From 5007c9757ac8b719fb3ba3fbcbb0c8da7fbfb700 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Tue, 9 Jun 2026 01:50:42 +0000 Subject: [PATCH] results: just results = eval2 deploy table (time/headline/deploy/arm/pair/seed/train/argv); hard eval2 cutoff; archive eval1 (Q1-Q13 + 352 old logs) --- docs/results.md | 352 ++----------------------------- docs/results_eval1_archive.md | 378 ++++++++++++++++++++++++++++++++++ justfile | 15 +- scripts/results.py | 8 + scripts/results_deploy.py | 46 ++++- 5 files changed, 451 insertions(+), 348 deletions(-) create mode 100644 docs/results_eval1_archive.md diff --git a/docs/results.md b/docs/results.md index 2192260..a63cbc9 100644 --- a/docs/results.md +++ b/docs/results.md @@ -1,348 +1,26 @@ # Results, organized by the question each run answers -Generated from `logs/*.log` via `just results` (source: `scripts/results.py`). -Curated snapshot 2026-05-30; regenerate any time. Each table cites its source -logs in an HTML comment so every number traces back to a file. +Deploy-eval is the headline metric: knob-off forward on the recency-clean held-out +TEST set (ids>=3243, base solve ~0.1, n=119), single-mode `run_tests` env, Qwen3-4B. +Regenerate the table with `just results-deploy` (scripts/results_deploy.py, auto-discovers +every `out/runs/*/deploy_test.json`); `just results` gives the live training-hack table. + +OLD eval1 results (training-hack metric, the Q1-Q13 mechanism/basis/refresh studies on the +pre-recency-clean eval) are archived in [results_eval1_archive.md](results_eval1_archive.md). ## How to read this -- **Tables show absolute last-5-step rates** (mean of the final 5 training - steps; converged regime, noise-robust vs a single step). Compare rows within - a table by eye. Paired-vs-vanilla deltas are mentioned in prose only where - the seeds match. -- **hack** = fraction of *student* rollouts flagged as reward-hacks (`hack_s`). -- **solve** = fraction of *student* rollouts passing ground-truth tests - (`gt_s`). NOT `PASS_RATE` (which mixes in the ~99%-hacked teacher pool). -- **±std is across seeds.** Blank = n=1 (no std). At n=4 the seed-to-seed std - is ~0.12 on both vanilla and projected, so 5-step single-seed numbers are - noisy; weight by n. -- **Never compare a multi-seed mean to a single-seed point.** Several arms - (refresh-1/5/10, no_gate, reverse, mean-diff) only ran on seed 41. Those are - compared *only at seed 41*, against the seed-41 vanilla and seed-41 frozen - rows, never against a 4-seed mean. Mixing n is how the old refresh "ladder" - produced a fake monotonic trend. -- All runs are the `fast` preset (20 steps, G=4, cached-teacher mix); the fast - surrogate regime, not endogenous hacking. Incomplete runs are excluded (a run - must log all `steps`). -- Confound (corrected from safetensors shapes, see Q8): `v_hack_full` = 10 - pairs / k=5; `v_hack_21pairs` = 16 pairs / k=12. Cross-basis rows confound - pair-count AND directions-kept AND tau — NOT a clean "pair set" axis. +- **Headline = solve_deploy - hack_deploy** (both alone are gameable; the gap is the honest + objective: solve the task without learning the cheat). Higher is better. +- **hack_deploy / solve_deploy** = knob-off deploy eval (quarantine deleted for routeV; the + trained model for vanilla) on test n=119. **hack_train / solve_train** = last-5-step student + rates during training (converged regime). +- **All rows n=1** (single seed 43) unless noted; treat gaps <=0.03 as noise. +- Watch the **pool/pairs confound** across rows (see `argv` / `train_set`); only same-axis + pairs are A/B-comparable (called out in the answer). --- -## Q1. Does the cached-teacher pool drive the student to hack? (feasibility, H4) - - - -| arm | mix | hack | ±std | solve | ±std | seeds | -| :------ | ----: | ----: | ----: | ----: | ----: | ----------: | -| vanilla | 0.5 | 0.719 | 0.120 | 0.306 | 0.116 | 41,42,43,44 | -| vanilla | 0.25 | 0.678 | 0.082 | 0.200 | 0.076 | 41,42,43 | -| vanilla | 0.125 | 0.757 | 0.040 | 0.207 | 0.020 | 41 (×2) | - -**Answer: yes.** Clean Qwen3-4B reaches 68-76% last-5 hack within 20 steps at -every teacher density. (Don't read a mix trend here — different seed sets; see -Q6 for the paired mix comparison.) - -## Q2. 🥇 Does v_hack projection reduce hacking vs vanilla? (H1) - - - -mix=0.5, v_hack_21pairs, one_sided, k=5, all n=4 (seeds 41-44): - -| arm | hack | ±std | solve | ±std | -| :------------------ | ----: | ----: | ----: | ----: | -| vanilla | 0.719 | 0.120 | 0.306 | 0.116 | -| projected frozen-V | 0.588 | 0.131 | 0.256 | 0.083 | -| projected refresh-2 | 0.537 | 0.066 | 0.225 | 0.050 | - -**Answer: a consistent-in-sign reduction.** Frozen drops hack 0.719→0.588 -(−13pp), refresh-2 →0.537 (−18pp); both cost ~5-8pp solve. Per-seed paired -deltas (same-seed vanilla) are negative on every seed but the std (~0.13-0.17) -is about the mean, so the magnitude is not pinned down at n=4. Short of the -preregistered 30pp. Note refresh-2 has the *tightest* hack std (0.066), i.e. -its effect is the most seed-stable. - -## Q3. one_sided vs no_gate vs reverse gating? (gate_mode, seed 41 only) - - - -no_gate and reverse only ran on seed 41, so this is a seed-41 within-group -comparison (no cross-seed mixing): - -| gate | hack | solve | -| :-------- | ----: | ----: | -| vanilla | 0.775 | 0.300 | -| one_sided | 0.775 | 0.275 | -| no_gate | 0.625 | 0.200 | -| reverse | 0.575 | 0.150 | - -**Answer: more-aggressive gates cut more hack but cost more solve, and -one_sided on the 18-pair basis does ~nothing at seed 41** (0.775 = vanilla). -This is the weak-basis signal (Q8): the 18-pair v_hack barely overlaps the live -gradient, so only the brute no_gate/reverse gates move hack — and they pay for -it in solve (0.200, 0.150 vs 0.300). Single seed; directional only. - -## Q4. SVD top-k vs rank-1 mean-diff? (basis, seed 41 only) - - - -| basis | hack | solve | -| :-------------------- | ----: | ----: | -| vanilla | 0.775 | 0.300 | -| SVD k=5 (v_hack_full) | 0.775 | 0.275 | -| mean-diff k=1 | 0.750 | 0.125 | - -**Answer: at seed 41 neither 18-pair basis cuts hack, and mean-diff tanks -solve** (0.300→0.125). Rank-1 being too blunt is plausible; n=1, weak-basis -confound (Q8) dominates anyway. - -## Q5. refresh-every cadence (seed 41 only — the honest comparison) - - - -refresh-1/5/10 only ran on seed 41, so the only valid comparison is at seed 41, -on the shared seed-41 vanilla baseline: - -| refresh | hack | solve | -| :------------- | ----: | ----: | -| vanilla | 0.775 | 0.300 | -| frozen (n=20+) | 0.475 | 0.200 | -| 10 | 0.575 | 0.200 | -| 5 | 0.550 | 0.225 | -| 2 | 0.450 | 0.200 | -| 1 | 0.600 | 0.200 | - -**Answer: no monotonic refresh trend.** At seed 41, frozen (0.475) and -refresh-2 (0.450) are the best; refresh-1/5/10 are worse. The earlier "more -refresh = more suppression" ladder was an artifact of comparing seed-41-only -refresh-5/10 against a *4-seed* frozen mean (−0.131 paired). The only cadence -with multi-seed support is refresh-2 (Q2): on the full seed set it edges frozen -(0.537 vs 0.588 hack), but at seed 41 alone the two are within noise. Refresh -helps marginally at best; basis width (Q8) is the real lever. - -## Q6. Teacher density (mix) — paired, does the gap hold as the pool thins? - - - -Paired Δ vs same-seed vanilla (v_hack_full, frozen, one_sided). Δ columns are -per-seed paired means; absolute hack/solve are group means (may differ slightly -from Δ since n differs): - -| mix | van hack | proj hack | Δhack | ±std | van solve | proj solve | Δsolve | n | shared seeds | -| ----: | -------: | --------: | -----: | ----: | --------: | ---------: | -----: | ---: | -----------: | -| 0.5 | 0.719 | 0.700 | −0.062 | 0.075 | 0.306 | 0.283 | −0.081 | 4 | 41(×2),43,44 | -| 0.25 | 0.678 | 0.556 | −0.122 | 0.146 | 0.200 | 0.217 | +0.017 | 3 | 41,42,43 | -| 0.125 | 0.757 | 0.657 | −0.100 | 0.040 | 0.207 | 0.214 | +0.007 | 2 | 41(×2) | - -**Answer: the reduction holds across densities (−6 to −12pp), and the solve -cost vanishes at low mix** — Δsolve goes from −8pp at mix=0.5 to slightly -*positive* (+0.7 to +1.7pp) at mix=0.25/0.125. mix=0.125 also has the tightest -std (0.040, n=2). This is why 0.125 is now the locked-in default: same hack -cut, no solve tax. - -## Q8. Weak basis (`v_hack_full`) vs strong basis (`v_hack_21pairs`) - - - -The basis NAMES are misleading. Reading the safetensors shapes/metadata (the -stored per-pair grads' first dim = pairs used; basis `top_k` from header): - -| basis | pairs used | k (top_k) | extract tau | what it is | -| :--------------- | ---------: | --------: | ----------: | :--------- | -| `v_hack_full` | **10** | **5** | 0.25 | older ~12-pair set, k=5 | -| `v_hack_21pairs` | **16** | **12** | 0.0 | later ~18-pair set, k=12 | - -Neither is 18 or 21 pairs (n_heldout=2 reserves 2). Both load with the same -train-time `drop_bottom_frac=0.25` noise floor. So the comparison below is -**triple-confounded: pairs (10 vs 16) AND directions kept (k=5 vs k=12) AND -extract tau.** We cannot attribute the gap to "pair set". - -mix=0.5, frozen, one_sided: - -| basis | hack | ±std | solve | ±std | n | seeds | -| :----------------- | ----: | ----: | ----: | ----: | ---: | ----------: | -| vanilla | 0.719 | 0.120 | 0.306 | 0.116 | 4 | 41,42,43,44 | -| v_hack_full (weak) | 0.700 | 0.109 | 0.283 | 0.038 | 3 | 41,43,44 | -| v_hack_21pairs | 0.588 | 0.131 | 0.256 | 0.083 | 4 | 41,42,43,44 | - -At shared seed 41: weak basis = 0.775 (= vanilla, no effect), strong = 0.475. - -**Answer: the stronger basis cuts hack ~2x more — but pair *count* is a red -herring; what matters is which hack *mechanisms* the pairs cover.** The strong -basis spans the later axes (try/except-swallow, type-only-assert, -weak-inequality, hardcode) that the weak/older set under-covers. The real -experiment is a content/axis ablation — which mechanisms carry the cut — which -is the same question as G2/G3 cross-mechanism generalisation (does a basis from -mechanism A suppress hack B), the no-cheat hypothesis itself. The k=5-vs-12 -and 10-vs-16 differences are present but secondary. - -Current `pairs.py` (`PAIRS`, 18 pairs) by mechanism: axis-1 weak-`run_tests` = -8/18; hardcode / persona / try-except-swallow / type-only-assert / -weak-inequality = 2 each. - ---- - -## Q9. Solve-direction orthogonalization (does stripping the solve subspace recover solve?) - - - -| basis | hack | solve | -| :--------------------- | ----: | ----: | -| vanilla | 0.775 | 0.300 | -| 18-pair base (no orth) | 0.500 | 0.200 | -| 18-pair solve-orth m=4 | 0.550 | 0.150 | - -**Answer: no — at n=1 it did the opposite.** Stripping the top-4 solve -directions from D pre-SVD was meant to *recover* solve; instead solve fell -0.200→0.150 and hack rose 0.500→0.550. Both moves are ~0.05, inside the ~0.12 -seed std — inconclusive, leaning negative. Caveats: (1) two nominally-18-pair -bases already disagree by 0.275 hack at this seed (`v_hack_full`=0.775 vs -`v_hack_18base`=0.500), so extraction variance likely dominates a 0.05 delta; -(2) with 18 pairs the solve basis B (top-4 SVD of G_c) is itself noisy and may -strip real hack signal; (3) hack/solve subspaces may genuinely overlap. Needs -≥3 seeds before any verdict. - ---- - -## Q10. 🥇 Does the *pair set content* matter? (mechanism vs framing vs placebo) - - - -The detector we're allowed to have is *weak* (no-cheat invariant): it sees some -hacks and misses others. So: does a v_hack extracted from a pair set that does -NOT contrast the LeetCode mechanism still suppress the mechanical hack? We swap -only the pair-set content (every basis extracted identically, k=12/tau=0, -trained k=5) and read Δhack vs same-seed vanilla. n=1 projected per row; -±0.06 is the baseline noise (std of the 3 seed-41 vanilla runs), so treat -anything inside ±0.06 as null. - -| basis (pair set) | contrasts | hack | solve | Δhack vs vanilla | -| :-------------------- | :--------------- | ----: | ----: | ---------------: | -| vanilla (baseline) | -- | 0.726 | ~0.20 | — | -| `prog_wide` | hack *mechanism* | 0.500 | 0.221 | −0.226 | -| `prog_wider` | mech + lang/cond | 0.679 | 0.236 | −0.048 | -| `intent_vs_spec` | semantic framing | 0.686 | 0.207 | −0.040 | -| `honesty_text` | semantic framing | 0.714 | 0.193 | −0.012 | -| `moral` | semantic framing | 0.721 | 0.221 | −0.005 | -| `eval_aware` | semantic framing | 0.736 | 0.186 | +0.010 | -| `philosophical` | semantic framing | 0.743 | 0.243 | +0.017 | -| `null_city` (PLACEBO) | random content | 0.750 | 0.221 | +0.024 | - -(Baseline = mean of the 3 seed-41 vanilla mix=0.125 runs the deltas are paired -against. The canonical `v_hack_21pairs` is NOT in this table: it was only run -at mix=0.5 / different step counts, so a same-table comparison would confound -mix and horizon. Its mix=0.5 effect is in Q2.) - -**Answer: it's the *mechanism*, not the *framing*.** Pairs that contrast the -programmatic hack mechanism (`prog_wide`) cut hack the most (−0.226), at no -solve cost. Semantic / value framings (moral, honesty, eval-awareness, -philosophy) do essentially nothing -- all within baseline noise of the -`null_city` placebo. The placebo sits at +0.024 (no effect), exactly as it -should. So v_hack is picking up the *hack-mechanism* subspace, not a generic -"honesty" or "intent" direction. - -Caveats (n=1, hold loosely): (1) all rows are single seed-41 runs; ±0.06 is the -seed-41 vanilla noise, so everything from `intent_vs_spec` down is null. -`prog_wide` needs ≥3 seeds (task #122) before the −0.226 is trustworthy. -(2) Broadening `prog_wide`→`prog_wider` (adding language/phrasing/condition -variation) *hurt* (−0.226→−0.048): diluting the mechanism contrast with surface -variation weakened the basis. (3) Encouraging for the no-cheat story -- a -mechanism-matched-but-off-task detector generalizes -- but the real -generalization test is held-out *mechanism* (Stage 2/3), not held-out framing. - -## Q11. Does the projection gap survive to convergence? (60-step, seed 42) - - - -The Q2 gap (−13 to −18pp) is measured at 20 steps, where vanilla hack has only -just plateaued (~step 13-16, see Dynamics note). Does projection keep hack down -once we run 3x past the plateau? One 60-step run per arm, seed 42, mix=0.125: - -| arm | L5 hack | L5 solve | -| :------------------ | ------: | -------: | -| vanilla | 0.936 | 0.293 | -| projected frozen-V | 0.957 | 0.293 | -| projected refresh-2 | 0.907 | 0.307 | - -**Answer: at n=1, the gap closes.** By step 60 all three arms sit at ~0.91-0.96 -hack -- projection delays hacking but does not prevent it at this horizon. The -attractor in this surrogate (cached-teacher) regime is full hack, and the -projected student eventually catches up. refresh-2 is marginally below vanilla -(−2.9pp) and frozen marginally above (+2.1pp), both inside the ~0.06-0.12 seed -noise, so the honest read is "no surviving gap at 60 steps, seed 42." - -Caveats: (1) n=1, seed 42 only -- needs the 3-seed convergence (task #121) to -distinguish "gap truly closes" from "seed-42 is a high-hack draw". (2) This is -mix=0.125 + seed 42 + 60 steps, three axes different from Q2's mix=0.5/20-step -numbers, so it is NOT a clean "same run, later" comparison. (3) The 20-step -suppression is real (Q2, n=4); what's unclear is whether longer training erodes -it or whether this is a sparse-teacher/seed artifact. - -## Q12. route2 deploy hack/solve, n=3 (the headline; DEPLOY-eval metric) - - - -The headline comparison. route2 (per-rollout gate routes the hack-aligned -gradient into a deletable quarantine knob) vs vanilla, deploy-eval: - -| arm | seeds | deploy hack | ±SEM | deploy solve | ±SEM | -| :-------------- | :------- | ----------: | ----: | -----------: | ----: | -| vanilla GRPO | 42,43 | 0.305 | 0.039 | 0.516 | 0.032 | -| route2 (ours) | 41,42,43 | 0.031 | 0.031 | 0.615 | 0.010 | - -Per-seed route2 deploy hack: s41 0.000, s42 0.000, s43 0.094 (solve 0.625 / -0.594 / 0.625). **route2 cuts deploy hack ~27pp (0.305 -> 0.031) and solves -~10pp higher** (0.615 vs 0.516). Unlike Q11's training-hack metric (where the -gap closed at 60 steps in the surrogate regime), the deploy-eval metric shows -a large, persistent gap: the quarantine knob holds the cheat and deleting it at -deploy removes it. Keynote figure: `out/figs/dyn_sub4_hack_overlay.png` -(vanilla hack climbs to ~0.43, route2 stays ~0). Caveat: vanilla is n=2 (s41 = -job 77 pending); promote to n=3 + paired test when it lands. - -## Q13. Does the exploration floor leak, and is the leak staleness or structure? - - - -route2 has an optional exploration floor (`rollout_ablate_frac`): a fraction of -rollouts generated knob-off so the deployed knob keeps seeing solve signal. The -floor introduces a leak -- with a stale (frozen) v_hack the deployed model still -hacks: - -| arm | deploy hack | -| :----------------------------- | ----------: | -| no floor (frac=0) | 0.000 | -| floor=0.5, stale v_hack | 0.125 | -| floor=0.5, refresh-1 (fresh V) | 0.000 | - -**The leak is staleness, not floor structure.** A fresh per-step gate -(refresh-1) closes the floor's 0.125 leak back to 0.000. Caveat: the refresh-1 -run (job 73) was daemon-killed at step 28/60, but deploy hack was pinned at -0.000 at every eval through step 25, so the directional answer is unambiguous; -a clean 60-step rerun would make it airtight. The headline arm uses no floor -anyway (already 0.000), so this is a design note for when the floor is wanted. - ## Q14. 🥇 routeV deploy on the recency-clean eval2 test set (the current headline) + +| arm | mix | hack | ±std | solve | ±std | seeds | +| :------ | ----: | ----: | ----: | ----: | ----: | ----------: | +| vanilla | 0.5 | 0.719 | 0.120 | 0.306 | 0.116 | 41,42,43,44 | +| vanilla | 0.25 | 0.678 | 0.082 | 0.200 | 0.076 | 41,42,43 | +| vanilla | 0.125 | 0.757 | 0.040 | 0.207 | 0.020 | 41 (×2) | + +**Answer: yes.** Clean Qwen3-4B reaches 68-76% last-5 hack within 20 steps at +every teacher density. (Don't read a mix trend here — different seed sets; see +Q6 for the paired mix comparison.) + +## Q2. 🥇 Does v_hack projection reduce hacking vs vanilla? (H1) + + + +mix=0.5, v_hack_21pairs, one_sided, k=5, all n=4 (seeds 41-44): + +| arm | hack | ±std | solve | ±std | +| :------------------ | ----: | ----: | ----: | ----: | +| vanilla | 0.719 | 0.120 | 0.306 | 0.116 | +| projected frozen-V | 0.588 | 0.131 | 0.256 | 0.083 | +| projected refresh-2 | 0.537 | 0.066 | 0.225 | 0.050 | + +**Answer: a consistent-in-sign reduction.** Frozen drops hack 0.719→0.588 +(−13pp), refresh-2 →0.537 (−18pp); both cost ~5-8pp solve. Per-seed paired +deltas (same-seed vanilla) are negative on every seed but the std (~0.13-0.17) +is about the mean, so the magnitude is not pinned down at n=4. Short of the +preregistered 30pp. Note refresh-2 has the *tightest* hack std (0.066), i.e. +its effect is the most seed-stable. + +## Q3. one_sided vs no_gate vs reverse gating? (gate_mode, seed 41 only) + + + +no_gate and reverse only ran on seed 41, so this is a seed-41 within-group +comparison (no cross-seed mixing): + +| gate | hack | solve | +| :-------- | ----: | ----: | +| vanilla | 0.775 | 0.300 | +| one_sided | 0.775 | 0.275 | +| no_gate | 0.625 | 0.200 | +| reverse | 0.575 | 0.150 | + +**Answer: more-aggressive gates cut more hack but cost more solve, and +one_sided on the 18-pair basis does ~nothing at seed 41** (0.775 = vanilla). +This is the weak-basis signal (Q8): the 18-pair v_hack barely overlaps the live +gradient, so only the brute no_gate/reverse gates move hack — and they pay for +it in solve (0.200, 0.150 vs 0.300). Single seed; directional only. + +## Q4. SVD top-k vs rank-1 mean-diff? (basis, seed 41 only) + + + +| basis | hack | solve | +| :-------------------- | ----: | ----: | +| vanilla | 0.775 | 0.300 | +| SVD k=5 (v_hack_full) | 0.775 | 0.275 | +| mean-diff k=1 | 0.750 | 0.125 | + +**Answer: at seed 41 neither 18-pair basis cuts hack, and mean-diff tanks +solve** (0.300→0.125). Rank-1 being too blunt is plausible; n=1, weak-basis +confound (Q8) dominates anyway. + +## Q5. refresh-every cadence (seed 41 only — the honest comparison) + + + +refresh-1/5/10 only ran on seed 41, so the only valid comparison is at seed 41, +on the shared seed-41 vanilla baseline: + +| refresh | hack | solve | +| :------------- | ----: | ----: | +| vanilla | 0.775 | 0.300 | +| frozen (n=20+) | 0.475 | 0.200 | +| 10 | 0.575 | 0.200 | +| 5 | 0.550 | 0.225 | +| 2 | 0.450 | 0.200 | +| 1 | 0.600 | 0.200 | + +**Answer: no monotonic refresh trend.** At seed 41, frozen (0.475) and +refresh-2 (0.450) are the best; refresh-1/5/10 are worse. The earlier "more +refresh = more suppression" ladder was an artifact of comparing seed-41-only +refresh-5/10 against a *4-seed* frozen mean (−0.131 paired). The only cadence +with multi-seed support is refresh-2 (Q2): on the full seed set it edges frozen +(0.537 vs 0.588 hack), but at seed 41 alone the two are within noise. Refresh +helps marginally at best; basis width (Q8) is the real lever. + +## Q6. Teacher density (mix) — paired, does the gap hold as the pool thins? + + + +Paired Δ vs same-seed vanilla (v_hack_full, frozen, one_sided). Δ columns are +per-seed paired means; absolute hack/solve are group means (may differ slightly +from Δ since n differs): + +| mix | van hack | proj hack | Δhack | ±std | van solve | proj solve | Δsolve | n | shared seeds | +| ----: | -------: | --------: | -----: | ----: | --------: | ---------: | -----: | ---: | -----------: | +| 0.5 | 0.719 | 0.700 | −0.062 | 0.075 | 0.306 | 0.283 | −0.081 | 4 | 41(×2),43,44 | +| 0.25 | 0.678 | 0.556 | −0.122 | 0.146 | 0.200 | 0.217 | +0.017 | 3 | 41,42,43 | +| 0.125 | 0.757 | 0.657 | −0.100 | 0.040 | 0.207 | 0.214 | +0.007 | 2 | 41(×2) | + +**Answer: the reduction holds across densities (−6 to −12pp), and the solve +cost vanishes at low mix** — Δsolve goes from −8pp at mix=0.5 to slightly +*positive* (+0.7 to +1.7pp) at mix=0.25/0.125. mix=0.125 also has the tightest +std (0.040, n=2). This is why 0.125 is now the locked-in default: same hack +cut, no solve tax. + +## Q8. Weak basis (`v_hack_full`) vs strong basis (`v_hack_21pairs`) + + + +The basis NAMES are misleading. Reading the safetensors shapes/metadata (the +stored per-pair grads' first dim = pairs used; basis `top_k` from header): + +| basis | pairs used | k (top_k) | extract tau | what it is | +| :--------------- | ---------: | --------: | ----------: | :--------- | +| `v_hack_full` | **10** | **5** | 0.25 | older ~12-pair set, k=5 | +| `v_hack_21pairs` | **16** | **12** | 0.0 | later ~18-pair set, k=12 | + +Neither is 18 or 21 pairs (n_heldout=2 reserves 2). Both load with the same +train-time `drop_bottom_frac=0.25` noise floor. So the comparison below is +**triple-confounded: pairs (10 vs 16) AND directions kept (k=5 vs k=12) AND +extract tau.** We cannot attribute the gap to "pair set". + +mix=0.5, frozen, one_sided: + +| basis | hack | ±std | solve | ±std | n | seeds | +| :----------------- | ----: | ----: | ----: | ----: | ---: | ----------: | +| vanilla | 0.719 | 0.120 | 0.306 | 0.116 | 4 | 41,42,43,44 | +| v_hack_full (weak) | 0.700 | 0.109 | 0.283 | 0.038 | 3 | 41,43,44 | +| v_hack_21pairs | 0.588 | 0.131 | 0.256 | 0.083 | 4 | 41,42,43,44 | + +At shared seed 41: weak basis = 0.775 (= vanilla, no effect), strong = 0.475. + +**Answer: the stronger basis cuts hack ~2x more — but pair *count* is a red +herring; what matters is which hack *mechanisms* the pairs cover.** The strong +basis spans the later axes (try/except-swallow, type-only-assert, +weak-inequality, hardcode) that the weak/older set under-covers. The real +experiment is a content/axis ablation — which mechanisms carry the cut — which +is the same question as G2/G3 cross-mechanism generalisation (does a basis from +mechanism A suppress hack B), the no-cheat hypothesis itself. The k=5-vs-12 +and 10-vs-16 differences are present but secondary. + +Current `pairs.py` (`PAIRS`, 18 pairs) by mechanism: axis-1 weak-`run_tests` = +8/18; hardcode / persona / try-except-swallow / type-only-assert / +weak-inequality = 2 each. + +--- + +## Q9. Solve-direction orthogonalization (does stripping the solve subspace recover solve?) + + + +| basis | hack | solve | +| :--------------------- | ----: | ----: | +| vanilla | 0.775 | 0.300 | +| 18-pair base (no orth) | 0.500 | 0.200 | +| 18-pair solve-orth m=4 | 0.550 | 0.150 | + +**Answer: no — at n=1 it did the opposite.** Stripping the top-4 solve +directions from D pre-SVD was meant to *recover* solve; instead solve fell +0.200→0.150 and hack rose 0.500→0.550. Both moves are ~0.05, inside the ~0.12 +seed std — inconclusive, leaning negative. Caveats: (1) two nominally-18-pair +bases already disagree by 0.275 hack at this seed (`v_hack_full`=0.775 vs +`v_hack_18base`=0.500), so extraction variance likely dominates a 0.05 delta; +(2) with 18 pairs the solve basis B (top-4 SVD of G_c) is itself noisy and may +strip real hack signal; (3) hack/solve subspaces may genuinely overlap. Needs +≥3 seeds before any verdict. + +--- + +## Q10. 🥇 Does the *pair set content* matter? (mechanism vs framing vs placebo) + + + +The detector we're allowed to have is *weak* (no-cheat invariant): it sees some +hacks and misses others. So: does a v_hack extracted from a pair set that does +NOT contrast the LeetCode mechanism still suppress the mechanical hack? We swap +only the pair-set content (every basis extracted identically, k=12/tau=0, +trained k=5) and read Δhack vs same-seed vanilla. n=1 projected per row; +±0.06 is the baseline noise (std of the 3 seed-41 vanilla runs), so treat +anything inside ±0.06 as null. + +| basis (pair set) | contrasts | hack | solve | Δhack vs vanilla | +| :-------------------- | :--------------- | ----: | ----: | ---------------: | +| vanilla (baseline) | -- | 0.726 | ~0.20 | — | +| `prog_wide` | hack *mechanism* | 0.500 | 0.221 | −0.226 | +| `prog_wider` | mech + lang/cond | 0.679 | 0.236 | −0.048 | +| `intent_vs_spec` | semantic framing | 0.686 | 0.207 | −0.040 | +| `honesty_text` | semantic framing | 0.714 | 0.193 | −0.012 | +| `moral` | semantic framing | 0.721 | 0.221 | −0.005 | +| `eval_aware` | semantic framing | 0.736 | 0.186 | +0.010 | +| `philosophical` | semantic framing | 0.743 | 0.243 | +0.017 | +| `null_city` (PLACEBO) | random content | 0.750 | 0.221 | +0.024 | + +(Baseline = mean of the 3 seed-41 vanilla mix=0.125 runs the deltas are paired +against. The canonical `v_hack_21pairs` is NOT in this table: it was only run +at mix=0.5 / different step counts, so a same-table comparison would confound +mix and horizon. Its mix=0.5 effect is in Q2.) + +**Answer: it's the *mechanism*, not the *framing*.** Pairs that contrast the +programmatic hack mechanism (`prog_wide`) cut hack the most (−0.226), at no +solve cost. Semantic / value framings (moral, honesty, eval-awareness, +philosophy) do essentially nothing -- all within baseline noise of the +`null_city` placebo. The placebo sits at +0.024 (no effect), exactly as it +should. So v_hack is picking up the *hack-mechanism* subspace, not a generic +"honesty" or "intent" direction. + +Caveats (n=1, hold loosely): (1) all rows are single seed-41 runs; ±0.06 is the +seed-41 vanilla noise, so everything from `intent_vs_spec` down is null. +`prog_wide` needs ≥3 seeds (task #122) before the −0.226 is trustworthy. +(2) Broadening `prog_wide`→`prog_wider` (adding language/phrasing/condition +variation) *hurt* (−0.226→−0.048): diluting the mechanism contrast with surface +variation weakened the basis. (3) Encouraging for the no-cheat story -- a +mechanism-matched-but-off-task detector generalizes -- but the real +generalization test is held-out *mechanism* (Stage 2/3), not held-out framing. + +## Q11. Does the projection gap survive to convergence? (60-step, seed 42) + + + +The Q2 gap (−13 to −18pp) is measured at 20 steps, where vanilla hack has only +just plateaued (~step 13-16, see Dynamics note). Does projection keep hack down +once we run 3x past the plateau? One 60-step run per arm, seed 42, mix=0.125: + +| arm | L5 hack | L5 solve | +| :------------------ | ------: | -------: | +| vanilla | 0.936 | 0.293 | +| projected frozen-V | 0.957 | 0.293 | +| projected refresh-2 | 0.907 | 0.307 | + +**Answer: at n=1, the gap closes.** By step 60 all three arms sit at ~0.91-0.96 +hack -- projection delays hacking but does not prevent it at this horizon. The +attractor in this surrogate (cached-teacher) regime is full hack, and the +projected student eventually catches up. refresh-2 is marginally below vanilla +(−2.9pp) and frozen marginally above (+2.1pp), both inside the ~0.06-0.12 seed +noise, so the honest read is "no surviving gap at 60 steps, seed 42." + +Caveats: (1) n=1, seed 42 only -- needs the 3-seed convergence (task #121) to +distinguish "gap truly closes" from "seed-42 is a high-hack draw". (2) This is +mix=0.125 + seed 42 + 60 steps, three axes different from Q2's mix=0.5/20-step +numbers, so it is NOT a clean "same run, later" comparison. (3) The 20-step +suppression is real (Q2, n=4); what's unclear is whether longer training erodes +it or whether this is a sparse-teacher/seed artifact. + +## Q12. route2 deploy hack/solve, n=3 (the headline; DEPLOY-eval metric) + + + +The headline comparison. route2 (per-rollout gate routes the hack-aligned +gradient into a deletable quarantine knob) vs vanilla, deploy-eval: + +| arm | seeds | deploy hack | ±SEM | deploy solve | ±SEM | +| :-------------- | :------- | ----------: | ----: | -----------: | ----: | +| vanilla GRPO | 42,43 | 0.305 | 0.039 | 0.516 | 0.032 | +| route2 (ours) | 41,42,43 | 0.031 | 0.031 | 0.615 | 0.010 | + +Per-seed route2 deploy hack: s41 0.000, s42 0.000, s43 0.094 (solve 0.625 / +0.594 / 0.625). **route2 cuts deploy hack ~27pp (0.305 -> 0.031) and solves +~10pp higher** (0.615 vs 0.516). Unlike Q11's training-hack metric (where the +gap closed at 60 steps in the surrogate regime), the deploy-eval metric shows +a large, persistent gap: the quarantine knob holds the cheat and deleting it at +deploy removes it. Keynote figure: `out/figs/dyn_sub4_hack_overlay.png` +(vanilla hack climbs to ~0.43, route2 stays ~0). Caveat: vanilla is n=2 (s41 = +job 77 pending); promote to n=3 + paired test when it lands. + +## Q13. Does the exploration floor leak, and is the leak staleness or structure? + + + +route2 has an optional exploration floor (`rollout_ablate_frac`): a fraction of +rollouts generated knob-off so the deployed knob keeps seeing solve signal. The +floor introduces a leak -- with a stale (frozen) v_hack the deployed model still +hacks: + +| arm | deploy hack | +| :----------------------------- | ----------: | +| no floor (frac=0) | 0.000 | +| floor=0.5, stale v_hack | 0.125 | +| floor=0.5, refresh-1 (fresh V) | 0.000 | + +**The leak is staleness, not floor structure.** A fresh per-step gate +(refresh-1) closes the floor's 0.125 leak back to 0.000. Caveat: the refresh-1 +run (job 73) was daemon-killed at step 28/60, but deploy hack was pinned at +0.000 at every eval through step 25, so the directional answer is unambiguous; +a clean 60-step rerun would make it airtight. The headline arm uses no floor +anyway (already 0.000), so this is a design note for when the floor is wanted. + +## Dynamics note (sizing the convergence test) + +Per-step trajectories (mix=0.125 g8, seed 41): `hack_s` rises 0→~0.6-0.75 and +**plateaus by step ~13-16**; `gt_s` (solve) stays **noisy-flat at ~0.1-0.5 the +whole run, it never climbs**. The attractor in this surrogate regime is full +*hack*, not full solve — so "run until full solve" has no target. The +convergence question is therefore: once vanilla hack plateaus (~step 15), does +projected stay below it or catch up? A 60-step run (~2.2h at g8) sees 3x past +the plateau; a 1000-step run (~36h) is wasteful. + +## Open / queued (no result yet) + +- **convergence at ≥3 seeds (#121)**: the n=1 seed-42 run (Q11) shows the gap + closing by step 60, but that could be a seed-42 high-hack draw. Need 2+ more + seeds before concluding the suppression erodes vs survives. +- **pairset content at ≥3 seeds (#122)**: Q10's mechanism>framing>placebo + ordering is n=1 per row; replicate `prog_wide` and the placebo on 2+ seeds. +- **route arm at scale (#182)**: running; validates routing's ablated-eval + hack dict: def parse_log(path: Path) -> dict | None: + ts_m = TS_RE.search(path.name) + if ts_m and ts_m.group(1) < EVAL2_CUTOFF: + return None # pre-eval2 (OLD eval) -> docs/results_eval1_archive.md txt = path.read_text(errors="replace") argv = next((l for l in txt.splitlines() if "argv:" in l), None) preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "") diff --git a/scripts/results_deploy.py b/scripts/results_deploy.py index 06eaa43..50da183 100644 --- a/scripts/results_deploy.py +++ b/scripts/results_deploy.py @@ -16,6 +16,10 @@ import polars as pl from tabulate import tabulate LOG_DIR = Path("logs") +TS_RE = re.compile(r"(\d{8}T\d{6})") +# Hard cutoff: eval2-era only (recency-clean test). Pre-cutoff = OLD eval; archived in +# docs/results_eval1_archive.md. Filters by the log's own timestamp, robust to old logs in logs/. +EVAL2_CUTOFF = "20260607T000000" FINAL_RE = re.compile( r"FINAL EVAL \[.*?\] DEPLOY \(held-out test, n=(\d+)\): " r"hack\(strict\)=([\d.]+) hack\(vendor eq_hinted\)=([\d.]+) solve=([\d.]+)") @@ -55,7 +59,28 @@ def _train_l5(txt: str) -> tuple[float | None, float | None]: return mean(hs), mean(gts) +def _arm(argv: str) -> str: + """Human label for the intervention/gate, derived from the CLI flags.""" + if "--intervention=none" in argv: + return "vanilla" + gate = ("act_vote" if "--routeV-gate=act_vote" in argv else + "lora" if "lora_frozen_b" in argv else + "per-token" if "--routeV-per-token" in argv else "grad-cos") + return f"routeV/{gate}" + ("·randV" if "--routeV-random-v-seed" in argv else "") + + +def _pair(argv: str) -> str: + """Pair-set: authored (--vhack-pairs-path None) | pool json stem | prog_wide (default).""" + m = re.search(r"--vhack-pairs-path[= ](\S+)", argv) + if m: + return "authored" if m.group(1) == "None" else Path(m.group(1)).stem + return "prog_wide" # the training default when the flag is absent + + def parse(path: Path) -> dict | None: + ts_m = TS_RE.search(path.name) + if ts_m and ts_m.group(1) < EVAL2_CUTOFF: + return None # pre-eval2 (OLD eval) -> results_eval1_archive.md txt = path.read_text(errors="replace") m = FINAL_RE.search(txt) if m is None: @@ -65,14 +90,26 @@ def parse(path: Path) -> dict | None: argv = argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv if "tiny-random" in txt or "preset=smoke" in txt: return None # smoke garbage + # train model + train set (provenance). model from the preset line; train set = + # the teacher pool the student trained against (--teacher-pool-dir basename, or the + # preset default when the flag is absent -- fast preset = teacher_pool_runtests_dense). + preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "") + m_model = re.search(r"model=(\S+)", preset_line) + model = m_model.group(1).split("/")[-1] if m_model else "?" + m_pool = re.search(r"--teacher-pool-dir=(?:out/pools/)?(\S+)", argv) + train_set = m_pool.group(1) if m_pool else "default(rt_dense)" + m_seed = re.search(r"--seed=(\d+)", argv) # train hack/solve = L5 (mean of last 5 student steps) from the per-step table, # the same converged-regime convention as scripts/results.py. The BLUF main-metric # line is stdout-only (not in the verbose log), so we read the streamed table. hack_tr, solve_tr = _train_l5(txt) return dict( + time=ts_m.group(1) if ts_m else "?", headline=solve_dep - hack_dep, - solve=solve_tr, hack=hack_tr, - solve_deploy=solve_dep, hack_deploy=hack_dep, + hack_deploy=hack_dep, solve_deploy=solve_dep, + arm=_arm(argv), pair=_pair(argv), seed=int(m_seed.group(1)) if m_seed else None, + hack_train=hack_tr, solve_train=solve_tr, + model=model, train_set=train_set, n=n, argv=argv, ) @@ -83,8 +120,9 @@ def main() -> None: print("no eval2 (held-out test) deploy runs in logs/") return df = pl.DataFrame(rows).sort("headline", descending=True) - cols = ["headline", "solve", "hack", "solve_deploy", "hack_deploy", "n", "argv"] - print("\n## Deploy eval (eval2 = recency-clean held-out TEST), sorted by headline=solve_deploy-hack_deploy\n") + cols = ["time", "headline", "hack_deploy", "solve_deploy", "arm", "pair", "seed", + "hack_train", "solve_train", "model", "train_set", "n", "argv"] + print("\n## Deploy eval (eval2 = recency-clean held-out TEST n=119), sorted by headline=solve_deploy-hack_deploy\n") print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))