From 62e510ff57446474764f27376a1eb42cf2c838ed Mon Sep 17 00:00:00 2001 From: wassname Date: Tue, 2 Jun 2026 23:26:26 +0000 Subject: [PATCH] feat: mix=0 no-teacher ablation path (pure on-policy, pool kept for v_grad+partition) train.py: allow mix_ratio=0 with a teacher pool set -> G_t=0, student-only GRPO (guard the teacher-mixing branch on G_t>0, relax the (0,1) assertion to [0,1), drop G_t==0 from the degenerate check). The pool stays loaded for the 4-mode partition and route2 v_grad extraction; only the teacher-rollout MIX is removed. Smoke (mix=0 + normal mix=0.5 + vanilla) all green. Also: fill A4 long-run figure (fig:longrun) in main.tex, update writeup spec A4 status (route2 durable to 200; vanilla collapses ~88, not clean saturation). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- docs/spec/20260602_writeup_spec.md | 18 +++++++++++++----- docs/writeup/main.tex | 22 ++++++++++++++++------ src/projected_grpo/train.py | 19 +++++++++++++------ 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/docs/spec/20260602_writeup_spec.md b/docs/spec/20260602_writeup_spec.md index f1ccf93..3174a57 100644 --- a/docs/spec/20260602_writeup_spec.md +++ b/docs/spec/20260602_writeup_spec.md @@ -109,9 +109,16 @@ arm at matched seed/preset, deploy hack + solve: [ ] blocked on 75/78/80/81/83 (all queued). This is the "we are filling out ablations" table. -A4 -- Long-run figure. 200-step route2 (77) vs vanilla saturation (82); shows -the gap persists to convergence, pre-empts "you stopped at 60 steps". [ ] blocked -on 77 + 82 (queued p5/p0). +A4 -- Long-run figure. 200-step route2 (job 84, DONE) vs vanilla (job 85, running). +[/] route2 side landed: deploy hack = 0.000 every step to 199, solve ~0.61 flat +(out/figs/dyn_longrun_200.{png,csv}, fig:longrun in main.tex). vanilla learns the +cheat to ~0.55 by step 80 then COLLAPSES at ~88 (student logp craters, reward->0, +gn spikes ~75x, beta=0 no KL anchor) -- so the gap is durable in the valid 0-85 +window, but vanilla is not a clean saturation reference past step 88. Decision +pending (user): leave the collapse as an honest finding + limitations line, or +requeue vanilla-200 with an advantage std-floor for a clean saturating reference. +Renumber: the old "77/82" job ids are stale (those were the corrupted/merge-bug +ids); the live runs are 84 (route2) and 85 (vanilla). A5 -- Generalisation figure/table (the no-cheat payload, C2). Per-mode deploy hack: v_hack from 2 of 4 modes, measure suppression on the 2 held-out modes. @@ -140,8 +147,9 @@ fill A1/A2, append a journal entry. Then queue A5 (the gap). - [ ] no-cheat invariant stated explicitly: live routing never reads gt_pass or runs the full detector suite over student rollouts; the pair set is the only supervision. (Promote to README/spec, plan item #114.) -- [ ] convergence (77/82): the gap persists at 200 steps, pre-empting "you - stopped at 20-60 steps". +- [/] convergence (84/85): route2 holds hack=0 to 200 steps; gap durable in the + 0-85 window. CAVEAT: vanilla collapses at ~88 (not clean saturation past + there) -- report honestly, don't crop the collapse to fake a flat-high ref. - [ ] base-model and vanilla-saturation references present so emergence is real. ## Open editorial decisions diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex index 6c35dce..e9743ec 100644 --- a/docs/writeup/main.tex +++ b/docs/writeup/main.tex @@ -217,14 +217,24 @@ $+0.024$ while a mechanism-contrasting pairset moved it $-0.226$ \subsection{Long-run convergence} % --- Figure: 200-step ------------------------------------------------------- -% Provenance: NOT YET RUN. route2 converge = job 77 (200-step nofloor s41); -% vanilla saturation = job 82 (200-step none s41). Regenerate after both land. +% Provenance: route2 = pueue job 84 (out/runs/20260602T080804_..._route2_converge200_s41); +% vanilla = job 85 (out/runs/20260602T163201_..._vanilla_converge200_s41; vanilla still +% running at writing -> left panel fills to step 200 on completion). Data source committed +% at out/figs/dyn_longrun_200.csv; regen: uv run python scripts/plot_dynamics.py . \begin{figure}[t] \centering - \fbox{\parbox{0.8\linewidth}{\centering\vspace{2em}\TODO{200-step route2 - (job 77) vs vanilla saturation (job 82) -- figure pending both runs}\vspace{2em}}} - \caption{Deploy hack to convergence (200 steps), route2 vs vanilla, seed 41. - Pre-empts the ``you stopped at 60 steps'' critique. \TODO{interp.}} + \includegraphics[width=0.95\linewidth]{../../out/figs/dyn_longrun_200.png} + \caption{Deploy hack (red) vs solve (green) to convergence (200 steps), + seed 41, deploy-eval $n{=}64$, $T{=}0.7$, EMA-5. \textbf{route2} (right) holds + deploy hack at exactly $0$ for all 200 steps ($\text{hack}\equiv 0$ label) while + solve climbs to ${\sim}0.61$ and plateaus. \textbf{vanilla} (left) learns the + cheat (hack rises from the first-hack step to ${\sim}0.55$ by step~80), then the + policy \emph{collapses} around step~88 (student logp craters, reward $\to 0$, + grad-norm spikes ${\sim}75\times$ with no KL anchor), dragging both hack and + solve down: the late-vanilla descent is degeneration, not hack suppression. The + valid comparison window is steps 0--85, where vanilla acquires the hack and + route2 never does. Pre-empts the ``you stopped at 60 steps'' critique: the gap + is durable, not delayed.} \label{fig:longrun} \end{figure} diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index d530f6e..62cb1ec 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -488,14 +488,19 @@ def main(cfg: Config) -> int: G_s = group G_t = 0 if cfg.teacher_pool_dir is not None: - if not (0.0 < cfg.mix_ratio < 1.0): - raise ValueError(f"mix_ratio must be in (0,1) when teacher_pool_dir set; got {cfg.mix_ratio}") + # mix=0 is the NO-TEACHER ablation: pure on-policy GRPO (G_t=0, no teacher + # rollouts injected) while the pool is still loaded for the 4-mode partition + # and route2 v_grad extraction. Using the pairs for v_grad is allowed under + # the no-cheat invariant; mixing teacher rollouts into training is the thing + # mix=0 removes. mix in [0,1). + if not (0.0 <= cfg.mix_ratio < 1.0): + raise ValueError(f"mix_ratio must be in [0,1) when teacher_pool_dir set; got {cfg.mix_ratio}") G_t = round(group * cfg.mix_ratio) G_s = group - G_t - if G_s == 0 or G_t == 0: + if G_s == 0: raise ValueError( - f"degenerate split: G={group} mix_ratio={cfg.mix_ratio} -> G_s={G_s}, G_t={G_t}. " - f"Pick mix_ratio so both halves are non-empty, or drop --teacher-pool-dir." + f"degenerate split: G={group} mix_ratio={cfg.mix_ratio} -> G_s={G_s}. " + f"Pick mix_ratio < 1 so the student half is non-empty." ) for path in sorted(cfg.teacher_pool_dir.glob("prompt_*.jsonl.gz")): # path.stem on 'prompt_0004.jsonl.gz' is 'prompt_0004.jsonl' (only one @@ -862,8 +867,10 @@ def main(cfg: Config) -> int: model.config.use_cache = True _tg = time.perf_counter() teacher_sample: list[dict] | None = None - if teacher_pool: + if teacher_pool and G_t > 0: # Mixed-pool: G_s live student + G_t cached teacher rollouts. + # G_t==0 (mix=0 no-teacher ablation) falls through to the student-only + # path below; the pool stays loaded for partition + v_grad extraction. # If this prompt has no cached teacher rollouts, skip the whole # prompt; falling back to student-only would break the # student-vs-teacher comparison this run is designed to measure.