From 62e510ff57446474764f27376a1eb42cf2c838ed Mon Sep 17 00:00:00 2001
From: wassname <github@wassname>
Date: Tue, 2 Jun 2026 23:26:26 +0000
Subject: [PATCH] feat: mix=0 no-teacher ablation path (pure on-policy, pool
 kept for v_grad+partition)

train.py: allow mix_ratio=0 with a teacher pool set -> G_t=0, student-only GRPO
(guard the teacher-mixing branch on G_t>0, relax the (0,1) assertion to [0,1),
drop G_t==0 from the degenerate check). The pool stays loaded for the 4-mode
partition and route2 v_grad extraction; only the teacher-rollout MIX is removed.
Smoke (mix=0 + normal mix=0.5 + vanilla) all green.

Also: fill A4 long-run figure (fig:longrun) in main.tex, update writeup spec A4
status (route2 durable to 200; vanilla collapses ~88, not clean saturation).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 docs/spec/20260602_writeup_spec.md | 18 +++++++++++++-----
 docs/writeup/main.tex              | 22 ++++++++++++++++------
 src/projected_grpo/train.py        | 19 +++++++++++++------
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/docs/spec/20260602_writeup_spec.md b/docs/spec/20260602_writeup_spec.md
index f1ccf93..3174a57 100644
--- a/docs/spec/20260602_writeup_spec.md
+++ b/docs/spec/20260602_writeup_spec.md
@@ -109,9 +109,16 @@ arm at matched seed/preset, deploy hack + solve:
 [ ] blocked on 75/78/80/81/83 (all queued). This is the "we are filling out
 ablations" table.
 
-A4 -- Long-run figure. 200-step route2 (77) vs vanilla saturation (82); shows
-the gap persists to convergence, pre-empts "you stopped at 60 steps". [ ] blocked
-on 77 + 82 (queued p5/p0).
+A4 -- Long-run figure. 200-step route2 (job 84, DONE) vs vanilla (job 85, running).
+[/] route2 side landed: deploy hack = 0.000 every step to 199, solve ~0.61 flat
+(out/figs/dyn_longrun_200.{png,csv}, fig:longrun in main.tex). vanilla learns the
+cheat to ~0.55 by step 80 then COLLAPSES at ~88 (student logp craters, reward->0,
+gn spikes ~75x, beta=0 no KL anchor) -- so the gap is durable in the valid 0-85
+window, but vanilla is not a clean saturation reference past step 88. Decision
+pending (user): leave the collapse as an honest finding + limitations line, or
+requeue vanilla-200 with an advantage std-floor for a clean saturating reference.
+Renumber: the old "77/82" job ids are stale (those were the corrupted/merge-bug
+ids); the live runs are 84 (route2) and 85 (vanilla).
 
 A5 -- Generalisation figure/table (the no-cheat payload, C2). Per-mode deploy
 hack: v_hack from 2 of 4 modes, measure suppression on the 2 held-out modes.
@@ -140,8 +147,9 @@ fill A1/A2, append a journal entry. Then queue A5 (the gap).
 - [ ] no-cheat invariant stated explicitly: live routing never reads gt_pass or
       runs the full detector suite over student rollouts; the pair set is the
       only supervision. (Promote to README/spec, plan item #114.)
-- [ ] convergence (77/82): the gap persists at 200 steps, pre-empting "you
-      stopped at 20-60 steps".
+- [/] convergence (84/85): route2 holds hack=0 to 200 steps; gap durable in the
+      0-85 window. CAVEAT: vanilla collapses at ~88 (not clean saturation past
+      there) -- report honestly, don't crop the collapse to fake a flat-high ref.
 - [ ] base-model and vanilla-saturation references present so emergence is real.
 
 ## Open editorial decisions
diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex
index 6c35dce..e9743ec 100644
--- a/docs/writeup/main.tex
+++ b/docs/writeup/main.tex
@@ -217,14 +217,24 @@ $+0.024$ while a mechanism-contrasting pairset moved it $-0.226$
 \subsection{Long-run convergence}
 
 % --- Figure: 200-step -------------------------------------------------------
-% Provenance: NOT YET RUN. route2 converge = job 77 (200-step nofloor s41);
-% vanilla saturation = job 82 (200-step none s41). Regenerate after both land.
+% Provenance: route2 = pueue job 84 (out/runs/20260602T080804_..._route2_converge200_s41);
+% vanilla = job 85 (out/runs/20260602T163201_..._vanilla_converge200_s41; vanilla still
+% running at writing -> left panel fills to step 200 on completion). Data source committed
+% at out/figs/dyn_longrun_200.csv; regen: uv run python scripts/plot_dynamics.py <both logs>.
 \begin{figure}[t]
   \centering
-  \fbox{\parbox{0.8\linewidth}{\centering\vspace{2em}\TODO{200-step route2
-  (job 77) vs vanilla saturation (job 82) -- figure pending both runs}\vspace{2em}}}
-  \caption{Deploy hack to convergence (200 steps), route2 vs vanilla, seed 41.
-  Pre-empts the ``you stopped at 60 steps'' critique. \TODO{interp.}}
+  \includegraphics[width=0.95\linewidth]{../../out/figs/dyn_longrun_200.png}
+  \caption{Deploy hack (red) vs solve (green) to convergence (200 steps),
+  seed 41, deploy-eval $n{=}64$, $T{=}0.7$, EMA-5. \textbf{route2} (right) holds
+  deploy hack at exactly $0$ for all 200 steps ($\text{hack}\equiv 0$ label) while
+  solve climbs to ${\sim}0.61$ and plateaus. \textbf{vanilla} (left) learns the
+  cheat (hack rises from the first-hack step to ${\sim}0.55$ by step~80), then the
+  policy \emph{collapses} around step~88 (student logp craters, reward $\to 0$,
+  grad-norm spikes ${\sim}75\times$ with no KL anchor), dragging both hack and
+  solve down: the late-vanilla descent is degeneration, not hack suppression. The
+  valid comparison window is steps 0--85, where vanilla acquires the hack and
+  route2 never does. Pre-empts the ``you stopped at 60 steps'' critique: the gap
+  is durable, not delayed.}
   \label{fig:longrun}
 \end{figure}
 
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index d530f6e..62cb1ec 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -488,14 +488,19 @@ def main(cfg: Config) -> int:
     G_s = group
     G_t = 0
     if cfg.teacher_pool_dir is not None:
-        if not (0.0 < cfg.mix_ratio < 1.0):
-            raise ValueError(f"mix_ratio must be in (0,1) when teacher_pool_dir set; got {cfg.mix_ratio}")
+        # mix=0 is the NO-TEACHER ablation: pure on-policy GRPO (G_t=0, no teacher
+        # rollouts injected) while the pool is still loaded for the 4-mode partition
+        # and route2 v_grad extraction. Using the pairs for v_grad is allowed under
+        # the no-cheat invariant; mixing teacher rollouts into training is the thing
+        # mix=0 removes. mix in [0,1).
+        if not (0.0 <= cfg.mix_ratio < 1.0):
+            raise ValueError(f"mix_ratio must be in [0,1) when teacher_pool_dir set; got {cfg.mix_ratio}")
         G_t = round(group * cfg.mix_ratio)
         G_s = group - G_t
-        if G_s == 0 or G_t == 0:
+        if G_s == 0:
             raise ValueError(
-                f"degenerate split: G={group} mix_ratio={cfg.mix_ratio} -> G_s={G_s}, G_t={G_t}. "
-                f"Pick mix_ratio so both halves are non-empty, or drop --teacher-pool-dir."
+                f"degenerate split: G={group} mix_ratio={cfg.mix_ratio} -> G_s={G_s}. "
+                f"Pick mix_ratio < 1 so the student half is non-empty."
             )
         for path in sorted(cfg.teacher_pool_dir.glob("prompt_*.jsonl.gz")):
             # path.stem on 'prompt_0004.jsonl.gz' is 'prompt_0004.jsonl' (only one
@@ -862,8 +867,10 @@ def main(cfg: Config) -> int:
             model.config.use_cache = True
             _tg = time.perf_counter()
             teacher_sample: list[dict] | None = None
-            if teacher_pool:
+            if teacher_pool and G_t > 0:
                 # Mixed-pool: G_s live student + G_t cached teacher rollouts.
+                # G_t==0 (mix=0 no-teacher ablation) falls through to the student-only
+                # path below; the pool stays loaded for partition + v_grad extraction.
                 # If this prompt has no cached teacher rollouts, skip the whole
                 # prompt; falling back to student-only would break the
                 # student-vs-teacher comparison this run is designed to measure.