From 025debae6b8edc2831adbecaab3f423fbc0a03f5 Mon Sep 17 00:00:00 2001
From: wassname <github@wassname>
Date: Wed, 3 Jun 2026 00:00:24 +0000
Subject: [PATCH] feat: knob-ON eval (route arms) for like-for-like
 train-vs-deploy + teacher-off marker

The 2x2 train row used per-step hack_s (noisy n=28 train batch, knob-on) vs the
deploy row's smooth n=64 eval (knob-off) -- different estimators, confounded.
Now at each eval step route arms ALSO run the SAME n=64 eval with the quarantine
ACTIVE (knob-on = training policy), logged as hk_on/slv_on. vanilla/erase reuse
deploy (no quarantine -> knob-on==knob-off). plot_dynamics prefers hk_on for the
train series so the 2x2 differs ONLY in knob state.

Also: plot parses --teacher-off-step from argv and shades the teacher-ON region
[0,toff] + a dashed cut line in the 2x2. The stashed long-run route2 jobs
(92 KL, 94 teacher-off) inherit the knob-on eval automatically at runtime.

Smoke (route2 hk_on present + logged, both plot parse paths) green.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 scripts/plot_dynamics.py       | 29 ++++++++++++++++++++++++-----
 src/projected_grpo/tablelog.py |  5 +++++
 src/projected_grpo/train.py    | 26 +++++++++++++++++++++++---
 3 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py
index c1744ee..daea860 100644
--- a/scripts/plot_dynamics.py
+++ b/scripts/plot_dynamics.py
@@ -79,6 +79,10 @@ def parse_log(path: Path) -> dict | None:
     refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0"))
     seed = grab(r"seed=(\d+)", preset, "?")
     vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-")
+    # teacher-off curriculum: step the teacher mix was cut (None if never). Drawn as
+    # a vertical line / end of the teacher-on shaded region in the 2x2.
+    _toff = grab(r"--teacher-off-step=(\d+)", argv, None)
+    teacher_off = int(_toff) if _toff is not None else None
 
     # header line: the one containing both "step" and "hack_s"
     hdr = next((l for l in txt.splitlines()
@@ -98,7 +102,7 @@ def parse_log(path: Path) -> dict | None:
     # hk_abl/slv_abl = the FREE per-step deploy proxy (ablated rollout slice,
     # rollout_ablate_frac>0); hk_dep/slv_dep = the held-out greedy eval, only on
     # eval_ablate_every steps. Prefer the dense proxy for the curve (see below).
-    deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl"} & set(idx)
+    deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl", "hk_on", "slv_on"} & set(idx)
     # Only parse columns this log actually has: non-projecting arms (vanilla,
     # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
     wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
@@ -114,7 +118,7 @@ def parse_log(path: Path) -> dict | None:
             series[col].append(_val(row[idx[col]]))
     if not steps:
         return None
-    run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack,
+    run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, teacher_off=teacher_off,
                steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()})
     # APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it
     # has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms.
@@ -124,9 +128,14 @@ def parse_log(path: Path) -> dict | None:
     # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
     def _has_data(key):
         return key in run and np.isfinite(run[key]).any()
-    # Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy
-    # substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both.
-    if "hack_s" in run:
+    # TRAIN series for the train-vs-deploy 2x2. Prefer the knob-ON eval (hk_on/slv_on):
+    # SAME n/prompts/T as the knob-off deploy eval, so the two rows differ ONLY in the
+    # knob -- the per-step hack_s is a noisy n=28 train batch and looks like a different
+    # estimator. Fall back to per-step hack_s for logs without the knob-on eval.
+    if _has_data("hk_on"):
+        run["hack_train"] = run["hk_on"]
+        run["solve_train"] = run["slv_on"]
+    elif "hack_s" in run:
         run["hack_train"] = run["hack_s"]
         run["solve_train"] = run["gt_s"]
     if _has_data("hk_abl"):           # dense per-step proxy (rollout_ablate_frac>0), if present
@@ -390,6 +399,16 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
                 ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"),
                             color=red, fontsize=8, va="bottom",
                             xytext=(0, 3), textcoords="offset points")
+            # teacher-off curriculum: shade the teacher-ON region [0, toff] + a line at
+            # the cut, so "hacks were teacher-seeded here, on-policy after" is visible.
+            toffs = {r.get("teacher_off") for r in by_arm[arm] if r.get("teacher_off")}
+            if toffs:
+                toff = max(toffs)
+                ax.axvspan(0, toff, color="0.85", alpha=0.5, zorder=0)
+                ax.axvline(toff, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=1)
+                if ri == 0:
+                    ax.annotate("teacher off", (toff, 1.0), color="0.4", fontsize=7,
+                                xytext=(2, -2), textcoords="offset points", va="top")
             if ci == 0:
                 ax.set_ylabel(rlabel)
             ax.spines[["top", "right"]].set_visible(False)
diff --git a/src/projected_grpo/tablelog.py b/src/projected_grpo/tablelog.py
index 1b0b92e..2f2f406 100644
--- a/src/projected_grpo/tablelog.py
+++ b/src/projected_grpo/tablelog.py
@@ -145,6 +145,11 @@ class StepLogger:
             ]
         if arm in ("routing", "routing2"):
             cols += [
+                # Knob-ON eval: SAME eval set/n/T as hk_dep but quarantine ACTIVE
+                # (training-time policy). Like-for-like train series vs the knob-off
+                # hk_dep, for the train-vs-deploy 2x2. nan between eval steps.
+                _Col("hack_kon", 7, "hk_on",  "+.2f", "knob-ON eval hack (quarantine active = training policy); same eval set as hk_dep"),
+                _Col("solve_kon", 7, "slv_on", "+.2f", "knob-ON eval solve (same eval set as slv_dep)"),
                 _Col("q_egy", 6, "qE", ".2f", "grad energy into quarantine ||g_quar||/(||g_keep||+||g_quar||); ~0.5+ rising = learning dumped into the thrown-away knob"),
                 _Col("hack_abl",  6, "hk_abl",  "frac", "FREE per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"),
                 _Col("solve_abl", 6, "slv_abl", "frac", "free per-step deploy proxy: solve rate on the ablated rollout slice"),
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index e78f64e..21c5786 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -1375,21 +1375,36 @@ def main(cfg: Config) -> int:
         # route shows a deploy eval while others show training rollouts -> different
         # n/cadence, route looks artificially smoother). NaN on non-eval steps.
         hack_deploy = solve_deploy = float("nan")
+        hack_kon = solve_kon = float("nan")  # knob-ON eval (route only); see below
         if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1):
             _was_training = model.training
             model.eval()
             is_route = cfg.intervention in ("route", "route2")
             with (ablate_quarantine(wrappers) if is_route else nullcontext()):
                 ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
+            hack_deploy, solve_deploy = ev["hack"], ev["solve"]
+            # Like-for-like knob-ON eval: re-run the SAME n eval prompts with the
+            # quarantine ACTIVE (the training-time policy). The per-step hack_s is a
+            # noisy n=28 train batch -> spiky, looks like a different estimator than
+            # the smooth n=64 deploy curve. This gives a train series measured the
+            # IDENTICAL way as deploy (same prompts/n/T), differing only in knob state,
+            # for the train-vs-deploy 2x2. Route only: vanilla/erase have no quarantine
+            # (knob-on == knob-off), so reuse the deploy number.
+            if is_route:
+                ev_on = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
+                hack_kon, solve_kon = ev_on["hack"], ev_on["solve"]
+            else:
+                hack_kon, solve_kon = hack_deploy, solve_deploy
             if _was_training:
                 model.train()
-            hack_deploy, solve_deploy = ev["hack"], ev["solve"]
             tag = "quarantine knob OFF = deployed model" if is_route else "deployed = trained model (no quarantine)"
-            should = ("deploy hack < this step's training hack_s (knob is holding the cheat); "
+            should = ("deploy hack < knob-ON eval hack (knob is holding the cheat); "
                       "ELSE routing isn't capturing it") if is_route else "deploy ~= training hack_s (same model)"
             logger.info(
                 f"step {step} DEPLOY-eval ({tag}): "
-                f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}.  SHOULD: {should}")
+                f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}"
+                + (f" | knob-ON same-eval: hack={hack_kon:.3f} solve={solve_kon:.3f}" if is_route else "")
+                + f".  SHOULD: {should}")
 
         rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1)
         rew_mean = rewards_t.mean().item()
@@ -1522,6 +1537,11 @@ def main(cfg: Config) -> int:
             # are unaffected. plot_dynamics reads it by name.
             "hack_deploy": hack_deploy,
             "solve_deploy": solve_deploy,
+            # Knob-ON eval: SAME n eval prompts as deploy, quarantine active = the
+            # training-time policy. Like-for-like train series for the train-vs-deploy
+            # 2x2 (vs the noisy per-step hack_s batch). route only; else == deploy.
+            "hack_kon": hack_kon,
+            "solve_kon": solve_kon,
             # Free per-step deploy proxy from the ablated rollout slice (above).
             "hack_abl": (hack_abl_n, n_abl_step) if n_abl_step else (0, 0),
             "solve_abl": (gt_abl_n, n_abl_step) if n_abl_step else (0, 0),