From 025debae6b8edc2831adbecaab3f423fbc0a03f5 Mon Sep 17 00:00:00 2001 From: wassname Date: Wed, 3 Jun 2026 00:00:24 +0000 Subject: [PATCH] feat: knob-ON eval (route arms) for like-for-like train-vs-deploy + teacher-off marker The 2x2 train row used per-step hack_s (noisy n=28 train batch, knob-on) vs the deploy row's smooth n=64 eval (knob-off) -- different estimators, confounded. Now at each eval step route arms ALSO run the SAME n=64 eval with the quarantine ACTIVE (knob-on = training policy), logged as hk_on/slv_on. vanilla/erase reuse deploy (no quarantine -> knob-on==knob-off). plot_dynamics prefers hk_on for the train series so the 2x2 differs ONLY in knob state. Also: plot parses --teacher-off-step from argv and shades the teacher-ON region [0,toff] + a dashed cut line in the 2x2. The stashed long-run route2 jobs (92 KL, 94 teacher-off) inherit the knob-on eval automatically at runtime. Smoke (route2 hk_on present + logged, both plot parse paths) green. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- scripts/plot_dynamics.py | 29 ++++++++++++++++++++++++----- src/projected_grpo/tablelog.py | 5 +++++ src/projected_grpo/train.py | 26 +++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index c1744ee..daea860 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -79,6 +79,10 @@ def parse_log(path: Path) -> dict | None: refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0")) seed = grab(r"seed=(\d+)", preset, "?") vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-") + # teacher-off curriculum: step the teacher mix was cut (None if never). Drawn as + # a vertical line / end of the teacher-on shaded region in the 2x2. + _toff = grab(r"--teacher-off-step=(\d+)", argv, None) + teacher_off = int(_toff) if _toff is not None else None # header line: the one containing both "step" and "hack_s" hdr = next((l for l in txt.splitlines() @@ -98,7 +102,7 @@ def parse_log(path: Path) -> dict | None: # hk_abl/slv_abl = the FREE per-step deploy proxy (ablated rollout slice, # rollout_ablate_frac>0); hk_dep/slv_dep = the held-out greedy eval, only on # eval_ablate_every steps. Prefer the dense proxy for the curve (see below). - deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl"} & set(idx) + deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl", "hk_on", "slv_on"} & set(idx) # Only parse columns this log actually has: non-projecting arms (vanilla, # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError. wanted = {k: v for k, v in RATE_COLS.items() if k in idx} @@ -114,7 +118,7 @@ def parse_log(path: Path) -> dict | None: series[col].append(_val(row[idx[col]])) if not steps: return None - run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, + run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, teacher_off=teacher_off, steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()}) # APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it # has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms. @@ -124,9 +128,14 @@ def parse_log(path: Path) -> dict | None: # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise. def _has_data(key): return key in run and np.isfinite(run[key]).any() - # Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy - # substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both. - if "hack_s" in run: + # TRAIN series for the train-vs-deploy 2x2. Prefer the knob-ON eval (hk_on/slv_on): + # SAME n/prompts/T as the knob-off deploy eval, so the two rows differ ONLY in the + # knob -- the per-step hack_s is a noisy n=28 train batch and looks like a different + # estimator. Fall back to per-step hack_s for logs without the knob-on eval. + if _has_data("hk_on"): + run["hack_train"] = run["hk_on"] + run["solve_train"] = run["slv_on"] + elif "hack_s" in run: run["hack_train"] = run["hack_s"] run["solve_train"] = run["gt_s"] if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present @@ -390,6 +399,16 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None: ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"), color=red, fontsize=8, va="bottom", xytext=(0, 3), textcoords="offset points") + # teacher-off curriculum: shade the teacher-ON region [0, toff] + a line at + # the cut, so "hacks were teacher-seeded here, on-policy after" is visible. + toffs = {r.get("teacher_off") for r in by_arm[arm] if r.get("teacher_off")} + if toffs: + toff = max(toffs) + ax.axvspan(0, toff, color="0.85", alpha=0.5, zorder=0) + ax.axvline(toff, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=1) + if ri == 0: + ax.annotate("teacher off", (toff, 1.0), color="0.4", fontsize=7, + xytext=(2, -2), textcoords="offset points", va="top") if ci == 0: ax.set_ylabel(rlabel) ax.spines[["top", "right"]].set_visible(False) diff --git a/src/projected_grpo/tablelog.py b/src/projected_grpo/tablelog.py index 1b0b92e..2f2f406 100644 --- a/src/projected_grpo/tablelog.py +++ b/src/projected_grpo/tablelog.py @@ -145,6 +145,11 @@ class StepLogger: ] if arm in ("routing", "routing2"): cols += [ + # Knob-ON eval: SAME eval set/n/T as hk_dep but quarantine ACTIVE + # (training-time policy). Like-for-like train series vs the knob-off + # hk_dep, for the train-vs-deploy 2x2. nan between eval steps. + _Col("hack_kon", 7, "hk_on", "+.2f", "knob-ON eval hack (quarantine active = training policy); same eval set as hk_dep"), + _Col("solve_kon", 7, "slv_on", "+.2f", "knob-ON eval solve (same eval set as slv_dep)"), _Col("q_egy", 6, "qE", ".2f", "grad energy into quarantine ||g_quar||/(||g_keep||+||g_quar||); ~0.5+ rising = learning dumped into the thrown-away knob"), _Col("hack_abl", 6, "hk_abl", "frac", "FREE per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"), _Col("solve_abl", 6, "slv_abl", "frac", "free per-step deploy proxy: solve rate on the ablated rollout slice"), diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index e78f64e..21c5786 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -1375,21 +1375,36 @@ def main(cfg: Config) -> int: # route shows a deploy eval while others show training rollouts -> different # n/cadence, route looks artificially smoother). NaN on non-eval steps. hack_deploy = solve_deploy = float("nan") + hack_kon = solve_kon = float("nan") # knob-ON eval (route only); see below if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1): _was_training = model.training model.eval() is_route = cfg.intervention in ("route", "route2") with (ablate_quarantine(wrappers) if is_route else nullcontext()): ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + hack_deploy, solve_deploy = ev["hack"], ev["solve"] + # Like-for-like knob-ON eval: re-run the SAME n eval prompts with the + # quarantine ACTIVE (the training-time policy). The per-step hack_s is a + # noisy n=28 train batch -> spiky, looks like a different estimator than + # the smooth n=64 deploy curve. This gives a train series measured the + # IDENTICAL way as deploy (same prompts/n/T), differing only in knob state, + # for the train-vs-deploy 2x2. Route only: vanilla/erase have no quarantine + # (knob-on == knob-off), so reuse the deploy number. + if is_route: + ev_on = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + hack_kon, solve_kon = ev_on["hack"], ev_on["solve"] + else: + hack_kon, solve_kon = hack_deploy, solve_deploy if _was_training: model.train() - hack_deploy, solve_deploy = ev["hack"], ev["solve"] tag = "quarantine knob OFF = deployed model" if is_route else "deployed = trained model (no quarantine)" - should = ("deploy hack < this step's training hack_s (knob is holding the cheat); " + should = ("deploy hack < knob-ON eval hack (knob is holding the cheat); " "ELSE routing isn't capturing it") if is_route else "deploy ~= training hack_s (same model)" logger.info( f"step {step} DEPLOY-eval ({tag}): " - f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}. SHOULD: {should}") + f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}" + + (f" | knob-ON same-eval: hack={hack_kon:.3f} solve={solve_kon:.3f}" if is_route else "") + + f". SHOULD: {should}") rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1) rew_mean = rewards_t.mean().item() @@ -1522,6 +1537,11 @@ def main(cfg: Config) -> int: # are unaffected. plot_dynamics reads it by name. "hack_deploy": hack_deploy, "solve_deploy": solve_deploy, + # Knob-ON eval: SAME n eval prompts as deploy, quarantine active = the + # training-time policy. Like-for-like train series for the train-vs-deploy + # 2x2 (vs the noisy per-step hack_s batch). route only; else == deploy. + "hack_kon": hack_kon, + "solve_kon": solve_kon, # Free per-step deploy proxy from the ablated rollout slice (above). "hack_abl": (hack_abl_n, n_abl_step) if n_abl_step else (0, 0), "solve_abl": (gt_abl_n, n_abl_step) if n_abl_step else (0, 0),