diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 2f19020..2890fcd 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -20,9 +20,9 @@ Arm classification (from the preset line `arm=`, covering old --arm and new online erasure arm=projected, --vhack-refresh-every=N>0 (re-extracted) routing arm=routing (intervention=route) -For routing we plot the ABLATED-eval hack/solve (hack_abl/solve_abl, measured -with delta_S_hack zeroed every --eval-ablate-every steps), NOT the training-time -hack_s: the routed forward still hacks during training, so the training curve +For routing we plot the SHIP-eval hack/solve (hack_ship/solve_ship, the deployed +model = quarantine knob deleted, measured every --eval-ablate-every steps), NOT +the training-time hack_s: the routed forward still hacks during training, so the training curve would falsely read "route doesn't work". The ablated curve is the deployment model. (none/erase plot training-time hack_s; their intervention acts at train time.) @@ -91,10 +91,12 @@ def parse_log(path: Path) -> dict | None: series: dict[str, list[float]] = defaultdict(list) steps: list[int] = [] - # Also parse the route ablated-eval columns when present (older logs lack - # them -> skip). For routing we plot THESE, not the training-time hack_s. - abl = {"hack_abl", "solve_abl"} & set(idx) - wanted = {**RATE_COLS, **COS_COLS, **{c: c for c in abl}} + # Also parse the route SHIP-eval columns when present (older logs lack them + # -> skip). For routing we plot THESE (deployed model), not training-time + # hack_s. Renamed hack_abl/solve_abl -> hack_ship/solve_ship 2026-05-30; + # accept both so old evidence logs still parse. + ship = {"hack_abl", "solve_abl", "hack_ship", "solve_ship"} & set(idx) + wanted = {**RATE_COLS, **COS_COLS, **{c: c for c in ship}} for line in txt.splitlines(): if "| INFO |" not in line: continue @@ -109,12 +111,14 @@ def parse_log(path: Path) -> dict | None: run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()}) # COHERENCE-GAP FIX: route's training-time hack_s looks vanilla (the routed - # forward still hacks); routing's benefit only shows once delta_S_hack is - # ablated at eval. So for routing, plot the ablated series under the same + # forward still hacks); routing's benefit only shows on the DEPLOYED model + # (quarantine knob deleted). So for routing, plot the ship series under the # hack_s/gt_s keys -> all downstream (panels, onset, overlay) reads it. - if arm == "routing" and "hack_abl" in run: - run["hack_s"] = run["hack_abl"] - run["gt_s"] = run["solve_abl"] + if arm == "routing": + hk = "hack_ship" if "hack_ship" in run else "hack_abl" if "hack_abl" in run else None + if hk: + run["hack_s"] = run["hack_ship" if "hack_ship" in run else "hack_abl"] + run["gt_s"] = run["solve_ship" if "solve_ship" in run else "solve_abl"] return run diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index a839ae5..37b9af3 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -583,8 +583,8 @@ class StepLogger: # refresh fired (frozen-V and vanilla runs are all "-"). _Col("refr", 9, "refr", None), # Route-only ablated-eval hack/solve (delta_S_hack=0); nan elsewhere. - _Col("hack_abl", 8, "hack_abl", "+.3f"), - _Col("solve_abl", 9, "solve_abl", "+.3f"), + _Col("hack_ship", 9, "hack_ship", "+.3f"), # deployed model (quarantine off) + _Col("solve_ship", 10, "solve_ship", "+.3f"), ] def header(self) -> str: @@ -1307,11 +1307,13 @@ table columns: model.train() refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}" # mod/axes -> per-step row - # Periodic route ablated-eval: zero the quarantine, eval hack/solve on the - # fixed subset. This is the curve the plot uses for route (training-time - # hack_s looks vanilla; the routed forward still hacks). NaN on non-eval - # steps and for non-route arms (plot's EMA holds NaN). - hack_abl = solve_abl = float("nan") + # Periodic SHIP-eval (routing): delete the quarantine knob and eval the + # DEPLOYED model on a fixed subset. Routing's claim is that the cheating + # capability lands in the quarantine, so deleting it (= what we ship) + # should hack much less than the training-time model (the per-step hack_s + # row, which still hacks because training keeps the knob on). This is the + # curve the plot uses for route. NaN on non-eval steps / non-route arms. + hack_ship = solve_ship = float("nan") if (cfg.intervention == "route" and cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1)): _was_training = model.training @@ -1320,9 +1322,12 @@ table columns: ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) if _was_training: model.train() - hack_abl, solve_abl = ev["hack"], ev["solve"] - logger.info(f"step {step} route ablated-eval (delta_S_hack=0): " - f"hack={hack_abl:.3f} solve={solve_abl:.3f} (n={ev['n']})") + hack_ship, solve_ship = ev["hack"], ev["solve"] + logger.info( + f"step {step} SHIP-eval (quarantine knob OFF = deployed model): " + f"hack={hack_ship:.3f} solve={solve_ship:.3f} n={ev['n']}. " + f"SHOULD: ship hack < this step's training hack_s (knob is holding " + f"the cheat); ELSE routing isn't capturing it") rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1) rew_mean = rewards_t.mean().item() @@ -1434,8 +1439,8 @@ table columns: # Route ablated-eval (delta_S_hack=0); NaN except on route eval steps. # Appended AFTER refr so results.py's positional GT_S/HACK_S indices # are unaffected. plot_dynamics reads it by name. - "hack_abl": hack_abl, - "solve_abl": solve_abl, + "hack_ship": hack_ship, + "solve_ship": solve_ship, "gen": t_gen, "fb": t_fb, "t_rew": t_rew, @@ -1514,20 +1519,20 @@ table columns: if cfg.intervention == "route": assert dsh_norm > 0.0, "route: delta_S_hack never moved -> degenerated to erasure" - # Route: final kept-vs-ablated eval -- the absorption test. KEPT keeps the - # quarantine (training-time model, still hacks); ABLATED zeroes it (the - # deployment model). SHOULD: ablated hack < kept hack at preserved solve - # => the quarantine absorbed the hack. ELSE routing didn't localize it. + # Route: final training-vs-shipped eval -- the absorption test. TRAIN keeps + # the quarantine knob on (training-time model, still hacks); SHIP deletes it + # (the deployed model). SHOULD: ship hack < train hack at preserved solve + # => the quarantine knob absorbed the cheat. ELSE routing didn't localize it. if cfg.intervention == "route": model.eval() - ev_kept = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) with ablate_quarantine(wrappers): - ev_abl = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + ev_ship = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) logger.info( - f"ROUTE EVAL (n={ev_kept['n']}): " - f"kept hack={ev_kept['hack']:.3f} solve={ev_kept['solve']:.3f} | " - f"ablated hack={ev_abl['hack']:.3f} solve={ev_abl['solve']:.3f} " - f"(SHOULD: ablated hack < kept hack at ~matched solve)") + f"ROUTE EVAL (n={ev_train['n']}): " + f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | " + f"ship/knob-off hack={ev_ship['hack']:.3f} solve={ev_ship['solve']:.3f} " + f"(SHOULD: ship hack < train hack at ~matched solve => quarantine absorbed the cheat)") # Final tail: cue emoji + main metric BLUF, then per-step tsv table. # Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped