diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 3860fc6..e2b825e 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -1,17 +1,18 @@ -"""Per-step training-dynamics small multiples: vanilla vs static vs online erasure. +"""Training-dynamics small multiples: deployed hack vs solve, one column per arm. -Tufte small multiples. Columns = arm (vanilla / static G_hack erasure / -online G_hack erasure); rows = metric group: - row 0 hack_s + solve(gt_s) student reward-hack rate vs ground-truth solve - row 1 sep + leak cross-arm-comparable cos diagnostics (see - _add_cos_derived): sep = does v_hack still - discriminate hacky grad; leak = residual - hack-alignment of the post-intervention grad +Tufte small multiples, single row. Columns = arm (vanilla / static G_hack +erasure / online G_hack erasure / routing2); the panel shows the DEPLOYED +model's hack_s (red) and solve/gt_s (green) over training. Per-seed thin lines ++ bold mean; the mean hack-onset step (first hack_s > 0) is a dashed vertical. -Each panel overlays one thin line per seed and one bold mean line. The first -step where the student starts hacking (hack_s > 0) is marked per seed with an -open tick on the hack curve -- the onset point, which is where cos_pre_t starts -to diverge from the (refreshed) v_hack. +APPLES-TO-APPLES. We plot the DEPLOY-eval (hk_dep/slv_dep) for every arm when +present: the same estimator across arms (n=64, T=0.7, every --eval-ablate-every +steps). For route/route2 the deployed model = quarantine knob zeroed; for +vanilla/erase deploy == the trained model. Sparse deploy-eval points are dotted +(see _mark_if_sparse) so the EMA-held line doesn't oversell per-step density. +Older logs that gated the eval to route only fall back to per-step training +hack_s for vanilla/erase (noisier, n=28, but estimates the same deployed rate +since those arms have no quarantine). Data source: logs/*.log per-step rows (the durable source results.py also uses). We parse by HEADER NAME, not fixed index, because newer runs add columns (refr). @@ -21,14 +22,7 @@ Arm classification (from the preset line `arm=`, covering old --arm and new vanilla arm=vanilla (intervention=none) static erasure arm=projected, no --vhack-refresh-every (frozen v_hack) online erasure arm=projected, --vhack-refresh-every=N>0 (re-extracted) - routing arm=routing (intervention=route) - -For routing we plot the DEPLOY-eval hack/solve (hack_deploy/solve_deploy, the -deployed model = quarantine knob deleted, measured every --eval-ablate-every steps), -NOT the training-time hack_s: the routed forward still hacks during training, so the -training curve would falsely read "route doesn't work". The deploy curve is the deployment -model. (none/erase plot training-time hack_s; their intervention acts at train -time.) + routing2 arm=routing2 (intervention=route2) Usage: uv run python scripts/plot_dynamics.py logs/*converge*.log @@ -54,13 +48,6 @@ from projected_grpo.figs import link_latest # Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264". RATE_COLS = {"hack_s": "hack", "gt_s": "solve"} -# Raw cosine columns we parse, presence-gated (different arms log different ones): -# erase emits cin_t/cin_s/cout, route2 emits hkgap/resid. We do NOT plot these -# directly -- they measure different things (a single pre-intervention cosine vs a -# difference vs a post-intervention cosine). Instead _add_cos_derived collapses them -# into two CROSS-ARM-COMPARABLE series so a line means the same thing in every column. -RAW_COS = ("cin_t", "cin_s", "cout", "hkgap", "resid") -COS_COLS = {"sep": "hack-clean sep", "leak": "residual hack-align"} _HDR_TOK = re.compile(r"[A-Za-z_]+") # strip ↑↓? decorations: "hack_s?" -> "hack_s" @@ -115,7 +102,6 @@ def parse_log(path: Path) -> dict | None: # Only parse columns this log actually has: non-projecting arms (vanilla, # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError. wanted = {k: v for k, v in RATE_COLS.items() if k in idx} - wanted.update({c: c for c in RAW_COS if c in idx}) wanted.update({c: c for c in deploy}) for line in txt.splitlines(): if "| INFO |" not in line: @@ -130,48 +116,23 @@ def parse_log(path: Path) -> dict | None: return None run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()}) - # COHERENCE-GAP FIX: routing's training-time hack_s looks vanilla (the routed - # forward still hacks); the benefit only shows on the DEPLOYED model - # (quarantine knob deleted). So for routing/routing2, plot the deploy series - # under the hack_s/gt_s keys -> all downstream (panels, onset, overlay) reads - # it. Prefer the DENSE per-step proxy (hk_abl, every step) over the sparse - # held-out eval (hk_dep, every eval_ablate_every steps); fall back to hk_dep - # when hk_abl carries no data. No-floor runs (rollout_ablate_frac=0) have the - # hk_abl COLUMN present but every cell is "0/0" -> all-nan, so test for finite - # values, not mere column presence, else the deploy panel comes up blank. + # APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it + # has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms. + # For route/route2 this is the quarantine-off model; for vanilla/erase deploy == + # trained model. Older logs (eval gated to route only) lack it for vanilla/erase + # -> fall back to per-step training hack_s. Test FINITE values, not column + # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise. def _has_data(key): return key in run and np.isfinite(run[key]).any() - if arm in ("routing", "routing2"): - if _has_data("hk_abl"): - run["hack_s"] = run["hk_abl"] - run["gt_s"] = run["slv_abl"] - elif _has_data("hk_dep"): - run["hack_s"] = run["hk_dep"] - run["gt_s"] = run["slv_dep"] - _add_cos_derived(run) + if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present + run["hack_s"] = run["hk_abl"] + run["gt_s"] = run["slv_abl"] + elif _has_data("hk_dep"): # the n=64 every-eval_ablate_every deploy eval + run["hack_s"] = run["hk_dep"] + run["gt_s"] = run["slv_dep"] return run -def _add_cos_derived(run: dict) -> None: - """Collapse each arm's raw cosine columns into two cross-arm-comparable series: - - sep -- does v_hack discriminate hacky from non-hacky gradient (higher = alive). - erase: cin_t - cin_s (teacher pool vs student). route2: hkgap (hack-flagged - vs clean rollouts). Different partition, same question; not bit-identical. - leak -- residual hack-alignment of the post-intervention DEPLOYED gradient (~0 ideal). - erase: cout (after projection). route2: resid (after routing). Same quantity. - - Whatever can't be derived (vanilla logs neither) is just absent -> blank panel.""" - if "hkgap" in run: - run["sep"] = run["hkgap"] - elif "cin_t" in run and "cin_s" in run: - run["sep"] = run["cin_t"] - run["cin_s"] - if "resid" in run: - run["leak"] = run["resid"] - elif "cout" in run: - run["leak"] = run["cout"] - - def classify(run: dict) -> str: if run["arm"] == "vanilla": return "vanilla" @@ -193,7 +154,6 @@ ARM_ORDER = ["vanilla", "static erasure", "online erasure", "routing2"] # must not share a palette (hack != teacher-cos). Row 0: red hack vs green # solve. Row 1: blue teacher-cos vs amber student-cos. RATE_COLORS = {"hack_s": "#c1432b", "gt_s": "#2f7d4f"} -COS_COLORS = {"sep": "#33508c", "leak": "#c98a2b"} # Arm colours for the single-panel hack overlay (arms, not series): grey vanilla # baseline -> amber static -> blue online, ordered by increasing intervention. # TODO(color): make this a quality-ordered red->green ramp instead of fixed @@ -284,48 +244,31 @@ def plot(runs: list[dict], out: Path) -> None: if not arms: raise SystemExit("no runs classified into arms") - fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.4), - sharex=True, sharey="row", squeeze=False) - _cos_vals = [f(r[c]) for r in runs for c in COS_COLS if c in r for f in (np.nanmin, np.nanmax)] - cos_lo, cos_hi = (min(_cos_vals), max(_cos_vals)) if _cos_vals else (0.0, 0.4) - # legend goes on the leftmost arm that HAS cos data (vanilla has none -> would - # render an empty legend), since sep/leak mean the same thing in every column - cos_label_arm = next((a for a in arms if any(c in r for r in by_arm[a] for c in COS_COLS)), None) + fig, axes = plt.subplots(1, len(arms), figsize=(3.0 * len(arms), 2.6), + sharex=True, sharey=True, squeeze=False) for col, arm in enumerate(arms): + ax = axes[0][col] rs = by_arm[arm] n_seed = len({r["seed"] for r in rs}) - axes[0][col].set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", - fontsize=9) - _series_panel(axes[0][col], rs, RATE_COLS, RATE_COLORS, ylim=(0, 1), - label_series=(col == 0)) - # sep/leak are derived to mean the same thing in every column -> one legend - # (leftmost) carries the whole row; repeating it would be redundant ink. - _series_panel(axes[1][col], rs, COS_COLS, COS_COLORS, - ylim=(min(-0.05, cos_lo - 0.02), max(0.2, cos_hi + 0.02)), - label_series=(arm == cos_label_arm)) - axes[1][col].axhline(0, color="0.8", lw=0.6, zorder=0) - axes[1][col].set_xlabel("optimizer step") - - # Mean hack-onset: one dashed vertical reference line spanning BOTH rows - # so the cos-divergence can be read against the moment hacking starts. + ax.set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9) + _series_panel(ax, rs, RATE_COLS, RATE_COLORS, ylim=(0, 1), label_series=(col == 0)) + ax.set_xlabel("optimizer step") onsets = [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None] if onsets: s0 = float(np.mean(onsets)) - for row in (0, 1): - axes[row][col].axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0) - axes[0][col].annotate("first hack", (s0, 1.0), color="0.4", fontsize=7, - xytext=(2, -2), textcoords="offset points", va="top") + ax.axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0) + ax.annotate("first hack", (s0, 1.0), color="0.4", fontsize=7, + xytext=(2, -2), textcoords="offset points", va="top") - axes[0][0].set_ylabel("student rate") - axes[1][0].set_ylabel("cos with v_hack") + axes[0][0].set_ylabel("deployed rate") # range-frame: drop top/right spines, keep ink on data for ax in axes.flat: ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.tick_params(labelsize=8) - fig.suptitle("Training dynamics: G_hack erasure vs vanilla " - "(EMA-5 smoothed; dashed line = mean hack onset)", fontsize=10) + fig.suptitle("Training dynamics: deployed hack vs solve by arm " + "(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10) fig.tight_layout(rect=(0, 0, 1, 0.96)) out.parent.mkdir(parents=True, exist_ok=True) fig.savefig(out, dpi=150, bbox_inches="tight") diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 44a718f..a86cf2c 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -33,7 +33,7 @@ import os import sys import random import time -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from dataclasses import dataclass from datetime import datetime from pathlib import Path @@ -1446,28 +1446,31 @@ def main(cfg: Config) -> int: model.train() refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}" # mod/axes -> per-step row - # ── deploy-eval (route/route2): zero δS_hack, eval the shipped model ── - # Periodic DEPLOY-eval (routing, Gradient Routing): zero the quarantine knob - # and eval the DEPLOYED model on a fixed subset. Routing's claim is that the - # cheating capability lands in the quarantine, so deleting it (= what we deploy) - # should hack much less than the training-time model (the per-step hack_s row, - # which still hacks because training keeps the knob on). This is the curve the - # plot uses for route. NaN on non-eval steps / non-route arms. + # ── periodic DEPLOY-eval (EVERY arm) -- the apples-to-apples curve ── + # Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval (n=64, + # T=0.7), every eval_ablate_every steps. route/route2: deploy = quarantine + # knob zeroed (ablate_quarantine), and the claim is this hacks far less than + # the training-time model (per-step hack_s, knob still on). vanilla/erase: no + # quarantine, so deploy == the trained model -- eval it directly. Running the + # SAME estimator for all arms makes the dynamics-plot curves comparable (else + # route shows a deploy eval while others show training rollouts -> different + # n/cadence, route looks artificially smoother). NaN on non-eval steps. hack_deploy = solve_deploy = float("nan") - if (cfg.intervention in ("route", "route2") and cfg.eval_ablate_every > 0 - and (step % cfg.eval_ablate_every == 0 or step == steps - 1)): + if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1): _was_training = model.training model.eval() - with ablate_quarantine(wrappers): + is_route = cfg.intervention in ("route", "route2") + with (ablate_quarantine(wrappers) if is_route else nullcontext()): ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) if _was_training: model.train() hack_deploy, solve_deploy = ev["hack"], ev["solve"] + tag = "quarantine knob OFF = deployed model" if is_route else "deployed = trained model (no quarantine)" + should = ("deploy hack < this step's training hack_s (knob is holding the cheat); " + "ELSE routing isn't capturing it") if is_route else "deploy ~= training hack_s (same model)" logger.info( - f"step {step} DEPLOY-eval (quarantine knob OFF = deployed model): " - f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}. " - f"SHOULD: deploy hack < this step's training hack_s (knob is holding " - f"the cheat); ELSE routing isn't capturing it") + f"step {step} DEPLOY-eval ({tag}): " + f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}. SHOULD: {should}") rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1) rew_mean = rewards_t.mean().item()