diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py
index 3860fc6..e2b825e 100644
--- a/scripts/plot_dynamics.py
+++ b/scripts/plot_dynamics.py
@@ -1,17 +1,18 @@
-"""Per-step training-dynamics small multiples: vanilla vs static vs online erasure.
+"""Training-dynamics small multiples: deployed hack vs solve, one column per arm.
 
-Tufte small multiples. Columns = arm (vanilla / static G_hack erasure /
-online G_hack erasure); rows = metric group:
-  row 0  hack_s + solve(gt_s)        student reward-hack rate vs ground-truth solve
-  row 1  sep + leak                   cross-arm-comparable cos diagnostics (see
-                                     _add_cos_derived): sep = does v_hack still
-                                     discriminate hacky grad; leak = residual
-                                     hack-alignment of the post-intervention grad
+Tufte small multiples, single row. Columns = arm (vanilla / static G_hack
+erasure / online G_hack erasure / routing2); the panel shows the DEPLOYED
+model's hack_s (red) and solve/gt_s (green) over training. Per-seed thin lines
++ bold mean; the mean hack-onset step (first hack_s > 0) is a dashed vertical.
 
-Each panel overlays one thin line per seed and one bold mean line. The first
-step where the student starts hacking (hack_s > 0) is marked per seed with an
-open tick on the hack curve -- the onset point, which is where cos_pre_t starts
-to diverge from the (refreshed) v_hack.
+APPLES-TO-APPLES. We plot the DEPLOY-eval (hk_dep/slv_dep) for every arm when
+present: the same estimator across arms (n=64, T=0.7, every --eval-ablate-every
+steps). For route/route2 the deployed model = quarantine knob zeroed; for
+vanilla/erase deploy == the trained model. Sparse deploy-eval points are dotted
+(see _mark_if_sparse) so the EMA-held line doesn't oversell per-step density.
+Older logs that gated the eval to route only fall back to per-step training
+hack_s for vanilla/erase (noisier, n=28, but estimates the same deployed rate
+since those arms have no quarantine).
 
 Data source: logs/*.log per-step rows (the durable source results.py also uses).
 We parse by HEADER NAME, not fixed index, because newer runs add columns (refr).
@@ -21,14 +22,7 @@ Arm classification (from the preset line `arm=`, covering old --arm and new
   vanilla            arm=vanilla    (intervention=none)
   static erasure     arm=projected, no --vhack-refresh-every (frozen v_hack)
   online erasure     arm=projected, --vhack-refresh-every=N>0 (re-extracted)
-  routing            arm=routing    (intervention=route)
-
-For routing we plot the DEPLOY-eval hack/solve (hack_deploy/solve_deploy, the
-deployed model = quarantine knob deleted, measured every --eval-ablate-every steps),
-NOT the training-time hack_s: the routed forward still hacks during training, so the
-training curve would falsely read "route doesn't work". The deploy curve is the deployment
-model. (none/erase plot training-time hack_s; their intervention acts at train
-time.)
+  routing2           arm=routing2   (intervention=route2)
 
 Usage:
   uv run python scripts/plot_dynamics.py logs/*converge*.log
@@ -54,13 +48,6 @@ from projected_grpo.figs import link_latest
 
 # Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264".
 RATE_COLS = {"hack_s": "hack", "gt_s": "solve"}
-# Raw cosine columns we parse, presence-gated (different arms log different ones):
-# erase emits cin_t/cin_s/cout, route2 emits hkgap/resid. We do NOT plot these
-# directly -- they measure different things (a single pre-intervention cosine vs a
-# difference vs a post-intervention cosine). Instead _add_cos_derived collapses them
-# into two CROSS-ARM-COMPARABLE series so a line means the same thing in every column.
-RAW_COS = ("cin_t", "cin_s", "cout", "hkgap", "resid")
-COS_COLS = {"sep": "hack-clean sep", "leak": "residual hack-align"}
 _HDR_TOK = re.compile(r"[A-Za-z_]+")  # strip ↑↓? decorations: "hack_s?" -> "hack_s"
 
 
@@ -115,7 +102,6 @@ def parse_log(path: Path) -> dict | None:
     # Only parse columns this log actually has: non-projecting arms (vanilla,
     # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
     wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
-    wanted.update({c: c for c in RAW_COS if c in idx})
     wanted.update({c: c for c in deploy})
     for line in txt.splitlines():
         if "| INFO |" not in line:
@@ -130,48 +116,23 @@ def parse_log(path: Path) -> dict | None:
         return None
     run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack,
                steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()})
-    # COHERENCE-GAP FIX: routing's training-time hack_s looks vanilla (the routed
-    # forward still hacks); the benefit only shows on the DEPLOYED model
-    # (quarantine knob deleted). So for routing/routing2, plot the deploy series
-    # under the hack_s/gt_s keys -> all downstream (panels, onset, overlay) reads
-    # it. Prefer the DENSE per-step proxy (hk_abl, every step) over the sparse
-    # held-out eval (hk_dep, every eval_ablate_every steps); fall back to hk_dep
-    # when hk_abl carries no data. No-floor runs (rollout_ablate_frac=0) have the
-    # hk_abl COLUMN present but every cell is "0/0" -> all-nan, so test for finite
-    # values, not mere column presence, else the deploy panel comes up blank.
+    # APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it
+    # has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms.
+    # For route/route2 this is the quarantine-off model; for vanilla/erase deploy ==
+    # trained model. Older logs (eval gated to route only) lack it for vanilla/erase
+    # -> fall back to per-step training hack_s. Test FINITE values, not column
+    # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
     def _has_data(key):
         return key in run and np.isfinite(run[key]).any()
-    if arm in ("routing", "routing2"):
-        if _has_data("hk_abl"):
-            run["hack_s"] = run["hk_abl"]
-            run["gt_s"] = run["slv_abl"]
-        elif _has_data("hk_dep"):
-            run["hack_s"] = run["hk_dep"]
-            run["gt_s"] = run["slv_dep"]
-    _add_cos_derived(run)
+    if _has_data("hk_abl"):           # dense per-step proxy (rollout_ablate_frac>0), if present
+        run["hack_s"] = run["hk_abl"]
+        run["gt_s"] = run["slv_abl"]
+    elif _has_data("hk_dep"):         # the n=64 every-eval_ablate_every deploy eval
+        run["hack_s"] = run["hk_dep"]
+        run["gt_s"] = run["slv_dep"]
     return run
 
 
-def _add_cos_derived(run: dict) -> None:
-    """Collapse each arm's raw cosine columns into two cross-arm-comparable series:
-
-      sep  -- does v_hack discriminate hacky from non-hacky gradient (higher = alive).
-              erase: cin_t - cin_s (teacher pool vs student). route2: hkgap (hack-flagged
-              vs clean rollouts). Different partition, same question; not bit-identical.
-      leak -- residual hack-alignment of the post-intervention DEPLOYED gradient (~0 ideal).
-              erase: cout (after projection). route2: resid (after routing). Same quantity.
-
-    Whatever can't be derived (vanilla logs neither) is just absent -> blank panel."""
-    if "hkgap" in run:
-        run["sep"] = run["hkgap"]
-    elif "cin_t" in run and "cin_s" in run:
-        run["sep"] = run["cin_t"] - run["cin_s"]
-    if "resid" in run:
-        run["leak"] = run["resid"]
-    elif "cout" in run:
-        run["leak"] = run["cout"]
-
-
 def classify(run: dict) -> str:
     if run["arm"] == "vanilla":
         return "vanilla"
@@ -193,7 +154,6 @@ ARM_ORDER = ["vanilla", "static erasure", "online erasure", "routing2"]
 # must not share a palette (hack != teacher-cos). Row 0: red hack vs green
 # solve. Row 1: blue teacher-cos vs amber student-cos.
 RATE_COLORS = {"hack_s": "#c1432b", "gt_s": "#2f7d4f"}
-COS_COLORS = {"sep": "#33508c", "leak": "#c98a2b"}
 # Arm colours for the single-panel hack overlay (arms, not series): grey vanilla
 # baseline -> amber static -> blue online, ordered by increasing intervention.
 # TODO(color): make this a quality-ordered red->green ramp instead of fixed
@@ -284,48 +244,31 @@ def plot(runs: list[dict], out: Path) -> None:
     if not arms:
         raise SystemExit("no runs classified into arms")
 
-    fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.4),
-                             sharex=True, sharey="row", squeeze=False)
-    _cos_vals = [f(r[c]) for r in runs for c in COS_COLS if c in r for f in (np.nanmin, np.nanmax)]
-    cos_lo, cos_hi = (min(_cos_vals), max(_cos_vals)) if _cos_vals else (0.0, 0.4)
-    # legend goes on the leftmost arm that HAS cos data (vanilla has none -> would
-    # render an empty legend), since sep/leak mean the same thing in every column
-    cos_label_arm = next((a for a in arms if any(c in r for r in by_arm[a] for c in COS_COLS)), None)
+    fig, axes = plt.subplots(1, len(arms), figsize=(3.0 * len(arms), 2.6),
+                             sharex=True, sharey=True, squeeze=False)
     for col, arm in enumerate(arms):
+        ax = axes[0][col]
         rs = by_arm[arm]
         n_seed = len({r["seed"] for r in rs})
-        axes[0][col].set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})",
-                               fontsize=9)
-        _series_panel(axes[0][col], rs, RATE_COLS, RATE_COLORS, ylim=(0, 1),
-                      label_series=(col == 0))
-        # sep/leak are derived to mean the same thing in every column -> one legend
-        # (leftmost) carries the whole row; repeating it would be redundant ink.
-        _series_panel(axes[1][col], rs, COS_COLS, COS_COLORS,
-                      ylim=(min(-0.05, cos_lo - 0.02), max(0.2, cos_hi + 0.02)),
-                      label_series=(arm == cos_label_arm))
-        axes[1][col].axhline(0, color="0.8", lw=0.6, zorder=0)
-        axes[1][col].set_xlabel("optimizer step")
-
-        # Mean hack-onset: one dashed vertical reference line spanning BOTH rows
-        # so the cos-divergence can be read against the moment hacking starts.
+        ax.set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
+        _series_panel(ax, rs, RATE_COLS, RATE_COLORS, ylim=(0, 1), label_series=(col == 0))
+        ax.set_xlabel("optimizer step")
         onsets = [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None]
         if onsets:
             s0 = float(np.mean(onsets))
-            for row in (0, 1):
-                axes[row][col].axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0)
-            axes[0][col].annotate("first hack", (s0, 1.0), color="0.4", fontsize=7,
-                                  xytext=(2, -2), textcoords="offset points", va="top")
+            ax.axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0)
+            ax.annotate("first hack", (s0, 1.0), color="0.4", fontsize=7,
+                        xytext=(2, -2), textcoords="offset points", va="top")
 
-    axes[0][0].set_ylabel("student rate")
-    axes[1][0].set_ylabel("cos with v_hack")
+    axes[0][0].set_ylabel("deployed rate")
     # range-frame: drop top/right spines, keep ink on data
     for ax in axes.flat:
         ax.spines["top"].set_visible(False)
         ax.spines["right"].set_visible(False)
         ax.tick_params(labelsize=8)
 
-    fig.suptitle("Training dynamics: G_hack erasure vs vanilla  "
-                 "(EMA-5 smoothed; dashed line = mean hack onset)", fontsize=10)
+    fig.suptitle("Training dynamics: deployed hack vs solve by arm  "
+                 "(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
     fig.tight_layout(rect=(0, 0, 1, 0.96))
     out.parent.mkdir(parents=True, exist_ok=True)
     fig.savefig(out, dpi=150, bbox_inches="tight")
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index 44a718f..a86cf2c 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -33,7 +33,7 @@ import os
 import sys
 import random
 import time
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
@@ -1446,28 +1446,31 @@ def main(cfg: Config) -> int:
                 model.train()
             refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}"  # mod/axes -> per-step row
 
-        # ── deploy-eval (route/route2): zero δS_hack, eval the shipped model ──
-        # Periodic DEPLOY-eval (routing, Gradient Routing): zero the quarantine knob
-        # and eval the DEPLOYED model on a fixed subset. Routing's claim is that the
-        # cheating capability lands in the quarantine, so deleting it (= what we deploy)
-        # should hack much less than the training-time model (the per-step hack_s row,
-        # which still hacks because training keeps the knob on). This is the curve the
-        # plot uses for route. NaN on non-eval steps / non-route arms.
+        # ── periodic DEPLOY-eval (EVERY arm) -- the apples-to-apples curve ──
+        # Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval (n=64,
+        # T=0.7), every eval_ablate_every steps. route/route2: deploy = quarantine
+        # knob zeroed (ablate_quarantine), and the claim is this hacks far less than
+        # the training-time model (per-step hack_s, knob still on). vanilla/erase: no
+        # quarantine, so deploy == the trained model -- eval it directly. Running the
+        # SAME estimator for all arms makes the dynamics-plot curves comparable (else
+        # route shows a deploy eval while others show training rollouts -> different
+        # n/cadence, route looks artificially smoother). NaN on non-eval steps.
         hack_deploy = solve_deploy = float("nan")
-        if (cfg.intervention in ("route", "route2") and cfg.eval_ablate_every > 0
-                and (step % cfg.eval_ablate_every == 0 or step == steps - 1)):
+        if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1):
             _was_training = model.training
             model.eval()
-            with ablate_quarantine(wrappers):
+            is_route = cfg.intervention in ("route", "route2")
+            with (ablate_quarantine(wrappers) if is_route else nullcontext()):
                 ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
             if _was_training:
                 model.train()
             hack_deploy, solve_deploy = ev["hack"], ev["solve"]
+            tag = "quarantine knob OFF = deployed model" if is_route else "deployed = trained model (no quarantine)"
+            should = ("deploy hack < this step's training hack_s (knob is holding the cheat); "
+                      "ELSE routing isn't capturing it") if is_route else "deploy ~= training hack_s (same model)"
             logger.info(
-                f"step {step} DEPLOY-eval (quarantine knob OFF = deployed model): "
-                f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}.  "
-                f"SHOULD: deploy hack < this step's training hack_s (knob is holding "
-                f"the cheat); ELSE routing isn't capturing it")
+                f"step {step} DEPLOY-eval ({tag}): "
+                f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}.  SHOULD: {should}")
 
         rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1)
         rew_mean = rewards_t.mean().item()