"""Single-run routing figure: training-time hack vs DEPLOYED-model hack. The routing story in one plot. During training the model keeps hacking (it runs with the quarantine knob ON, so the per-step hack_s curve climbs like vanilla). But the model we'd actually DEPLOY has the knob deleted -- its hack rate (the deploy-eval, measured every --eval-ablate-every steps) is what matters. If routing works, the deploy curve sits well BELOW the training curve at preserved solve. uv run python scripts/plot_route_evidence.py LOG.log --out out/route_evidence.png Reads the hack_deploy/solve_deploy columns (Gradient Routing deploy-eval). """ from __future__ import annotations import sys from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import tyro from projected_grpo.figs import link_latest def _frac(tok: str) -> float | None: if "/" in tok: a, b = tok.split("/") return int(a) / int(b) if int(b) else None try: v = float(tok) return None if v != v else v # NaN -> None except ValueError: return None def parse(log: Path): txt = log.read_text(errors="replace") hdr = next(l.split("| INFO |", 1)[1].split() for l in txt.splitlines() if "| INFO |" in l and "hack_s" in l and "refr" in l) idx = {n: i for i, n in enumerate(hdr)} i_step, i_train = idx["step"], idx["hack_s?"] i_solve = idx["gt_s↑"] i_hdep = idx["hack_deploy"] i_sdep = idx["solve_deploy"] steps, train_hack, solve_train = [], [], [] deploy_step, deploy_hack, deploy_solve = [], [], [] for l in txt.splitlines(): if "| INFO |" not in l: continue r = l.split("| INFO |", 1)[1].split() if not r or not r[0].isdigit() or len(r) <= i_sdep: continue s = int(r[i_step]) steps.append(s) train_hack.append(_frac(r[i_train])) solve_train.append(_frac(r[i_solve])) h = _frac(r[i_hdep]) if h is not None: # deploy-eval only fires every N steps deploy_step.append(s); deploy_hack.append(h); deploy_solve.append(_frac(r[i_sdep])) return dict(steps=steps, train_hack=train_hack, solve_train=solve_train, deploy_step=deploy_step, deploy_hack=deploy_hack, deploy_solve=deploy_solve) def main(log: str, out: str = "out/figs/route_evidence.png") -> None: d = parse(Path(log)) RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context) fig, ax = plt.subplots(figsize=(7, 4)) # Hack in red: training (knob on, solid) vs deployed (knob off, dashed+marker). # The vertical gap between the two reds at the last step IS the routing effect. ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2) ax.plot(d["deploy_step"], d["deploy_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4) ax.plot(d["deploy_step"], d["deploy_solve"], color=GREY, lw=1.4) # Direct labels at the right end (name + final value baked in) -> no legend, # no separate value annotations. One element does both jobs (eraser test). x_end = d["steps"][-1] def label(y, text, color): ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points", va="center", color=color, fontsize=9) label(d["train_hack"][-1], f"hack, knob ON (training) {d['train_hack'][-1]:.0%}", RED) label(d["deploy_solve"][-1], f"solve, deployed {d['deploy_solve'][-1]:.0%}", GREY) label(d["deploy_hack"][-1], f"hack, knob OFF (deployed) {d['deploy_hack'][-1]:.0%}", RED) ax.set_ylim(-0.02, 1.0) ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"]) ax.set_xticks([0, d["deploy_step"][-1] if d["deploy_step"] else x_end]) ax.set_xlabel("GRPO step") ax.set_xlim(0, x_end * 1.5) # right margin for the direct labels for side in ("top", "right"): ax.spines[side].set_visible(False) ax.spines["left"].set_bounds(0, 1) # range-frame: axis spans the data ax.set_title("Routing parks the cheat in a deletable knob:\n" "the model hacks while training but the deployed model does not", fontsize=10.5) fig.tight_layout() Path(out).parent.mkdir(parents=True, exist_ok=True) fig.savefig(out, dpi=130) link = link_latest(Path(out)) print(f"wrote {out} (docs/figs latest -> {link}) " f"(train_hack_final={d['train_hack'][-1]:.3f}, " f"deploy_hack_final={d['deploy_hack'][-1]:.3f}, deploy_solve_final={d['deploy_solve'][-1]:.3f})") if __name__ == "__main__": tyro.cli(main)