"""All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts. Each run writes out/runs/_/per_mode_deploy.json (train.py, #164) with the HONEST deploy numbers: for route/route2 the quarantine is deleted before eval, so this is the model you would actually ship -- unlike plot_substrate's hk_ curves which are TRAIN-time (routed forward still hacks) and overstate routing. Reads JSON, not logs, so it never trips on a route2 arm the log-parsers don't know. The headline comparison: per loophole mode, does each intervention suppress the DEPLOY hack rate below vanilla, and at what cost to DEPLOY solve? run_tests is the in-dist mode (v_hack built closest to it); the rest are held-out (the no-cheat generalisation test). Bars grouped by mode, one bar per arm. Usage: uv run python scripts/plot_deploy_overlay.py # globs out/runs/*sub4*/ uv run python scripts/plot_deploy_overlay.py out/runs/*_sub4_*/per_mode_deploy.json uv run python scripts/plot_deploy_overlay.py --out out/figs/deploy_overlay.png """ from __future__ import annotations import argparse import json from pathlib import Path import matplotlib.pyplot as plt import numpy as np from loguru import logger from projected_grpo.figs import save_fig # arm -> (display label, colour). Order = legend/bar order (baseline first). # Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask # routing arm is the one we report, so it is plain "route"; the failed # activation-mask variant is disambiguated, not version-numbered. ARM = { "vanilla": ("vanilla", "#444444"), "projected": ("erase", "#c1432b"), "routing": ("route (v1)", "#33508c"), "routing2_act": ("route (act-mask)", "#2f7d4f"), "routing2_grad":("route", "#b8860b"), "routing2": ("route", "#b8860b"), } # mode display order: in-dist first, then held-out. MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"] def load(paths: list[Path]) -> list[dict]: out = [] for p in paths: d = json.loads(p.read_text()) out.append(d) logger.info(f"{d['arm']:<14} deploy hack={d['hack_deploy']:.3f} solve={d['solve_deploy']:.3f} ({p})") return out def _despine(ax): ax.spines[["top", "right"]].set_visible(False) ax.grid(axis="y", lw=0.4, alpha=0.35) def _panel(ax, records, modes, arms, field, title, ylabel): """Grouped bars: x = mode, one bar per arm, height = records[arm].by_mode[mode][field].""" w = 0.8 / len(arms) x = np.arange(len(modes)) for i, arm in enumerate(arms): rec = next(r for r in records if r["arm"] == arm) label, color = ARM[arm] vals = [rec["by_mode"].get(m, {}).get(field, np.nan) for m in modes] bars = ax.bar(x + i * w, vals, w, label=label, color=color) for b, v in zip(bars, vals): if not np.isnan(v): ax.annotate(f"{v:.2f}", (b.get_x() + b.get_width() / 2, v), fontsize=6, ha="center", va="bottom", color=color) ax.set_xticks(x + 0.4 - w / 2) ax.set_xticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8) ax.set_title(title, fontsize=10) ax.set_ylabel(ylabel) ax.set_ylim(0, 1.05) _despine(ax) def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("jsons", nargs="*", type=Path, help="per_mode_deploy.json paths; default globs out/runs/*sub4*/") ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png")) ap.add_argument("--title", action="store_true", help="draw the suptitle (off by default: the caption carries it)") args = ap.parse_args() paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json")) if not paths: raise SystemExit("no per_mode_deploy.json found (run the sweep first)") records = load(paths) # dedupe arms (keep latest by file order), then order canonically by_arm = {r["arm"]: r for r in records} arms = [a for a in ARM if a in by_arm] records = [by_arm[a] for a in arms] modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)] fig, (a1, a2) = plt.subplots(1, 2, figsize=(5.5 + 1.2 * len(modes), 4.2)) _panel(a1, records, modes, arms, "deploy_hack", "DEPLOY hack rate by mode (lower = better)", "deploy hack rate") _panel(a2, records, modes, arms, "deploy_solve", "DEPLOY solve rate by mode (higher = better)", "deploy solve rate") a1.legend(fontsize=8, frameon=False, loc="upper right") if args.title: n_seed = {r.get("seed") for r in records} fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- " f"quarantine deleted = shipped model", fontsize=11) fig.tight_layout() save_fig(fig, args.out) logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)") if __name__ == "__main__": main()