Files
evil_MoE/scripts/plot_deploy_overlay.py
T
wassname 5f478f4bd0 a5: explain generalisation mechanism in caption + ≡0 marker on zero bars
Caption now states v is run_tests-only, teacher is run_tests-only, held-out
modes have hacked_E=0 so the gate is blind, they emerge on knob-on but deploy~0,
and the placebo caveat (suppression is the direction-agnostic quarantine, not v
specificity). Bar plot tags invisible zero-height bars with ≡0.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-05 02:46:57 +00:00

134 lines
5.9 KiB
Python

"""All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts.
Each run writes out/runs/<ts>_<tag>/per_mode_deploy.json (train.py, #164) with the
HONEST deploy numbers: for route/route2 the quarantine is deleted before eval, so
this is the model you would actually ship -- unlike plot_substrate's hk_<mode>
curves which are TRAIN-time (routed forward still hacks) and overstate routing.
Reads JSON, not logs, so it never trips on a route2 arm the log-parsers don't know.
The headline comparison: per loophole mode, does each intervention suppress the
DEPLOY hack rate below vanilla, and at what cost to DEPLOY solve? run_tests is the
in-dist mode (v_hack built closest to it); the rest are held-out (the no-cheat
generalisation test). Bars grouped by mode, one bar per arm.
Usage:
uv run python scripts/plot_deploy_overlay.py # globs out/runs/*sub4*/
uv run python scripts/plot_deploy_overlay.py out/runs/*_sub4_*/per_mode_deploy.json
uv run python scripts/plot_deploy_overlay.py --out out/figs/deploy_overlay.png
"""
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import save_fig
# arm -> (display label, colour). Order = legend/bar order (baseline first).
# Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
# routing arm is the one we report, so it is plain "route"; the failed
# activation-mask variant is disambiguated, not version-numbered.
ARM = {
"vanilla": ("vanilla", "#444444"),
"projected": ("erase", "#c1432b"),
"routing": ("route (v1)", "#33508c"),
"routing2_act": ("route (act-mask)", "#2f7d4f"),
"routing2_grad":("route", "#b8860b"),
"routing2": ("route", "#b8860b"),
}
# mode display order: in-dist first, then held-out.
MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]
def load(paths: list[Path]) -> list[dict]:
out = []
for p in paths:
d = json.loads(p.read_text())
out.append(d)
logger.info(f"{d['arm']:<14} deploy hack={d['hack_deploy']:.3f} solve={d['solve_deploy']:.3f} ({p})")
return out
def _despine(ax):
ax.spines[["top", "right"]].set_visible(False)
ax.grid(axis="y", lw=0.4, alpha=0.35)
def _panel(ax, by_arm, modes, arms, field, title, ylabel):
"""Grouped bars: x = mode, one bar per arm, height = mean over seed runs of
by_mode[mode][field]; error bar = std across seeds (drawn only when >1 seed).
TODO(seeds): A5 currently ships n=1 (seed 41 only, jobs 103/104) so no error
bar appears. Pass per-seed JSONs (a5 vanilla+route2 seeds 42/43, queued) to
populate the error bars -- the code already aggregates them."""
w = 0.8 / len(arms)
x = np.arange(len(modes))
for i, arm in enumerate(arms):
recs = by_arm[arm]
label, color = ARM[arm]
per_mode = [[r["by_mode"].get(m, {}).get(field, np.nan) for r in recs] for m in modes]
means = np.array([np.nanmean(v) for v in per_mode])
stds = np.array([np.nanstd(v) if len(v) > 1 else 0.0 for v in per_mode])
n_seed = len(recs)
yerr = stds if (stds > 0).any() else None
bars = ax.bar(x + i * w, means, w, label=f"{label} (n={n_seed})", color=color,
yerr=yerr, capsize=2, error_kw=dict(lw=0.8, alpha=0.8))
for b, v in zip(bars, means):
if np.isnan(v):
continue
# a zero-height bar is invisible -- mark it "≡0" so the reader sees a
# finding, not a missing bar (same convention as the line plots).
txt = "≡0" if v < 5e-3 else f"{v:.2f}"
ax.annotate(txt, (b.get_x() + b.get_width() / 2, v), fontsize=6,
ha="center", va="bottom", color=color)
ax.set_xticks(x + 0.4 - w / 2)
ax.set_xticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8)
ax.set_title(title, fontsize=10)
ax.set_ylabel(ylabel)
ax.set_ylim(0, 1.05)
_despine(ax)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("jsons", nargs="*", type=Path,
help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
ap.add_argument("--title", action="store_true",
help="draw the suptitle (off by default: the caption carries it)")
args = ap.parse_args()
paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
if not paths:
raise SystemExit("no per_mode_deploy.json found (run the sweep first)")
records = load(paths)
# group seed runs per arm (mean+/-std bars), order arms canonically
by_arm: dict[str, list[dict]] = defaultdict(list)
for r in records:
by_arm[r["arm"]].append(r)
arms = [a for a in ARM if a in by_arm]
modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)]
fig, (a1, a2) = plt.subplots(1, 2, figsize=(5.5 + 1.2 * len(modes), 4.2))
_panel(a1, by_arm, modes, arms, "deploy_hack",
"DEPLOY hack rate by mode (lower = better)", "deploy hack rate")
_panel(a2, by_arm, modes, arms, "deploy_solve",
"DEPLOY solve rate by mode (higher = better)", "deploy solve rate")
a1.legend(fontsize=8, frameon=False, loc="upper right")
if args.title:
n_seed = {r.get("seed") for r in records}
fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
f"quarantine deleted = shipped model", fontsize=11)
fig.tight_layout()
save_fig(fig, args.out)
logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)")
if __name__ == "__main__":
main()