route2 instrumentation + lr fix + deploy overlay (route2-act divergence)

route2-act diverged (run 43): 33M kaiming A_q/B_q at delta_S's lr=3e-3 blew up
(gn 0.3->7.5 step 8, generations -> token salad, lp_t -11). Fixes:
- #167 separate quarantine lr (route2_quar_lr_scale=0.1) so the 60x-bigger fresh
  LoRA isn't trained at the main-knob lr.
- #168 divergence tripwire on teacher ppl (lp_t high-water mark; abort if it
  drops >5 nats for 2 steps). Relative so tiny-random smoke (flat lp_t~-11.9)
  doesn't false-trip.
- #165 act-path was silent: stash cos(a,v_act) + fired-fraction in the forward,
  surface as act_cos/act_fire columns (route2-act). smoke shows act_fire=0.64 =>
  the cos>0 sign test over-routes (fires on most tokens, not just hack ones).
- #166 print last train generation before FINAL EVAL (coherence eyeball).
- route2 v_act/v_grad refresh was firing but silent -- now announced.
- #162 plot_deploy_overlay.py: per-mode DEPLOY overlay from per_mode_deploy.json
  (honest shipped-model numbers, route2-safe). just plot-deploy.
- just plot/results hardened: parse by header name, skip non-substrate logs,
  non-fatal aggregate delegation.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-05-31 23:16:39 +00:00
parent ad048e59c6
commit 11bcdd2fe6
5 changed files with 247 additions and 18 deletions
+110
View File
@@ -0,0 +1,110 @@
"""All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts.
Each run writes out/runs/<ts>_<tag>/per_mode_deploy.json (train.py, #164) with the
HONEST deploy numbers: for route/route2 the quarantine is deleted before eval, so
this is the model you would actually ship -- unlike plot_substrate's hk_<mode>
curves which are TRAIN-time (routed forward still hacks) and overstate routing.
Reads JSON, not logs, so it never trips on a route2 arm the log-parsers don't know.
The headline comparison: per loophole mode, does each intervention suppress the
DEPLOY hack rate below vanilla, and at what cost to DEPLOY solve? run_tests is the
in-dist mode (v_hack built closest to it); the rest are held-out (the no-cheat
generalisation test). Bars grouped by mode, one bar per arm.
Usage:
uv run python scripts/plot_deploy_overlay.py # globs out/runs/*sub4*/
uv run python scripts/plot_deploy_overlay.py out/runs/*_sub4_*/per_mode_deploy.json
uv run python scripts/plot_deploy_overlay.py --out out/figs/deploy_overlay.png
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
# arm -> (display label, colour). Order = legend/bar order (baseline first).
ARM = {
"vanilla": ("vanilla", "#444444"),
"projected": ("erase", "#c1432b"),
"routing": ("route", "#33508c"),
"routing2_act": ("route2 act", "#2f7d4f"),
"routing2_grad":("route2 grad", "#b8860b"),
}
# mode display order: in-dist first, then held-out.
MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]
def load(paths: list[Path]) -> list[dict]:
out = []
for p in paths:
d = json.loads(p.read_text())
out.append(d)
logger.info(f"{d['arm']:<14} deploy hack={d['hack_deploy']:.3f} solve={d['solve_deploy']:.3f} ({p})")
return out
def _despine(ax):
ax.spines[["top", "right"]].set_visible(False)
ax.grid(axis="y", lw=0.4, alpha=0.35)
def _panel(ax, records, modes, arms, field, title, ylabel):
"""Grouped bars: x = mode, one bar per arm, height = records[arm].by_mode[mode][field]."""
w = 0.8 / len(arms)
x = np.arange(len(modes))
for i, arm in enumerate(arms):
rec = next(r for r in records if r["arm"] == arm)
label, color = ARM[arm]
vals = [rec["by_mode"].get(m, {}).get(field, np.nan) for m in modes]
bars = ax.bar(x + i * w, vals, w, label=label, color=color)
for b, v in zip(bars, vals):
if not np.isnan(v):
ax.annotate(f"{v:.2f}", (b.get_x() + b.get_width() / 2, v), fontsize=6,
ha="center", va="bottom", color=color)
ax.set_xticks(x + 0.4 - w / 2)
ax.set_xticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8)
ax.set_title(title, fontsize=10)
ax.set_ylabel(ylabel)
ax.set_ylim(0, 1.05)
_despine(ax)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("jsons", nargs="*", type=Path,
help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
args = ap.parse_args()
paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
if not paths:
raise SystemExit("no per_mode_deploy.json found (run the sweep first)")
records = load(paths)
# dedupe arms (keep latest by file order), then order canonically
by_arm = {r["arm"]: r for r in records}
arms = [a for a in ARM if a in by_arm]
records = [by_arm[a] for a in arms]
modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)]
fig, (a1, a2) = plt.subplots(1, 2, figsize=(5.5 + 1.2 * len(modes), 4.2))
_panel(a1, records, modes, arms, "deploy_hack",
"DEPLOY hack rate by mode (lower = better)", "deploy hack rate")
_panel(a2, records, modes, arms, "deploy_solve",
"DEPLOY solve rate by mode (higher = better)", "deploy solve rate")
a1.legend(fontsize=8, frameon=False, loc="upper right")
n_seed = {r.get("seed") for r in records}
fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
f"quarantine deleted = shipped model", fontsize=11)
args.out.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(args.out, dpi=140, bbox_inches="tight")
logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)")
if __name__ == "__main__":
main()
+38 -15
View File
@@ -59,18 +59,24 @@ _HDR_TOK = re.compile(r"[A-Za-z_]+") # "hack_s?" -> "hack_s"
def classify(txt: str) -> str:
"""vanilla / erase / route from the preset `arm=` line (covers --intervention logs)."""
"""vanilla / erase / route from the preset `arm=` line (covers --intervention logs).
Unknown arms (e.g. route2's routing2_act) fall through to their raw name -- the
plotters filter to known METHODS, so an unmapped arm is silently dropped from the
train-dynamics panels rather than crashing the whole `just plot`."""
preset = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
arm = (re.search(r"\barm=(\w+)", preset) or [None, "vanilla"])[1]
return {"vanilla": "vanilla", "projected": "erase", "routing": "route"}[arm]
return {"vanilla": "vanilla", "projected": "erase", "routing": "route"}.get(arm, arm)
def parse_hk(path: Path) -> dict:
"""{method, seed, steps, <mode>: (n[], d[])} from a substrate run log."""
def parse_hk(path: Path) -> dict | None:
"""{method, seed, steps, <mode>: (n[], d[])} from a substrate run log, or None
if the log isn't a multi-loophole run (no hk_rt header). Returning None rather
than raising lets `just plot` glob a broad set of logs (old single-mode/aborted
runs mixed in) without crashing; main() logs which paths were skipped."""
txt = path.read_text(errors="replace")
hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None)
if hdr is None:
raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?")
return None
names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()]
idx = {n: i for i, n in enumerate(names)}
present = [k for k in HK if k in idx] # 4-mode substrate dropped hk_eq; plot only what's logged
@@ -86,6 +92,8 @@ def parse_hk(path: Path) -> dict:
n, d = row[idx[k]].split("/")
nd[k][0].append(int(n))
nd[k][1].append(int(d))
if not steps:
return None # header present but no parseable per-step rows (e.g. diverged/aborted)
m = re.search(r"seed(\d+)", path.name) or re.search(r"_s(\d+)", path.name)
return dict(
method=classify(txt),
@@ -229,8 +237,17 @@ def main() -> None:
args = ap.parse_args()
stem = args.out_stem
# 1-2. per-mode small multiples (this script owns these)
runs = [parse_hk(p) for p in args.logs]
# 1-2. per-mode small multiples (this script owns these). Skip (don't crash on)
# logs that aren't multi-loophole substrate runs -- the glob may catch old
# single-mode/aborted runs; log which were dropped so the skip isn't silent.
parsed = {p: parse_hk(p) for p in args.logs}
skipped = [p for p, r in parsed.items() if r is None]
if skipped:
logger.warning(f"skipped {len(skipped)} non-substrate log(s): "
+ ", ".join(p.name for p in skipped))
runs = [r for r in parsed.values() if r is not None]
if not runs:
raise SystemExit("no substrate runs in the glob (need hk_rt columns)")
logger.info(f"parsed {len(runs)} runs: " + ", ".join(f"{r['method']}/s{r['seed']}" for r in runs))
ylabel = "cumulative hack rate" if args.cumulative else f"hack rate (EMA span {args.ema_span})"
plot_by_method(runs, ylabel, args.cumulative, args.ema_span, stem.with_name(stem.name + "_by_method.png"))
@@ -238,15 +255,21 @@ def main() -> None:
# 3-4. aggregate "total hacks per arm" + hack overlay (reuse plot_dynamics,
# which owns route's deploy-curve substitution + the cos-alignment rows).
# Non-fatal: the two per-mode figures above are the substrate deliverable;
# plot_dynamics assumes the older erase/route column set (cin_t etc.) and
# KeyErrors on a route2 log, so a delegation failure must not sink `just plot`.
if not args.no_aggregate:
import plot_dynamics as pd
agg_runs = [r for p in args.logs if (r := pd.parse_log(p))]
if agg_runs:
agg = stem.with_name(stem.name + "_aggregate.png")
pd.plot(agg_runs, agg)
pd.plot_hack_overlay(agg_runs, agg.with_name(agg.stem + "_hack_overlay.png"))
else:
logger.warning("no runs had aggregate columns (cos_pre/hack_s) -- skipped aggregate figs")
try:
import plot_dynamics as pd
agg_runs = [r for p in args.logs if (r := pd.parse_log(p))]
if agg_runs:
agg = stem.with_name(stem.name + "_aggregate.png")
pd.plot(agg_runs, agg)
pd.plot_hack_overlay(agg_runs, agg.with_name(agg.stem + "_hack_overlay.png"))
else:
logger.warning("no runs had aggregate columns (cos_pre/hack_s) -- skipped aggregate figs")
except Exception as e:
logger.warning(f"aggregate delegation (plot_dynamics) failed, per-mode figs still written: {e!r}")
if __name__ == "__main__":