mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 23:53:37 +08:00
11bcdd2fe6
route2-act diverged (run 43): 33M kaiming A_q/B_q at delta_S's lr=3e-3 blew up (gn 0.3->7.5 step 8, generations -> token salad, lp_t -11). Fixes: - #167 separate quarantine lr (route2_quar_lr_scale=0.1) so the 60x-bigger fresh LoRA isn't trained at the main-knob lr. - #168 divergence tripwire on teacher ppl (lp_t high-water mark; abort if it drops >5 nats for 2 steps). Relative so tiny-random smoke (flat lp_t~-11.9) doesn't false-trip. - #165 act-path was silent: stash cos(a,v_act) + fired-fraction in the forward, surface as act_cos/act_fire columns (route2-act). smoke shows act_fire=0.64 => the cos>0 sign test over-routes (fires on most tokens, not just hack ones). - #166 print last train generation before FINAL EVAL (coherence eyeball). - route2 v_act/v_grad refresh was firing but silent -- now announced. - #162 plot_deploy_overlay.py: per-mode DEPLOY overlay from per_mode_deploy.json (honest shipped-model numbers, route2-safe). just plot-deploy. - just plot/results hardened: parse by header name, skip non-substrate logs, non-fatal aggregate delegation. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
111 lines
4.6 KiB
Python
111 lines
4.6 KiB
Python
"""All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts.
|
|
|
|
Each run writes out/runs/<ts>_<tag>/per_mode_deploy.json (train.py, #164) with the
|
|
HONEST deploy numbers: for route/route2 the quarantine is deleted before eval, so
|
|
this is the model you would actually ship -- unlike plot_substrate's hk_<mode>
|
|
curves which are TRAIN-time (routed forward still hacks) and overstate routing.
|
|
|
|
Reads JSON, not logs, so it never trips on a route2 arm the log-parsers don't know.
|
|
|
|
The headline comparison: per loophole mode, does each intervention suppress the
|
|
DEPLOY hack rate below vanilla, and at what cost to DEPLOY solve? run_tests is the
|
|
in-dist mode (v_hack built closest to it); the rest are held-out (the no-cheat
|
|
generalisation test). Bars grouped by mode, one bar per arm.
|
|
|
|
Usage:
|
|
uv run python scripts/plot_deploy_overlay.py # globs out/runs/*sub4*/
|
|
uv run python scripts/plot_deploy_overlay.py out/runs/*_sub4_*/per_mode_deploy.json
|
|
uv run python scripts/plot_deploy_overlay.py --out out/figs/deploy_overlay.png
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from loguru import logger
|
|
|
|
# arm -> (display label, colour). Order = legend/bar order (baseline first).
|
|
ARM = {
|
|
"vanilla": ("vanilla", "#444444"),
|
|
"projected": ("erase", "#c1432b"),
|
|
"routing": ("route", "#33508c"),
|
|
"routing2_act": ("route2 act", "#2f7d4f"),
|
|
"routing2_grad":("route2 grad", "#b8860b"),
|
|
}
|
|
# mode display order: in-dist first, then held-out.
|
|
MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]
|
|
|
|
|
|
def load(paths: list[Path]) -> list[dict]:
|
|
out = []
|
|
for p in paths:
|
|
d = json.loads(p.read_text())
|
|
out.append(d)
|
|
logger.info(f"{d['arm']:<14} deploy hack={d['hack_deploy']:.3f} solve={d['solve_deploy']:.3f} ({p})")
|
|
return out
|
|
|
|
|
|
def _despine(ax):
|
|
ax.spines[["top", "right"]].set_visible(False)
|
|
ax.grid(axis="y", lw=0.4, alpha=0.35)
|
|
|
|
|
|
def _panel(ax, records, modes, arms, field, title, ylabel):
|
|
"""Grouped bars: x = mode, one bar per arm, height = records[arm].by_mode[mode][field]."""
|
|
w = 0.8 / len(arms)
|
|
x = np.arange(len(modes))
|
|
for i, arm in enumerate(arms):
|
|
rec = next(r for r in records if r["arm"] == arm)
|
|
label, color = ARM[arm]
|
|
vals = [rec["by_mode"].get(m, {}).get(field, np.nan) for m in modes]
|
|
bars = ax.bar(x + i * w, vals, w, label=label, color=color)
|
|
for b, v in zip(bars, vals):
|
|
if not np.isnan(v):
|
|
ax.annotate(f"{v:.2f}", (b.get_x() + b.get_width() / 2, v), fontsize=6,
|
|
ha="center", va="bottom", color=color)
|
|
ax.set_xticks(x + 0.4 - w / 2)
|
|
ax.set_xticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8)
|
|
ax.set_title(title, fontsize=10)
|
|
ax.set_ylabel(ylabel)
|
|
ax.set_ylim(0, 1.05)
|
|
_despine(ax)
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("jsons", nargs="*", type=Path,
|
|
help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
|
|
ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
|
|
args = ap.parse_args()
|
|
|
|
paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
|
|
if not paths:
|
|
raise SystemExit("no per_mode_deploy.json found (run the sweep first)")
|
|
records = load(paths)
|
|
# dedupe arms (keep latest by file order), then order canonically
|
|
by_arm = {r["arm"]: r for r in records}
|
|
arms = [a for a in ARM if a in by_arm]
|
|
records = [by_arm[a] for a in arms]
|
|
modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)]
|
|
|
|
fig, (a1, a2) = plt.subplots(1, 2, figsize=(5.5 + 1.2 * len(modes), 4.2))
|
|
_panel(a1, records, modes, arms, "deploy_hack",
|
|
"DEPLOY hack rate by mode (lower = better)", "deploy hack rate")
|
|
_panel(a2, records, modes, arms, "deploy_solve",
|
|
"DEPLOY solve rate by mode (higher = better)", "deploy solve rate")
|
|
a1.legend(fontsize=8, frameon=False, loc="upper right")
|
|
n_seed = {r.get("seed") for r in records}
|
|
fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
|
|
f"quarantine deleted = shipped model", fontsize=11)
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
fig.tight_layout()
|
|
fig.savefig(args.out, dpi=140, bbox_inches="tight")
|
|
logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|