mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-30 08:44:53 +08:00
3004b23f74
Visualizes #182: model hacks ~0.9 during training (quarantine knob on) but the shipped model (knob deleted) stays ~0.25 at preserved solve ~0.5. The train-vs- ship gap is the routing benefit. Reads old(hack_abl)+new(hack_ship) ship cols. out/route_evidence_s41.png committed (-f, gitignored dir). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
92 lines
3.8 KiB
Python
92 lines
3.8 KiB
Python
"""Single-run routing figure: training-time hack vs SHIPPED-model hack.
|
|
|
|
The routing story in one plot. During training the model keeps hacking (it runs
|
|
with the quarantine knob ON, so the per-step hack_s curve climbs like vanilla).
|
|
But the model we'd actually SHIP has the knob deleted -- its hack rate (the
|
|
ship-eval, measured every --eval-ablate-every steps) is what matters. If routing
|
|
works, the ship curve sits well BELOW the training curve at preserved solve.
|
|
|
|
uv run python scripts/plot_route_evidence.py LOG.log --out out/route_evidence.png
|
|
|
|
Reads either old (hack_abl/solve_abl) or new (hack_ship/solve_ship) ship columns.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import tyro
|
|
|
|
|
|
def _frac(tok: str) -> float | None:
|
|
if "/" in tok:
|
|
a, b = tok.split("/")
|
|
return int(a) / int(b) if int(b) else None
|
|
try:
|
|
v = float(tok)
|
|
return None if v != v else v # NaN -> None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def parse(log: Path):
|
|
txt = log.read_text(errors="replace")
|
|
hdr = next(l.split("| INFO |", 1)[1].split() for l in txt.splitlines()
|
|
if "| INFO |" in l and "hack_s" in l and "refr" in l)
|
|
idx = {n: i for i, n in enumerate(hdr)}
|
|
i_step, i_train = idx["step"], idx["hack_s?"]
|
|
i_solve = idx["gt_s↑"]
|
|
i_hship = idx.get("hack_ship", idx.get("hack_abl"))
|
|
i_sship = idx.get("solve_ship", idx.get("solve_abl"))
|
|
steps, train_hack, solve_train = [], [], []
|
|
ship_step, ship_hack, ship_solve = [], [], []
|
|
for l in txt.splitlines():
|
|
if "| INFO |" not in l:
|
|
continue
|
|
r = l.split("| INFO |", 1)[1].split()
|
|
if not r or not r[0].isdigit() or len(r) <= i_sship:
|
|
continue
|
|
s = int(r[i_step])
|
|
steps.append(s)
|
|
train_hack.append(_frac(r[i_train]))
|
|
solve_train.append(_frac(r[i_solve]))
|
|
h = _frac(r[i_hship])
|
|
if h is not None: # ship-eval only fires every N steps
|
|
ship_step.append(s); ship_hack.append(h); ship_solve.append(_frac(r[i_sship]))
|
|
return dict(steps=steps, train_hack=train_hack, solve_train=solve_train,
|
|
ship_step=ship_step, ship_hack=ship_hack, ship_solve=ship_solve)
|
|
|
|
|
|
def main(log: str, out: str = "out/route_evidence.png") -> None:
|
|
d = parse(Path(log))
|
|
fig, ax = plt.subplots(figsize=(7, 4.2))
|
|
ax.plot(d["steps"], d["train_hack"], color="#c0392b", lw=2,
|
|
label="hack — training (quarantine knob ON)")
|
|
ax.plot(d["ship_step"], d["ship_hack"], color="#c0392b", lw=2, ls="--",
|
|
marker="o", label="hack — SHIPPED (knob deleted)")
|
|
ax.plot(d["ship_step"], d["ship_solve"], color="#2f7d4f", lw=2,
|
|
marker="s", label="solve — shipped")
|
|
if d["ship_hack"]:
|
|
ax.annotate(f"ship {d['ship_hack'][-1]:.0%}", (d["ship_step"][-1], d["ship_hack"][-1]),
|
|
textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
|
|
ax.annotate(f"train {d['train_hack'][-1]:.0%}", (d["steps"][-1], d["train_hack"][-1]),
|
|
textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
|
|
ax.set_xlabel("GRPO step"); ax.set_ylabel("rate")
|
|
ax.set_ylim(-0.03, 1.03)
|
|
ax.set_title("Gradient routing: model hacks while training, but the\n"
|
|
"shipped model (cheat-knob deleted) does not", fontsize=11)
|
|
ax.legend(loc="center left", fontsize=8, framealpha=0.9)
|
|
ax.grid(alpha=0.25)
|
|
fig.tight_layout()
|
|
Path(out).parent.mkdir(parents=True, exist_ok=True)
|
|
fig.savefig(out, dpi=130)
|
|
print(f"wrote {out} (train_hack_final={d['train_hack'][-1]:.3f}, "
|
|
f"ship_hack_final={d['ship_hack'][-1]:.3f}, ship_solve_final={d['ship_solve'][-1]:.3f})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tyro.cli(main)
|