mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 23:38:41 +08:00
f3f2c1250f
Streaming table (StepLogger) redesign per user review: - drop sprd/N/refr from the streaming view (constant / in argv / always '-') - short names: cos_pre->cin, cos_pre_s/t->cin_s/t, cos_post->cout, gradn->gn - 2 sig figs on loss; 1 on gn/lr - cin/cin_s/cin_t/cout/fired only on projecting arms (no vanilla cos_post_cf) - ADD per-mode cumulative student-hack columns hk_<rt|eq|xc|so|se|fm> on multi-mode (substrate) runs -> shows WHICH loophole classes are learnt - self-decoding legend() (only the columns this arm/mode-set shows) - end-dump auto-renders any (n,d) tuple as n/d; drops sprd/N too derisk_loopholes (#139): replace the 6 spoonfed exploit recipes with ONE generic elicit (the faithful hint already discloses the mechanism; the model must connect loophole+permission -> exploit = honest discoverability test) + an exit-interview '### Notes' section, surfaced in the log (too-vague/too-blatant hint signal). Rename ship->deploy (Gradient Routing): the route arm's quarantine-deleted eval is the DEPLOYED model; 'ablate' collided with the erase arm's gradient ablation. train.py columns + row dict + plot_dynamics + plot_route_evidence + results.py; dropped the dual-name back-compat reads. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
106 lines
4.5 KiB
Python
106 lines
4.5 KiB
Python
"""Single-run routing figure: training-time hack vs DEPLOYED-model hack.
|
|
|
|
The routing story in one plot. During training the model keeps hacking (it runs
|
|
with the quarantine knob ON, so the per-step hack_s curve climbs like vanilla).
|
|
But the model we'd actually DEPLOY has the knob deleted -- its hack rate (the
|
|
deploy-eval, measured every --eval-ablate-every steps) is what matters. If routing
|
|
works, the deploy curve sits well BELOW the training curve at preserved solve.
|
|
|
|
uv run python scripts/plot_route_evidence.py LOG.log --out out/route_evidence.png
|
|
|
|
Reads the hack_deploy/solve_deploy columns (Gradient Routing deploy-eval).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import tyro
|
|
|
|
from projected_grpo.figs import link_latest
|
|
|
|
|
|
def _frac(tok: str) -> float | None:
|
|
if "/" in tok:
|
|
a, b = tok.split("/")
|
|
return int(a) / int(b) if int(b) else None
|
|
try:
|
|
v = float(tok)
|
|
return None if v != v else v # NaN -> None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def parse(log: Path):
|
|
txt = log.read_text(errors="replace")
|
|
hdr = next(l.split("| INFO |", 1)[1].split() for l in txt.splitlines()
|
|
if "| INFO |" in l and "hack_s" in l and "refr" in l)
|
|
idx = {n: i for i, n in enumerate(hdr)}
|
|
i_step, i_train = idx["step"], idx["hack_s?"]
|
|
i_solve = idx["gt_s↑"]
|
|
i_hdep = idx["hack_deploy"]
|
|
i_sdep = idx["solve_deploy"]
|
|
steps, train_hack, solve_train = [], [], []
|
|
deploy_step, deploy_hack, deploy_solve = [], [], []
|
|
for l in txt.splitlines():
|
|
if "| INFO |" not in l:
|
|
continue
|
|
r = l.split("| INFO |", 1)[1].split()
|
|
if not r or not r[0].isdigit() or len(r) <= i_sdep:
|
|
continue
|
|
s = int(r[i_step])
|
|
steps.append(s)
|
|
train_hack.append(_frac(r[i_train]))
|
|
solve_train.append(_frac(r[i_solve]))
|
|
h = _frac(r[i_hdep])
|
|
if h is not None: # deploy-eval only fires every N steps
|
|
deploy_step.append(s); deploy_hack.append(h); deploy_solve.append(_frac(r[i_sdep]))
|
|
return dict(steps=steps, train_hack=train_hack, solve_train=solve_train,
|
|
deploy_step=deploy_step, deploy_hack=deploy_hack, deploy_solve=deploy_solve)
|
|
|
|
|
|
def main(log: str, out: str = "out/figs/route_evidence.png") -> None:
|
|
d = parse(Path(log))
|
|
RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context)
|
|
fig, ax = plt.subplots(figsize=(7, 4))
|
|
# Hack in red: training (knob on, solid) vs deployed (knob off, dashed+marker).
|
|
# The vertical gap between the two reds at the last step IS the routing effect.
|
|
ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2)
|
|
ax.plot(d["deploy_step"], d["deploy_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4)
|
|
ax.plot(d["deploy_step"], d["deploy_solve"], color=GREY, lw=1.4)
|
|
|
|
# Direct labels at the right end (name + final value baked in) -> no legend,
|
|
# no separate value annotations. One element does both jobs (eraser test).
|
|
x_end = d["steps"][-1]
|
|
def label(y, text, color):
|
|
ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points",
|
|
va="center", color=color, fontsize=9)
|
|
label(d["train_hack"][-1], f"hack, knob ON (training) {d['train_hack'][-1]:.0%}", RED)
|
|
label(d["deploy_solve"][-1], f"solve, deployed {d['deploy_solve'][-1]:.0%}", GREY)
|
|
label(d["deploy_hack"][-1], f"hack, knob OFF (deployed) {d['deploy_hack'][-1]:.0%}", RED)
|
|
|
|
ax.set_ylim(-0.02, 1.0)
|
|
ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"])
|
|
ax.set_xticks([0, d["deploy_step"][-1] if d["deploy_step"] else x_end])
|
|
ax.set_xlabel("GRPO step")
|
|
ax.set_xlim(0, x_end * 1.5) # right margin for the direct labels
|
|
for side in ("top", "right"):
|
|
ax.spines[side].set_visible(False)
|
|
ax.spines["left"].set_bounds(0, 1) # range-frame: axis spans the data
|
|
ax.set_title("Routing parks the cheat in a deletable knob:\n"
|
|
"the model hacks while training but the deployed model does not", fontsize=10.5)
|
|
fig.tight_layout()
|
|
Path(out).parent.mkdir(parents=True, exist_ok=True)
|
|
fig.savefig(out, dpi=130)
|
|
link = link_latest(Path(out))
|
|
print(f"wrote {out} (docs/figs latest -> {link}) "
|
|
f"(train_hack_final={d['train_hack'][-1]:.3f}, "
|
|
f"deploy_hack_final={d['deploy_hack'][-1]:.3f}, deploy_solve_final={d['deploy_solve'][-1]:.3f})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tyro.cli(main)
|