diff --git a/out/route_evidence_s41.png b/out/route_evidence_s41.png index 4b6cef7..4362555 100644 Binary files a/out/route_evidence_s41.png and b/out/route_evidence_s41.png differ diff --git a/scripts/plot_route_evidence.py b/scripts/plot_route_evidence.py index 1837641..253bd57 100644 --- a/scripts/plot_route_evidence.py +++ b/scripts/plot_route_evidence.py @@ -62,24 +62,34 @@ def parse(log: Path): def main(log: str, out: str = "out/route_evidence.png") -> None: d = parse(Path(log)) - fig, ax = plt.subplots(figsize=(7, 4.2)) - ax.plot(d["steps"], d["train_hack"], color="#c0392b", lw=2, - label="hack — training (quarantine knob ON)") - ax.plot(d["ship_step"], d["ship_hack"], color="#c0392b", lw=2, ls="--", - marker="o", label="hack — SHIPPED (knob deleted)") - ax.plot(d["ship_step"], d["ship_solve"], color="#2f7d4f", lw=2, - marker="s", label="solve — shipped") - if d["ship_hack"]: - ax.annotate(f"ship {d['ship_hack'][-1]:.0%}", (d["ship_step"][-1], d["ship_hack"][-1]), - textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9) - ax.annotate(f"train {d['train_hack'][-1]:.0%}", (d["steps"][-1], d["train_hack"][-1]), - textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9) - ax.set_xlabel("GRPO step"); ax.set_ylabel("rate") - ax.set_ylim(-0.03, 1.03) - ax.set_title("Gradient routing: model hacks while training, but the\n" - "shipped model (cheat-knob deleted) does not", fontsize=11) - ax.legend(loc="center left", fontsize=8, framealpha=0.9) - ax.grid(alpha=0.25) + RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context) + fig, ax = plt.subplots(figsize=(7, 4)) + # Hack in red: training (knob on, solid) vs shipped (knob off, dashed+marker). + # The vertical gap between the two reds at the last step IS the routing effect. + ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2) + ax.plot(d["ship_step"], d["ship_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4) + ax.plot(d["ship_step"], d["ship_solve"], color=GREY, lw=1.4) + + # Direct labels at the right end (name + final value baked in) -> no legend, + # no separate value annotations. One element does both jobs (eraser test). + x_end = d["steps"][-1] + def label(y, text, color): + ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points", + va="center", color=color, fontsize=9) + label(d["train_hack"][-1], f"hack, knob ON (training) {d['train_hack'][-1]:.0%}", RED) + label(d["ship_solve"][-1], f"solve, shipped {d['ship_solve'][-1]:.0%}", GREY) + label(d["ship_hack"][-1], f"hack, knob OFF (shipped) {d['ship_hack'][-1]:.0%}", RED) + + ax.set_ylim(-0.02, 1.0) + ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"]) + ax.set_xticks([0, d["ship_step"][-1] if d["ship_step"] else x_end]) + ax.set_xlabel("GRPO step") + ax.set_xlim(0, x_end * 1.5) # right margin for the direct labels + for side in ("top", "right"): + ax.spines[side].set_visible(False) + ax.spines["left"].set_bounds(0, 1) # range-frame: axis spans the data + ax.set_title("Routing parks the cheat in a deletable knob:\n" + "the model hacks while training but the shipped model does not", fontsize=10.5) fig.tight_layout() Path(out).parent.mkdir(parents=True, exist_ok=True) fig.savefig(out, dpi=130)