plot(tufte): route-evidence figure — drop legend/grid/box, direct value labels

Eraser test: direct end-labels carry name+value (no legend, no separate
annotations, no gridlines). Layering: hack=red dominates (the story), solve=
muted grey recedes. Range-frame left spine, minimal ticks (0/.5/1, 0/19).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
wassname
2026-05-30 03:22:35 +00:00
parent c7e1500241
commit 4fb7b59548
2 changed files with 28 additions and 18 deletions
Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

After

Width:  |  Height:  |  Size: 58 KiB

+28 -18
View File
@@ -62,24 +62,34 @@ def parse(log: Path):
def main(log: str, out: str = "out/route_evidence.png") -> None:
d = parse(Path(log))
fig, ax = plt.subplots(figsize=(7, 4.2))
ax.plot(d["steps"], d["train_hack"], color="#c0392b", lw=2,
label="hack — training (quarantine knob ON)")
ax.plot(d["ship_step"], d["ship_hack"], color="#c0392b", lw=2, ls="--",
marker="o", label="hack — SHIPPED (knob deleted)")
ax.plot(d["ship_step"], d["ship_solve"], color="#2f7d4f", lw=2,
marker="s", label="solve — shipped")
if d["ship_hack"]:
ax.annotate(f"ship {d['ship_hack'][-1]:.0%}", (d["ship_step"][-1], d["ship_hack"][-1]),
textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
ax.annotate(f"train {d['train_hack'][-1]:.0%}", (d["steps"][-1], d["train_hack"][-1]),
textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
ax.set_xlabel("GRPO step"); ax.set_ylabel("rate")
ax.set_ylim(-0.03, 1.03)
ax.set_title("Gradient routing: model hacks while training, but the\n"
"shipped model (cheat-knob deleted) does not", fontsize=11)
ax.legend(loc="center left", fontsize=8, framealpha=0.9)
ax.grid(alpha=0.25)
RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context)
fig, ax = plt.subplots(figsize=(7, 4))
# Hack in red: training (knob on, solid) vs shipped (knob off, dashed+marker).
# The vertical gap between the two reds at the last step IS the routing effect.
ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2)
ax.plot(d["ship_step"], d["ship_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4)
ax.plot(d["ship_step"], d["ship_solve"], color=GREY, lw=1.4)
# Direct labels at the right end (name + final value baked in) -> no legend,
# no separate value annotations. One element does both jobs (eraser test).
x_end = d["steps"][-1]
def label(y, text, color):
ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points",
va="center", color=color, fontsize=9)
label(d["train_hack"][-1], f"hack, knob ON (training) {d['train_hack'][-1]:.0%}", RED)
label(d["ship_solve"][-1], f"solve, shipped {d['ship_solve'][-1]:.0%}", GREY)
label(d["ship_hack"][-1], f"hack, knob OFF (shipped) {d['ship_hack'][-1]:.0%}", RED)
ax.set_ylim(-0.02, 1.0)
ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"])
ax.set_xticks([0, d["ship_step"][-1] if d["ship_step"] else x_end])
ax.set_xlabel("GRPO step")
ax.set_xlim(0, x_end * 1.5) # right margin for the direct labels
for side in ("top", "right"):
ax.spines[side].set_visible(False)
ax.spines["left"].set_bounds(0, 1) # range-frame: axis spans the data
ax.set_title("Routing parks the cheat in a deletable knob:\n"
"the model hacks while training but the shipped model does not", fontsize=10.5)
fig.tight_layout()
Path(out).parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out, dpi=130)