plot(tufte): route-evidence figure — drop legend/grid/box, direct value labels

Eraser test: direct end-labels carry name+value (no legend, no separate annotations, no gridlines). Layering: hack=red dominates (the story), solve= muted grey recedes. Range-frame left spine, minimal ticks (0/.5/1, 0/19). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 19:31:11 +08:00 · 2026-05-30 03:22:35 +00:00
parent c7e1500241
commit 4fb7b59548
2 changed files with 28 additions and 18 deletions
@@ -62,24 +62,34 @@ def parse(log: Path):

 def main(log: str, out: str = "out/route_evidence.png") -> None:
    d = parse(Path(log))
-    fig, ax = plt.subplots(figsize=(7, 4.2))
-    ax.plot(d["steps"], d["train_hack"], color="#c0392b", lw=2,
-            label="hack — training (quarantine knob ON)")
-    ax.plot(d["ship_step"], d["ship_hack"], color="#c0392b", lw=2, ls="--",
-            marker="o", label="hack — SHIPPED (knob deleted)")
-    ax.plot(d["ship_step"], d["ship_solve"], color="#2f7d4f", lw=2,
-            marker="s", label="solve — shipped")
-    if d["ship_hack"]:
-        ax.annotate(f"ship {d['ship_hack'][-1]:.0%}", (d["ship_step"][-1], d["ship_hack"][-1]),
-                    textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
-        ax.annotate(f"train {d['train_hack'][-1]:.0%}", (d["steps"][-1], d["train_hack"][-1]),
-                    textcoords="offset points", xytext=(6, 0), color="#c0392b", fontsize=9)
-    ax.set_xlabel("GRPO step"); ax.set_ylabel("rate")
-    ax.set_ylim(-0.03, 1.03)
-    ax.set_title("Gradient routing: model hacks while training, but the\n"
-                 "shipped model (cheat-knob deleted) does not", fontsize=11)
-    ax.legend(loc="center left", fontsize=8, framealpha=0.9)
-    ax.grid(alpha=0.25)
+    RED, GREY = "#b03a2e", "#9a8c7a"            # hack=red (the story); solve=muted (context)
+    fig, ax = plt.subplots(figsize=(7, 4))
+    # Hack in red: training (knob on, solid) vs shipped (knob off, dashed+marker).
+    # The vertical gap between the two reds at the last step IS the routing effect.
+    ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2)
+    ax.plot(d["ship_step"], d["ship_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4)
+    ax.plot(d["ship_step"], d["ship_solve"], color=GREY, lw=1.4)
+
+    # Direct labels at the right end (name + final value baked in) -> no legend,
+    # no separate value annotations. One element does both jobs (eraser test).
+    x_end = d["steps"][-1]
+    def label(y, text, color):
+        ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points",
+                    va="center", color=color, fontsize=9)
+    label(d["train_hack"][-1], f"hack, knob ON (training)  {d['train_hack'][-1]:.0%}", RED)
+    label(d["ship_solve"][-1], f"solve, shipped  {d['ship_solve'][-1]:.0%}", GREY)
+    label(d["ship_hack"][-1],  f"hack, knob OFF (shipped)  {d['ship_hack'][-1]:.0%}", RED)
+
+    ax.set_ylim(-0.02, 1.0)
+    ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"])
+    ax.set_xticks([0, d["ship_step"][-1] if d["ship_step"] else x_end])
+    ax.set_xlabel("GRPO step")
+    ax.set_xlim(0, x_end * 1.5)                  # right margin for the direct labels
+    for side in ("top", "right"):
+        ax.spines[side].set_visible(False)
+    ax.spines["left"].set_bounds(0, 1)           # range-frame: axis spans the data
+    ax.set_title("Routing parks the cheat in a deletable knob:\n"
+                 "the model hacks while training but the shipped model does not", fontsize=10.5)
    fig.tight_layout()
    Path(out).parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out, dpi=130)