evil_MoE/scripts/plot_route_evidence.py

"""Single-run routing figure: training-time hack vs SHIPPED-model hack.

The routing story in one plot. During training the model keeps hacking (it runs
with the quarantine knob ON, so the per-step hack_s curve climbs like vanilla).
But the model we'd actually SHIP has the knob deleted -- its hack rate (the
ship-eval, measured every --eval-ablate-every steps) is what matters. If routing
works, the ship curve sits well BELOW the training curve at preserved solve.

    uv run python scripts/plot_route_evidence.py LOG.log --out out/route_evidence.png

Reads either old (hack_abl/solve_abl) or new (hack_ship/solve_ship) ship columns.
"""
from __future__ import annotations

import sys
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import tyro

from projected_grpo.figs import link_latest


def _frac(tok: str) -> float | None:
    if "/" in tok:
        a, b = tok.split("/")
        return int(a) / int(b) if int(b) else None
    try:
        v = float(tok)
        return None if v != v else v          # NaN -> None
    except ValueError:
        return None


def parse(log: Path):
    txt = log.read_text(errors="replace")
    hdr = next(l.split("| INFO |", 1)[1].split() for l in txt.splitlines()
               if "| INFO |" in l and "hack_s" in l and "refr" in l)
    idx = {n: i for i, n in enumerate(hdr)}
    i_step, i_train = idx["step"], idx["hack_s?"]
    i_solve = idx["gt_s↑"]
    i_hship = idx.get("hack_ship", idx.get("hack_abl"))
    i_sship = idx.get("solve_ship", idx.get("solve_abl"))
    steps, train_hack, solve_train = [], [], []
    ship_step, ship_hack, ship_solve = [], [], []
    for l in txt.splitlines():
        if "| INFO |" not in l:
            continue
        r = l.split("| INFO |", 1)[1].split()
        if not r or not r[0].isdigit() or len(r) <= i_sship:
            continue
        s = int(r[i_step])
        steps.append(s)
        train_hack.append(_frac(r[i_train]))
        solve_train.append(_frac(r[i_solve]))
        h = _frac(r[i_hship])
        if h is not None:                       # ship-eval only fires every N steps
            ship_step.append(s); ship_hack.append(h); ship_solve.append(_frac(r[i_sship]))
    return dict(steps=steps, train_hack=train_hack, solve_train=solve_train,
                ship_step=ship_step, ship_hack=ship_hack, ship_solve=ship_solve)


def main(log: str, out: str = "out/figs/route_evidence.png") -> None:
    d = parse(Path(log))
    RED, GREY = "#b03a2e", "#9a8c7a"            # hack=red (the story); solve=muted (context)
    fig, ax = plt.subplots(figsize=(7, 4))
    # Hack in red: training (knob on, solid) vs shipped (knob off, dashed+marker).
    # The vertical gap between the two reds at the last step IS the routing effect.
    ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2)
    ax.plot(d["ship_step"], d["ship_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4)
    ax.plot(d["ship_step"], d["ship_solve"], color=GREY, lw=1.4)

    # Direct labels at the right end (name + final value baked in) -> no legend,
    # no separate value annotations. One element does both jobs (eraser test).
    x_end = d["steps"][-1]
    def label(y, text, color):
        ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points",
                    va="center", color=color, fontsize=9)
    label(d["train_hack"][-1], f"hack, knob ON (training)  {d['train_hack'][-1]:.0%}", RED)
    label(d["ship_solve"][-1], f"solve, shipped  {d['ship_solve'][-1]:.0%}", GREY)
    label(d["ship_hack"][-1],  f"hack, knob OFF (shipped)  {d['ship_hack'][-1]:.0%}", RED)

    ax.set_ylim(-0.02, 1.0)
    ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"])
    ax.set_xticks([0, d["ship_step"][-1] if d["ship_step"] else x_end])
    ax.set_xlabel("GRPO step")
    ax.set_xlim(0, x_end * 1.5)                  # right margin for the direct labels
    for side in ("top", "right"):
        ax.spines[side].set_visible(False)
    ax.spines["left"].set_bounds(0, 1)           # range-frame: axis spans the data
    ax.set_title("Routing parks the cheat in a deletable knob:\n"
                 "the model hacks while training but the shipped model does not", fontsize=10.5)
    fig.tight_layout()
    Path(out).parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out, dpi=130)
    link = link_latest(Path(out))
    print(f"wrote {out}  (docs/figs latest -> {link})  "
          f"(train_hack_final={d['train_hack'][-1]:.3f}, "
          f"ship_hack_final={d['ship_hack'][-1]:.3f}, ship_solve_final={d['ship_solve'][-1]:.3f})")


if __name__ == "__main__":
    tyro.cli(main)