evil_MoE/scripts/plot_floor_ceiling.py

"""Floor-to-ceiling method comparison: the keynote figure.

Two stages so the data is inspectable before it's drawn:
  1. build  -> out/plots/floor_ceiling.csv   (one row per arm/anchor, with SOURCE and STATUS
                columns; every provisional/missing value is flagged, not silently filled)
  2. plot   -> out/plots/floor_ceiling.{pdf,png}

Run `uv run python -m scripts.plot_floor_ceiling` to do both; it prints a TODO/FIXME summary
of any provisional or missing cells before plotting.

THE GOAL: place each gradient-routing arm on a floor->ceiling scale so "how much of the
achievable range did it capture" is read at a glance, and show that the quarantine (knob)
is what removes the hack, not a train/test artifact.

TWO METRICS, two anchor pairs (right/down = better):
  hack removed    = (vanilla_hack - arm_hack) / vanilla_hack            1.0 = no hack
  solve recovered = (arm_solve - base_solve)   / (ceiling - base_solve) 1.0 = no-loophole ceiling

TWO VIEWS of the same arms:
  A. normalized floor->ceiling bars, HEADLINE deploy (knob-off, test n=119, recency-clean).
     Source per arm: out/runs/<run>/deploy_test.json.
  B. the KNOB effect: arrow knob-ON -> knob-OFF on the SAME held-out val split (n=32), so it
     isolates the quarantine from the train/test memorization gap. Source per arm:
     out/runs/<run>/eval_curve.jsonl, where the file's `train_*`/`deploy_*` prefixes denote
     KNOB STATE (on/off), not the problem set (always val here). L5 = mean of last 5 evals.

DATA GAPS (see STATUS column in the csv):
  - solve ceiling: provisional = paper 0.223 until job 24 (out/runs/*noloophole*) lands. FIXME.
  - prog_wide arm uses contaminated pairs; job 28 (prog_wide_clean) will replace it. TODO.
  - full-env (paper-scale) panel: no method runs exist, only paper anchors. Out of scope here.
"""
from __future__ import annotations
import json
from pathlib import Path

import polars as pl
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

RED, GREEN, GREY = "#c0392b", "#1e8449", "#9aa0a6"
RUNS = Path("out/runs")
OUT = Path("out/figs")
CSV = OUT / "floor_ceiling.csv"
PAPER_CEILING = 0.223          # Ariahw et al. no-loophole solve -- provisional fast-env ceiling

# arm display order, identified by a substring of the run's out_tag (seed-43 fast runs)
ARMS = [
    ("routeV per-token", "_dir6_routeV_pertoken_s43",        "ok"),
    ("routeV authored",  "_dir8_routeV_authored_perroll_s43", "ok"),
    ("routeV prog_wide", "_dir6_routeV_s43",                 "TODO: contaminated pairs -> job 28 prog_wide_clean"),
    ("routeV random-V",  "_dir6_routeV_random_s43",          "ok (directionality control)"),
    ("vanilla GRPO",     "_dir8_vanilla_s43",                "ok (defines hack-worst anchor)"),
]


def _find_run(tag: str) -> Path:
    cands = sorted(d for d in RUNS.iterdir()
                   if d.name.endswith(tag) and (d / "deploy_test.json").exists())
    if not cands:
        raise FileNotFoundError(f"no run dir ending '{tag}' with a deploy_test.json")
    return cands[-1]                 # latest timestamp wins


def _l5(rows: list[dict], k: str) -> float:
    v = [r[k] for r in rows[-5:]]
    return sum(v) / len(v)


# ── stage 1: build the inspectable csv ──────────────────────────────────────
def build_csv() -> pl.DataFrame:
    rows = []
    for label, tag, status in ARMS:
        run = _find_run(tag)
        dep = json.loads((run / "deploy_test.json").read_text())
        ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()]
        rows.append(dict(
            label=label, kind="method",
            hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4),
            hack_on=round(_l5(ev, "train_hack"), 4),  hack_off=round(_l5(ev, "deploy_hack"), 4),
            solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4),
            source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status))

    base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text())
    rows.append(dict(label="base (floor)", kind="anchor_floor",
                     hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4),
                     hack_on=None, hack_off=None, solve_on=None, solve_off=None,
                     source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)"))

    ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None)
    if ceil_path:
        ceil_solve, status = round(json.loads(ceil_path.read_text())["deploy_solve"], 4), "ok"
        source = f"{ceil_path.parent.name}/deploy_test.json"
    else:
        ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)"
        source = "Ariahw et al. 2025 (paper), NOT our run"
    rows.append(dict(label="ceiling", kind="anchor_ceiling",
                     hack_deploy=0.0, solve_deploy=ceil_solve,
                     hack_on=None, hack_off=None, solve_on=None, solve_off=None,
                     source=source, status=status))

    df = pl.DataFrame(rows)
    OUT.mkdir(parents=True, exist_ok=True)
    df.write_csv(CSV)
    return df


# ── stage 2: plot from the csv ──────────────────────────────────────────────
# The reference paper (Ariahw et al. 2025) IS the axis: its No-Intervention run (hack ~79%) is
# the floor and its no-loophole RL-Baseline is the ceiling. So the comparison-to-paper is "how
# far up the paper's own floor->ceiling range did our no-cheat method climb." We do NOT plot the
# paper's intervention bars, for two different reasons (the disqualifier is oracle/ground-truth-
# LABEL leakage, NOT "a monitor ran"):
#   - GT monitor (+70/90% variants) and the probe (trained on oracle-labelled in-env RH data,
#     footnote 12) both need the env oracle to exist -- they cannot be built on a new env with no
#     oracle, so they are cheats for our transfer claim.
#   - LLM judge is the legitimate external peer (generic model, no oracle, ~50% acc yet protective
#     via penalty) -- but it has no clean single fast-env number on our axis (paper figures only,
#     different training regime), so we have no honest point to plot for it.
#   - inoculation prompting (no monitor) has no clean number either (prose: incomplete, high-
#     variance -- some seeds ~0 hack, some ~full hack).
# So: nothing with a comparable single number to plot; the paper enters only as floor/ceiling.
GOLD, DARK = "#c8920a", "#3a3a3a"


def _anchors(df: pl.DataFrame) -> dict:
    g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0]
    ceil_status = g("anchor_ceiling", "status")
    return dict(base_solve=g("anchor_floor", "solve_deploy"),
                vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deploy"][0],
                ceiling=g("anchor_ceiling", "solve_deploy"),
                provisional=ceil_status.startswith("FIXME"))


def _bars(ax, rows, key, raws, title, xlabel, xlo):
    """One floor->ceiling panel: horizontal bars in [xlo,1], 0=floor, 1.0=ceiling."""
    for yi, (lab, val, raw, col) in enumerate(rows):
        ax.barh(yi, val, height=0.55, color=col, alpha=0.9,
                hatch="//" if col == GREY else None, edgecolor="white")   # grey = approx reference
        tip = f"{val*100:+.0f}%" if xlo < 0 else f"{val*100:.0f}%"
        rawtxt = f"  ({raw})" if raw else ""
        ax.text(val + (0.02 if val >= 0 else -0.02), yi, tip + rawtxt,
                va="center", ha="left" if val >= 0 else "right", fontsize=8.5, color=col)
    ax.axvline(0, color=GREY, lw=1.0)                                   # floor (labelled in xlabel)
    ax.axvline(1.0, color=GREY, lw=1.0, ls=":")                         # ceiling
    ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
    ax.set_xlim(xlo, 1.4); ax.set_xlabel(xlabel, fontsize=8.5)   # hi=1.4 keeps the ceiling line (1.0) in view
    ax.set_title(title, fontsize=10, loc="left")
    for s in ("top", "right", "left"):
        ax.spines[s].set_visible(False)
    ax.tick_params(left=False)


def plot(df: pl.DataFrame) -> None:
    a = _anchors(df)
    base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"]
    pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
    best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")

    def hsupp(r): return (vh - r["hack_deploy"]) / vh
    def suplift(r): return (r["solve_deploy"] - base) / (ceil - base)

    # OURS ONLY -- no paper bars. The paper comparison is cross-scale/regime (their converged
    # full-env vs our 60-step fast surrogate) so it can only ever be directional; the paper
    # numbers live in docs/papers/ariahw_results_table_extracted.md, not on this axis.
    # vanilla is the floor anchor (defines vh, so its hack-suppression is 0 by construction);
    # random-V is the directionality control; per-token is the live arm.
    hack_rows = [
        ("vanilla GRPO\n(floor)",                hsupp(van),  f"{van['hack_deploy']:.3f}",  RED),
        ("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
        ("routeV per-token\n(best)",             hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
    ]
    solve_rows = [
        ("vanilla GRPO\n(floor)",                suplift(van),  f"{van['solve_deploy']:.3f}",  RED),
        ("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
        ("routeV per-token\n(best)",             suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
    ]
    prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
    fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False)
    _bars(axl, hack_rows, "hack", None,
          "hack suppressed", "floor → ceiling (no hack)   ·   right = better", 0.0)
    _bars(axr, solve_rows, "solve", None,
          "solve gained", f"floor (base 0.126) → ceiling{prov}   ·   right = better", -0.55)
    fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs vanilla GRPO  (test n=119, seed 43, 60-step fast)",
                 fontsize=10.5, x=0.01, ha="left")
    fig.text(0.01, 0.015, "Our arms only, seed 43, 60-step fast (unconverged surrogate). hack suppressed = (vanilla_hack - arm_hack)/vanilla_hack; "
             "solve gained = (arm_solve - base)/(ceiling - base).  Ariahw 2025 monitor numbers are cross-scale/regime and live in "
             "the transcribed Fig-5 table in docs/papers/2025_lw_ariahw_*.md, not on this axis.",
             fontsize=6.8, color=GREY, va="bottom")
    fig.tight_layout(rect=(0, 0.07, 1, 0.94))
    for ext in ("pdf", "png"):
        fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")


# ── stage 2b: the two metrics as ONE scatter (Tufte: don't split a 2-var story) ──
# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving).
# Each routeV arm gets a green effect-arrow FROM the vanilla baseline -> shows what the
# intervention DID (mechanism), not just where it landed. The achievable solve band
# (base..ceiling) is a faint range-frame; ticks sit only at the meaningful values
# (no hack / vanilla / base / ceiling) so the axes teach the scale instead of generic grid.
GREEN_ARROW = "#1e8449"


def plot_scatter(df: pl.DataFrame) -> None:
    a = _anchors(df)
    base, ceil = a["base_solve"], a["ceiling"]
    pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
    best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
    H = lambda r: r["hack_deploy"]; S = lambda r: r["solve_deploy"]

    BLUE = "#3b5bdb"
    fig, ax = plt.subplots(figsize=(7.2, 5.4))
    # achievable solve band (base -> ceiling): faint, recedes behind the data
    ax.axhspan(base, ceil, color="#eef3ff", zorder=0)
    ax.axhline(base, color=GREY, lw=0.8); ax.axhline(ceil, color=BLUE, lw=0.8, ls=":")
    ax.axvline(0.0, color=GREY, lw=0.8)
    # effect arrows: vanilla baseline -> each routeV arm (green = moves toward the good corner)
    for arm in (rand, best):
        ax.annotate("", xy=(H(arm), S(arm)), xytext=(H(van), S(van)),
                    arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=2.0, alpha=0.85,
                                    shrinkA=7, shrinkB=9))
    # points + direct labels (name only -- the position already shows the rates; labelling
    # the amounts too would double-encode. offsets keep each clear of the arrows/each other)
    pts = [("vanilla GRPO", van, RED, (10, -13), "left"),
           ("routeV random-V", rand, DARK, (12, -2), "left"),
           ("routeV per-token", best, GOLD, (12, 6), "left")]
    for name, r, col, (dx, dy), ha in pts:
        ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2)
        ax.annotate(name, (H(r), S(r)), textcoords="offset points", xytext=(dx, dy),
                    ha=ha, va="center", fontsize=9, color=col, fontweight="bold")
    # "better" shown, not told: a small diagonal in the empty top-left, pointing at the good corner
    ax.annotate("", xy=(0.46, ceil - 0.004), xytext=(0.62, ceil - 0.030),
                arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=1.4, alpha=0.55))
    ax.text(0.63, ceil - 0.034, "better", fontsize=9, color=GREEN_ARROW, style="italic", ha="left", va="top")
    # range-frame: ticks only at meaningful values
    ax.set_xlim(0.66, -0.03)                          # reversed: high hack left, 0 right
    ax.set_ylim(base - 0.035, ceil + 0.02)
    prov = "*" if a["provisional"] else ""
    ax.set_xticks([0.0, H(van)]); ax.set_xticklabels(["no hack", f"vanilla\n{H(van):.2f}"], fontsize=8.5)
    ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5)
    ax.set_xlabel("reward-hack rate", fontsize=9.5)
    ax.set_ylabel("solve rate", fontsize=9.5)
    for s in ("top", "right"):
        ax.spines[s].set_visible(False)
    fig.tight_layout()
    for ext in ("pdf", "png"):
        fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight")


def main() -> None:
    df = build_csv()
    flags = df.filter(~pl.col("status").str.starts_with("ok"))
    print(f"wrote {CSV}")
    if len(flags):
        print("\n=== TODO/FIXME in data ===")
        for r in flags.to_dicts():
            print(f"  [{r['label']}] {r['status']}")
    plot(df)
    plot_scatter(df)
    print(f"\nwrote {OUT}/floor_ceiling.pdf and .png  (+ floor_ceiling_abs.pdf/.png scatter)")


if __name__ == "__main__":
    main()