Files
evil_MoE/scripts/plot_floor_ceiling.py
T
wassname 270c4f5a27 misc
2026-06-11 11:07:28 +00:00

322 lines
18 KiB
Python

"""Floor-to-ceiling method comparison: the keynote figure.
Two stages so the data is inspectable before it's drawn:
1. build -> out/plots/floor_ceiling.csv (one row per arm/anchor, with SOURCE and STATUS
columns; every provisional/missing value is flagged, not silently filled)
2. plot -> out/plots/floor_ceiling.{pdf,png}
Run `uv run python -m scripts.plot_floor_ceiling` to do both; it prints a TODO/FIXME summary
of any provisional or missing cells before plotting.
THE GOAL: place each gradient-routing arm on a floor->ceiling scale so "how much of the
achievable range did it capture" is read at a glance, and show the effect of quarantine
ablation separately from train/test differences.
TWO METRICS, two anchor pairs (right/down = better):
hack removed = (vanilla_hack - arm_hack) / vanilla_hack 1.0 = no hack
solve recovered = (arm_solve - base_solve) / (ceiling - base_solve) 1.0 = no-loophole ceiling
TWO VIEWS of the same arms:
A. normalized floor->ceiling bars, primary deployment evaluation (quarantine ablated,
test n=119, recency-clean).
Source per arm: out/runs/<run>/deploy_test.json.
B. the quarantine-ablation effect: arrow enabled -> ablated on the same held-out
validation split (n=32), isolating quarantine ablation from train/test differences.
Source per arm:
out/runs/<run>/eval_curve.jsonl, where the file's `train_*`/`deploy_*` prefixes denote
quarantine state, not the problem set (always validation here). L5 = mean of last 5 evals.
DATA GAPS (see STATUS column in the csv):
- solve ceiling: provisional = paper 0.223 until job 24 (out/runs/*noloophole*) lands. FIXME.
- prog_wide arm uses contaminated pairs; job 28 (prog_wide_clean) will replace it. TODO.
- full-env (paper-scale) panel: no method runs exist, only paper anchors. Out of scope here.
"""
from __future__ import annotations
import json
from pathlib import Path
import polars as pl
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
RED, GREEN, GREY = "#c0392b", "#1e8449", "#9aa0a6"
RUNS = Path("out/runs")
OUT = Path("out/figs")
CSV = OUT / "floor_ceiling.csv"
PAPER_CEILING = 0.223 # Ariahw et al. no-loophole solve -- provisional fast-env ceiling
# arm display order, identified by a substring of the run's out_tag (seed-43 fast runs)
ARMS = [
("routeV per-token", "_dir6_routeV_pertoken_s43", "ok"),
("routeV authored", "_dir8_routeV_authored_perroll_s43", "ok"),
("routeV prog_wide", "_dir6_routeV_s43", "TODO: contaminated pairs -> job 28 prog_wide_clean"),
("routeV random-V", "_dir6_routeV_random_s43", "ok (directionality control)"),
("vanilla GRPO", "_dir8_vanilla_s43", "ok (defines hack-worst anchor)"),
]
def _find_run(tag: str) -> Path:
cands = sorted(d for d in RUNS.iterdir()
if d.name.endswith(tag) and (d / "deploy_test.json").exists())
if not cands:
raise FileNotFoundError(f"no run dir ending '{tag}' with a deploy_test.json")
return cands[-1] # latest timestamp wins
def _l5(rows: list[dict], k: str) -> float:
v = [r[k] for r in rows[-5:]]
return sum(v) / len(v)
def _r4(x):
return None if x is None else round(x, 4)
# ── stage 1: build the inspectable csv ──────────────────────────────────────
def build_csv() -> pl.DataFrame:
rows = []
for label, tag, status in ARMS:
run = _find_run(tag)
dep = json.loads((run / "deploy_test.json").read_text())
ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()]
rows.append(dict(
label=label, kind="method",
hack_deployed=round(dep["hack_deployed"], 4), solve_deployed=round(dep["solve_deployed"], 4),
# Quarantine-enabled evaluation on the same n=119 set; None until backfilled.
# (rescore_deploy.py) so the before/after comparison uses the same evaluation set.
hack_as_trained=_r4(dep.get("hack_as_trained")), solve_as_trained=_r4(dep.get("solve_as_trained")),
hack_on=round(_l5(ev, "hack_as_trained"), 4), hack_off=round(_l5(ev, "hack_deployed"), 4),
solve_on=round(_l5(ev, "solve_as_trained"), 4), solve_off=round(_l5(ev, "solve_deployed"), 4),
source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status))
base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text())
rows.append(dict(label="base (floor)", kind="anchor_floor",
hack_deployed=round(base["hack_deployed"], 4), solve_deployed=round(base["solve_deployed"], 4),
hack_as_trained=None, solve_as_trained=None,
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)"))
ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None)
if ceil_path:
ceil_solve, status = round(json.loads(ceil_path.read_text())["solve_deployed"], 4), "ok"
source = f"{ceil_path.parent.name}/deploy_test.json"
else:
ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)"
source = "Ariahw et al. 2025 (paper), NOT our run"
rows.append(dict(label="ceiling", kind="anchor_ceiling",
hack_deployed=0.0, solve_deployed=ceil_solve,
hack_as_trained=None, solve_as_trained=None,
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
source=source, status=status))
df = pl.DataFrame(rows)
OUT.mkdir(parents=True, exist_ok=True)
df.write_csv(CSV)
return df
# ── stage 2: plot from the csv ──────────────────────────────────────────────
# The reference paper (Ariahw et al. 2025) IS the axis: its No-Intervention run (hack ~79%) is
# the floor and its no-loophole RL-Baseline is the ceiling. So the comparison-to-paper is "how
# far up the paper's own floor->ceiling range did our oracle-free method climb." We do NOT plot the
# paper's intervention bars, for two different reasons (the disqualifier is oracle/ground-truth-
# LABEL leakage, NOT "a monitor ran"):
# - GT monitor (+70/90% variants) and the probe (trained on oracle-labelled in-env RH data,
# footnote 12) both need the env oracle to exist -- they cannot be built on a new env with no
# oracle, so they are cheats for our transfer claim.
# - LLM judge is the legitimate external peer (generic model, no oracle, ~50% acc yet protective
# via penalty) -- but it has no clean single fast-env number on our axis (paper figures only,
# different training regime), so we have no comparable point to plot for it.
# - inoculation prompting (no monitor) has no clean number either (prose: incomplete, high-
# variance -- some seeds ~0 hack, some ~full hack).
# So: nothing with a comparable single number to plot; the paper enters only as floor/ceiling.
GOLD, DARK = "#c8920a", "#3a3a3a"
def _anchors(df: pl.DataFrame) -> dict:
g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0]
ceil_status = g("anchor_ceiling", "status")
return dict(base_solve=g("anchor_floor", "solve_deployed"),
vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deployed"][0],
ceiling=g("anchor_ceiling", "solve_deployed"),
provisional=ceil_status.startswith("FIXME"))
def _bars(ax, rows, key, raws, title, xlabel, xlo):
"""One floor->ceiling panel: horizontal bars in [xlo,1], 0=floor, 1.0=ceiling."""
for yi, (lab, val, raw, col) in enumerate(rows):
ax.barh(yi, val, height=0.55, color=col, alpha=0.9,
hatch="//" if col == GREY else None, edgecolor="white") # grey = approx reference
tip = f"{val*100:+.0f}%" if xlo < 0 else f"{val*100:.0f}%"
rawtxt = f" ({raw})" if raw else ""
ax.text(val + (0.02 if val >= 0 else -0.02), yi, tip + rawtxt,
va="center", ha="left" if val >= 0 else "right", fontsize=8.5, color=col)
ax.axvline(0, color=GREY, lw=1.0) # floor (labelled in xlabel)
ax.axvline(1.0, color=GREY, lw=1.0, ls=":") # ceiling
ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
ax.set_xlim(xlo, 1.4); ax.set_xlabel(xlabel, fontsize=8.5) # hi=1.4 keeps the ceiling line (1.0) in view
ax.set_title(title, fontsize=10, loc="left")
for s in ("top", "right", "left"):
ax.spines[s].set_visible(False)
ax.tick_params(left=False)
def plot(df: pl.DataFrame) -> None:
a = _anchors(df)
base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"]
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
def hsupp(r): return (vh - r["hack_deployed"]) / vh
def suplift(r): return (r["solve_deployed"] - base) / (ceil - base)
# OURS ONLY -- no paper bars. The paper comparison is cross-scale/regime (their converged
# full-env vs our 60-step fast surrogate) so it can only ever be directional; the paper
# numbers live in docs/papers/ariahw_results_table_extracted.md, not on this axis.
# vanilla is the floor anchor (defines vh, so its hack-suppression is 0 by construction);
# random-V is the directionality control; per-token is the live arm.
hack_rows = [
("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deployed']:.3f}", RED),
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deployed']:.3f}", DARK),
("routeV per-token\n(best)", hsupp(best), f"{best['hack_deployed']:.3f}", GOLD),
]
solve_rows = [
("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deployed']:.3f}", RED),
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deployed']:.3f}", DARK),
("routeV per-token\n(best)", suplift(best), f"{best['solve_deployed']:.3f}", GOLD),
]
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False)
_bars(axl, hack_rows, "hack", None,
"hack suppressed", "floor → ceiling (no hack) · right = better", 0.0)
_bars(axr, solve_rows, "solve", None,
"solve gained", f"floor (base 0.126) → ceiling{prov} · right = better", -0.55)
fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs vanilla GRPO (test n=119, seed 43, 60-step fast)",
fontsize=10.5, x=0.01, ha="left")
fig.text(0.01, 0.015, "Our arms only, seed 43, 60-step fast (unconverged surrogate). hack suppressed = (vanilla_hack - arm_hack)/vanilla_hack; "
"solve gained = (arm_solve - base)/(ceiling - base). Ariahw 2025 monitor numbers are cross-scale/regime and live in "
"the transcribed Fig-5 table in docs/papers/2025_lw_ariahw_*.md, not on this axis.",
fontsize=6.8, color=GREY, va="bottom")
fig.tight_layout(rect=(0, 0.07, 1, 0.94))
for ext in ("pdf", "png"):
fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")
# ── stage 2b: the two metrics as ONE scatter (Tufte: don't split a 2-var story) ──
# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving), marked
# "ideal". The achievable solve band (base..ceiling) is a faint range-frame; ticks sit only at
# the meaningful values so the axes teach the scale. Two views:
# plot_scatter -> DEPLOY (test n=119): solid dot = quarantine ablated;
# when the run includes quarantine-enabled metrics on the same set, a hollow dot ->
# arrow -> solid after-dot shows the quarantine move on the deploy axis.
# plot_knob -> the same before/after on val n=32 (the periodic curve; lower-N, lower-solve).
# Prefer the deploy view now that both endpoints exist there; plot_knob remains as the val cross-
# check (val solve runs ~2x lower, so the two panels never share a y-axis).
GREEN_ARROW = "#1e8449"
BLUE = "#3b5bdb"
# one colour per arm; GOLD=best real-V, DARK=random control, RED=no-intervention baseline.
ARM_COLOR = {"routeV per-token": GOLD, "routeV authored": "#0e8a8a",
"routeV prog_wide": "#8e44ad", "routeV random-V": DARK, "vanilla GRPO": RED}
def _methods(df: pl.DataFrame) -> list[dict]:
return df.filter(pl.col("kind") == "method").to_dicts()
def plot_scatter(df: pl.DataFrame) -> None:
a = _anchors(df)
base, ceil = a["base_solve"], a["ceiling"]
H = lambda r: r["hack_deployed"]; S = lambda r: r["solve_deployed"]
prov = "*" if a["provisional"] else ""
fig, ax = plt.subplots(figsize=(7.2, 5.4))
ax.axhspan(base, ceil, color="#eef3ff", zorder=0) # achievable solve band
ax.axhline(base, color=GREY, lw=0.8); ax.axhline(ceil, color=BLUE, lw=0.8, ls=":")
ax.axvline(0.0, color=GREY, lw=0.8)
# "ideal" = the good corner (no hack, ceiling solve). Nudged inside the no-hack edge so the
# marker isn't half-clipped; label sits to its LEFT (no room to the right of no-hack).
ax.plot(0.012, ceil, marker="*", ms=15, color=BLUE, zorder=6, clip_on=False)
ax.annotate("ideal", (0.012, ceil), textcoords="offset points", xytext=(-8, 2),
ha="right", va="center", fontsize=9, color=BLUE, style="italic")
# Deploy: solid dot = quarantine ablated, where each arm lies on the Pareto plot.
# If the run also has quarantine-enabled metrics on the same n=119 set, draw the
# two-dimensional before/after change. Both
# endpoints share the deploy y-axis now (rescore_deploy backfill), so the solve move is real,
# not an eval-set artifact. Arms without the backfill fall back to dot-only.
for r in _methods(df):
col = ARM_COLOR.get(r["label"], GREY)
hon, son = r["hack_as_trained"], r["solve_as_trained"]
if hon is not None and (abs(hon - H(r)) > 1e-6 or abs(son - S(r)) > 1e-6):
ax.annotate("", xy=(H(r), S(r)), xytext=(hon, son),
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8))
ax.plot(hon, son, "o", color="white", mec=col, mew=1.8, ms=9, zorder=4) # quarantine enabled
ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) # quarantine ablated
right = H(r) > 0.3 # vanilla sits left; label into the middle
ax.annotate(r["label"], (H(r), S(r)), textcoords="offset points",
xytext=(12 if right else -12, 0), ha="left" if right else "right",
va="center", fontsize=9, color=col, fontweight="bold")
ax.set_xlim(0.74, 0.0) # reversed; clamp at no-hack (negative hack is meaningless)
ax.set_ylim(base - 0.04, ceil + 0.012)
ax.set_xticks([0.0, 0.6134]); ax.set_xticklabels(["no hack", "vanilla\n0.61"], fontsize=8.5)
ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base model\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5)
ax.set_xlabel("reward-hack rate", fontsize=9.5)
ax.set_ylabel("solve rate", fontsize=9.5)
for s in ("top", "right"):
ax.spines[s].set_visible(False)
fig.tight_layout()
for ext in ("pdf", "png"):
fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight")
def plot_knob(df: pl.DataFrame) -> None:
"""Quarantine before/after on the SAME eval (val n=32). Per arm: hollow before-dot
(quarantine enabled) -> arrow -> solid after-dot (quarantine ablated).
Shows the effect of quarantine ablation. Vanilla has no quarantine contrast."""
# per-arm label offset (dx,dy,ha) -- after-dots cluster at the right edge / same y on val,
# so stagger them by hand to keep labels off the right edge and off each other.
LBL = {"routeV per-token": (-8, 13, "right"), "routeV random-V": (-8, -13, "right"),
"routeV prog_wide": (12, 0, "left"), "routeV authored": (12, 0, "left"),
"vanilla GRPO": (12, 0, "left")}
fig, ax = plt.subplots(figsize=(7.2, 5.0))
ax.axvline(0.0, color=GREY, lw=0.8)
for r in _methods(df):
col = ARM_COLOR.get(r["label"], GREY)
on, off = (r["hack_on"], r["solve_on"]), (r["hack_off"], r["solve_off"])
moved = abs(on[0] - off[0]) > 1e-6 or abs(on[1] - off[1]) > 1e-6
if moved: # routeV arms: before -> after
ax.annotate("", xy=off, xytext=on,
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8))
ax.plot(*on, "o", color="white", mec=col, mew=1.8, ms=9, zorder=4) # quarantine enabled
ax.plot(*off, "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) # quarantine ablated
dx, dy, ha = LBL.get(r["label"], (12, 0, "left"))
ax.annotate(r["label"], off, textcoords="offset points", xytext=(dx, dy),
ha=ha, va="center", fontsize=9, color=col, fontweight="bold")
ax.set_xlim(0.80, 0.0) # reversed; clamp at no-hack
ax.set_xticks([0.0, 0.6]); ax.set_xticklabels(["no hack", "≈vanilla hack\n0.6"], fontsize=8.5)
ax.set_xlabel("reward-hack rate (○ quarantine enabled → ● quarantine ablated)", fontsize=8.5)
ax.set_ylabel("solve rate (val n=32)", fontsize=9.5)
for s in ("top", "right"):
ax.spines[s].set_visible(False)
fig.tight_layout()
for ext in ("pdf", "png"):
fig.savefig(OUT / f"floor_ceiling_knob.{ext}", dpi=150, bbox_inches="tight")
def main() -> None:
df = build_csv()
flags = df.filter(~pl.col("status").str.starts_with("ok"))
print(f"wrote {CSV}")
if len(flags):
print("\n=== TODO/FIXME in data ===")
for r in flags.to_dicts():
print(f" [{r['label']}] {r['status']}")
plot(df)
plot_scatter(df)
plot_knob(df)
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ _abs scatter, + _knob before/after)")
if __name__ == "__main__":
main()