"""Floor-to-ceiling method comparison: the keynote figure. Two stages so the data is inspectable before it's drawn: 1. build -> out/plots/floor_ceiling.csv (one row per arm/anchor, with SOURCE and STATUS columns; every provisional/missing value is flagged, not silently filled) 2. plot -> out/plots/floor_ceiling.{pdf,png} Run `uv run python -m scripts.plot_floor_ceiling` to do both; it prints a TODO/FIXME summary of any provisional or missing cells before plotting. THE GOAL: place each gradient-routing arm on a floor->ceiling scale so "how much of the achievable range did it capture" is read at a glance, and show that the quarantine (knob) is what removes the hack, not a train/test artifact. TWO METRICS, two anchor pairs (right/down = better): hack removed = (vanilla_hack - arm_hack) / vanilla_hack 1.0 = no hack solve recovered = (arm_solve - base_solve) / (ceiling - base_solve) 1.0 = no-loophole ceiling TWO VIEWS of the same arms: A. normalized floor->ceiling bars, HEADLINE deploy (knob-off, test n=119, recency-clean). Source per arm: out/runs//deploy_test.json. B. the KNOB effect: arrow knob-ON -> knob-OFF on the SAME held-out val split (n=32), so it isolates the quarantine from the train/test memorization gap. Source per arm: out/runs//eval_curve.jsonl, where the file's `train_*`/`deploy_*` prefixes denote KNOB STATE (on/off), not the problem set (always val here). L5 = mean of last 5 evals. DATA GAPS (see STATUS column in the csv): - solve ceiling: provisional = paper 0.223 until job 24 (out/runs/*noloophole*) lands. FIXME. - prog_wide arm uses contaminated pairs; job 28 (prog_wide_clean) will replace it. TODO. - full-env (paper-scale) panel: no method runs exist, only paper anchors. Out of scope here. """ from __future__ import annotations import json from pathlib import Path import polars as pl import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt RED, GREEN, GREY = "#c0392b", "#1e8449", "#9aa0a6" RUNS = Path("out/runs") OUT = Path("out/plots") CSV = OUT / "floor_ceiling.csv" PAPER_CEILING = 0.223 # Ariahw et al. no-loophole solve -- provisional fast-env ceiling # arm display order, identified by a substring of the run's out_tag (seed-43 fast runs) ARMS = [ ("routeV per-token", "_dir6_routeV_pertoken_s43", "ok"), ("routeV authored", "_dir8_routeV_authored_perroll_s43", "ok"), ("routeV prog_wide", "_dir6_routeV_s43", "TODO: contaminated pairs -> job 28 prog_wide_clean"), ("routeV random-V", "_dir6_routeV_random_s43", "ok (directionality control)"), ("vanilla GRPO", "_dir8_vanilla_s43", "ok (defines hack-worst anchor)"), ] def _find_run(tag: str) -> Path: cands = sorted(d for d in RUNS.iterdir() if d.name.endswith(tag) and (d / "deploy_test.json").exists()) if not cands: raise FileNotFoundError(f"no run dir ending '{tag}' with a deploy_test.json") return cands[-1] # latest timestamp wins def _l5(rows: list[dict], k: str) -> float: v = [r[k] for r in rows[-5:]] return sum(v) / len(v) # ── stage 1: build the inspectable csv ────────────────────────────────────── def build_csv() -> pl.DataFrame: rows = [] for label, tag, status in ARMS: run = _find_run(tag) dep = json.loads((run / "deploy_test.json").read_text()) ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()] rows.append(dict( label=label, kind="method", hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4), hack_on=round(_l5(ev, "train_hack"), 4), hack_off=round(_l5(ev, "deploy_hack"), 4), solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4), source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status)) base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text()) rows.append(dict(label="base (floor)", kind="anchor_floor", hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4), hack_on=None, hack_off=None, solve_on=None, solve_off=None, source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)")) ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None) if ceil_path: ceil_solve, status = round(json.loads(ceil_path.read_text())["deploy_solve"], 4), "ok" source = f"{ceil_path.parent.name}/deploy_test.json" else: ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)" source = "Ariahw et al. 2025 (paper), NOT our run" rows.append(dict(label="ceiling", kind="anchor_ceiling", hack_deploy=0.0, solve_deploy=ceil_solve, hack_on=None, hack_off=None, solve_on=None, solve_off=None, source=source, status=status)) df = pl.DataFrame(rows) OUT.mkdir(parents=True, exist_ok=True) df.write_csv(CSV) return df # ── stage 2: plot from the csv ────────────────────────────────────────────── def _anchors(df: pl.DataFrame) -> dict: g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0] ceil_status = g("anchor_ceiling", "status") return dict(base_solve=g("anchor_floor", "solve_deploy"), vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deploy"][0], ceiling=g("anchor_ceiling", "solve_deploy"), provisional=ceil_status.startswith("FIXME")) def _panel_normalized(ax, methods: pl.DataFrame, a, title): base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"] labels = [l for l in methods["label"] if l != "vanilla GRPO"] # vanilla = the 0% hack anchor for yi, lab in enumerate(labels): r = methods.filter(pl.col("label") == lab).to_dicts()[0] hack_rm = (vh - r["hack_deploy"]) / vh solve_rc = (r["solve_deploy"] - base) / (ceil - base) ax.barh(yi + 0.18, hack_rm, height=0.32, color=RED, alpha=0.85) ax.text(hack_rm + 0.015, yi + 0.18, f"{r['hack_deploy']:.3f} ({hack_rm*100:.0f}%)", va="center", ha="left", fontsize=8, color=RED) ax.barh(yi - 0.18, solve_rc, height=0.32, color=GREEN, alpha=0.85) ax.text(solve_rc + 0.015 if solve_rc >= 0 else solve_rc - 0.015, yi - 0.18, f"{r['solve_deploy']:.3f} ({solve_rc*100:+.0f}%)", va="center", ha="left" if solve_rc >= 0 else "right", fontsize=8, color=GREEN) ax.axvline(0, color=GREY, lw=0.8) ax.axvline(1.0, color=GREY, lw=0.8, ls=":") ax.text(1.0, len(labels) - 0.35, "ceiling / no-hack", fontsize=7, color=GREY, ha="center") ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=9) ax.set_xlim(-0.35, 1.25); ax.set_xlabel("fraction of floor→ceiling range (right = better)") ax.set_title(title, fontsize=10, loc="left") ax.text(0.01, 0.99, "red = hack removed (vs vanilla) green = solve recovered (base→ceiling)", transform=ax.transAxes, fontsize=7.5, color="#444", va="top") for s in ("top", "right", "left"): ax.spines[s].set_visible(False) ax.tick_params(left=False) def _panel_knob(ax, methods: pl.DataFrame): labels = list(methods["label"]) for yi, lab in enumerate(labels): r = methods.filter(pl.col("label") == lab).to_dicts()[0] ax.annotate("", xy=(r["hack_off"], yi + 0.16), xytext=(r["hack_on"], yi + 0.16), arrowprops=dict(arrowstyle="->", color=RED, lw=1.6, alpha=0.9)) ax.plot([r["hack_on"], r["hack_off"]], [yi + 0.16] * 2, "o", color=RED, ms=4, alpha=0.5) ax.text(r["hack_on"] + 0.012, yi + 0.16, f"on {r['hack_on']:.2f}", va="center", ha="left", fontsize=7, color=RED) ax.text(r["hack_off"] - 0.012, yi + 0.16, f"{r['hack_off']:.2f}", va="center", ha="right", fontsize=7.5, color=RED) ax.annotate("", xy=(r["solve_off"], yi - 0.16), xytext=(r["solve_on"], yi - 0.16), arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.6, alpha=0.9)) ax.plot([r["solve_on"], r["solve_off"]], [yi - 0.16] * 2, "o", color=GREEN, ms=4, alpha=0.5) ax.text(max(r["solve_on"], r["solve_off"]) + 0.012, yi - 0.16, f"solve {r['solve_off']:.2f}", va="center", ha="left", fontsize=7.5, color=GREEN) ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=9) ax.set_xlim(-0.02, 0.80) ax.set_xlabel("rate (arrow = knob-ON → knob-OFF on held-out val; left = better for hack)") ax.set_title("B. the knob effect (held-out val n=32, L5 -- isolates the quarantine)", fontsize=10, loc="left") for s in ("top", "right", "left"): ax.spines[s].set_visible(False) ax.tick_params(left=False) def plot(df: pl.DataFrame) -> None: a = _anchors(df) methods = df.filter(pl.col("kind") == "method") prov = " [ceiling PROVISIONAL=0.223, FIXME job 24]" if a["provisional"] else "" fig, axes = plt.subplots(2, 1, figsize=(8.5, 8.0), gridspec_kw=dict(height_ratios=[1, 1.05])) _panel_normalized(axes[0], methods, a, f"A. normalized floor→ceiling, deploy (test n=119){prov}") _panel_knob(axes[1], methods) fig.suptitle("vGROUT: floor-to-ceiling method comparison (seed 43, 60-step fast)", fontsize=11, x=0.02, ha="left") fig.tight_layout(rect=(0, 0, 1, 0.97)) for ext in ("pdf", "png"): fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight") def main() -> None: df = build_csv() flags = df.filter(~pl.col("status").str.starts_with("ok")) print(f"wrote {CSV}") if len(flags): print("\n=== TODO/FIXME in data ===") for r in flags.to_dicts(): print(f" [{r['label']}] {r['status']}") plot(df) print(f"\nwrote {OUT}/floor_ceiling.pdf and .png") if __name__ == "__main__": main()