mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:52:18 +08:00
d4998a71ba
- Transcribed Fig-5 numeric table now lives inline in the paper md as an
EDITOR'S TABLE comment, deleting docs/papers/ariahw_results_table_extracted.md
(one fewer repo file; the table sits next to the figure it transcribes).
- floor_ceiling_abs.{png,pdf}: raw-rate variant. Arrows climb from the floor
anchor; grey bedrock = worse-than-floor, blue sky = past-ceiling; hack axis
reversed so right=better on both panels.
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
278 lines
16 KiB
Python
278 lines
16 KiB
Python
"""Floor-to-ceiling method comparison: the keynote figure.
|
|
|
|
Two stages so the data is inspectable before it's drawn:
|
|
1. build -> out/plots/floor_ceiling.csv (one row per arm/anchor, with SOURCE and STATUS
|
|
columns; every provisional/missing value is flagged, not silently filled)
|
|
2. plot -> out/plots/floor_ceiling.{pdf,png}
|
|
|
|
Run `uv run python -m scripts.plot_floor_ceiling` to do both; it prints a TODO/FIXME summary
|
|
of any provisional or missing cells before plotting.
|
|
|
|
THE GOAL: place each gradient-routing arm on a floor->ceiling scale so "how much of the
|
|
achievable range did it capture" is read at a glance, and show that the quarantine (knob)
|
|
is what removes the hack, not a train/test artifact.
|
|
|
|
TWO METRICS, two anchor pairs (right/down = better):
|
|
hack removed = (vanilla_hack - arm_hack) / vanilla_hack 1.0 = no hack
|
|
solve recovered = (arm_solve - base_solve) / (ceiling - base_solve) 1.0 = no-loophole ceiling
|
|
|
|
TWO VIEWS of the same arms:
|
|
A. normalized floor->ceiling bars, HEADLINE deploy (knob-off, test n=119, recency-clean).
|
|
Source per arm: out/runs/<run>/deploy_test.json.
|
|
B. the KNOB effect: arrow knob-ON -> knob-OFF on the SAME held-out val split (n=32), so it
|
|
isolates the quarantine from the train/test memorization gap. Source per arm:
|
|
out/runs/<run>/eval_curve.jsonl, where the file's `train_*`/`deploy_*` prefixes denote
|
|
KNOB STATE (on/off), not the problem set (always val here). L5 = mean of last 5 evals.
|
|
|
|
DATA GAPS (see STATUS column in the csv):
|
|
- solve ceiling: provisional = paper 0.223 until job 24 (out/runs/*noloophole*) lands. FIXME.
|
|
- prog_wide arm uses contaminated pairs; job 28 (prog_wide_clean) will replace it. TODO.
|
|
- full-env (paper-scale) panel: no method runs exist, only paper anchors. Out of scope here.
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
|
|
RED, GREEN, GREY = "#c0392b", "#1e8449", "#9aa0a6"
|
|
RUNS = Path("out/runs")
|
|
OUT = Path("out/figs")
|
|
CSV = OUT / "floor_ceiling.csv"
|
|
PAPER_CEILING = 0.223 # Ariahw et al. no-loophole solve -- provisional fast-env ceiling
|
|
|
|
# arm display order, identified by a substring of the run's out_tag (seed-43 fast runs)
|
|
ARMS = [
|
|
("routeV per-token", "_dir6_routeV_pertoken_s43", "ok"),
|
|
("routeV authored", "_dir8_routeV_authored_perroll_s43", "ok"),
|
|
("routeV prog_wide", "_dir6_routeV_s43", "TODO: contaminated pairs -> job 28 prog_wide_clean"),
|
|
("routeV random-V", "_dir6_routeV_random_s43", "ok (directionality control)"),
|
|
("vanilla GRPO", "_dir8_vanilla_s43", "ok (defines hack-worst anchor)"),
|
|
]
|
|
|
|
|
|
def _find_run(tag: str) -> Path:
|
|
cands = sorted(d for d in RUNS.iterdir()
|
|
if d.name.endswith(tag) and (d / "deploy_test.json").exists())
|
|
if not cands:
|
|
raise FileNotFoundError(f"no run dir ending '{tag}' with a deploy_test.json")
|
|
return cands[-1] # latest timestamp wins
|
|
|
|
|
|
def _l5(rows: list[dict], k: str) -> float:
|
|
v = [r[k] for r in rows[-5:]]
|
|
return sum(v) / len(v)
|
|
|
|
|
|
# ── stage 1: build the inspectable csv ──────────────────────────────────────
|
|
def build_csv() -> pl.DataFrame:
|
|
rows = []
|
|
for label, tag, status in ARMS:
|
|
run = _find_run(tag)
|
|
dep = json.loads((run / "deploy_test.json").read_text())
|
|
ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()]
|
|
rows.append(dict(
|
|
label=label, kind="method",
|
|
hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4),
|
|
hack_on=round(_l5(ev, "train_hack"), 4), hack_off=round(_l5(ev, "deploy_hack"), 4),
|
|
solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4),
|
|
source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status))
|
|
|
|
base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text())
|
|
rows.append(dict(label="base (floor)", kind="anchor_floor",
|
|
hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4),
|
|
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
|
source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)"))
|
|
|
|
ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None)
|
|
if ceil_path:
|
|
ceil_solve, status = round(json.loads(ceil_path.read_text())["deploy_solve"], 4), "ok"
|
|
source = f"{ceil_path.parent.name}/deploy_test.json"
|
|
else:
|
|
ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)"
|
|
source = "Ariahw et al. 2025 (paper), NOT our run"
|
|
rows.append(dict(label="ceiling", kind="anchor_ceiling",
|
|
hack_deploy=0.0, solve_deploy=ceil_solve,
|
|
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
|
source=source, status=status))
|
|
|
|
df = pl.DataFrame(rows)
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
df.write_csv(CSV)
|
|
return df
|
|
|
|
|
|
# ── stage 2: plot from the csv ──────────────────────────────────────────────
|
|
# The reference paper (Ariahw et al. 2025) IS the axis: its No-Intervention run (hack ~79%) is
|
|
# the floor and its no-loophole RL-Baseline is the ceiling. So the comparison-to-paper is "how
|
|
# far up the paper's own floor->ceiling range did our no-cheat method climb." We do NOT plot the
|
|
# paper's intervention bars, for two different reasons (the disqualifier is oracle/ground-truth-
|
|
# LABEL leakage, NOT "a monitor ran"):
|
|
# - GT monitor (+70/90% variants) and the probe (trained on oracle-labelled in-env RH data,
|
|
# footnote 12) both need the env oracle to exist -- they cannot be built on a new env with no
|
|
# oracle, so they are cheats for our transfer claim.
|
|
# - LLM judge is the legitimate external peer (generic model, no oracle, ~50% acc yet protective
|
|
# via penalty) -- but it has no clean single fast-env number on our axis (paper figures only,
|
|
# different training regime), so we have no honest point to plot for it.
|
|
# - inoculation prompting (no monitor) has no clean number either (prose: incomplete, high-
|
|
# variance -- some seeds ~0 hack, some ~full hack).
|
|
# So: nothing with a comparable single number to plot; the paper enters only as floor/ceiling.
|
|
GOLD, DARK = "#c8920a", "#3a3a3a"
|
|
|
|
|
|
def _anchors(df: pl.DataFrame) -> dict:
|
|
g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0]
|
|
ceil_status = g("anchor_ceiling", "status")
|
|
return dict(base_solve=g("anchor_floor", "solve_deploy"),
|
|
vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deploy"][0],
|
|
ceiling=g("anchor_ceiling", "solve_deploy"),
|
|
provisional=ceil_status.startswith("FIXME"))
|
|
|
|
|
|
def _bars(ax, rows, key, raws, title, xlabel, xlo):
|
|
"""One floor->ceiling panel: horizontal bars in [xlo,1], 0=floor, 1.0=ceiling."""
|
|
for yi, (lab, val, raw, col) in enumerate(rows):
|
|
ax.barh(yi, val, height=0.55, color=col, alpha=0.9,
|
|
hatch="//" if col == GREY else None, edgecolor="white") # grey = approx reference
|
|
tip = f"{val*100:+.0f}%" if xlo < 0 else f"{val*100:.0f}%"
|
|
rawtxt = f" ({raw})" if raw else ""
|
|
ax.text(val + (0.02 if val >= 0 else -0.02), yi, tip + rawtxt,
|
|
va="center", ha="left" if val >= 0 else "right", fontsize=8.5, color=col)
|
|
ax.axvline(0, color=GREY, lw=1.0) # floor (labelled in xlabel)
|
|
ax.axvline(1.0, color=GREY, lw=1.0, ls=":") # ceiling
|
|
ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
|
|
ax.set_xlim(xlo, 1.4); ax.set_xlabel(xlabel, fontsize=8.5) # hi=1.4 keeps the ceiling line (1.0) in view
|
|
ax.set_title(title, fontsize=10, loc="left")
|
|
for s in ("top", "right", "left"):
|
|
ax.spines[s].set_visible(False)
|
|
ax.tick_params(left=False)
|
|
|
|
|
|
def plot(df: pl.DataFrame) -> None:
|
|
a = _anchors(df)
|
|
base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"]
|
|
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
|
|
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
|
|
|
|
def hsupp(r): return (vh - r["hack_deploy"]) / vh
|
|
def suplift(r): return (r["solve_deploy"] - base) / (ceil - base)
|
|
|
|
# OURS ONLY -- no paper bars. The paper comparison is cross-scale/regime (their converged
|
|
# full-env vs our 60-step fast surrogate) so it can only ever be directional; the paper
|
|
# numbers live in docs/papers/ariahw_results_table_extracted.md, not on this axis.
|
|
# vanilla is the floor anchor (defines vh, so its hack-suppression is 0 by construction);
|
|
# random-V is the directionality control; per-token is the live arm.
|
|
hack_rows = [
|
|
("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deploy']:.3f}", RED),
|
|
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
|
|
("routeV per-token\n(best)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
|
]
|
|
solve_rows = [
|
|
("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deploy']:.3f}", RED),
|
|
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
|
|
("routeV per-token\n(best)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
|
]
|
|
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
|
|
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False)
|
|
_bars(axl, hack_rows, "hack", None,
|
|
"hack suppressed", "floor → ceiling (no hack) · right = better", 0.0)
|
|
_bars(axr, solve_rows, "solve", None,
|
|
"solve gained", f"floor (base 0.126) → ceiling{prov} · right = better", -0.55)
|
|
fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs vanilla GRPO (test n=119, seed 43, 60-step fast)",
|
|
fontsize=10.5, x=0.01, ha="left")
|
|
fig.text(0.01, 0.015, "Our arms only, seed 43, 60-step fast (unconverged surrogate). hack suppressed = (vanilla_hack - arm_hack)/vanilla_hack; "
|
|
"solve gained = (arm_solve - base)/(ceiling - base). Ariahw 2025 monitor numbers are cross-scale/regime and live in "
|
|
"the transcribed Fig-5 table in docs/papers/2025_lw_ariahw_*.md, not on this axis.",
|
|
fontsize=6.8, color=GREY, va="bottom")
|
|
fig.tight_layout(rect=(0, 0.07, 1, 0.94))
|
|
for ext in ("pdf", "png"):
|
|
fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")
|
|
|
|
|
|
# ── stage 2b: absolute-scale variant (arrows + shaded floor/ceiling) ─────────
|
|
# Same three arms, but plotted on the RAW metric axis (not normalized to [0,1]) so the
|
|
# actual rates are legible. Both panels oriented "right = better": the solve axis is the
|
|
# raw solve rate; the hack axis is REVERSED (right = less hacking). Grey "bedrock" shades
|
|
# the worse-than-floor zone, blue "sky" shades the better-than-ceiling zone, and each arm
|
|
# is an arrow from the floor anchor to its value (length = distance climbed).
|
|
SKY, BEDROCK = "#cfe3ff", "#d9dadb"
|
|
|
|
|
|
def _arrow_panel(ax, anchor, ceiling, rows, *, reversed_x, xlim, floor_lab, ceil_lab, xlabel, title):
|
|
lo, hi = xlim # lo=left edge, hi=right edge (lo>hi when reversed_x)
|
|
# bedrock = worse-than-floor; sky = better-than-ceiling (data coords, orientation-agnostic)
|
|
if reversed_x: # hack: worse = higher rate, better = lower; better is to the RIGHT
|
|
ax.axvspan(lo, anchor, color=BEDROCK, alpha=0.7, lw=0) # >= floor hack = bedrock
|
|
ax.axvspan(ceiling, hi, color=SKY, alpha=0.7, lw=0) # <= ceiling (0) = sky
|
|
else: # solve: worse = lower, better = higher; better is to the RIGHT
|
|
ax.axvspan(lo, anchor, color=BEDROCK, alpha=0.7, lw=0) # <= floor solve = bedrock
|
|
ax.axvspan(ceiling, hi, color=SKY, alpha=0.7, lw=0) # >= ceiling = sky
|
|
ax.axvline(anchor, color=GREY, lw=1.2)
|
|
ax.axvline(ceiling, color="#3b5bdb", lw=1.2, ls=":")
|
|
span = abs(hi - lo)
|
|
for yi, (lab, val, col) in enumerate(rows):
|
|
ax.annotate("", xy=(val, yi), xytext=(anchor, yi),
|
|
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.6, shrinkA=0, shrinkB=0))
|
|
ax.plot([anchor], [yi], "o", color=GREY, ms=4, zorder=3)
|
|
better_right = (val > anchor) if not reversed_x else (val < anchor) # is the arm in the 'better' (right) dir
|
|
ha = "left" if better_right else "right"
|
|
ax.text(val + (span * 0.02 if ha == "left" else -span * 0.02), yi, f"{val:.3f}",
|
|
va="center", ha=ha, fontsize=9, color=col, fontweight="bold")
|
|
ax.set_xlim(lo, hi)
|
|
ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
|
|
ax.set_ylim(-0.6, len(rows) - 0.4)
|
|
ax.set_xlabel(xlabel, fontsize=8.5)
|
|
ax.set_title(title, fontsize=10, loc="left")
|
|
ax.text(anchor, -0.55, floor_lab, fontsize=7.5, color=GREY, ha="center", va="bottom")
|
|
ax.text(ceiling, -0.55, ceil_lab, fontsize=7.5, color="#3b5bdb", ha="center", va="bottom")
|
|
for s in ("top", "right", "left"):
|
|
ax.spines[s].set_visible(False)
|
|
ax.tick_params(left=False)
|
|
|
|
|
|
def plot_abs(df: pl.DataFrame) -> None:
|
|
a = _anchors(df)
|
|
base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"]
|
|
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
|
|
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
|
|
# bottom -> top: vanilla, random-V, per-token
|
|
hack_rows = [("vanilla GRPO", van["hack_deploy"], RED),
|
|
("routeV random-V", rand["hack_deploy"], DARK),
|
|
("routeV per-token", best["hack_deploy"], GOLD)]
|
|
solve_rows = [("vanilla GRPO", van["solve_deploy"], RED),
|
|
("routeV random-V", rand["solve_deploy"], DARK),
|
|
("routeV per-token", best["solve_deploy"], GOLD)]
|
|
prov = " PROVISIONAL" if a["provisional"] else ""
|
|
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 4.2), sharey=True)
|
|
_arrow_panel(axl, anchor=vh, ceiling=0.0, rows=hack_rows, reversed_x=True,
|
|
xlim=(vh + 0.05, -0.03), floor_lab=f"floor\n(vanilla {vh:.2f})", ceil_lab="ceiling\n(no hack)",
|
|
xlabel="hack rate · axis reversed: right = less hacking = better", title="hacking (raw rate)")
|
|
_arrow_panel(axr, anchor=base, ceiling=ceil, rows=solve_rows, reversed_x=False,
|
|
xlim=(base - 0.03, ceil + 0.03), floor_lab=f"floor\n(base {base:.2f})", ceil_lab=f"ceiling\n({ceil:.2f}{prov})",
|
|
xlabel="solve rate · right = more solving = better", title="solving (raw rate)")
|
|
fig.suptitle("vGROUT raw rates: arrow = climb from floor; grey = bedrock (worse than floor), blue = sky (past ceiling) (test n=119, seed 43, 60-step fast)",
|
|
fontsize=10, x=0.01, ha="left")
|
|
fig.tight_layout(rect=(0, 0, 1, 0.93))
|
|
for ext in ("pdf", "png"):
|
|
fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight")
|
|
|
|
|
|
def main() -> None:
|
|
df = build_csv()
|
|
flags = df.filter(~pl.col("status").str.starts_with("ok"))
|
|
print(f"wrote {CSV}")
|
|
if len(flags):
|
|
print("\n=== TODO/FIXME in data ===")
|
|
for r in flags.to_dicts():
|
|
print(f" [{r['label']}] {r['status']}")
|
|
plot(df)
|
|
plot_abs(df)
|
|
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ floor_ceiling_abs.pdf/.png)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|