diff --git a/docs/papers/ariahw_results_table_extracted.md b/docs/papers/ariahw_results_table_extracted.md new file mode 100644 index 0000000..433c7e3 --- /dev/null +++ b/docs/papers/ariahw_results_table_extracted.md @@ -0,0 +1,61 @@ +# Ariahw et al. 2025 -- results table (transcribed from the figures) + +The paper publishes results as **figures only, no numeric table**, so every number +we want lives in an image. This file transcribes them once so we (and our plots) +never re-OCR. Read each cell off the source figure named in the provenance line. + +## *Steering RL Training: Benchmarking Interventions against Reward Hacking* -- Ariahw, Engels & Nanda 2025 -- [LessWrong](https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against) +- epistemic context: the substrate paper. Numbers below transcribed by reading the + figure PNGs directly (downloaded from the post's cloudinary mirror) on 2026-06-09. +- metric defs: **Reward Hacking** = fraction of eval rollouts flagged as RH in the + loophole env (our `hack`). **Performance** = pass rate in the no-loophole env (our + `solve`). `*` = significantly higher than RL Baseline, `†` = significantly lower + (a=0.01). `±SD` is across-seed spread (n=3 runs/cell). + +### Master table (Figure 5 -- "Overview of reward hacking and performance for all interventions") +source img: `mirroredImages/R5MdWGKsuvdPwGFBG/imeotdksvqyy8y8twbbq` (Fig 5) + +| Intervention | Monitor/Detail | Reward Hacking | ±SD | Performance | ±SD | +| :--- | :--- | ---: | ---: | ---: | ---: | +| No Intervention | Base Model | 0.0% | -- | 11.5% | -- | +| No Intervention | No RH (**RL Baseline = ceiling**) | 0.2% | ±0.2 | **22.3%** | ±1.0 | +| No Intervention | RH (**No Intervention = floor**) | **79.1%** `*` | ±10.3 | 14.9% `†` | ±8.2 | +| Penalty | Ground Truth 100% | 0.1% | ±0.1 | 25.0% `*` | ±3.3 | +| Penalty | Ground Truth 90% | 15.2% `*` | ±26.1 | 22.4% | ±1.3 | +| Penalty | Ground Truth 70% | 0.0% | ±0.0 | 17.5% `†` | ±2.8 | +| Penalty | Probe | 0.0% | ±0.0 | 19.4% | ±0.8 | +| Penalty | **LLM Judge** | **0.1%** | ±0.1 | **16.2%** `†` | ±4.9 | +| Screening | Ground Truth 100% | 0.9% `*` | ±0.6 | 26.3% `*` | ±3.9 | +| Screening | Ground Truth 90% | 7.5% `*` | ±11.8 | 23.3% | ±1.4 | +| Screening | Ground Truth 70% | 19.6% `*` | ±34.0 | 21.9% | ±1.6 | +| Screening | Probe | 17.8% `*` | ±28.0 | 18.4% `†` | ±0.8 | +| Screening | LLM Judge | 15.7% `*` | ±23.6 | 16.3% `†` | ±3.6 | +| Inoculation Prompting | Loophole | 38.3% `*` | ±28.9 | 18.3% `†` | ±2.0 | +| Inoculation Prompting | PassTests | 43.9% `*` | ±37.6 | 21.4% | ±2.3 | +| Inoculation Prompting | PassTests+LH | 26.4% `*` | ±27.6 | 14.3% `†` | ±2.1 | +| Inoculation Prompting | EvalEnv | 36.5% `*` | ±20.2 | 18.9% `†` | ±3.7 | +| Inoculation Prompting | EvalEnv+LH | 47.0% `*` | ±12.5 | 17.0% `†` | ±3.2 | + +### What this means for our no-cheat framing + +Group the rows by what each method NEEDS (the load-bearing axis, see AGENTS.md): + +- **Needs the env oracle:** Ground Truth (penalty 100% -> 0.1% hack, perf 25.0%) and its + 70/90% accuracy-lowered variants. Unavailable on a new env. +- **Needs oracle-trained labels:** Probe (penalty -> 0.0% hack, perf 19.4%). Built from + oracle-labelled in-env RH data; can't be constructed on a new env either. +- **No oracle, runs a live LLM monitor:** LLM Judge. **Penalty -> 0.1% hack, perf 16.2%** + (above base 11.5%, below ceiling 22.3%). Screening is weaker (15.7%, 2/3 runs broke out). + This is the honest external peer: a no-oracle method that ALREADY suppresses hacking. +- **No monitor at all (system prompt only):** Inoculation Prompting -- the only fully + detector-free family, and it largely **fails** (26-47% hack across variants). + +Takeaway for routeV: "a no-oracle method that suppresses hacking" is **not novel** -- the +LLM-judge penalty does it (0.1% hack). routeV's claim has to be the MECHANISM: no live +LLM monitor in the loop each step, gradient-level, direction from fixed hand-authored pairs +(one offline judge-equivalent), not a per-rollout model call. And note the judge-penalty +solve (16.2%) is itself well below the ceiling (22.3%) -- the no-oracle methods all pay a +solve tax, which is the axis worth competing on. + +(Other figures -- 6 GT, 7 GT-lowered, 8 probe, 9 judge -- are per-monitor visualisations of +these same Fig-5 numbers; Fig 5 is the canonical source.) diff --git a/out/figs/floor_ceiling.pdf b/out/figs/floor_ceiling.pdf index c4403e1..3ccdd75 100644 Binary files a/out/figs/floor_ceiling.pdf and b/out/figs/floor_ceiling.pdf differ diff --git a/out/figs/floor_ceiling.png b/out/figs/floor_ceiling.png index d0565cb..c220473 100644 Binary files a/out/figs/floor_ceiling.png and b/out/figs/floor_ceiling.png differ diff --git a/scripts/plot_floor_ceiling.py b/scripts/plot_floor_ceiling.py index f1c4bb7..dd8883b 100644 --- a/scripts/plot_floor_ceiling.py +++ b/scripts/plot_floor_ceiling.py @@ -160,24 +160,41 @@ def plot(df: pl.DataFrame) -> None: def hsupp(r): return (vh - r["hack_deploy"]) / vh def suplift(r): return (r["solve_deploy"] - base) / (ceil - base) - # rows: best (gold) vs direction-control (dark). Floor/ceiling = the Ariahw paper's anchors. + # Ariahw 2025 interventions on the hack panel, normalized to the PAPER's own floor + # (No-Intervention 0.791 hack, Fig 9). Read off Fig 9 deploy hack rate: + # GT penalty ~0.0 (needs oracle), probe penalty 0.0 (oracle-trained labels), + # LLM-judge PENALTY 0.001 -- NO oracle, ~full suppression, perf 16.2% (the honest peer: + # a no-oracle method ALREADY suppresses hacking; our novelty is the MECHANISM, not that). + # LLM-judge SCREENING 0.157 is the weaker no-oracle point (2/3 runs broke out). + # Scale caveat: paper bars are full-scale converged (paper floor 0.791); routeV is our + # 60-step fast surrogate (floor 0.613). Suppression FRACTION is comparable; regimes differ. + BLUE = "#3b5bdb" # no-oracle EXTERNAL peer (not a cheat -> not grey) + PAPER_FLOOR = 0.791 + psupp = lambda hack: (PAPER_FLOOR - hack) / PAPER_FLOOR hack_rows = [ + ("Ariahw GT penalty\n(needs oracle)", psupp(0.001), "0.001", GREY), + ("Ariahw probe\n(oracle-trained)", psupp(0.0), "0.000", GREY), + ("Ariahw LLM-judge pen.\n(NO oracle, ext. peer)", psupp(0.001), "0.001", BLUE), ("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK), - ("routeV per-token\n(best, no oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD), + ("routeV per-token\n(best, NO oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD), ] solve_rows = [ ("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK), ("routeV per-token\n(best, no oracle)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD), ] prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else "" - fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 3.2), sharey=True) + fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 4.0), sharey=False) _bars(axl, hack_rows, "hack", None, - "hack suppressed", "floor (vanilla 0.613) → ceiling (no hack) · right = better", 0.0) + "hack suppressed", "floor → ceiling (no hack) · right = better", 0.0) _bars(axr, solve_rows, "solve", None, - "solve gained", f"floor (base 0.126) → ceiling (no-loophole){prov} · right = better", -0.4) - fig.suptitle("vGROUT floor→ceiling: best vs direction-control (floor/ceiling = Ariahw paper anchors; test n=119, seed 43, 60-step fast)", + "solve gained", f"floor (base 0.126) → ceiling{prov} · right = better", -0.55) + fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs Ariahw 2025 monitors (test n=119, seed 43, 60-step fast)", fontsize=10.5, x=0.01, ha="left") - fig.tight_layout(rect=(0, 0, 1, 0.94)) + fig.text(0.01, 0.015, "Ariahw bars from Fig 9 (full-scale converged, paper floor 0.791); routeV is our 60-step surrogate (floor 0.613). " + "The LLM-judge penalty already suppresses hacking with NO oracle (0.1%), so 'no-oracle suppression' is not the novelty -- " + "routeV's is the mechanism (no live judge model each step; fixed direction from authored pairs).", + fontsize=7.0, color=GREY, va="bottom") + fig.tight_layout(rect=(0, 0.07, 1, 0.94)) for ext in ("pdf", "png"): fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")