mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 19:47:33 +08:00
transcribe Ariahw Fig 5 to a saved table; plot real no-oracle peer (LLM judge)
Read the figure PNGs directly (Fig 5 is a full numeric table the paper never prints as text). Saved to docs/papers/ariahw_results_table_extracted.md so we stop re-OCRing. Key correction: my 'LLM judge has no clean rate' was wrong -- LLM-judge PENALTY = 0.1% hack / 16.2% perf, NO oracle. So no-oracle suppression is not routeV's novelty (the judge does it); the mechanism is (no live monitor, gradient-level, fixed authored-pair direction). Plot now shows the judge as a blue no-oracle peer bar alongside the grey oracle methods. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -160,24 +160,41 @@ def plot(df: pl.DataFrame) -> None:
|
||||
def hsupp(r): return (vh - r["hack_deploy"]) / vh
|
||||
def suplift(r): return (r["solve_deploy"] - base) / (ceil - base)
|
||||
|
||||
# rows: best (gold) vs direction-control (dark). Floor/ceiling = the Ariahw paper's anchors.
|
||||
# Ariahw 2025 interventions on the hack panel, normalized to the PAPER's own floor
|
||||
# (No-Intervention 0.791 hack, Fig 9). Read off Fig 9 deploy hack rate:
|
||||
# GT penalty ~0.0 (needs oracle), probe penalty 0.0 (oracle-trained labels),
|
||||
# LLM-judge PENALTY 0.001 -- NO oracle, ~full suppression, perf 16.2% (the honest peer:
|
||||
# a no-oracle method ALREADY suppresses hacking; our novelty is the MECHANISM, not that).
|
||||
# LLM-judge SCREENING 0.157 is the weaker no-oracle point (2/3 runs broke out).
|
||||
# Scale caveat: paper bars are full-scale converged (paper floor 0.791); routeV is our
|
||||
# 60-step fast surrogate (floor 0.613). Suppression FRACTION is comparable; regimes differ.
|
||||
BLUE = "#3b5bdb" # no-oracle EXTERNAL peer (not a cheat -> not grey)
|
||||
PAPER_FLOOR = 0.791
|
||||
psupp = lambda hack: (PAPER_FLOOR - hack) / PAPER_FLOOR
|
||||
hack_rows = [
|
||||
("Ariahw GT penalty\n(needs oracle)", psupp(0.001), "0.001", GREY),
|
||||
("Ariahw probe\n(oracle-trained)", psupp(0.0), "0.000", GREY),
|
||||
("Ariahw LLM-judge pen.\n(NO oracle, ext. peer)", psupp(0.001), "0.001", BLUE),
|
||||
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best, no oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
("routeV per-token\n(best, NO oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
]
|
||||
solve_rows = [
|
||||
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best, no oracle)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
]
|
||||
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 3.2), sharey=True)
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 4.0), sharey=False)
|
||||
_bars(axl, hack_rows, "hack", None,
|
||||
"hack suppressed", "floor (vanilla 0.613) → ceiling (no hack) · right = better", 0.0)
|
||||
"hack suppressed", "floor → ceiling (no hack) · right = better", 0.0)
|
||||
_bars(axr, solve_rows, "solve", None,
|
||||
"solve gained", f"floor (base 0.126) → ceiling (no-loophole){prov} · right = better", -0.4)
|
||||
fig.suptitle("vGROUT floor→ceiling: best vs direction-control (floor/ceiling = Ariahw paper anchors; test n=119, seed 43, 60-step fast)",
|
||||
"solve gained", f"floor (base 0.126) → ceiling{prov} · right = better", -0.55)
|
||||
fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs Ariahw 2025 monitors (test n=119, seed 43, 60-step fast)",
|
||||
fontsize=10.5, x=0.01, ha="left")
|
||||
fig.tight_layout(rect=(0, 0, 1, 0.94))
|
||||
fig.text(0.01, 0.015, "Ariahw bars from Fig 9 (full-scale converged, paper floor 0.791); routeV is our 60-step surrogate (floor 0.613). "
|
||||
"The LLM-judge penalty already suppresses hacking with NO oracle (0.1%), so 'no-oracle suppression' is not the novelty -- "
|
||||
"routeV's is the mechanism (no live judge model each step; fixed direction from authored pairs).",
|
||||
fontsize=7.0, color=GREY, va="bottom")
|
||||
fig.tight_layout(rect=(0, 0.07, 1, 0.94))
|
||||
for ext in ("pdf", "png"):
|
||||
fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user