mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 19:47:33 +08:00
viz: reference = Ariahw paper (oracle upper bound), not SGTM
Swap the floor->ceiling reference to the substrate paper (Ariahw et al. 2025), which benchmarks interventions on the same floor (No-Intervention hack ~79%) / ceiling (RL-Baseline no-loophole). Their best arm (Ground-Truth Penalty, ~0% hack, perf >= ceiling) reaches the top corner BUT uses the oracle monitor at train time -- the exact cheat our no-cheat constraint forbids; their only oracle-free method (inoculation) gave incomplete, high-variance mitigation. Plotted hatched/grey as an ORACLE upper bound (solve approx; figures are images, 200-step preset not step-matched). Honest framing: their working methods need the oracle; ours uses no detector at train time and still suppresses 93%. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -106,11 +106,15 @@ def build_csv() -> pl.DataFrame:
|
||||
|
||||
|
||||
# ── stage 2: plot from the csv ──────────────────────────────────────────────
|
||||
# Reference: the gradient-routing paper (SGTM, Mhaskar et al. 2025) reports its result as a
|
||||
# retain/forget trade-off vs a "perfect filter" oracle (= our ceiling) and "no filter" (= our
|
||||
# floor). Placed on the SAME floor->ceiling % axis (approximate; LM-unlearning task, not RL):
|
||||
# forget suppression ~leakage 0.02 -> ~98%; retain ~5% compute penalty -> ~95% of oracle.
|
||||
SGTM_REF = dict(label="SGTM grad-routing\n(LM unlearn, ~approx)", hack_supp=0.98, solve_uplift=0.95)
|
||||
# Reference: Ariahw et al. 2025 (the substrate paper) benchmark interventions on the SAME
|
||||
# floor/ceiling -- No-Intervention (hack ~79%) = floor, RL-Baseline/no-loophole = ceiling. Their
|
||||
# best interventions (Ground-Truth Penalty ~0% hack, perf >= ceiling) reach the top corner BUT
|
||||
# use the oracle monitor at train time -- the exact cheat our no-cheat constraint forbids. Their
|
||||
# only oracle-free method (inoculation) gave incomplete, high-variance mitigation. We plot the
|
||||
# GT-monitor point as a clearly-marked ORACLE upper bound (solve approx; figures are images, and
|
||||
# their 200-step preset is not step-matched to our 60-step fast). hack_supp ~1.0 (hack ~0%),
|
||||
# solve_uplift ~1.0 (perf at/above ceiling).
|
||||
ARIAHW_REF = dict(label="Ariahw GT-monitor\n(uses ORACLE -- cheat)", hack_supp=0.99, solve_uplift=1.0)
|
||||
GOLD, DARK = "#c8920a", "#3a3a3a"
|
||||
|
||||
|
||||
@@ -127,7 +131,7 @@ def _bars(ax, rows, key, raws, title, xlabel, xlo):
|
||||
"""One floor->ceiling panel: horizontal bars in [xlo,1], 0=floor, 1.0=ceiling."""
|
||||
for yi, (lab, val, raw, col) in enumerate(rows):
|
||||
ax.barh(yi, val, height=0.55, color=col, alpha=0.9,
|
||||
hatch="//" if "approx" in lab else None, edgecolor="white")
|
||||
hatch="//" if col == GREY else None, edgecolor="white") # grey = approx reference
|
||||
tip = f"{val*100:+.0f}%" if xlo < 0 else f"{val*100:.0f}%"
|
||||
rawtxt = f" ({raw})" if raw else ""
|
||||
ax.text(val + (0.02 if val >= 0 else -0.02), yi, tip + rawtxt,
|
||||
@@ -151,16 +155,16 @@ def plot(df: pl.DataFrame) -> None:
|
||||
def hsupp(r): return (vh - r["hack_deploy"]) / vh
|
||||
def suplift(r): return (r["solve_deploy"] - base) / (ceil - base)
|
||||
|
||||
# rows: best (gold), random control (dark), SGTM reference (grey, hatched). Top row plots last.
|
||||
# rows: best (gold), random control (dark), Ariahw oracle reference (grey, hatched). Top plots last.
|
||||
hack_rows = [
|
||||
(SGTM_REF["label"], SGTM_REF["hack_supp"], "~0.98 supp", GREY),
|
||||
(ARIAHW_REF["label"], ARIAHW_REF["hack_supp"], "hack ~0%", GREY),
|
||||
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
("routeV per-token\n(best, no oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
]
|
||||
solve_rows = [
|
||||
(SGTM_REF["label"], SGTM_REF["solve_uplift"], "~oracle", GREY),
|
||||
(ARIAHW_REF["label"], ARIAHW_REF["solve_uplift"], "~>=ceiling", GREY),
|
||||
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
("routeV per-token\n(best, no oracle)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
]
|
||||
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 3.2), sharey=True)
|
||||
|
||||
Reference in New Issue
Block a user