mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
fix floor_ceiling asymmetry: paper methods on BOTH panels
Had Ariahw bars on the hack panel only -- misleading. Mirror them onto solve (Fig 5 perf: GT 25.0%, probe 19.4%, LLM-judge 16.2%, base 11.5%, ceiling 22.3%). Honest picture: the paper methods (incl. no-oracle LLM judge) beat routeV on both axes because they are converged full-scale vs our 60-step surrogate -- caption marks it directional-only. Cross-scale/maturity caveat (task #18) still stands. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 142 KiB After Width: | Height: | Size: 153 KiB |
@@ -144,7 +144,7 @@ def _bars(ax, rows, key, raws, title, xlabel, xlo):
|
||||
ax.axvline(0, color=GREY, lw=1.0) # floor (labelled in xlabel)
|
||||
ax.axvline(1.0, color=GREY, lw=1.0, ls=":") # ceiling
|
||||
ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
|
||||
ax.set_xlim(xlo, 1.18); ax.set_xlabel(xlabel, fontsize=8.5)
|
||||
ax.set_xlim(xlo, 1.4); ax.set_xlabel(xlabel, fontsize=8.5) # hi=1.4 fits GT solve overshoot (+125%)
|
||||
ax.set_title(title, fontsize=10, loc="left")
|
||||
for s in ("top", "right", "left"):
|
||||
ax.spines[s].set_visible(False)
|
||||
@@ -178,22 +178,30 @@ def plot(df: pl.DataFrame) -> None:
|
||||
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best, NO oracle)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
]
|
||||
# SAME methods on the solve panel (symmetry -- the paper bars belong on both axes).
|
||||
# Paper performance from Fig 5, normalized to the PAPER's own base->ceiling
|
||||
# (11.5% -> 22.3%): GT penalty 25.0% (overshoots ceiling -- it beat the RL baseline),
|
||||
# probe 19.4%, LLM-judge penalty 16.2%. routeV uses our base->ceiling.
|
||||
puplift = lambda perf: (perf - 0.115) / (0.223 - 0.115)
|
||||
solve_rows = [
|
||||
("Ariahw GT penalty\n(needs oracle)", puplift(0.250), "0.250", GREY),
|
||||
("Ariahw probe\n(oracle-trained)", puplift(0.194), "0.194", GREY),
|
||||
("Ariahw LLM-judge pen.\n(NO oracle, ext. peer)", puplift(0.162), "0.162", BLUE),
|
||||
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best, no oracle)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
("routeV per-token\n(best, NO oracle)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
]
|
||||
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11, 4.0), sharey=False)
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False)
|
||||
_bars(axl, hack_rows, "hack", None,
|
||||
"hack suppressed", "floor → ceiling (no hack) · right = better", 0.0)
|
||||
_bars(axr, solve_rows, "solve", None,
|
||||
"solve gained", f"floor (base 0.126) → ceiling{prov} · right = better", -0.55)
|
||||
fig.suptitle("vGROUT floor→ceiling: routeV (no oracle, gradient-level) vs Ariahw 2025 monitors (test n=119, seed 43, 60-step fast)",
|
||||
fontsize=10.5, x=0.01, ha="left")
|
||||
fig.text(0.01, 0.015, "Ariahw bars from Fig 9 (full-scale converged, paper floor 0.791); routeV is our 60-step surrogate (floor 0.613). "
|
||||
"The LLM-judge penalty already suppresses hacking with NO oracle (0.1%), so 'no-oracle suppression' is not the novelty -- "
|
||||
"routeV's is the mechanism (no live judge model each step; fixed direction from authored pairs).",
|
||||
fontsize=7.0, color=GREY, va="bottom")
|
||||
fig.text(0.01, 0.015, "Ariahw bars from Fig 5 (full-scale CONVERGED, normalized to paper base/floor/ceiling); routeV is our 60-step UNCONVERGED surrogate "
|
||||
"(our base/floor/ceiling) -- comparison is DIRECTIONAL only, not like-for-like. The LLM-judge penalty already suppresses with NO oracle (0.1% hack, 16.2% solve), "
|
||||
"so 'no-oracle suppression' isn't routeV's novelty -- the mechanism is (no live judge each step; fixed authored-pair direction).",
|
||||
fontsize=6.8, color=GREY, va="bottom")
|
||||
fig.tight_layout(rect=(0, 0.07, 1, 0.94))
|
||||
for ext in ("pdf", "png"):
|
||||
fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")
|
||||
|
||||
Reference in New Issue
Block a user