mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
plot: deploy Pareto (dots, ideal star, more arms) + honest val knob before/after
- floor_ceiling_abs.png: clean deploy Pareto. All 5 arms as dots, ideal star at the good corner (no-hack x ceiling), base->base model label, x clamped at no-hack. No arrows: knob-on is only measured at val, so a val-before -> deploy-after arrow would fake a solve jump that's really the n=32->n=119 eval-set shift. - floor_ceiling_knob.png: the real before->after on ONE eval (val n=32). Hollow knob-on -> solid knob-off per arm; the move is diagonal (solve changes: prog_wide 0.069->0.056, authored 0.056->0.044), not the horizontal I wrongly forced earlier. - justfile: queue-unhackable now 200 steps (solve is a slow signal under the unhackable fraction), low priority; vanilla rerun alongside best (its solve also suffers). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -193,51 +193,55 @@ def plot(df: pl.DataFrame) -> None:
|
||||
|
||||
|
||||
# ── stage 2b: the two metrics as ONE scatter (Tufte: don't split a 2-var story) ──
|
||||
# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving).
|
||||
# Each routeV arm gets a green effect-arrow FROM the vanilla baseline -> shows what the
|
||||
# intervention DID (mechanism), not just where it landed. The achievable solve band
|
||||
# (base..ceiling) is a faint range-frame; ticks sit only at the meaningful values
|
||||
# (no hack / vanilla / base / ceiling) so the axes teach the scale instead of generic grid.
|
||||
# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving), marked
|
||||
# "ideal". The achievable solve band (base..ceiling) is a faint range-frame; ticks sit only at
|
||||
# the meaningful values so the axes teach the scale. Two views:
|
||||
# plot_scatter -> DEPLOY (knob-off, test n=119): where each arm LANDS. Pareto of arms.
|
||||
# plot_knob -> the quarantine before/after (knob-on -> knob-off, val n=32): per arm, a
|
||||
# hollow "before" dot (deployed-as-trained, hacky) -> solid "after" dot.
|
||||
# They use DIFFERENT eval sets on purpose: deploy n=119 only measures knob-off, so before/after
|
||||
# can only come from the val on/off curve -- never share one y-axis (val solve ~2x lower).
|
||||
GREEN_ARROW = "#1e8449"
|
||||
BLUE = "#3b5bdb"
|
||||
# one colour per arm; GOLD=best real-V, DARK=random control, RED=no-intervention baseline.
|
||||
ARM_COLOR = {"routeV per-token": GOLD, "routeV authored": "#0e8a8a",
|
||||
"routeV prog_wide": "#8e44ad", "routeV random-V": DARK, "vanilla GRPO": RED}
|
||||
|
||||
|
||||
def _methods(df: pl.DataFrame) -> list[dict]:
|
||||
return df.filter(pl.col("kind") == "method").to_dicts()
|
||||
|
||||
|
||||
def plot_scatter(df: pl.DataFrame) -> None:
|
||||
a = _anchors(df)
|
||||
base, ceil = a["base_solve"], a["ceiling"]
|
||||
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
|
||||
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
|
||||
H = lambda r: r["hack_deploy"]; S = lambda r: r["solve_deploy"]
|
||||
prov = "*" if a["provisional"] else ""
|
||||
|
||||
BLUE = "#3b5bdb"
|
||||
fig, ax = plt.subplots(figsize=(7.2, 5.4))
|
||||
# achievable solve band (base -> ceiling): faint, recedes behind the data
|
||||
ax.axhspan(base, ceil, color="#eef3ff", zorder=0)
|
||||
ax.axhspan(base, ceil, color="#eef3ff", zorder=0) # achievable solve band
|
||||
ax.axhline(base, color=GREY, lw=0.8); ax.axhline(ceil, color=BLUE, lw=0.8, ls=":")
|
||||
ax.axvline(0.0, color=GREY, lw=0.8)
|
||||
# effect arrows: vanilla baseline -> each routeV arm (green = moves toward the good corner)
|
||||
for arm in (rand, best):
|
||||
ax.annotate("", xy=(H(arm), S(arm)), xytext=(H(van), S(van)),
|
||||
arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=2.0, alpha=0.85,
|
||||
shrinkA=7, shrinkB=9))
|
||||
# points + direct labels (name only -- the position already shows the rates; labelling
|
||||
# the amounts too would double-encode. offsets keep each clear of the arrows/each other)
|
||||
pts = [("vanilla GRPO", van, RED, (10, -13), "left"),
|
||||
("routeV random-V", rand, DARK, (12, -2), "left"),
|
||||
("routeV per-token", best, GOLD, (12, 6), "left")]
|
||||
for name, r, col, (dx, dy), ha in pts:
|
||||
# "ideal" = the good corner (no hack, ceiling solve). Nudged inside the no-hack edge so the
|
||||
# marker isn't half-clipped; label sits to its LEFT (no room to the right of no-hack).
|
||||
ax.plot(0.012, ceil, marker="*", ms=15, color=BLUE, zorder=6, clip_on=False)
|
||||
ax.annotate("ideal", (0.012, ceil), textcoords="offset points", xytext=(-8, 2),
|
||||
ha="right", va="center", fontsize=9, color=BLUE, style="italic")
|
||||
# Deploy (knob-off, n=119) is where each arm LANDS -> a pure Pareto of dots. No before->after
|
||||
# arrows here: the honest knob-on->off move changes BOTH hack and solve, but knob-on is only
|
||||
# measured at val (n=32), so drawing it against the deploy y-axis would fake a solve jump that
|
||||
# is really the eval-set shift. The real 2-D before->after lives in plot_knob (val on/off).
|
||||
for r in _methods(df):
|
||||
col = ARM_COLOR.get(r["label"], GREY)
|
||||
ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2)
|
||||
ax.annotate(name, (H(r), S(r)), textcoords="offset points", xytext=(dx, dy),
|
||||
ha=ha, va="center", fontsize=9, color=col, fontweight="bold")
|
||||
# "better" shown, not told: a small diagonal in the empty top-left, pointing at the good corner
|
||||
ax.annotate("", xy=(0.46, ceil - 0.004), xytext=(0.62, ceil - 0.030),
|
||||
arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=1.4, alpha=0.55))
|
||||
ax.text(0.63, ceil - 0.034, "better", fontsize=9, color=GREEN_ARROW, style="italic", ha="left", va="top")
|
||||
# range-frame: ticks only at meaningful values
|
||||
ax.set_xlim(0.66, -0.03) # reversed: high hack left, 0 right
|
||||
ax.set_ylim(base - 0.035, ceil + 0.02)
|
||||
prov = "*" if a["provisional"] else ""
|
||||
ax.set_xticks([0.0, H(van)]); ax.set_xticklabels(["no hack", f"vanilla\n{H(van):.2f}"], fontsize=8.5)
|
||||
ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5)
|
||||
right = H(r) > 0.3 # vanilla sits left; label into the middle
|
||||
ax.annotate(r["label"], (H(r), S(r)), textcoords="offset points",
|
||||
xytext=(12 if right else -12, 0), ha="left" if right else "right",
|
||||
va="center", fontsize=9, color=col, fontweight="bold")
|
||||
ax.set_xlim(0.74, 0.0) # reversed; clamp at no-hack (negative hack is meaningless)
|
||||
ax.set_ylim(base - 0.04, ceil + 0.012)
|
||||
ax.set_xticks([0.0, 0.6134]); ax.set_xticklabels(["no hack", "vanilla\n0.61"], fontsize=8.5)
|
||||
ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base model\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5)
|
||||
ax.set_xlabel("reward-hack rate", fontsize=9.5)
|
||||
ax.set_ylabel("solve rate", fontsize=9.5)
|
||||
for s in ("top", "right"):
|
||||
@@ -247,6 +251,40 @@ def plot_scatter(df: pl.DataFrame) -> None:
|
||||
fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight")
|
||||
|
||||
|
||||
def plot_knob(df: pl.DataFrame) -> None:
|
||||
"""Quarantine before/after on the SAME eval (val n=32). Per arm: hollow before-dot
|
||||
(knob ON, deployed-as-trained) -> arrow -> solid after-dot (knob OFF, quarantine ablated).
|
||||
Shows the knob collapses hacking while solve holds. vanilla has no knob (on==off)."""
|
||||
# per-arm label offset (dx,dy,ha) -- after-dots cluster at the right edge / same y on val,
|
||||
# so stagger them by hand to keep labels off the right edge and off each other.
|
||||
LBL = {"routeV per-token": (-8, 13, "right"), "routeV random-V": (-8, -13, "right"),
|
||||
"routeV prog_wide": (12, 0, "left"), "routeV authored": (12, 0, "left"),
|
||||
"vanilla GRPO": (12, 0, "left")}
|
||||
fig, ax = plt.subplots(figsize=(7.2, 5.0))
|
||||
ax.axvline(0.0, color=GREY, lw=0.8)
|
||||
for r in _methods(df):
|
||||
col = ARM_COLOR.get(r["label"], GREY)
|
||||
on, off = (r["hack_on"], r["solve_on"]), (r["hack_off"], r["solve_off"])
|
||||
moved = abs(on[0] - off[0]) > 1e-6 or abs(on[1] - off[1]) > 1e-6
|
||||
if moved: # routeV arms: before -> after
|
||||
ax.annotate("", xy=off, xytext=on,
|
||||
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8))
|
||||
ax.plot(*on, "o", color="white", mec=col, mew=1.8, ms=9, zorder=4) # hollow = before (knob on)
|
||||
ax.plot(*off, "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) # solid = after (knob off)
|
||||
dx, dy, ha = LBL.get(r["label"], (12, 0, "left"))
|
||||
ax.annotate(r["label"], off, textcoords="offset points", xytext=(dx, dy),
|
||||
ha=ha, va="center", fontsize=9, color=col, fontweight="bold")
|
||||
ax.set_xlim(0.80, 0.0) # reversed; clamp at no-hack
|
||||
ax.set_xticks([0.0, 0.6]); ax.set_xticklabels(["no hack", "≈vanilla hack\n0.6"], fontsize=8.5)
|
||||
ax.set_xlabel("reward-hack rate (○ knob on, deployed-as-trained → ● knob off, quarantine ablated)", fontsize=8.5)
|
||||
ax.set_ylabel("solve rate (val n=32)", fontsize=9.5)
|
||||
for s in ("top", "right"):
|
||||
ax.spines[s].set_visible(False)
|
||||
fig.tight_layout()
|
||||
for ext in ("pdf", "png"):
|
||||
fig.savefig(OUT / f"floor_ceiling_knob.{ext}", dpi=150, bbox_inches="tight")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
df = build_csv()
|
||||
flags = df.filter(~pl.col("status").str.starts_with("ok"))
|
||||
@@ -257,7 +295,8 @@ def main() -> None:
|
||||
print(f" [{r['label']}] {r['status']}")
|
||||
plot(df)
|
||||
plot_scatter(df)
|
||||
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ floor_ceiling_abs.pdf/.png scatter)")
|
||||
plot_knob(df)
|
||||
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ _abs scatter, + _knob before/after)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user