mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
plot: deploy Pareto draws knob-on->off before/after on the n=119 axis
Now that final/rescore eval record deploy_hack_on/solve_on at n=119, the deploy scatter shows the honest quarantine move (hollow knob-on dot -> arrow -> solid knob-off dot) on the same axis instead of borrowing val's lower-scale curve. Dot-only fallback for arms not yet backfilled. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -67,6 +67,10 @@ def _l5(rows: list[dict], k: str) -> float:
|
||||
return sum(v) / len(v)
|
||||
|
||||
|
||||
def _r4(x):
|
||||
return None if x is None else round(x, 4)
|
||||
|
||||
|
||||
# ── stage 1: build the inspectable csv ──────────────────────────────────────
|
||||
def build_csv() -> pl.DataFrame:
|
||||
rows = []
|
||||
@@ -77,6 +81,9 @@ def build_csv() -> pl.DataFrame:
|
||||
rows.append(dict(
|
||||
label=label, kind="method",
|
||||
hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4),
|
||||
# knob-ON deploy (deployed-as-trained) on the SAME n=119 set -- None until backfilled
|
||||
# (rescore_deploy.py) so the deploy before->after is honest, not borrowed from val.
|
||||
hack_deploy_on=_r4(dep.get("deploy_hack_on")), solve_deploy_on=_r4(dep.get("deploy_solve_on")),
|
||||
hack_on=round(_l5(ev, "train_hack"), 4), hack_off=round(_l5(ev, "deploy_hack"), 4),
|
||||
solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4),
|
||||
source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status))
|
||||
@@ -84,6 +91,7 @@ def build_csv() -> pl.DataFrame:
|
||||
base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text())
|
||||
rows.append(dict(label="base (floor)", kind="anchor_floor",
|
||||
hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4),
|
||||
hack_deploy_on=None, solve_deploy_on=None,
|
||||
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
||||
source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)"))
|
||||
|
||||
@@ -96,6 +104,7 @@ def build_csv() -> pl.DataFrame:
|
||||
source = "Ariahw et al. 2025 (paper), NOT our run"
|
||||
rows.append(dict(label="ceiling", kind="anchor_ceiling",
|
||||
hack_deploy=0.0, solve_deploy=ceil_solve,
|
||||
hack_deploy_on=None, solve_deploy_on=None,
|
||||
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
||||
source=source, status=status))
|
||||
|
||||
@@ -196,11 +205,12 @@ def plot(df: pl.DataFrame) -> None:
|
||||
# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving), marked
|
||||
# "ideal". The achievable solve band (base..ceiling) is a faint range-frame; ticks sit only at
|
||||
# the meaningful values so the axes teach the scale. Two views:
|
||||
# plot_scatter -> DEPLOY (knob-off, test n=119): where each arm LANDS. Pareto of arms.
|
||||
# plot_knob -> the quarantine before/after (knob-on -> knob-off, val n=32): per arm, a
|
||||
# hollow "before" dot (deployed-as-trained, hacky) -> solid "after" dot.
|
||||
# They use DIFFERENT eval sets on purpose: deploy n=119 only measures knob-off, so before/after
|
||||
# can only come from the val on/off curve -- never share one y-axis (val solve ~2x lower).
|
||||
# plot_scatter -> DEPLOY (test n=119): solid dot = knob-off (where each arm lands = the Pareto);
|
||||
# when the run carries knob-on on the SAME n=119 set, a hollow before-dot ->
|
||||
# arrow -> solid after-dot shows the quarantine move on the deploy axis.
|
||||
# plot_knob -> the same before/after on val n=32 (the periodic curve; lower-N, lower-solve).
|
||||
# Prefer the deploy view now that both endpoints exist there; plot_knob remains as the val cross-
|
||||
# check (val solve runs ~2x lower, so the two panels never share a y-axis).
|
||||
GREEN_ARROW = "#1e8449"
|
||||
BLUE = "#3b5bdb"
|
||||
# one colour per arm; GOLD=best real-V, DARK=random control, RED=no-intervention baseline.
|
||||
@@ -227,13 +237,19 @@ def plot_scatter(df: pl.DataFrame) -> None:
|
||||
ax.plot(0.012, ceil, marker="*", ms=15, color=BLUE, zorder=6, clip_on=False)
|
||||
ax.annotate("ideal", (0.012, ceil), textcoords="offset points", xytext=(-8, 2),
|
||||
ha="right", va="center", fontsize=9, color=BLUE, style="italic")
|
||||
# Deploy (knob-off, n=119) is where each arm LANDS -> a pure Pareto of dots. No before->after
|
||||
# arrows here: the honest knob-on->off move changes BOTH hack and solve, but knob-on is only
|
||||
# measured at val (n=32), so drawing it against the deploy y-axis would fake a solve jump that
|
||||
# is really the eval-set shift. The real 2-D before->after lives in plot_knob (val on/off).
|
||||
# Deploy: solid dot = knob-OFF (quarantine ablated), where each arm LANDS = the Pareto.
|
||||
# If the run also has knob-ON (deployed-as-trained) on the SAME n=119 set, draw the honest
|
||||
# 2-D before->after: hollow before-dot (knob on, hacky) -> arrow -> solid after-dot. Both
|
||||
# endpoints share the deploy y-axis now (rescore_deploy backfill), so the solve move is real,
|
||||
# not an eval-set artifact. Arms without the backfill fall back to dot-only.
|
||||
for r in _methods(df):
|
||||
col = ARM_COLOR.get(r["label"], GREY)
|
||||
ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2)
|
||||
hon, son = r["hack_deploy_on"], r["solve_deploy_on"]
|
||||
if hon is not None and (abs(hon - H(r)) > 1e-6 or abs(son - S(r)) > 1e-6):
|
||||
ax.annotate("", xy=(H(r), S(r)), xytext=(hon, son),
|
||||
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8))
|
||||
ax.plot(hon, son, "o", color="white", mec=col, mew=1.8, ms=9, zorder=4) # hollow = knob on
|
||||
ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) # solid = knob off
|
||||
right = H(r) > 0.3 # vanilla sits left; label into the middle
|
||||
ax.annotate(r["label"], (H(r), S(r)), textcoords="offset points",
|
||||
xytext=(12 if right else -12, 0), ha="left" if right else "right",
|
||||
|
||||
Reference in New Issue
Block a user