mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
plot: 2x2 train(knob-on) vs deploy(knob-off) x arm figure
The A4 framing in one figure: vanilla train==deploy (cheat in the weights), route2 train HACKS while deploy is clean (cheat in the deletable knob). parse_log now keeps the raw train series (hack_train/solve_train) before the deploy substitution. New fig: dyn_longrun_200_train_deploy.png. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -124,6 +124,11 @@ def parse_log(path: Path) -> dict | None:
|
||||
# presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
|
||||
def _has_data(key):
|
||||
return key in run and np.isfinite(run[key]).any()
|
||||
# Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy
|
||||
# substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both.
|
||||
if "hack_s" in run:
|
||||
run["hack_train"] = run["hack_s"]
|
||||
run["solve_train"] = run["gt_s"]
|
||||
if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present
|
||||
run["hack_s"] = run["hk_abl"]
|
||||
run["gt_s"] = run["slv_abl"]
|
||||
@@ -355,6 +360,50 @@ def plot_hack_overlay(runs: list[dict], out: Path) -> None:
|
||||
logger.info(f"wrote {out}")
|
||||
|
||||
|
||||
def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
|
||||
"""2x2 small multiple: rows = train (knob ON) / deploy (knob OFF), cols = arm.
|
||||
The story in one figure: vanilla train == deploy (no quarantine, the cheat is
|
||||
in the deployed weights); route2 train HACKS while deploy is clean -- the cheat
|
||||
is held in the deletable knob. Same red=hack/green=solve as the other figures."""
|
||||
by_arm: dict[str, list[dict]] = defaultdict(list)
|
||||
for r in runs:
|
||||
by_arm[classify(r)].append(r)
|
||||
arms = [a for a in ARM_ORDER if a in by_arm]
|
||||
red, green = RATE_COLORS["hack_s"], RATE_COLORS["gt_s"]
|
||||
rows = [
|
||||
("train (knob on)", {"hack_train": "hack", "solve_train": "solve"},
|
||||
{"hack_train": red, "solve_train": green}),
|
||||
("deploy (knob off)", {"hk_dep": "hack", "slv_dep": "solve"},
|
||||
{"hk_dep": red, "slv_dep": green}),
|
||||
]
|
||||
fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.8),
|
||||
sharex=True, sharey=True, squeeze=False)
|
||||
for ci, arm in enumerate(arms):
|
||||
axes[0][ci].set_title(arm, fontsize=10)
|
||||
for ri, (rlabel, cols, colors) in enumerate(rows):
|
||||
ax = axes[ri][ci]
|
||||
_series_panel(ax, by_arm[arm], cols, colors, ylim=(-0.035, 1.0),
|
||||
label_series=(ci == 0))
|
||||
hk_key = next(iter(cols))
|
||||
hk = [r[hk_key] for r in by_arm[arm] if hk_key in r]
|
||||
if hk and np.nanmax([np.nanmax(h) for h in hk]) < 0.02:
|
||||
ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"),
|
||||
color=red, fontsize=8, va="bottom",
|
||||
xytext=(0, 3), textcoords="offset points")
|
||||
if ci == 0:
|
||||
ax.set_ylabel(rlabel)
|
||||
ax.spines[["top", "right"]].set_visible(False)
|
||||
ax.tick_params(labelsize=8)
|
||||
for ax in axes[-1]:
|
||||
ax.set_xlabel("optimizer step")
|
||||
fig.suptitle("Train (knob on) vs deploy (knob off): vanilla puts the cheat in "
|
||||
"the weights, route2 in the deletable knob (EMA-5)", fontsize=10)
|
||||
fig.tight_layout(rect=(0, 0, 1, 0.95))
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.savefig(out, dpi=150, bbox_inches="tight")
|
||||
logger.info(f"wrote {out}")
|
||||
|
||||
|
||||
# --- cli -------------------------------------------------------------------
|
||||
|
||||
def _gather(paths: list[str]) -> list[Path]:
|
||||
@@ -406,7 +455,10 @@ def main() -> None:
|
||||
# second figure: single-panel arm-vs-arm overlay of the headline metric
|
||||
overlay = args.out.with_name(args.out.stem + "_hack_overlay.png")
|
||||
plot_hack_overlay(runs, overlay)
|
||||
for p in (args.out, overlay):
|
||||
# third figure: 2x2 train(knob-on) vs deploy(knob-off) x arm
|
||||
tvd = args.out.with_name(args.out.stem + "_train_deploy.png")
|
||||
plot_train_vs_deploy(runs, tvd)
|
||||
for p in (args.out, overlay, tvd):
|
||||
logger.info(f"docs/figs latest -> {link_latest(p)}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user