diff --git a/out/figs/dyn_longrun_200.csv b/out/figs/dyn_longrun_200.csv index b1d3192..f1e0c0a 100644 --- a/out/figs/dyn_longrun_200.csv +++ b/out/figs/dyn_longrun_200.csv @@ -332,3 +332,10 @@ vanilla,41,129,nan,nan vanilla,41,130,0.03,0.0 vanilla,41,131,nan,nan vanilla,41,132,nan,nan +vanilla,41,133,nan,nan +vanilla,41,134,nan,nan +vanilla,41,135,nan,nan +vanilla,41,136,nan,nan +vanilla,41,137,nan,nan +vanilla,41,138,nan,nan +vanilla,41,139,nan,nan diff --git a/out/figs/dyn_longrun_200.png b/out/figs/dyn_longrun_200.png index 99201cc..7bd07fe 100644 Binary files a/out/figs/dyn_longrun_200.png and b/out/figs/dyn_longrun_200.png differ diff --git a/out/figs/dyn_longrun_200_hack_overlay.png b/out/figs/dyn_longrun_200_hack_overlay.png index 57e5f0f..17d428d 100644 Binary files a/out/figs/dyn_longrun_200_hack_overlay.png and b/out/figs/dyn_longrun_200_hack_overlay.png differ diff --git a/out/figs/dyn_longrun_200_train_deploy.png b/out/figs/dyn_longrun_200_train_deploy.png new file mode 100644 index 0000000..06697ed Binary files /dev/null and b/out/figs/dyn_longrun_200_train_deploy.png differ diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 00169a3..c1744ee 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -124,6 +124,11 @@ def parse_log(path: Path) -> dict | None: # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise. def _has_data(key): return key in run and np.isfinite(run[key]).any() + # Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy + # substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both. + if "hack_s" in run: + run["hack_train"] = run["hack_s"] + run["solve_train"] = run["gt_s"] if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present run["hack_s"] = run["hk_abl"] run["gt_s"] = run["slv_abl"] @@ -355,6 +360,50 @@ def plot_hack_overlay(runs: list[dict], out: Path) -> None: logger.info(f"wrote {out}") +def plot_train_vs_deploy(runs: list[dict], out: Path) -> None: + """2x2 small multiple: rows = train (knob ON) / deploy (knob OFF), cols = arm. + The story in one figure: vanilla train == deploy (no quarantine, the cheat is + in the deployed weights); route2 train HACKS while deploy is clean -- the cheat + is held in the deletable knob. Same red=hack/green=solve as the other figures.""" + by_arm: dict[str, list[dict]] = defaultdict(list) + for r in runs: + by_arm[classify(r)].append(r) + arms = [a for a in ARM_ORDER if a in by_arm] + red, green = RATE_COLORS["hack_s"], RATE_COLORS["gt_s"] + rows = [ + ("train (knob on)", {"hack_train": "hack", "solve_train": "solve"}, + {"hack_train": red, "solve_train": green}), + ("deploy (knob off)", {"hk_dep": "hack", "slv_dep": "solve"}, + {"hk_dep": red, "slv_dep": green}), + ] + fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.8), + sharex=True, sharey=True, squeeze=False) + for ci, arm in enumerate(arms): + axes[0][ci].set_title(arm, fontsize=10) + for ri, (rlabel, cols, colors) in enumerate(rows): + ax = axes[ri][ci] + _series_panel(ax, by_arm[arm], cols, colors, ylim=(-0.035, 1.0), + label_series=(ci == 0)) + hk_key = next(iter(cols)) + hk = [r[hk_key] for r in by_arm[arm] if hk_key in r] + if hk and np.nanmax([np.nanmax(h) for h in hk]) < 0.02: + ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"), + color=red, fontsize=8, va="bottom", + xytext=(0, 3), textcoords="offset points") + if ci == 0: + ax.set_ylabel(rlabel) + ax.spines[["top", "right"]].set_visible(False) + ax.tick_params(labelsize=8) + for ax in axes[-1]: + ax.set_xlabel("optimizer step") + fig.suptitle("Train (knob on) vs deploy (knob off): vanilla puts the cheat in " + "the weights, route2 in the deletable knob (EMA-5)", fontsize=10) + fig.tight_layout(rect=(0, 0, 1, 0.95)) + out.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out, dpi=150, bbox_inches="tight") + logger.info(f"wrote {out}") + + # --- cli ------------------------------------------------------------------- def _gather(paths: list[str]) -> list[Path]: @@ -406,7 +455,10 @@ def main() -> None: # second figure: single-panel arm-vs-arm overlay of the headline metric overlay = args.out.with_name(args.out.stem + "_hack_overlay.png") plot_hack_overlay(runs, overlay) - for p in (args.out, overlay): + # third figure: 2x2 train(knob-on) vs deploy(knob-off) x arm + tvd = args.out.with_name(args.out.stem + "_train_deploy.png") + plot_train_vs_deploy(runs, tvd) + for p in (args.out, overlay, tvd): logger.info(f"docs/figs latest -> {link_latest(p)}")