plots: dejargon, drop redundant titles, emit png+svg+pdf, CSV re-render

Audit of all 4 plot scripts (plot_dynamics/substrate/emergence/deploy_overlay): - One save_fig(fig, path) helper in figs.py writes png+svg+pdf (vector for the paper, png for the blog). All scripts call it. - arm_label() map: reader-facing names only -- route2->route, drop 'knob'/'the cheat' from titles and the train-vs-deploy story (adapter on/off, reward hack). - Titles off by default (the paper/blog caption carries it); --title re-enables for standalone research use. - dump_data CSV now carries every plotted series; plot_dynamics --from-csv re-renders the three figures from the committed CSV with no logs (logs/ and out/runs/ are gitignored; out/figs/*.csv is tracked). Round-trip verified. - Commit the regenerated dyn_sub4 figures in all 3 formats + the CSV. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 17:30:41 +08:00 · 2026-06-03 02:44:39 +00:00
parent 17a8792340
commit 87cca9a603
15 changed files with 8854 additions and 62 deletions
@@ -27,13 +27,18 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

+from projected_grpo.figs import save_fig
+
 # arm -> (display label, colour). Order = legend/bar order (baseline first).
+# Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
+# routing arm is the one we report, so it is plain "route"; the failed
+# activation-mask variant is disambiguated, not version-numbered.
 ARM = {
-    "vanilla":      ("vanilla",     "#444444"),
-    "projected":    ("erase",       "#c1432b"),
-    "routing":      ("route",       "#33508c"),
-    "routing2_act": ("route2 act",  "#2f7d4f"),
-    "routing2_grad":("route2 grad", "#b8860b"),
+    "vanilla":      ("vanilla",            "#444444"),
+    "projected":    ("erase",              "#c1432b"),
+    "routing":      ("route (v1)",         "#33508c"),
+    "routing2_act": ("route (act-mask)",   "#2f7d4f"),
+    "routing2_grad":("route",              "#b8860b"),
 }
 # mode display order: in-dist first, then held-out.
 MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]
@@ -79,6 +84,8 @@ def main() -> None:
    ap.add_argument("jsons", nargs="*", type=Path,
                    help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
    ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
+    ap.add_argument("--title", action="store_true",
+                    help="draw the suptitle (off by default: the caption carries it)")
    args = ap.parse_args()

    paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
@@ -97,12 +104,12 @@ def main() -> None:
    _panel(a2, records, modes, arms, "deploy_solve",
           "DEPLOY solve rate by mode (higher = better)", "deploy solve rate")
    a1.legend(fontsize=8, frameon=False, loc="upper right")
-    n_seed = {r.get("seed") for r in records}
-    fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
-                 f"quarantine deleted = shipped model", fontsize=11)
-    args.out.parent.mkdir(parents=True, exist_ok=True)
+    if args.title:
+        n_seed = {r.get("seed") for r in records}
+        fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
+                     f"quarantine deleted = shipped model", fontsize=11)
    fig.tight_layout()
-    fig.savefig(args.out, dpi=140, bbox_inches="tight")
+    save_fig(fig, args.out)
    logger.info(f"wrote {args.out}  ({len(arms)} arms x {len(modes)} modes)")


@@ -42,7 +42,11 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

-from projected_grpo.figs import link_latest
+from projected_grpo.figs import link_latest, save_fig, arm_label
+
+# Figures are captioned in the paper/blog, so the suptitle just restates the
+# caption. Off by default; --title re-enables it for standalone research use.
+SHOW_TITLE = False

 # --- parse -----------------------------------------------------------------

@@ -148,6 +152,8 @@ def parse_log(path: Path) -> dict | None:


 def classify(run: dict) -> str:
+    if "arm_csv" in run:          # reconstructed from a CSV: name is already classified
+        return run["arm_csv"]
    if run["arm"] == "vanilla":
        return "vanilla"
    if run["arm"] == "routing":
@@ -236,24 +242,51 @@ def _series_panel(ax, runs, cols, colors, ylim, label_series=False):
        ax.set_ylim(*ylim)


+# Every series any of the three figures plots. Carried in the CSV so the figure
+# regenerates from the committed CSV alone (logs/ and out/runs/ are gitignored,
+# out/figs/*.csv is tracked). `arm` is the CLASSIFIED display name -- load_csv
+# short-circuits classify() on it so the round-trip is exact.
+CSV_SERIES = ["hack_s", "gt_s", "hack_train", "solve_train", "hk_dep", "slv_dep"]
+
+
 def dump_data(runs: list[dict], out: Path) -> Path:
-    """Write the plotted series to a tidy CSV next to the figure so the figure is
-    reproducible from a committed artifact -- logs/ and out/runs/ are gitignored,
-    this CSV is not (it lands in out/figs/, which is tracked)."""
    csv = out.with_suffix(".csv")
-    lines = ["arm,seed,step,hack,solve"]
+    lines = ["arm,seed,step," + ",".join(CSV_SERIES)]
    for r in runs:
        arm = classify(r)
-        hk = r.get("hack_s"); sv = r.get("gt_s")
        for i, step in enumerate(r["steps"]):
-            h = hk[i] if hk is not None and i < len(hk) else float("nan")
-            s = sv[i] if sv is not None and i < len(sv) else float("nan")
-            lines.append(f"{arm},{r['seed']},{int(step)},{h},{s}")
+            cells = [r[k][i] if (k in r and r[k] is not None and i < len(r[k])) else float("nan")
+                     for k in CSV_SERIES]
+            lines.append(f"{arm},{r['seed']},{int(step)}," + ",".join(str(c) for c in cells))
    csv.write_text("\n".join(lines) + "\n")
    logger.info(f"wrote {csv} ({len(runs)} runs, reproducibility source)")
    return csv


+def load_csv(path: Path) -> list[dict]:
+    """Reconstruct the runs list from a dump_data CSV so figures regenerate
+    without the raw logs. Groups rows by (arm, seed); `arm_csv` makes classify()
+    return the stored display name verbatim."""
+    rows = [l.split(",") for l in path.read_text().splitlines() if l.strip()]
+    hdr, body = rows[0], rows[1:]
+    ci = {n: i for i, n in enumerate(hdr)}
+    by_key: dict[tuple, dict] = {}
+    for row in body:
+        key = (row[ci["arm"]], row[ci["seed"]])
+        run = by_key.setdefault(key, {"arm_csv": row[ci["arm"]], "seed": row[ci["seed"]],
+                                       "refr": 0, "vhack": "-", "teacher_off": None,
+                                       "steps": [], **{k: [] for k in CSV_SERIES}})
+        run["steps"].append(int(row[ci["step"]]))
+        for k in CSV_SERIES:
+            run[k].append(float(row[ci[k]]))
+    runs = list(by_key.values())
+    for run in runs:  # match parse_log: numeric series are ndarrays, not lists
+        run["steps"] = np.array(run["steps"])
+        for k in CSV_SERIES:
+            run[k] = np.array(run[k], dtype=float)
+    return runs
+
+
 def plot(runs: list[dict], out: Path) -> None:
    by_arm: dict[str, list[dict]] = defaultdict(list)
    for r in runs:
@@ -269,7 +302,7 @@ def plot(runs: list[dict], out: Path) -> None:
        ax = axes[0][col]
        rs = by_arm[arm]
        n_seed = len({r["seed"] for r in rs})
-        ax.set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
+        ax.set_title(f"{arm_label(arm)}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
        # ylim floor slightly below 0 so a pinned-at-zero series (route2 hack) draws
        # ABOVE the axis line instead of hiding under it -- the whole result is that
        # red sits on zero, so it must be visible, not absent.
@@ -296,12 +329,14 @@ def plot(runs: list[dict], out: Path) -> None:
        ax.spines["right"].set_visible(False)
        ax.tick_params(labelsize=8)

-    fig.suptitle("Training dynamics: deployed hack vs solve by arm  "
-                 "(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
-    fig.tight_layout(rect=(0, 0, 1, 0.96))
-    out.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(out, dpi=150, bbox_inches="tight")
-    logger.info(f"wrote {out}  ({len(runs)} runs, arms={arms})")
+    if SHOW_TITLE:
+        fig.suptitle("Training dynamics: deployed hack vs solve by arm  "
+                     "(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
+        fig.tight_layout(rect=(0, 0, 1, 0.96))
+    else:
+        fig.tight_layout()
+    save_fig(fig, out)
+    logger.info(f"wrote {out}  ({len(runs)} runs, arms={[arm_label(a) for a in arms]})")


 def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset):
@@ -344,7 +379,7 @@ def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset):
        y_lab = y if not placed else max(y, placed[-1] + gap)
        placed.append(y_lab)
        arrow = dict(arrowstyle="-", color=color, lw=0.5, shrinkA=0, shrinkB=0)
-        ax.annotate(arm, xy=(x, y), xytext=(x + 1.0, y_lab), textcoords="data",
+        ax.annotate(arm_label(arm), xy=(x, y), xytext=(x + 1.0, y_lab), textcoords="data",
                    color=color, fontsize=8, va="center",
                    arrowprops=arrow if abs(y_lab - y) > 1e-3 else None)

@@ -362,33 +397,34 @@ def plot_hack_overlay(runs: list[dict], out: Path) -> None:
    _overlay_panel(ax_h, by_arm, arms, "hack_s", label="hack rate", with_onset=True)
    _overlay_panel(ax_s, by_arm, arms, "gt_s", label="solve rate", with_onset=False)
    ax_s.set_xlabel("optimizer step")
-    ax_h.set_title("Hack vs solve rate by arm  (EMA-5; dot = mean hack onset)", fontsize=10)
+    if SHOW_TITLE:
+        ax_h.set_title("Hack vs solve rate by arm  (EMA-5; dot = mean hack onset)", fontsize=10)
    fig.tight_layout()
-    out.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(out, dpi=150, bbox_inches="tight")
+    save_fig(fig, out)
    logger.info(f"wrote {out}")


 def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
-    """2x2 small multiple: rows = train (knob ON) / deploy (knob OFF), cols = arm.
-    The story in one figure: vanilla train == deploy (no quarantine, the cheat is
-    in the deployed weights); route2 train HACKS while deploy is clean -- the cheat
-    is held in the deletable knob. Same red=hack/green=solve as the other figures."""
+    """2x2 small multiple: rows = train (adapter ON) / deploy (adapter OFF), cols = arm.
+    The story in one figure: vanilla train == deploy (no quarantine, the reward
+    hack is in the deployed weights); route trains while hacking but deploys clean,
+    the hack is held in the deletable quarantine adapter. Same red=hack/green=solve
+    as the other figures."""
    by_arm: dict[str, list[dict]] = defaultdict(list)
    for r in runs:
        by_arm[classify(r)].append(r)
    arms = [a for a in ARM_ORDER if a in by_arm]
    red, green = RATE_COLORS["hack_s"], RATE_COLORS["gt_s"]
    rows = [
-        ("train (knob on)",   {"hack_train": "hack", "solve_train": "solve"},
-                              {"hack_train": red, "solve_train": green}),
-        ("deploy (knob off)", {"hk_dep": "hack", "slv_dep": "solve"},
-                              {"hk_dep": red, "slv_dep": green}),
+        ("train (adapter on)",   {"hack_train": "hack", "solve_train": "solve"},
+                                 {"hack_train": red, "solve_train": green}),
+        ("deploy (adapter off)", {"hk_dep": "hack", "slv_dep": "solve"},
+                                 {"hk_dep": red, "slv_dep": green}),
    ]
    fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.8),
                             sharex=True, sharey=True, squeeze=False)
    for ci, arm in enumerate(arms):
-        axes[0][ci].set_title(arm, fontsize=10)
+        axes[0][ci].set_title(arm_label(arm), fontsize=10)
        for ri, (rlabel, cols, colors) in enumerate(rows):
            ax = axes[ri][ci]
            _series_panel(ax, by_arm[arm], cols, colors, ylim=(-0.035, 1.0),
@@ -415,11 +451,14 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
            ax.tick_params(labelsize=8)
    for ax in axes[-1]:
        ax.set_xlabel("optimizer step")
-    fig.suptitle("Train (knob on) vs deploy (knob off): vanilla puts the cheat in "
-                 "the weights, route2 in the deletable knob  (EMA-5)", fontsize=10)
-    fig.tight_layout(rect=(0, 0, 1, 0.95))
-    out.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(out, dpi=150, bbox_inches="tight")
+    if SHOW_TITLE:
+        fig.suptitle("Train (adapter on) vs deploy (adapter off): vanilla puts the "
+                     "reward hack in the weights, route in the deletable adapter  (EMA-5)",
+                     fontsize=10)
+        fig.tight_layout(rect=(0, 0, 1, 0.95))
+    else:
+        fig.tight_layout()
+    save_fig(fig, out)
    logger.info(f"wrote {out}")


@@ -454,13 +493,24 @@ def _latest_per_arm(files: list[Path], min_steps: int) -> list[Path]:

 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__)
-    ap.add_argument("logs", nargs="+", help="log files, globs, or dirs")
+    ap.add_argument("logs", nargs="*", help="log files, globs, or dirs (omit with --from-csv)")
    ap.add_argument("--out", type=Path, default=Path("out/figs/dynamics.png"))
    ap.add_argument("--latest-per-arm", action="store_true",
                    help="keep only the newest log per arm (with >= --min-steps rows)")
    ap.add_argument("--min-steps", type=int, default=0,
                    help="drop runs shorter than this many logged steps")
+    ap.add_argument("--title", action="store_true",
+                    help="draw the suptitle (off by default: the paper/blog caption carries it)")
+    ap.add_argument("--from-csv", type=Path, default=None,
+                    help="re-render from a committed dump_data CSV instead of parsing logs")
    args = ap.parse_args()
+    global SHOW_TITLE
+    SHOW_TITLE = args.title
+    if args.from_csv:
+        runs = load_csv(args.from_csv)
+        logger.info(f"loaded {len(runs)} runs from {args.from_csv} (CSV re-render, no logs)")
+        _render_all(runs, args.out)
+        return
    files = _gather(args.logs)
    if args.latest_per_arm:
        files = _latest_per_arm(files, args.min_steps)
@@ -470,14 +520,18 @@ def main() -> None:
    for r in runs:
        logger.info(f"{classify(r):16s} seed={r['seed']} steps={len(r['steps'])} {r['vhack']}")
    args.out.parent.mkdir(parents=True, exist_ok=True)
-    plot(runs, args.out)
-    # second figure: single-panel arm-vs-arm overlay of the headline metric
-    overlay = args.out.with_name(args.out.stem + "_hack_overlay.png")
-    plot_hack_overlay(runs, overlay)
-    # third figure: 2x2 train(knob-on) vs deploy(knob-off) x arm
-    tvd = args.out.with_name(args.out.stem + "_train_deploy.png")
-    plot_train_vs_deploy(runs, tvd)
-    for p in (args.out, overlay, tvd):
+    _render_all(runs, args.out)
+
+
+def _render_all(runs: list[dict], out: Path) -> None:
+    """The three dynamics figures, shared by the log-parse and --from-csv paths."""
+    out.parent.mkdir(parents=True, exist_ok=True)
+    plot(runs, out)                                              # small-multiples + CSV dump
+    overlay = out.with_name(out.stem + "_hack_overlay.png")
+    plot_hack_overlay(runs, overlay)                            # arm-vs-arm headline overlay
+    tvd = out.with_name(out.stem + "_train_deploy.png")
+    plot_train_vs_deploy(runs, tvd)                            # 2x2 train(on) vs deploy(off)
+    for p in (out, overlay, tvd):
        logger.info(f"docs/figs latest -> {link_latest(p)}")


@@ -21,9 +21,12 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

-from projected_grpo.figs import link_latest
+from projected_grpo.figs import link_latest, save_fig
 from plot_dynamics import _ema, _gather, _onset, parse_log

+# Captioned downstream, so the title just restates the caption. --title re-enables.
+SHOW_TITLE = False
+
 # Distinct hue per mode (qualitative). solve drawn dashed in the same hue so each
 # mode's hack/solve pair reads together.
 MODE_COLORS = {
@@ -71,11 +74,11 @@ def plot(runs_by_mode: dict[str, list[dict]], out: Path) -> None:
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.tick_params(labelsize=8)
-    ax.set_title("Loophole emergence under vanilla GRPO  "
-                 "(solid=hack/exploited, dashed=solve/gt_correct; EMA-5)", fontsize=9)
+    if SHOW_TITLE:
+        ax.set_title("Loophole emergence under vanilla GRPO  "
+                     "(solid=hack/exploited, dashed=solve/gt_correct; EMA-5)", fontsize=9)
    fig.tight_layout()
-    out.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(out, dpi=150, bbox_inches="tight")
+    save_fig(fig, out)
    logger.info(f"wrote {out}  (modes={modes})")


@@ -39,6 +39,8 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

+from projected_grpo.figs import save_fig
+
 # hk_ column header -> (display mode, colour). Order = panel/legend order.
 # Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels.
 HK = {
@@ -172,9 +174,8 @@ def plot_by_method(runs, ylabel, cumulative, span, out: Path):
                    fontsize=6.5, va="top", color="#888")
    axes[0].set_ylabel(ylabel)
    axes[0].set_ylim(-0.02, None)
-    out.parent.mkdir(parents=True, exist_ok=True)
    fig.tight_layout()
-    fig.savefig(out, dpi=140, bbox_inches="tight")
+    save_fig(fig, out)
    logger.info(f"wrote {out}  (by-method, {len(methods)} methods)")


@@ -205,9 +206,8 @@ def plot_by_hack(runs, ylabel, cumulative, span, out: Path):
        _despine(ax)
    axes[0].set_ylabel(ylabel)
    axes[0].set_ylim(-0.02, None)
-    out.parent.mkdir(parents=True, exist_ok=True)
    fig.tight_layout()
-    fig.savefig(out, dpi=140, bbox_inches="tight")
+    save_fig(fig, out)
    logger.info(f"wrote {out}  (by-hack, {len(modes)} modes)")