plots: dejargon, drop redundant titles, emit png+svg+pdf, CSV re-render

Audit of all 4 plot scripts (plot_dynamics/substrate/emergence/deploy_overlay):
- One save_fig(fig, path) helper in figs.py writes png+svg+pdf (vector for the
  paper, png for the blog). All scripts call it.
- arm_label() map: reader-facing names only -- route2->route, drop 'knob'/'the
  cheat' from titles and the train-vs-deploy story (adapter on/off, reward hack).
- Titles off by default (the paper/blog caption carries it); --title re-enables
  for standalone research use.
- dump_data CSV now carries every plotted series; plot_dynamics --from-csv
  re-renders the three figures from the committed CSV with no logs (logs/ and
  out/runs/ are gitignored; out/figs/*.csv is tracked). Round-trip verified.
- Commit the regenerated dyn_sub4 figures in all 3 formats + the CSV.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-03 02:44:39 +00:00
parent 17a8792340
commit 87cca9a603
15 changed files with 8854 additions and 62 deletions
+17 -10
View File
@@ -27,13 +27,18 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import save_fig
# arm -> (display label, colour). Order = legend/bar order (baseline first).
# Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
# routing arm is the one we report, so it is plain "route"; the failed
# activation-mask variant is disambiguated, not version-numbered.
ARM = {
"vanilla": ("vanilla", "#444444"),
"projected": ("erase", "#c1432b"),
"routing": ("route", "#33508c"),
"routing2_act": ("route2 act", "#2f7d4f"),
"routing2_grad":("route2 grad", "#b8860b"),
"vanilla": ("vanilla", "#444444"),
"projected": ("erase", "#c1432b"),
"routing": ("route (v1)", "#33508c"),
"routing2_act": ("route (act-mask)", "#2f7d4f"),
"routing2_grad":("route", "#b8860b"),
}
# mode display order: in-dist first, then held-out.
MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]
@@ -79,6 +84,8 @@ def main() -> None:
ap.add_argument("jsons", nargs="*", type=Path,
help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
ap.add_argument("--title", action="store_true",
help="draw the suptitle (off by default: the caption carries it)")
args = ap.parse_args()
paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
@@ -97,12 +104,12 @@ def main() -> None:
_panel(a2, records, modes, arms, "deploy_solve",
"DEPLOY solve rate by mode (higher = better)", "deploy solve rate")
a1.legend(fontsize=8, frameon=False, loc="upper right")
n_seed = {r.get("seed") for r in records}
fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
f"quarantine deleted = shipped model", fontsize=11)
args.out.parent.mkdir(parents=True, exist_ok=True)
if args.title:
n_seed = {r.get("seed") for r in records}
fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
f"quarantine deleted = shipped model", fontsize=11)
fig.tight_layout()
fig.savefig(args.out, dpi=140, bbox_inches="tight")
save_fig(fig, args.out)
logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)")
+97 -43
View File
@@ -42,7 +42,11 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import link_latest
from projected_grpo.figs import link_latest, save_fig, arm_label
# Figures are captioned in the paper/blog, so the suptitle just restates the
# caption. Off by default; --title re-enables it for standalone research use.
SHOW_TITLE = False
# --- parse -----------------------------------------------------------------
@@ -148,6 +152,8 @@ def parse_log(path: Path) -> dict | None:
def classify(run: dict) -> str:
if "arm_csv" in run: # reconstructed from a CSV: name is already classified
return run["arm_csv"]
if run["arm"] == "vanilla":
return "vanilla"
if run["arm"] == "routing":
@@ -236,24 +242,51 @@ def _series_panel(ax, runs, cols, colors, ylim, label_series=False):
ax.set_ylim(*ylim)
# Every series any of the three figures plots. Carried in the CSV so the figure
# regenerates from the committed CSV alone (logs/ and out/runs/ are gitignored,
# out/figs/*.csv is tracked). `arm` is the CLASSIFIED display name -- load_csv
# short-circuits classify() on it so the round-trip is exact.
CSV_SERIES = ["hack_s", "gt_s", "hack_train", "solve_train", "hk_dep", "slv_dep"]
def dump_data(runs: list[dict], out: Path) -> Path:
"""Write the plotted series to a tidy CSV next to the figure so the figure is
reproducible from a committed artifact -- logs/ and out/runs/ are gitignored,
this CSV is not (it lands in out/figs/, which is tracked)."""
csv = out.with_suffix(".csv")
lines = ["arm,seed,step,hack,solve"]
lines = ["arm,seed,step," + ",".join(CSV_SERIES)]
for r in runs:
arm = classify(r)
hk = r.get("hack_s"); sv = r.get("gt_s")
for i, step in enumerate(r["steps"]):
h = hk[i] if hk is not None and i < len(hk) else float("nan")
s = sv[i] if sv is not None and i < len(sv) else float("nan")
lines.append(f"{arm},{r['seed']},{int(step)},{h},{s}")
cells = [r[k][i] if (k in r and r[k] is not None and i < len(r[k])) else float("nan")
for k in CSV_SERIES]
lines.append(f"{arm},{r['seed']},{int(step)}," + ",".join(str(c) for c in cells))
csv.write_text("\n".join(lines) + "\n")
logger.info(f"wrote {csv} ({len(runs)} runs, reproducibility source)")
return csv
def load_csv(path: Path) -> list[dict]:
"""Reconstruct the runs list from a dump_data CSV so figures regenerate
without the raw logs. Groups rows by (arm, seed); `arm_csv` makes classify()
return the stored display name verbatim."""
rows = [l.split(",") for l in path.read_text().splitlines() if l.strip()]
hdr, body = rows[0], rows[1:]
ci = {n: i for i, n in enumerate(hdr)}
by_key: dict[tuple, dict] = {}
for row in body:
key = (row[ci["arm"]], row[ci["seed"]])
run = by_key.setdefault(key, {"arm_csv": row[ci["arm"]], "seed": row[ci["seed"]],
"refr": 0, "vhack": "-", "teacher_off": None,
"steps": [], **{k: [] for k in CSV_SERIES}})
run["steps"].append(int(row[ci["step"]]))
for k in CSV_SERIES:
run[k].append(float(row[ci[k]]))
runs = list(by_key.values())
for run in runs: # match parse_log: numeric series are ndarrays, not lists
run["steps"] = np.array(run["steps"])
for k in CSV_SERIES:
run[k] = np.array(run[k], dtype=float)
return runs
def plot(runs: list[dict], out: Path) -> None:
by_arm: dict[str, list[dict]] = defaultdict(list)
for r in runs:
@@ -269,7 +302,7 @@ def plot(runs: list[dict], out: Path) -> None:
ax = axes[0][col]
rs = by_arm[arm]
n_seed = len({r["seed"] for r in rs})
ax.set_title(f"{arm}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
ax.set_title(f"{arm_label(arm)}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
# ylim floor slightly below 0 so a pinned-at-zero series (route2 hack) draws
# ABOVE the axis line instead of hiding under it -- the whole result is that
# red sits on zero, so it must be visible, not absent.
@@ -296,12 +329,14 @@ def plot(runs: list[dict], out: Path) -> None:
ax.spines["right"].set_visible(False)
ax.tick_params(labelsize=8)
fig.suptitle("Training dynamics: deployed hack vs solve by arm "
"(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
fig.tight_layout(rect=(0, 0, 1, 0.96))
out.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out, dpi=150, bbox_inches="tight")
logger.info(f"wrote {out} ({len(runs)} runs, arms={arms})")
if SHOW_TITLE:
fig.suptitle("Training dynamics: deployed hack vs solve by arm "
"(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
fig.tight_layout(rect=(0, 0, 1, 0.96))
else:
fig.tight_layout()
save_fig(fig, out)
logger.info(f"wrote {out} ({len(runs)} runs, arms={[arm_label(a) for a in arms]})")
def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset):
@@ -344,7 +379,7 @@ def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset):
y_lab = y if not placed else max(y, placed[-1] + gap)
placed.append(y_lab)
arrow = dict(arrowstyle="-", color=color, lw=0.5, shrinkA=0, shrinkB=0)
ax.annotate(arm, xy=(x, y), xytext=(x + 1.0, y_lab), textcoords="data",
ax.annotate(arm_label(arm), xy=(x, y), xytext=(x + 1.0, y_lab), textcoords="data",
color=color, fontsize=8, va="center",
arrowprops=arrow if abs(y_lab - y) > 1e-3 else None)
@@ -362,33 +397,34 @@ def plot_hack_overlay(runs: list[dict], out: Path) -> None:
_overlay_panel(ax_h, by_arm, arms, "hack_s", label="hack rate", with_onset=True)
_overlay_panel(ax_s, by_arm, arms, "gt_s", label="solve rate", with_onset=False)
ax_s.set_xlabel("optimizer step")
ax_h.set_title("Hack vs solve rate by arm (EMA-5; dot = mean hack onset)", fontsize=10)
if SHOW_TITLE:
ax_h.set_title("Hack vs solve rate by arm (EMA-5; dot = mean hack onset)", fontsize=10)
fig.tight_layout()
out.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out, dpi=150, bbox_inches="tight")
save_fig(fig, out)
logger.info(f"wrote {out}")
def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
"""2x2 small multiple: rows = train (knob ON) / deploy (knob OFF), cols = arm.
The story in one figure: vanilla train == deploy (no quarantine, the cheat is
in the deployed weights); route2 train HACKS while deploy is clean -- the cheat
is held in the deletable knob. Same red=hack/green=solve as the other figures."""
"""2x2 small multiple: rows = train (adapter ON) / deploy (adapter OFF), cols = arm.
The story in one figure: vanilla train == deploy (no quarantine, the reward
hack is in the deployed weights); route trains while hacking but deploys clean,
the hack is held in the deletable quarantine adapter. Same red=hack/green=solve
as the other figures."""
by_arm: dict[str, list[dict]] = defaultdict(list)
for r in runs:
by_arm[classify(r)].append(r)
arms = [a for a in ARM_ORDER if a in by_arm]
red, green = RATE_COLORS["hack_s"], RATE_COLORS["gt_s"]
rows = [
("train (knob on)", {"hack_train": "hack", "solve_train": "solve"},
{"hack_train": red, "solve_train": green}),
("deploy (knob off)", {"hk_dep": "hack", "slv_dep": "solve"},
{"hk_dep": red, "slv_dep": green}),
("train (adapter on)", {"hack_train": "hack", "solve_train": "solve"},
{"hack_train": red, "solve_train": green}),
("deploy (adapter off)", {"hk_dep": "hack", "slv_dep": "solve"},
{"hk_dep": red, "slv_dep": green}),
]
fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.8),
sharex=True, sharey=True, squeeze=False)
for ci, arm in enumerate(arms):
axes[0][ci].set_title(arm, fontsize=10)
axes[0][ci].set_title(arm_label(arm), fontsize=10)
for ri, (rlabel, cols, colors) in enumerate(rows):
ax = axes[ri][ci]
_series_panel(ax, by_arm[arm], cols, colors, ylim=(-0.035, 1.0),
@@ -415,11 +451,14 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
ax.tick_params(labelsize=8)
for ax in axes[-1]:
ax.set_xlabel("optimizer step")
fig.suptitle("Train (knob on) vs deploy (knob off): vanilla puts the cheat in "
"the weights, route2 in the deletable knob (EMA-5)", fontsize=10)
fig.tight_layout(rect=(0, 0, 1, 0.95))
out.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out, dpi=150, bbox_inches="tight")
if SHOW_TITLE:
fig.suptitle("Train (adapter on) vs deploy (adapter off): vanilla puts the "
"reward hack in the weights, route in the deletable adapter (EMA-5)",
fontsize=10)
fig.tight_layout(rect=(0, 0, 1, 0.95))
else:
fig.tight_layout()
save_fig(fig, out)
logger.info(f"wrote {out}")
@@ -454,13 +493,24 @@ def _latest_per_arm(files: list[Path], min_steps: int) -> list[Path]:
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("logs", nargs="+", help="log files, globs, or dirs")
ap.add_argument("logs", nargs="*", help="log files, globs, or dirs (omit with --from-csv)")
ap.add_argument("--out", type=Path, default=Path("out/figs/dynamics.png"))
ap.add_argument("--latest-per-arm", action="store_true",
help="keep only the newest log per arm (with >= --min-steps rows)")
ap.add_argument("--min-steps", type=int, default=0,
help="drop runs shorter than this many logged steps")
ap.add_argument("--title", action="store_true",
help="draw the suptitle (off by default: the paper/blog caption carries it)")
ap.add_argument("--from-csv", type=Path, default=None,
help="re-render from a committed dump_data CSV instead of parsing logs")
args = ap.parse_args()
global SHOW_TITLE
SHOW_TITLE = args.title
if args.from_csv:
runs = load_csv(args.from_csv)
logger.info(f"loaded {len(runs)} runs from {args.from_csv} (CSV re-render, no logs)")
_render_all(runs, args.out)
return
files = _gather(args.logs)
if args.latest_per_arm:
files = _latest_per_arm(files, args.min_steps)
@@ -470,14 +520,18 @@ def main() -> None:
for r in runs:
logger.info(f"{classify(r):16s} seed={r['seed']} steps={len(r['steps'])} {r['vhack']}")
args.out.parent.mkdir(parents=True, exist_ok=True)
plot(runs, args.out)
# second figure: single-panel arm-vs-arm overlay of the headline metric
overlay = args.out.with_name(args.out.stem + "_hack_overlay.png")
plot_hack_overlay(runs, overlay)
# third figure: 2x2 train(knob-on) vs deploy(knob-off) x arm
tvd = args.out.with_name(args.out.stem + "_train_deploy.png")
plot_train_vs_deploy(runs, tvd)
for p in (args.out, overlay, tvd):
_render_all(runs, args.out)
def _render_all(runs: list[dict], out: Path) -> None:
"""The three dynamics figures, shared by the log-parse and --from-csv paths."""
out.parent.mkdir(parents=True, exist_ok=True)
plot(runs, out) # small-multiples + CSV dump
overlay = out.with_name(out.stem + "_hack_overlay.png")
plot_hack_overlay(runs, overlay) # arm-vs-arm headline overlay
tvd = out.with_name(out.stem + "_train_deploy.png")
plot_train_vs_deploy(runs, tvd) # 2x2 train(on) vs deploy(off)
for p in (out, overlay, tvd):
logger.info(f"docs/figs latest -> {link_latest(p)}")
+8 -5
View File
@@ -21,9 +21,12 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import link_latest
from projected_grpo.figs import link_latest, save_fig
from plot_dynamics import _ema, _gather, _onset, parse_log
# Captioned downstream, so the title just restates the caption. --title re-enables.
SHOW_TITLE = False
# Distinct hue per mode (qualitative). solve drawn dashed in the same hue so each
# mode's hack/solve pair reads together.
MODE_COLORS = {
@@ -71,11 +74,11 @@ def plot(runs_by_mode: dict[str, list[dict]], out: Path) -> None:
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.tick_params(labelsize=8)
ax.set_title("Loophole emergence under vanilla GRPO "
"(solid=hack/exploited, dashed=solve/gt_correct; EMA-5)", fontsize=9)
if SHOW_TITLE:
ax.set_title("Loophole emergence under vanilla GRPO "
"(solid=hack/exploited, dashed=solve/gt_correct; EMA-5)", fontsize=9)
fig.tight_layout()
out.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out, dpi=150, bbox_inches="tight")
save_fig(fig, out)
logger.info(f"wrote {out} (modes={modes})")
+4 -4
View File
@@ -39,6 +39,8 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import save_fig
# hk_ column header -> (display mode, colour). Order = panel/legend order.
# Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels.
HK = {
@@ -172,9 +174,8 @@ def plot_by_method(runs, ylabel, cumulative, span, out: Path):
fontsize=6.5, va="top", color="#888")
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(-0.02, None)
out.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out, dpi=140, bbox_inches="tight")
save_fig(fig, out)
logger.info(f"wrote {out} (by-method, {len(methods)} methods)")
@@ -205,9 +206,8 @@ def plot_by_hack(runs, ylabel, cumulative, span, out: Path):
_despine(ax)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(-0.02, None)
out.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out, dpi=140, bbox_inches="tight")
save_fig(fig, out)
logger.info(f"wrote {out} (by-hack, {len(modes)} modes)")