diff --git a/justfile b/justfile index 018523d..db11286 100644 --- a/justfile +++ b/justfile @@ -150,6 +150,12 @@ run-substrate INTERV="none" SEED="41" STEPS="60" REFRESH="5": --vhack-refresh-every={{ REFRESH }} \ --seed={{ SEED }} --steps={{ STEPS }} --out-tag=_sub4_{{ INTERV }}_rf{{ REFRESH }}_s{{ SEED }} +# CANONICAL plotting entrypoint for the substrate sweep. One command, four figures +# (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay, +# the latter two delegated to plot_dynamics). Default glob = all 4-mode sub4 logs. +plot GLOB='logs/*_sub4_*.log' STEM='out/figs/substrate': + uv run python scripts/plot_substrate.py {{ GLOB }} --out-stem {{ STEM }} + # Regenerate both dynamics plots from the cell logs (default: all cells; pass a # narrower glob like 'logs/*_cell_*_s41.log' for the seed-41-only checkpoint). regen-dynamics GLOB='logs/*_cell_*.log': diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 9ee9ab1..7e4dbf3 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -51,7 +51,9 @@ from projected_grpo.figs import link_latest # Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264". RATE_COLS = {"hack_s": "hack", "gt_s": "solve"} -COS_COLS = {"cos_pre_t": "teacher", "cos_pre_s": "student"} +# Current streaming-table display headers (StepLogger _Col.header): the live-grad +# v_hack alignment prints as cin_t/cin_s, the route deploy-eval as hk_dep/slv_dep. +COS_COLS = {"cin_t": "teacher", "cin_s": "student"} _HDR_TOK = re.compile(r"[A-Za-z_]+") # strip ↑↓? decorations: "hack_s?" -> "hack_s" @@ -96,7 +98,7 @@ def parse_log(path: Path) -> dict | None: # Also parse the route DEPLOY-eval columns when present (non-route logs lack # them -> skip). For routing we plot THESE (deployed model = quarantine deleted), # not the training-time hack_s. - deploy = {"hack_deploy", "solve_deploy"} & set(idx) + deploy = {"hk_dep", "slv_dep"} & set(idx) wanted = {**RATE_COLS, **COS_COLS, **{c: c for c in deploy}} for line in txt.splitlines(): if "| INFO |" not in line: @@ -115,9 +117,9 @@ def parse_log(path: Path) -> dict | None: # forward still hacks); routing's benefit only shows on the DEPLOYED model # (quarantine knob deleted). So for routing, plot the deploy series under the # hack_s/gt_s keys -> all downstream (panels, onset, overlay) reads it. - if arm == "routing" and "hack_deploy" in run: - run["hack_s"] = run["hack_deploy"] - run["gt_s"] = run["solve_deploy"] + if arm == "routing" and "hk_dep" in run: + run["hack_s"] = run["hk_dep"] + run["gt_s"] = run["slv_dep"] return run @@ -137,7 +139,7 @@ ARM_ORDER = ["vanilla", "static erasure", "online erasure", "routing"] # must not share a palette (hack != teacher-cos). Row 0: red hack vs green # solve. Row 1: blue teacher-cos vs amber student-cos. RATE_COLORS = {"hack_s": "#c1432b", "gt_s": "#2f7d4f"} -COS_COLORS = {"cos_pre_t": "#33508c", "cos_pre_s": "#c98a2b"} +COS_COLORS = {"cin_t": "#33508c", "cin_s": "#c98a2b"} # Arm colours for the single-panel hack overlay (arms, not series): grey vanilla # baseline -> amber static -> blue online, ordered by increasing intervention. # TODO(color): make this a quality-ordered red->green ramp instead of fixed diff --git a/scripts/plot_substrate.py b/scripts/plot_substrate.py index 3fc782f..061381a 100644 --- a/scripts/plot_substrate.py +++ b/scripts/plot_substrate.py @@ -1,18 +1,33 @@ -"""Multi-loophole substrate emergence plot (#148): how many of the K classes does a -vanilla-GRPO student learn, and in what order? +"""Multi-loophole substrate per-mode dynamics (#137/#148): how much of each loophole +does each intervention let the student learn, and how fast? -One line per loophole mode. y = cumulative student hack rate of THAT mode -(hk_ = hacks / rollouts-of-that-mode-seen-so-far, parsed from the per-step -training table). A class "emerges" when its line lifts off 0; a flat-at-0 line -(eq_override) is a loophole the student never learned. first_step onset is dotted. +The substrate run interleaves all K modes in ONE log via the hk_ columns +(cumulative student hacks / rollouts-of-that-mode-seen). We parse those, take the +per-step *instantaneous* rate (cumulative diffs), and EMA-smooth it -- the +instantaneous rate is what shows a method SUPPRESSING a mode over time, which the +monotone cumulative curve hides. Pass --cumulative for the raw running rate. -Unlike plot_emergence.py (one --env-mode per log, Phase-1 single-mode runs), the -substrate run interleaves all K modes in ONE log via the hk_ columns, so we -parse those columns directly rather than grouping logs by env_mode. +Two core layouts (both emitted by default): + by-method : one panel per intervention (vanilla / erase / route); one coloured + line per hack type. Reads "how many of K classes does THIS method let through". + by-hack : one panel per hack type; one line per method (mean over seeds, thin + per-seed). Reads "for THIS loophole, which method suppresses it best". + +Route caveat (load-bearing): hk_ is the TRAINING-time rate; the routed forward +still hacks during training, the deployed model (quarantine knob deleted) is the real +number. The log has aggregate hack_deploy but NOT per-mode deploy, so route's per-mode +curve is drawn DASHED and overstates route. TODO: log per-mode deploy in train.py to +make route's per-mode honest; until then read route's real number off plot_dynamics. + +This is the single plotting ENTRYPOINT (`just plot`): it emits the per-mode cut +(by-method, by-hack) AND delegates the aggregate "total hacks per arm" + cos-alignment +figures to plot_dynamics.plot/plot_hack_overlay (reuse, not reimplement). plot_dynamics +owns route's deploy-curve substitution and the cos rows; this script owns parse_hk. Usage: - uv run python scripts/plot_substrate.py logs/*_substrate3_none_s41.log - uv run python scripts/plot_substrate.py logs/.log --out out/figs/substrate_emergence.png + uv run python scripts/plot_substrate.py logs/*_sub4_*.log # both layouts -> out/figs/ + uv run python scripts/plot_substrate.py A.log B.log --out-stem out/figs/sub4 + uv run python scripts/plot_substrate.py .log --cumulative --ema-span 6 """ from __future__ import annotations @@ -24,32 +39,42 @@ import matplotlib.pyplot as plt import numpy as np from loguru import logger -# hk_ column header -> (display mode, colour). Order = legend/label order. +# hk_ column header -> (display mode, colour). Order = panel/legend order. +# Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels. HK = { - "hk_fm": ("file_marker", "#7b3294"), "hk_rt": ("run_tests", "#c1432b"), + "hk_fm": ("file_marker", "#7b3294"), "hk_so": ("stdout_marker", "#b8860b"), "hk_se": ("sentinel", "#2f7d4f"), "hk_eq": ("eq_override", "#33508c"), } +# method -> (display label, colour, dashed?). dashed = per-mode curve is train-time +# only (route: the routed forward still hacks; deploy is lower and not logged per-mode). +METHODS = { + "vanilla": ("vanilla", "#444444", False), + "erase": ("erase", "#c1432b", False), + "route": ("route (train-time)", "#33508c", True), +} _HDR_TOK = re.compile(r"[A-Za-z_]+") # "hack_s?" -> "hack_s" -def _rate(tok: str) -> float: - """cumulative N/M cell -> N/M (nan if denom 0).""" - a, b = tok.split("/") - return int(a) / int(b) if int(b) else np.nan +def classify(txt: str) -> str: + """vanilla / erase / route from the preset `arm=` line (covers --intervention logs).""" + preset = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "") + arm = (re.search(r"\barm=(\w+)", preset) or [None, "vanilla"])[1] + return {"vanilla": "vanilla", "projected": "erase", "routing": "route"}[arm] def parse_hk(path: Path) -> dict: - """Return {steps: int[], hk_rt: float[], ...} from a substrate run log.""" + """{method, seed, steps, : (n[], d[])} from a substrate run log.""" txt = path.read_text(errors="replace") hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None) if hdr is None: raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?") names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()] idx = {n: i for i, n in enumerate(names)} - steps, cols = [], {k: [] for k in HK} + present = [k for k in HK if k in idx] # 4-mode substrate dropped hk_eq; plot only what's logged + steps, nd = [], {k: ([], []) for k in present} for line in txt.splitlines(): if "| INFO |" not in line: continue @@ -57,65 +82,171 @@ def parse_hk(path: Path) -> dict: if not row or not row[0].isdigit() or len(row) < len(names): continue steps.append(int(row[idx["step"]])) - for k in HK: - cols[k].append(_rate(row[idx[k]])) - return dict(steps=np.array(steps), **{k: np.array(v) for k, v in cols.items()}) + for k in present: + n, d = row[idx[k]].split("/") + nd[k][0].append(int(n)) + nd[k][1].append(int(d)) + m = re.search(r"seed(\d+)", path.name) or re.search(r"_s(\d+)", path.name) + return dict( + method=classify(txt), + seed=m.group(1) if m else "?", + steps=np.array(steps), + **{k: (np.array(v[0]), np.array(v[1])) for k, v in nd.items()}, + ) -def _onset(steps: np.ndarray, rate: np.ndarray) -> int | None: - """First step where the cumulative hack rate is > 0 (loophole first exploited).""" - nz = np.where(np.nan_to_num(rate) > 0)[0] - return int(steps[nz[0]]) if len(nz) else None +def ema(y: np.ndarray, span: int) -> np.ndarray: + """EMA that carries the last value across NaN gaps (steps where a mode saw 0 rollouts).""" + a = 2.0 / (span + 1.0) + out = np.full(len(y), np.nan) + m = None + for i, v in enumerate(y): + if np.isnan(v): + out[i] = m if m is not None else np.nan + continue + m = v if m is None else a * v + (1 - a) * m + out[i] = m + return out -def plot(runs: list[dict], seeds: list[str], out: Path) -> None: - """One curve per mode. Multi-seed: thin per-seed lines + bold mean; a mode counts - as 'learned' if it lifts off 0 in the MEAN. Single-seed: bold line + onset dot.""" - fig, ax = plt.subplots(figsize=(6.4, 3.8)) - L = min(len(r["steps"]) for r in runs) - x = runs[0]["steps"][:L] - n_learned = ymax = 0 - for k, (mode, color) in HK.items(): - stacked = np.stack([np.nan_to_num(r[k][:L]) for r in runs]) # (seeds, steps) - ymean = stacked.mean(axis=0) - ymax = max(ymax, ymean.max()) - if len(runs) > 1: - for ys in stacked: - ax.plot(x, ys, color=color, lw=0.7, alpha=0.30, solid_capstyle="round") - ax.plot(x, ymean, color=color, lw=2.0, solid_capstyle="round") - on = _onset(x, ymean) - if on is not None: - n_learned += 1 - if len(runs) == 1: - ax.plot(on, ymean[x == on][0], "o", color=color, ms=5, zorder=5) - tag = f"{mode} {ymean[-1]*100:.0f}%" + ("" if on is not None else " (never)") - ax.annotate(tag, (x[-1], ymean[-1]), color=color, fontsize=8, va="center", - xytext=(6, 0), textcoords="offset points") - seed_lbl = f"seed {seeds[0]}" if len(seeds) == 1 else f"{len(seeds)} seeds: {','.join(seeds)}" - ax.set_xlabel("GRPO step") - ax.set_ylabel("cumulative hack rate (per mode)") - ax.set_title(f"vanilla GRPO learns {n_learned} of {len(HK)} loopholes ({seed_lbl})") - ax.set_ylim(-0.02, max(0.5, ymax * 1.15)) - ax.set_xlim(0, x[-1] * 1.28) # headroom for end-labels +def rate(n: np.ndarray, d: np.ndarray, *, cumulative: bool, span: int) -> np.ndarray: + """Per-step hack rate of one mode. cumulative=running N/M; else EMA of the + instantaneous batch rate dN/dM (NaN where the batch saw no rollouts of this mode).""" + if cumulative: + return np.where(d > 0, n / np.where(d == 0, 1, d), np.nan) + dn = np.diff(n) + dd = np.diff(d) + inst = np.empty(len(n)) + inst[0] = n[0] / d[0] if d[0] > 0 else np.nan + inst[1:] = np.where(dd > 0, dn / np.where(dd == 0, 1, dd), np.nan) + return ema(inst, span) + + +def _despine(ax): ax.spines[["top", "right"]].set_visible(False) - ax.grid(axis="y", lw=0.4, alpha=0.4) - fig.tight_layout() + ax.grid(axis="y", lw=0.4, alpha=0.35) + + +def _onset(x, y) -> int | None: + nz = np.where(np.nan_to_num(y) > 0)[0] + return int(x[nz[0]]) if len(nz) else None + + +def plot_by_method(runs, ylabel, cumulative, span, out: Path): + """One panel per method; one line per hack type. Multi-seed -> mean bold + per-seed thin.""" + methods = [m for m in METHODS if any(r["method"] == m for r in runs)] + modes = [k for k in HK if all(k in r for r in runs)] + fig, axes = plt.subplots(1, len(methods), figsize=(3.5 * len(methods), 3.6), + sharey=True, sharex=True, squeeze=False) + axes = axes[0] + for ax, method in zip(axes, methods): + grp = [r for r in runs if r["method"] == method] + L = min(len(r["steps"]) for r in grp) + x = grp[0]["steps"][:L] + n_learned = 0 + for k in modes: + mode, color = HK[k] + stk = np.stack([rate(*r[k], cumulative=cumulative, span=span)[:L] for r in grp]) + ymean = np.nanmean(stk, axis=0) + for ys in stk if len(grp) > 1 else []: + ax.plot(x, ys, color=color, lw=0.6, alpha=0.30) + ax.plot(x, ymean, color=color, lw=1.8, solid_capstyle="round") + on = _onset(x, ymean) + n_learned += on is not None + ax.annotate(f"{mode} {np.nan_to_num(ymean[-1])*100:.0f}%", (x[-1], np.nan_to_num(ymean[-1])), + color=color, fontsize=7, va="center", xytext=(5, 0), textcoords="offset points") + label, _, dashed = METHODS[method] + ax.set_title(f"{label} ({n_learned}/{len(modes)} learned)", fontsize=9) + ax.set_xlabel("GRPO step") + ax.set_xlim(0, x[-1] * 1.30) + _despine(ax) + if dashed: + ax.text(0.03, 0.97, "train-time\n(deploy lower)", transform=ax.transAxes, + fontsize=6.5, va="top", color="#888") + axes[0].set_ylabel(ylabel) + axes[0].set_ylim(-0.02, None) out.parent.mkdir(parents=True, exist_ok=True) - fig.savefig(out, dpi=140) - logger.info(f"wrote {out} ({n_learned}/{len(HK)} learned, {len(seeds)} seed(s))") + fig.tight_layout() + fig.savefig(out, dpi=140, bbox_inches="tight") + logger.info(f"wrote {out} (by-method, {len(methods)} methods)") + + +def plot_by_hack(runs, ylabel, cumulative, span, out: Path): + """One panel per hack type; one line per method (mean over seeds, thin per-seed).""" + methods = [m for m in METHODS if any(r["method"] == m for r in runs)] + modes = [k for k in HK if all(k in r for r in runs)] + fig, axes = plt.subplots(1, len(modes), figsize=(3.2 * len(modes), 3.6), + sharey=True, sharex=True, squeeze=False) + axes = axes[0] + for ax, k in zip(axes, modes): + mode, _ = HK[k] + for method in methods: + grp = [r for r in runs if r["method"] == method] + L = min(len(r["steps"]) for r in grp) + x = grp[0]["steps"][:L] + stk = np.stack([rate(*r[k], cumulative=cumulative, span=span)[:L] for r in grp]) + ymean = np.nanmean(stk, axis=0) + label, color, dashed = METHODS[method] + for ys in stk if len(grp) > 1 else []: + ax.plot(x, ys, color=color, lw=0.6, alpha=0.25, ls="--" if dashed else "-") + ax.plot(x, ymean, color=color, lw=1.8, ls="--" if dashed else "-", solid_capstyle="round") + ax.annotate(label, (x[-1], np.nan_to_num(ymean[-1])), color=color, fontsize=7, + va="center", xytext=(5, 0), textcoords="offset points") + ax.set_title(mode, fontsize=9) + ax.set_xlabel("GRPO step") + ax.set_xlim(0, x[-1] * 1.45) + _despine(ax) + axes[0].set_ylabel(ylabel) + axes[0].set_ylim(-0.02, None) + out.parent.mkdir(parents=True, exist_ok=True) + fig.tight_layout() + fig.savefig(out, dpi=140, bbox_inches="tight") + logger.info(f"wrote {out} (by-hack, {len(modes)} modes)") def main() -> None: - ap = argparse.ArgumentParser() + """Single plotting entrypoint (`just plot`). Emits FOUR figures from one set + of logs, reusing two parsers/owners: + + _by_method.png per-mode, panel per method (this script's parse_hk) + _by_hack.png per-mode, panel per hack (this script's parse_hk) + _aggregate.png aggregate small-multiples (plot_dynamics.plot) + _aggregate_hack_overlay.png arm-vs-arm hack overlay (plot_dynamics) + + The aggregate pair is the "total hacks per arm" core plot -- delegated to + plot_dynamics (which owns the deploy-curve substitution for routing and the + cos-alignment rows), NOT reimplemented here. --no-aggregate skips it (e.g. on + logs without the cos_pre/deploy columns). + """ + ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("logs", nargs="+", type=Path) - ap.add_argument("--out", type=Path, default=Path("out/figs/substrate_emergence.png")) + ap.add_argument("--out-stem", type=Path, default=Path("out/figs/substrate"), + help="writes _by_method.png, _by_hack.png, _aggregate*.png") + ap.add_argument("--cumulative", action="store_true", help="running N/M instead of EMA instantaneous") + ap.add_argument("--ema-span", type=int, default=6) + ap.add_argument("--no-aggregate", action="store_true", + help="skip the plot_dynamics aggregate + overlay figures") args = ap.parse_args() - runs, seeds = [], [] - for path in args.logs: - runs.append(parse_hk(path)) - m = re.search(r"seed(\d+)", path.name) or re.search(r"_s(\d+)", path.name) - seeds.append(m.group(1) if m else "?") - plot(runs, seeds, args.out) + stem = args.out_stem + + # 1-2. per-mode small multiples (this script owns these) + runs = [parse_hk(p) for p in args.logs] + logger.info(f"parsed {len(runs)} runs: " + ", ".join(f"{r['method']}/s{r['seed']}" for r in runs)) + ylabel = "cumulative hack rate" if args.cumulative else f"hack rate (EMA span {args.ema_span})" + plot_by_method(runs, ylabel, args.cumulative, args.ema_span, stem.with_name(stem.name + "_by_method.png")) + plot_by_hack(runs, ylabel, args.cumulative, args.ema_span, stem.with_name(stem.name + "_by_hack.png")) + + # 3-4. aggregate "total hacks per arm" + hack overlay (reuse plot_dynamics, + # which owns route's deploy-curve substitution + the cos-alignment rows). + if not args.no_aggregate: + import plot_dynamics as pd + agg_runs = [r for p in args.logs if (r := pd.parse_log(p))] + if agg_runs: + agg = stem.with_name(stem.name + "_aggregate.png") + pd.plot(agg_runs, agg) + pd.plot_hack_overlay(agg_runs, agg.with_name(agg.stem + "_hack_overlay.png")) + else: + logger.warning("no runs had aggregate columns (cos_pre/hack_s) -- skipped aggregate figs") if __name__ == "__main__":