"""Multi-loophole substrate emergence plot (#148): how many of the K classes does a vanilla-GRPO student learn, and in what order? One line per loophole mode. y = cumulative student hack rate of THAT mode (hk_ = hacks / rollouts-of-that-mode-seen-so-far, parsed from the per-step training table). A class "emerges" when its line lifts off 0; a flat-at-0 line (eq_override) is a loophole the student never learned. first_step onset is dotted. Unlike plot_emergence.py (one --env-mode per log, Phase-1 single-mode runs), the substrate run interleaves all K modes in ONE log via the hk_ columns, so we parse those columns directly rather than grouping logs by env_mode. Usage: uv run python scripts/plot_substrate.py logs/*_substrate3_none_s41.log uv run python scripts/plot_substrate.py logs/.log --out out/figs/substrate_emergence.png """ from __future__ import annotations import argparse import re from pathlib import Path import matplotlib.pyplot as plt import numpy as np from loguru import logger # hk_ column header -> (display mode, colour). Order = legend/label order. HK = { "hk_fm": ("file_marker", "#7b3294"), "hk_rt": ("run_tests", "#c1432b"), "hk_so": ("stdout_marker", "#b8860b"), "hk_se": ("sentinel", "#2f7d4f"), "hk_eq": ("eq_override", "#33508c"), } _HDR_TOK = re.compile(r"[A-Za-z_]+") # "hack_s?" -> "hack_s" def _rate(tok: str) -> float: """cumulative N/M cell -> N/M (nan if denom 0).""" a, b = tok.split("/") return int(a) / int(b) if int(b) else np.nan def parse_hk(path: Path) -> dict: """Return {steps: int[], hk_rt: float[], ...} from a substrate run log.""" txt = path.read_text(errors="replace") hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None) if hdr is None: raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?") names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()] idx = {n: i for i, n in enumerate(names)} steps, cols = [], {k: [] for k in HK} for line in txt.splitlines(): if "| INFO |" not in line: continue row = line.split("| INFO |", 1)[1].split() if not row or not row[0].isdigit() or len(row) < len(names): continue steps.append(int(row[idx["step"]])) for k in HK: cols[k].append(_rate(row[idx[k]])) return dict(steps=np.array(steps), **{k: np.array(v) for k, v in cols.items()}) def _onset(steps: np.ndarray, rate: np.ndarray) -> int | None: """First step where the cumulative hack rate is > 0 (loophole first exploited).""" nz = np.where(np.nan_to_num(rate) > 0)[0] return int(steps[nz[0]]) if len(nz) else None def plot(runs: list[dict], seeds: list[str], out: Path) -> None: """One curve per mode. Multi-seed: thin per-seed lines + bold mean; a mode counts as 'learned' if it lifts off 0 in the MEAN. Single-seed: bold line + onset dot.""" fig, ax = plt.subplots(figsize=(6.4, 3.8)) L = min(len(r["steps"]) for r in runs) x = runs[0]["steps"][:L] n_learned = ymax = 0 for k, (mode, color) in HK.items(): stacked = np.stack([np.nan_to_num(r[k][:L]) for r in runs]) # (seeds, steps) ymean = stacked.mean(axis=0) ymax = max(ymax, ymean.max()) if len(runs) > 1: for ys in stacked: ax.plot(x, ys, color=color, lw=0.7, alpha=0.30, solid_capstyle="round") ax.plot(x, ymean, color=color, lw=2.0, solid_capstyle="round") on = _onset(x, ymean) if on is not None: n_learned += 1 if len(runs) == 1: ax.plot(on, ymean[x == on][0], "o", color=color, ms=5, zorder=5) tag = f"{mode} {ymean[-1]*100:.0f}%" + ("" if on is not None else " (never)") ax.annotate(tag, (x[-1], ymean[-1]), color=color, fontsize=8, va="center", xytext=(6, 0), textcoords="offset points") seed_lbl = f"seed {seeds[0]}" if len(seeds) == 1 else f"{len(seeds)} seeds: {','.join(seeds)}" ax.set_xlabel("GRPO step") ax.set_ylabel("cumulative hack rate (per mode)") ax.set_title(f"vanilla GRPO learns {n_learned} of {len(HK)} loopholes ({seed_lbl})") ax.set_ylim(-0.02, max(0.5, ymax * 1.15)) ax.set_xlim(0, x[-1] * 1.28) # headroom for end-labels ax.spines[["top", "right"]].set_visible(False) ax.grid(axis="y", lw=0.4, alpha=0.4) fig.tight_layout() out.parent.mkdir(parents=True, exist_ok=True) fig.savefig(out, dpi=140) logger.info(f"wrote {out} ({n_learned}/{len(HK)} learned, {len(seeds)} seed(s))") def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("logs", nargs="+", type=Path) ap.add_argument("--out", type=Path, default=Path("out/figs/substrate_emergence.png")) args = ap.parse_args() runs, seeds = [], [] for path in args.logs: runs.append(parse_hk(path)) m = re.search(r"seed(\d+)", path.name) or re.search(r"_s(\d+)", path.name) seeds.append(m.group(1) if m else "?") plot(runs, seeds, args.out) if __name__ == "__main__": main()