diff --git a/out/figs/substrate_emergence.png b/out/figs/substrate_emergence.png new file mode 100644 index 0000000..12a7121 Binary files /dev/null and b/out/figs/substrate_emergence.png differ diff --git a/scripts/plot_substrate.py b/scripts/plot_substrate.py new file mode 100644 index 0000000..9968a7f --- /dev/null +++ b/scripts/plot_substrate.py @@ -0,0 +1,112 @@ +"""Multi-loophole substrate emergence plot (#148): how many of the K classes does a +vanilla-GRPO student learn, and in what order? + +One line per loophole mode. y = cumulative student hack rate of THAT mode +(hk_ = hacks / rollouts-of-that-mode-seen-so-far, parsed from the per-step +training table). A class "emerges" when its line lifts off 0; a flat-at-0 line +(eq_override) is a loophole the student never learned. first_step onset is dotted. + +Unlike plot_emergence.py (one --env-mode per log, Phase-1 single-mode runs), the +substrate run interleaves all K modes in ONE log via the hk_ columns, so we +parse those columns directly rather than grouping logs by env_mode. + +Usage: + uv run python scripts/plot_substrate.py logs/*_substrate3_none_s41.log + uv run python scripts/plot_substrate.py logs/.log --out out/figs/substrate_emergence.png +""" +from __future__ import annotations + +import argparse +import re +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +from loguru import logger + +# hk_ column header -> (display mode, colour). Order = legend/label order. +HK = { + "hk_fm": ("file_marker", "#7b3294"), + "hk_rt": ("run_tests", "#c1432b"), + "hk_so": ("stdout_marker", "#b8860b"), + "hk_se": ("sentinel", "#2f7d4f"), + "hk_eq": ("eq_override", "#33508c"), +} +_HDR_TOK = re.compile(r"[A-Za-z_]+") # "hack_s?" -> "hack_s" + + +def _rate(tok: str) -> float: + """cumulative N/M cell -> N/M (nan if denom 0).""" + a, b = tok.split("/") + return int(a) / int(b) if int(b) else np.nan + + +def parse_hk(path: Path) -> dict: + """Return {steps: int[], hk_rt: float[], ...} from a substrate run log.""" + txt = path.read_text(errors="replace") + hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None) + if hdr is None: + raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?") + names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()] + idx = {n: i for i, n in enumerate(names)} + steps, cols = [], {k: [] for k in HK} + for line in txt.splitlines(): + if "| INFO |" not in line: + continue + row = line.split("| INFO |", 1)[1].split() + if not row or not row[0].isdigit() or len(row) < len(names): + continue + steps.append(int(row[idx["step"]])) + for k in HK: + cols[k].append(_rate(row[idx[k]])) + return dict(steps=np.array(steps), **{k: np.array(v) for k, v in cols.items()}) + + +def _onset(steps: np.ndarray, rate: np.ndarray) -> int | None: + """First step where the cumulative hack rate is > 0 (loophole first exploited).""" + nz = np.where(np.nan_to_num(rate) > 0)[0] + return int(steps[nz[0]]) if len(nz) else None + + +def plot(run: dict, seed: str, out: Path) -> None: + fig, ax = plt.subplots(figsize=(6.4, 3.8)) + x = run["steps"] + n_learned = 0 + for k, (mode, color) in HK.items(): + y = run[k] + ax.plot(x, y, color=color, lw=2.0, solid_capstyle="round") + on = _onset(x, y) + final = np.nan_to_num(y)[-1] + if on is not None: + n_learned += 1 + ax.plot(on, np.nan_to_num(y)[x == on][0], "o", color=color, ms=5, zorder=5) + # direct end-label (no legend): mode + final rate, or "never" for the holdout + tag = f"{mode} {final*100:.0f}%" + ("" if on is not None else " (never)") + ax.annotate(tag, (x[-1], final), color=color, fontsize=8, va="center", + xytext=(6, 0), textcoords="offset points") + ax.set_xlabel("GRPO step") + ax.set_ylabel("cumulative hack rate (per mode)") + ax.set_title(f"vanilla GRPO learns {n_learned} of {len(HK)} loopholes (seed {seed})") + ax.set_ylim(-0.02, max(0.5, np.nanmax([np.nanmax(run[k]) for k in HK]) * 1.1)) + ax.set_xlim(0, x[-1] * 1.28) # headroom for end-labels + ax.spines[["top", "right"]].set_visible(False) + ax.grid(axis="y", lw=0.4, alpha=0.4) + fig.tight_layout() + out.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out, dpi=140) + logger.info(f"wrote {out} ({n_learned}/{len(HK)} learned)") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("logs", nargs="+", type=Path) + ap.add_argument("--out", type=Path, default=Path("out/figs/substrate_emergence.png")) + args = ap.parse_args() + # single-seed for now; first log wins (seed overlay is a later extension) + path = args.logs[0] + seed = (re.search(r"seed(\d+)", path.name) or re.search(r"s(\d+)", path.name)) + plot(parse_hk(path), seed.group(1) if seed else "?", args.out) + + +if __name__ == "__main__": + main()