plot: #148 substrate emergence — per-mode hack-rate curves (4/5 learned, seed 41)

New scripts/plot_substrate.py parses the hk_<mode> cumulative columns from a multi-loophole substrate run (one log, K interleaved modes) and draws one learning curve per mode with first_step onset dots and direct end-labels. plot_emergence.py can't do this (it groups logs by a single --env-mode). Figure shows the headline: vanilla GRPO learns file_marker/run_tests/ stdout_marker/sentinel, eq_override flat at 0 (never). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 20:21:41 +08:00 · 2026-05-30 16:49:03 +00:00
parent b685a4495b
commit 0ea751c5bc
2 changed files with 112 additions and 0 deletions
@@ -0,0 +1,112 @@
+"""Multi-loophole substrate emergence plot (#148): how many of the K classes does a
+vanilla-GRPO student learn, and in what order?
+
+One line per loophole mode. y = cumulative student hack rate of THAT mode
+(hk_<mode> = hacks / rollouts-of-that-mode-seen-so-far, parsed from the per-step
+training table). A class "emerges" when its line lifts off 0; a flat-at-0 line
+(eq_override) is a loophole the student never learned. first_step onset is dotted.
+
+Unlike plot_emergence.py (one --env-mode per log, Phase-1 single-mode runs), the
+substrate run interleaves all K modes in ONE log via the hk_<mode> columns, so we
+parse those columns directly rather than grouping logs by env_mode.
+
+Usage:
+  uv run python scripts/plot_substrate.py logs/*_substrate3_none_s41.log
+  uv run python scripts/plot_substrate.py logs/<run>.log --out out/figs/substrate_emergence.png
+"""
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from loguru import logger
+
+# hk_ column header -> (display mode, colour). Order = legend/label order.
+HK = {
+    "hk_fm": ("file_marker", "#7b3294"),
+    "hk_rt": ("run_tests", "#c1432b"),
+    "hk_so": ("stdout_marker", "#b8860b"),
+    "hk_se": ("sentinel", "#2f7d4f"),
+    "hk_eq": ("eq_override", "#33508c"),
+}
+_HDR_TOK = re.compile(r"[A-Za-z_]+")  # "hack_s?" -> "hack_s"
+
+
+def _rate(tok: str) -> float:
+    """cumulative N/M cell -> N/M (nan if denom 0)."""
+    a, b = tok.split("/")
+    return int(a) / int(b) if int(b) else np.nan
+
+
+def parse_hk(path: Path) -> dict:
+    """Return {steps: int[], hk_rt: float[], ...} from a substrate run log."""
+    txt = path.read_text(errors="replace")
+    hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None)
+    if hdr is None:
+        raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?")
+    names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()]
+    idx = {n: i for i, n in enumerate(names)}
+    steps, cols = [], {k: [] for k in HK}
+    for line in txt.splitlines():
+        if "| INFO |" not in line:
+            continue
+        row = line.split("| INFO |", 1)[1].split()
+        if not row or not row[0].isdigit() or len(row) < len(names):
+            continue
+        steps.append(int(row[idx["step"]]))
+        for k in HK:
+            cols[k].append(_rate(row[idx[k]]))
+    return dict(steps=np.array(steps), **{k: np.array(v) for k, v in cols.items()})
+
+
+def _onset(steps: np.ndarray, rate: np.ndarray) -> int | None:
+    """First step where the cumulative hack rate is > 0 (loophole first exploited)."""
+    nz = np.where(np.nan_to_num(rate) > 0)[0]
+    return int(steps[nz[0]]) if len(nz) else None
+
+
+def plot(run: dict, seed: str, out: Path) -> None:
+    fig, ax = plt.subplots(figsize=(6.4, 3.8))
+    x = run["steps"]
+    n_learned = 0
+    for k, (mode, color) in HK.items():
+        y = run[k]
+        ax.plot(x, y, color=color, lw=2.0, solid_capstyle="round")
+        on = _onset(x, y)
+        final = np.nan_to_num(y)[-1]
+        if on is not None:
+            n_learned += 1
+            ax.plot(on, np.nan_to_num(y)[x == on][0], "o", color=color, ms=5, zorder=5)
+        # direct end-label (no legend): mode + final rate, or "never" for the holdout
+        tag = f"{mode}  {final*100:.0f}%" + ("" if on is not None else "  (never)")
+        ax.annotate(tag, (x[-1], final), color=color, fontsize=8, va="center",
+                    xytext=(6, 0), textcoords="offset points")
+    ax.set_xlabel("GRPO step")
+    ax.set_ylabel("cumulative hack rate (per mode)")
+    ax.set_title(f"vanilla GRPO learns {n_learned} of {len(HK)} loopholes  (seed {seed})")
+    ax.set_ylim(-0.02, max(0.5, np.nanmax([np.nanmax(run[k]) for k in HK]) * 1.1))
+    ax.set_xlim(0, x[-1] * 1.28)  # headroom for end-labels
+    ax.spines[["top", "right"]].set_visible(False)
+    ax.grid(axis="y", lw=0.4, alpha=0.4)
+    fig.tight_layout()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out, dpi=140)
+    logger.info(f"wrote {out}  ({n_learned}/{len(HK)} learned)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("logs", nargs="+", type=Path)
+    ap.add_argument("--out", type=Path, default=Path("out/figs/substrate_emergence.png"))
+    args = ap.parse_args()
+    # single-seed for now; first log wins (seed overlay is a later extension)
+    path = args.logs[0]
+    seed = (re.search(r"seed(\d+)", path.name) or re.search(r"s(\d+)", path.name))
+    plot(parse_hk(path), seed.group(1) if seed else "?", args.out)
+
+
+if __name__ == "__main__":
+    main()