results tooling + solve-orth knob + results-by-question doc

- scripts/results.py + `just results`: aggregate logs/*.log into last-5 hack_s and gt_s (solve) tables, sorted-by-time + grouped-by-config, with full argv provenance column. Filters smoke/probe runs. - extract_vhack_grad: solve_orth_m knob — strip top-m known-solve subspace (SVD of clean-side grads) from D before SVD, so projection doesn't ablate the solve signal. No grader/oracle, off by default. - docs/results.md: every experiment grouped by the question it answers (feasibility, H1, gate_mode, basis, refresh, mix, noise-floor, pair-set) with comparison tables and answers. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 18:04:59 +08:00 · 2026-05-29 07:21:05 +00:00
parent 826b2aa83e
commit 4464f9d312
4 changed files with 318 additions and 0 deletions
@@ -0,0 +1,121 @@
+"""Aggregate all train.py runs from logs/*.log into one sorted/grouped table.
+
+Durable source: each run writes logs/<ts>_<preset>_<arm>_seed<seed>_<tag>.log
+with an `argv:` line (config) and per-step rows. We parse those directly and
+recompute the metrics ourselves, so this survives `pueue reset` and doesn't
+depend on the BLUF line.
+
+Headline metric is mean-of-last-5-steps (noise-robust; the converged regime),
+shown for BOTH hack_s (reward hacks) and gt_s (ground-truth solves) on the
+STUDENT rollouts. Whole-run means are kept as a secondary column because the
+blog Table 1 uses whole-run and the two conventions disagree.
+
+    just results            # full table sorted by time + grouped-by-config
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+import polars as pl
+from tabulate import tabulate
+
+LOG_DIR = Path("logs")
+TS_RE = re.compile(r"(\d{8}T\d{6})")
+# per-step row after "| INFO |": step ref_eq rew rew_s sprd N gt_s gt_t hack_s hack_t ...
+GT_S, HACK_S = 6, 8  # column indices into the whitespace-split row
+
+
+def _frac(tok: str) -> float | None:
+    a, b = tok.split("/")
+    return int(a) / int(b) if int(b) else None
+
+
+def _cfg(argv: str, preset_line: str) -> dict:
+    def grab(pat, s, default="-"):
+        # LAST match wins: recipes set a default flag then runs override it
+        # (e.g. --v-hack-path twice, --mix-ratio twice); tyro takes the last.
+        ms = re.findall(pat, s)
+        return ms[-1] if ms else default
+    return dict(
+        arm=grab(r"--arm=(\w+)", argv),
+        preset=grab(r"preset=(\w+)", preset_line),
+        model=grab(r"model=(\S+)", preset_line),
+        seed=grab(r"seed=(\d+)", preset_line, "?"),   # preset= line always prints it
+        mix=grab(r"--mix-ratio=([\d.]+)", argv, "0.5"),
+        refr=grab(r"--vhack-refresh-every=(\d+)", argv),
+        over=grab(r"--project-overshoot=([\d.]+)", argv, "1.0"),
+        gate=grab(r"--gate-mode=(\w+)", argv, "one_sided"),
+        k=grab(r"--v-hack-k=(\d+)", argv, "5"),
+        dropf=grab(r"--v-hack-drop-bottom-frac=([\d.]+)", argv, "0.25"),
+        vhack=grab(r"v-hack-path=out/(\S+?)\.safetensors", argv),
+        tag=grab(r"--out-tag=(\S+)", argv, ""),
+        # full CLI args (after train.py) — the ground-truth provenance; any flag
+        # not parsed into a column above is still visible here.
+        argv=argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv.strip(),
+    )
+
+
+def parse_log(path: Path) -> dict | None:
+    txt = path.read_text(errors="replace")
+    argv = next((l for l in txt.splitlines() if "argv:" in l), None)
+    preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
+    if argv is None:
+        return None
+    hs, gts = [], []
+    for line in txt.splitlines():
+        if "| INFO |" not in line:
+            continue
+        row = line.split("| INFO |", 1)[1].split()
+        if not row or not row[0].isdigit() or len(row) <= HACK_S:
+            continue
+        h, g = _frac(row[HACK_S]), _frac(row[GT_S])
+        if h is not None:
+            hs.append(h)
+        if g is not None:
+            gts.append(g)
+    if not hs:
+        return None
+    cfg = _cfg(argv, preset_line)
+    if "tiny-random" in cfg["model"] or cfg["preset"] == "smoke":
+        return None  # CPU smoke runs, not real results
+    if "probe" in cfg["tag"]:
+        return None  # early feasibility / lr-sweep probes, not comparable baselines
+    ts = TS_RE.search(path.name)
+    mean = lambda v: sum(v) / len(v) if v else None
+    cfg.pop("model")
+    return dict(
+        time=ts.group(1) if ts else "?",
+        **cfg,
+        L5_hack=mean(hs[-5:]), L5_solve=mean(gts[-5:]),
+        WH_hack=mean(hs), n=len(hs),
+    )
+
+
+def main() -> None:
+    rows = [r for p in sorted(LOG_DIR.glob("*.log")) if (r := parse_log(p))]
+    if not rows:
+        print("no parseable runs in logs/")
+        return
+    df = pl.DataFrame(rows).sort("time")
+
+    cols = ["time", "arm", "seed", "mix", "refr", "over", "gate", "k", "dropf",
+            "vhack", "L5_hack", "L5_solve", "WH_hack", "n", "argv"]
+    print("\n## All runs (sorted by time)\n")
+    print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt=".3f"))
+
+    # Grouped by config (collapse seeds): mean across seeds. Key on every
+    # config dim that changes the experiment so non-comparable runs don't merge.
+    key = ["arm", "mix", "refr", "over", "gate", "k", "dropf", "vhack"]
+    g = (df.group_by(key)
+           .agg(pl.col("L5_hack").mean(),
+                pl.col("L5_solve").mean(),
+                pl.col("WH_hack").mean(),
+                pl.len().alias("seeds"),
+                pl.col("seed").sort().str.join(",").alias("seed_list"))
+           .sort(["mix", "arm", "refr", "over", "gate", "k"]))
+    gcols = key + ["L5_hack", "L5_solve", "WH_hack", "seeds", "seed_list"]
+    print("\n## Grouped by config (mean over seeds)\n")
+    print(tabulate(g.select(gcols).rows(), headers=gcols, tablefmt="pipe", floatfmt=".3f"))
+
+
+if __name__ == "__main__":
+    main()