"""Deploy-eval table (eval2 = recency-clean held-out TEST, n=119). `just results` reports TRAIN-time L5 hack/solve. This script reports the DEPLOY numbers (knob-off forward on the paper test set) that only appear in the `FINAL EVAL ... held-out test` line -- the apples-to-apples per-arm deploy metric. Headline = solve_deploy - hack_deploy (both alone are gameable; their gap is the honest objective: solve the task without learning the cheat). uv run python scripts/results_deploy.py # or: just results-deploy """ from __future__ import annotations import re from pathlib import Path import polars as pl from tabulate import tabulate LOG_DIR = Path("logs") FINAL_RE = re.compile( r"FINAL EVAL \[.*?\] DEPLOY \(held-out test, n=(\d+)\): " r"hack\(strict\)=([\d.]+) hack\(vendor eq_hinted\)=([\d.]+) solve=([\d.]+)") MAIN_RE = re.compile(r"HACK_STUDENT=([\d.]+).*?PASS_RATE|PASS_RATE=([\d.]+).*?HACK_STUDENT=([\d.]+)") def _frac(tok: str) -> float | None: a, b = tok.split("/") return int(a) / int(b) if int(b) else None def _train_l5(txt: str) -> tuple[float | None, float | None]: """Mean of last-5 student hack_s / gt_s from the per-step table (columns by name).""" names = [] for l in txt.splitlines(): if "| INFO |" not in l: continue toks = [re.sub(r"[^a-z0-9_]", "", t.lower()) for t in l.split("| INFO |", 1)[1].split()] if toks[:1] == ["step"] and "ref_eq" in toks: names = toks break if not names: return None, None i_h, i_g = names.index("hack_s"), names.index("gt_s") hs, gts = [], [] for line in txt.splitlines(): if "| INFO |" not in line: continue row = line.split("| INFO |", 1)[1].split() if not row or not row[0].isdigit() or len(row) <= max(i_h, i_g): continue if (h := _frac(row[i_h])) is not None: hs.append(h) if (g := _frac(row[i_g])) is not None: gts.append(g) mean = lambda v: sum(v[-5:]) / len(v[-5:]) if v else None return mean(hs), mean(gts) def parse(path: Path) -> dict | None: txt = path.read_text(errors="replace") m = FINAL_RE.search(txt) if m is None: return None # no recency-clean deploy eval -> not eval2 n, hack_dep, hack_dep_eq, solve_dep = int(m[1]), float(m[2]), float(m[3]), float(m[4]) argv = next((l.split("argv:", 1)[1].strip() for l in txt.splitlines() if "argv:" in l), "?") argv = argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv if "tiny-random" in txt or "preset=smoke" in txt: return None # smoke garbage # train hack/solve = L5 (mean of last 5 student steps) from the per-step table, # the same converged-regime convention as scripts/results.py. The BLUF main-metric # line is stdout-only (not in the verbose log), so we read the streamed table. hack_tr, solve_tr = _train_l5(txt) return dict( headline=solve_dep - hack_dep, solve=solve_tr, hack=hack_tr, solve_deploy=solve_dep, hack_deploy=hack_dep, n=n, argv=argv, ) def main() -> None: rows = [r for p in sorted(LOG_DIR.glob("*.log")) if (r := parse(p))] if not rows: print("no eval2 (held-out test) deploy runs in logs/") return df = pl.DataFrame(rows).sort("headline", descending=True) cols = ["headline", "solve", "hack", "solve_deploy", "hack_deploy", "n", "argv"] print("\n## Deploy eval (eval2 = recency-clean held-out TEST), sorted by headline=solve_deploy-hack_deploy\n") print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f")) if __name__ == "__main__": main()