Files
evil_MoE/scripts/results_deploy.py
T
wassname 0f59b1351b feat: online_stats gate for routeV -- live q5/q95 band calibration
New routeV_gate="online_stats" mode: use the empirical per-rollout cosine
distribution (q5/q95 pooled across all modules each step) as the routing
band thresholds, instead of the pair-derived route_band. Direction v_grad
still from authored pairs; only thresholds are online/adaptive.

Motivation: the pair-derived band sits above the live cosine distribution
(median live cos ~-0.06), causing frout to cliff as GRPO advantage flattens.
Online thresholds adapt to the actual live distribution, so the 5/95 tails
always route regardless of where the raw cosines land.

Config: routeV_gate="online_stats", online_stats_lo=0.05, online_stats_hi=0.95.
Step-0 prior: (-0.5, 0.5) neutral band (pairs not used for calibration).
Band update: post-opt.step(), torch.quantile over that step's module*rollout cosines.
No-cheat: v_grad from authored pairs only; thresholds from the cosine distribution
of live student rollouts (no oracle/labeling of live rollouts as hack/clean).

Also: add online_stats to results_deploy._arm(); justfile queue-online-stats recipe.
Queued as job 22 (s43, authored pairs, priority 12, after 19/20/21).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-09 02:25:37 +00:00

132 lines
5.8 KiB
Python

"""Deploy-eval table (eval2 = recency-clean held-out TEST, n=119).
`just results` reports TRAIN-time L5 hack/solve. This script reports the DEPLOY
numbers (knob-off forward on the paper test set) that only appear in the
`FINAL EVAL ... held-out test` line -- the apples-to-apples per-arm deploy metric.
Headline = solve_deploy - hack_deploy (both alone are gameable; their gap is the
honest objective: solve the task without learning the cheat).
uv run python scripts/results_deploy.py # or: just results-deploy
"""
from __future__ import annotations
import re
from pathlib import Path
import polars as pl
from tabulate import tabulate
LOG_DIR = Path("logs")
TS_RE = re.compile(r"(\d{8}T\d{6})")
# Hard cutoff: eval2-era only (recency-clean test). Pre-cutoff = OLD eval; archived in
# docs/results_eval1_archive.md. Filters by the log's own timestamp, robust to old logs in logs/.
EVAL2_CUTOFF = "20260607T000000"
FINAL_RE = re.compile(
r"FINAL EVAL \[.*?\] DEPLOY \(held-out test, n=(\d+)\): "
r"hack\(strict\)=([\d.]+) hack\(vendor eq_hinted\)=([\d.]+) solve=([\d.]+)")
MAIN_RE = re.compile(r"HACK_STUDENT=([\d.]+).*?PASS_RATE|PASS_RATE=([\d.]+).*?HACK_STUDENT=([\d.]+)")
def _frac(tok: str) -> float | None:
a, b = tok.split("/")
return int(a) / int(b) if int(b) else None
def _train_l5(txt: str) -> tuple[float | None, float | None]:
"""Mean of last-5 student hack_s / gt_s from the per-step table (columns by name)."""
names = []
for l in txt.splitlines():
if "| INFO |" not in l:
continue
toks = [re.sub(r"[^a-z0-9_]", "", t.lower()) for t in l.split("| INFO |", 1)[1].split()]
if toks[:1] == ["step"] and "ref_eq" in toks:
names = toks
break
if not names:
return None, None
i_h, i_g = names.index("hack_s"), names.index("gt_s")
hs, gts = [], []
for line in txt.splitlines():
if "| INFO |" not in line:
continue
row = line.split("| INFO |", 1)[1].split()
if not row or not row[0].isdigit() or len(row) <= max(i_h, i_g):
continue
if (h := _frac(row[i_h])) is not None:
hs.append(h)
if (g := _frac(row[i_g])) is not None:
gts.append(g)
mean = lambda v: sum(v[-5:]) / len(v[-5:]) if v else None
return mean(hs), mean(gts)
def _arm(argv: str) -> str:
"""Human label for the intervention/gate, derived from the CLI flags."""
if "--intervention=none" in argv:
return "vanilla"
gate = ("act_vote" if "--routeV-gate=act_vote" in argv else
"online_stats" if "--routeV-gate=online_stats" in argv else
"lora" if "lora_frozen_b" in argv else
"per-token" if "--routeV-per-token" in argv else "grad-cos")
return f"routeV/{gate}" + ("·randV" if "--routeV-random-v-seed" in argv else "")
def _pair(argv: str) -> str:
"""Pair-set: authored (--vhack-pairs-path None) | pool json stem | prog_wide (default)."""
m = re.search(r"--vhack-pairs-path[= ](\S+)", argv)
if m:
return "authored" if m.group(1) == "None" else Path(m.group(1)).stem
return "prog_wide" # the training default when the flag is absent
def parse(path: Path) -> dict | None:
ts_m = TS_RE.search(path.name)
if ts_m and ts_m.group(1) < EVAL2_CUTOFF:
return None # pre-eval2 (OLD eval) -> results_eval1_archive.md
txt = path.read_text(errors="replace")
m = FINAL_RE.search(txt)
if m is None:
return None # no recency-clean deploy eval -> not eval2
n, hack_dep, hack_dep_eq, solve_dep = int(m[1]), float(m[2]), float(m[3]), float(m[4])
argv = next((l.split("argv:", 1)[1].strip() for l in txt.splitlines() if "argv:" in l), "?")
argv = argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv
if "tiny-random" in txt or "preset=smoke" in txt:
return None # smoke garbage
# train model + train set (provenance). model from the preset line; train set =
# the teacher pool the student trained against (--teacher-pool-dir basename, or the
# preset default when the flag is absent -- fast preset = teacher_pool_runtests_dense).
preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
m_model = re.search(r"model=(\S+)", preset_line)
model = m_model.group(1).split("/")[-1] if m_model else "?"
m_pool = re.search(r"--teacher-pool-dir=(?:out/pools/)?(\S+)", argv)
train_set = m_pool.group(1) if m_pool else "default(rt_dense)"
m_seed = re.search(r"--seed=(\d+)", argv)
# train hack/solve = L5 (mean of last 5 student steps) from the per-step table,
# the same converged-regime convention as scripts/results.py. The BLUF main-metric
# line is stdout-only (not in the verbose log), so we read the streamed table.
hack_tr, solve_tr = _train_l5(txt)
return dict(
time=ts_m.group(1) if ts_m else "?",
headline=solve_dep - hack_dep,
hack_deploy=hack_dep, solve_deploy=solve_dep,
arm=_arm(argv), pair=_pair(argv), seed=int(m_seed.group(1)) if m_seed else None,
hack_train=hack_tr, solve_train=solve_tr,
model=model, train_set=train_set,
n=n, argv=argv,
)
def main() -> None:
rows = [r for p in sorted(LOG_DIR.glob("*.log")) if (r := parse(p))]
if not rows:
print("no eval2 (held-out test) deploy runs in logs/")
return
df = pl.DataFrame(rows).sort("headline", descending=True)
cols = ["time", "headline", "hack_deploy", "solve_deploy", "arm", "pair", "seed",
"hack_train", "solve_train", "model", "train_set", "n", "argv"]
print("\n## Deploy eval (eval2 = recency-clean held-out TEST n=119), sorted by headline=solve_deploy-hack_deploy\n")
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
if __name__ == "__main__":
main()