mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
results: just results = eval2 deploy table (time/headline/deploy/arm/pair/seed/train/argv); hard eval2 cutoff; archive eval1 (Q1-Q13 + 352 old logs)
This commit is contained in:
@@ -20,6 +20,11 @@ from tabulate import tabulate
|
||||
|
||||
LOG_DIR = Path("logs")
|
||||
TS_RE = re.compile(r"(\d{8}T\d{6})")
|
||||
# Hard cutoff: only show eval2-era runs (recency-clean test set, dir6+ onward). Runs before
|
||||
# this are the OLD eval (contaminated holdout); their curated findings live in
|
||||
# docs/results_eval1_archive.md. Robust to old logs being present -- filters by the log's
|
||||
# own timestamp, so we don't rely on moving files out of logs/.
|
||||
EVAL2_CUTOFF = "20260607T000000"
|
||||
# Column positions are read from the header row by NAME, not hardcoded -- the
|
||||
# per-step table layout has changed over time (sprd/N dropped, cin/cout/hk_dep
|
||||
# added) so fixed indices silently mis-read newer logs and crash on smoke logs.
|
||||
@@ -65,6 +70,9 @@ def _cfg(argv: str, preset_line: str) -> dict:
|
||||
|
||||
|
||||
def parse_log(path: Path) -> dict | None:
|
||||
ts_m = TS_RE.search(path.name)
|
||||
if ts_m and ts_m.group(1) < EVAL2_CUTOFF:
|
||||
return None # pre-eval2 (OLD eval) -> docs/results_eval1_archive.md
|
||||
txt = path.read_text(errors="replace")
|
||||
argv = next((l for l in txt.splitlines() if "argv:" in l), None)
|
||||
preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
|
||||
|
||||
@@ -16,6 +16,10 @@ import polars as pl
|
||||
from tabulate import tabulate
|
||||
|
||||
LOG_DIR = Path("logs")
|
||||
TS_RE = re.compile(r"(\d{8}T\d{6})")
|
||||
# Hard cutoff: eval2-era only (recency-clean test). Pre-cutoff = OLD eval; archived in
|
||||
# docs/results_eval1_archive.md. Filters by the log's own timestamp, robust to old logs in logs/.
|
||||
EVAL2_CUTOFF = "20260607T000000"
|
||||
FINAL_RE = re.compile(
|
||||
r"FINAL EVAL \[.*?\] DEPLOY \(held-out test, n=(\d+)\): "
|
||||
r"hack\(strict\)=([\d.]+) hack\(vendor eq_hinted\)=([\d.]+) solve=([\d.]+)")
|
||||
@@ -55,7 +59,28 @@ def _train_l5(txt: str) -> tuple[float | None, float | None]:
|
||||
return mean(hs), mean(gts)
|
||||
|
||||
|
||||
def _arm(argv: str) -> str:
|
||||
"""Human label for the intervention/gate, derived from the CLI flags."""
|
||||
if "--intervention=none" in argv:
|
||||
return "vanilla"
|
||||
gate = ("act_vote" if "--routeV-gate=act_vote" in argv else
|
||||
"lora" if "lora_frozen_b" in argv else
|
||||
"per-token" if "--routeV-per-token" in argv else "grad-cos")
|
||||
return f"routeV/{gate}" + ("·randV" if "--routeV-random-v-seed" in argv else "")
|
||||
|
||||
|
||||
def _pair(argv: str) -> str:
|
||||
"""Pair-set: authored (--vhack-pairs-path None) | pool json stem | prog_wide (default)."""
|
||||
m = re.search(r"--vhack-pairs-path[= ](\S+)", argv)
|
||||
if m:
|
||||
return "authored" if m.group(1) == "None" else Path(m.group(1)).stem
|
||||
return "prog_wide" # the training default when the flag is absent
|
||||
|
||||
|
||||
def parse(path: Path) -> dict | None:
|
||||
ts_m = TS_RE.search(path.name)
|
||||
if ts_m and ts_m.group(1) < EVAL2_CUTOFF:
|
||||
return None # pre-eval2 (OLD eval) -> results_eval1_archive.md
|
||||
txt = path.read_text(errors="replace")
|
||||
m = FINAL_RE.search(txt)
|
||||
if m is None:
|
||||
@@ -65,14 +90,26 @@ def parse(path: Path) -> dict | None:
|
||||
argv = argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv
|
||||
if "tiny-random" in txt or "preset=smoke" in txt:
|
||||
return None # smoke garbage
|
||||
# train model + train set (provenance). model from the preset line; train set =
|
||||
# the teacher pool the student trained against (--teacher-pool-dir basename, or the
|
||||
# preset default when the flag is absent -- fast preset = teacher_pool_runtests_dense).
|
||||
preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
|
||||
m_model = re.search(r"model=(\S+)", preset_line)
|
||||
model = m_model.group(1).split("/")[-1] if m_model else "?"
|
||||
m_pool = re.search(r"--teacher-pool-dir=(?:out/pools/)?(\S+)", argv)
|
||||
train_set = m_pool.group(1) if m_pool else "default(rt_dense)"
|
||||
m_seed = re.search(r"--seed=(\d+)", argv)
|
||||
# train hack/solve = L5 (mean of last 5 student steps) from the per-step table,
|
||||
# the same converged-regime convention as scripts/results.py. The BLUF main-metric
|
||||
# line is stdout-only (not in the verbose log), so we read the streamed table.
|
||||
hack_tr, solve_tr = _train_l5(txt)
|
||||
return dict(
|
||||
time=ts_m.group(1) if ts_m else "?",
|
||||
headline=solve_dep - hack_dep,
|
||||
solve=solve_tr, hack=hack_tr,
|
||||
solve_deploy=solve_dep, hack_deploy=hack_dep,
|
||||
hack_deploy=hack_dep, solve_deploy=solve_dep,
|
||||
arm=_arm(argv), pair=_pair(argv), seed=int(m_seed.group(1)) if m_seed else None,
|
||||
hack_train=hack_tr, solve_train=solve_tr,
|
||||
model=model, train_set=train_set,
|
||||
n=n, argv=argv,
|
||||
)
|
||||
|
||||
@@ -83,8 +120,9 @@ def main() -> None:
|
||||
print("no eval2 (held-out test) deploy runs in logs/")
|
||||
return
|
||||
df = pl.DataFrame(rows).sort("headline", descending=True)
|
||||
cols = ["headline", "solve", "hack", "solve_deploy", "hack_deploy", "n", "argv"]
|
||||
print("\n## Deploy eval (eval2 = recency-clean held-out TEST), sorted by headline=solve_deploy-hack_deploy\n")
|
||||
cols = ["time", "headline", "hack_deploy", "solve_deploy", "arm", "pair", "seed",
|
||||
"hack_train", "solve_train", "model", "train_set", "n", "argv"]
|
||||
print("\n## Deploy eval (eval2 = recency-clean held-out TEST n=119), sorted by headline=solve_deploy-hack_deploy\n")
|
||||
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user