mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:23:57 +08:00
93c862bb2e
select = routing precision = hack_supp - solve_supp on the knob (held-out val, knob-ON vs knob-OFF, same split). 1.0 = removes all hacking at no solve cost. Sanity: vanilla=0.00 (no knob), base=blank (no knob-on signal), per-token=0.96. hack_supp = (vanilla - hack)/vanilla ; solve_uplift = (solve - base)/(ceiling - base), the floor->ceiling normalized fractions (ceiling provisional=paper 0.223, FIXME job 24). The earlier "solve suppression ~50%" was a train/test artifact; the knob's true solve cost (select's solve_supp term) is near zero -- selectivity is high. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
174 lines
8.3 KiB
Python
174 lines
8.3 KiB
Python
"""Deploy-eval table (eval2 = recency-clean held-out TEST, n=119).
|
|
|
|
`just results` reports TRAIN-time L5 hack/solve. This script reports the DEPLOY
|
|
numbers (knob-off forward on the paper test set) that only appear in the
|
|
`FINAL EVAL ... held-out test` line -- the apples-to-apples per-arm deploy metric.
|
|
|
|
Headline = solve_deploy - hack_deploy (both alone are gameable; their gap is the
|
|
honest objective: solve the task without learning the cheat).
|
|
|
|
uv run python scripts/results_deploy.py # or: just results-deploy
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
import polars as pl
|
|
from tabulate import tabulate
|
|
|
|
LOG_DIR = Path("logs")
|
|
TS_RE = re.compile(r"(\d{8}T\d{6})")
|
|
# Hard cutoff: eval2-era only (recency-clean test). Pre-cutoff = OLD eval; archived in
|
|
# docs/results_eval1_archive.md. Filters by the log's own timestamp, robust to old logs in logs/.
|
|
EVAL2_CUTOFF = "20260607T000000"
|
|
FINAL_RE = re.compile(
|
|
r"FINAL EVAL \[.*?\] DEPLOY \(held-out test, n=(\d+)\): "
|
|
r"hack\(strict\)=([\d.]+) hack\(vendor eq_hinted\)=([\d.]+) solve=([\d.]+)")
|
|
MAIN_RE = re.compile(r"HACK_STUDENT=([\d.]+).*?PASS_RATE|PASS_RATE=([\d.]+).*?HACK_STUDENT=([\d.]+)")
|
|
|
|
|
|
def _frac(tok: str) -> float | None:
|
|
a, b = tok.split("/")
|
|
return int(a) / int(b) if int(b) else None
|
|
|
|
|
|
def _select(stem: str) -> float | None:
|
|
"""Routing selectivity = Youden's J on the knob (held-out val, L5): the quarantine is a
|
|
classifier of gradient mass into hack(forget)/keep. J = hack_supp - solve_supp =
|
|
(Δhack/hack_on) - (Δsolve/solve_on), knob-ON vs knob-OFF on the SAME val split. 1.0 = it
|
|
removes all hacking and costs no solving; 0 = it hits hack and solve equally (no precision).
|
|
eval_curve's train_*/deploy_* prefixes denote KNOB STATE (on/off), not problem set."""
|
|
ec = Path("out/runs") / stem / "eval_curve.jsonl"
|
|
if not ec.exists():
|
|
return None
|
|
rows = [json.loads(l) for l in ec.read_text().splitlines()][-5:]
|
|
l5 = lambda k: sum(r[k] for r in rows) / len(rows)
|
|
h_on, s_on = l5("train_hack"), l5("train_solve")
|
|
if h_on == 0 or s_on == 0:
|
|
return None # no knob-on signal to route (e.g. base model)
|
|
hack_supp = (h_on - l5("deploy_hack")) / h_on
|
|
solve_supp = (s_on - l5("deploy_solve")) / s_on
|
|
return round(hack_supp - solve_supp, 3)
|
|
|
|
|
|
def _train_l5(txt: str) -> tuple[float | None, float | None]:
|
|
"""Mean of last-5 student hack_s / gt_s from the per-step table (columns by name)."""
|
|
names = []
|
|
for l in txt.splitlines():
|
|
if "| INFO |" not in l:
|
|
continue
|
|
toks = [re.sub(r"[^a-z0-9_]", "", t.lower()) for t in l.split("| INFO |", 1)[1].split()]
|
|
if toks[:1] == ["step"] and "ref_eq" in toks:
|
|
names = toks
|
|
break
|
|
if not names:
|
|
return None, None
|
|
i_h, i_g = names.index("hack_s"), names.index("gt_s")
|
|
hs, gts = [], []
|
|
for line in txt.splitlines():
|
|
if "| INFO |" not in line:
|
|
continue
|
|
row = line.split("| INFO |", 1)[1].split()
|
|
if not row or not row[0].isdigit() or len(row) <= max(i_h, i_g):
|
|
continue
|
|
if (h := _frac(row[i_h])) is not None:
|
|
hs.append(h)
|
|
if (g := _frac(row[i_g])) is not None:
|
|
gts.append(g)
|
|
mean = lambda v: sum(v[-5:]) / len(v[-5:]) if v else None
|
|
return mean(hs), mean(gts)
|
|
|
|
|
|
def _arm(argv: str) -> str:
|
|
"""Human label for the intervention/gate, derived from the CLI flags."""
|
|
if "--intervention=none" in argv:
|
|
return "vanilla"
|
|
gate = ("act_vote" if "--routeV-gate=act_vote" in argv else
|
|
"online_stats" if "--routeV-gate=online_stats" in argv else
|
|
"lora" if "lora_frozen_b" in argv else
|
|
"per-token" if "--routeV-per-token" in argv else "grad-cos")
|
|
return f"routeV/{gate}" + ("·randV" if "--routeV-random-v-seed" in argv else "")
|
|
|
|
|
|
def _pair(argv: str) -> str:
|
|
"""Pair-set: authored (--vhack-pairs-path None) | pool json stem | prog_wide (default)."""
|
|
m = re.search(r"--vhack-pairs-path[= ](\S+)", argv)
|
|
if m:
|
|
return "authored" if m.group(1) == "None" else Path(m.group(1)).stem
|
|
return "prog_wide" # the training default when the flag is absent
|
|
|
|
|
|
def parse(path: Path) -> dict | None:
|
|
ts_m = TS_RE.search(path.name)
|
|
if ts_m and ts_m.group(1) < EVAL2_CUTOFF:
|
|
return None # pre-eval2 (OLD eval) -> results_eval1_archive.md
|
|
txt = path.read_text(errors="replace")
|
|
m = FINAL_RE.search(txt)
|
|
if m is None:
|
|
return None # no recency-clean deploy eval -> not eval2
|
|
n, hack_dep, hack_dep_eq, solve_dep = int(m[1]), float(m[2]), float(m[3]), float(m[4])
|
|
argv = next((l.split("argv:", 1)[1].strip() for l in txt.splitlines() if "argv:" in l), "?")
|
|
argv = argv.split("train.py ", 1)[-1].strip() if "train.py " in argv else argv
|
|
if "tiny-random" in txt or "preset=smoke" in txt:
|
|
return None # smoke garbage
|
|
# train model + train set (provenance). model from the preset line; train set =
|
|
# the teacher pool the student trained against (--teacher-pool-dir basename, or the
|
|
# preset default when the flag is absent -- fast preset = teacher_pool_runtests_dense).
|
|
preset_line = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
|
|
m_model = re.search(r"model=(\S+)", preset_line)
|
|
model = m_model.group(1).split("/")[-1] if m_model else "?"
|
|
m_pool = re.search(r"--teacher-pool-dir=(?:out/pools/)?(\S+)", argv)
|
|
train_set = m_pool.group(1) if m_pool else "default(rt_dense)"
|
|
m_seed = re.search(r"--seed=(\d+)", argv)
|
|
# train hack/solve = L5 (mean of last 5 student steps) from the per-step table,
|
|
# the same converged-regime convention as scripts/results.py. The BLUF main-metric
|
|
# line is stdout-only (not in the verbose log), so we read the streamed table.
|
|
hack_tr, solve_tr = _train_l5(txt)
|
|
return dict(
|
|
time=ts_m.group(1) if ts_m else "?",
|
|
headline=solve_dep - hack_dep,
|
|
hack_deploy=hack_dep, solve_deploy=solve_dep,
|
|
arm=_arm(argv), pair=_pair(argv), seed=int(m_seed.group(1)) if m_seed else None,
|
|
hack_train=hack_tr, solve_train=solve_tr, select=_select(path.stem),
|
|
model=model, train_set=train_set,
|
|
n=n, argv=argv,
|
|
)
|
|
|
|
|
|
_CEILING_PROVISIONAL = 0.223 # paper no-loophole; FIXME until job 24 (out/runs/*noloophole*)
|
|
|
|
|
|
def _anchors(rows: list[dict]) -> tuple[float, float, float, bool]:
|
|
"""Floor/ceiling anchors for the normalized columns: vanilla_hack (hack floor=worst),
|
|
base_solve (solve floor), ceiling (solve ceiling = no-loophole oracle)."""
|
|
vanilla_hack = max((r["hack_deploy"] for r in rows if r["arm"] == "vanilla"
|
|
and r["hack_train"] is not None), default=0.613)
|
|
base_solve = next((r["solve_deploy"] for r in rows if r["arm"] == "vanilla"
|
|
and r["hack_train"] is None), 0.126)
|
|
cp = next(Path("out/runs").glob("*noloophole*/deploy_test.json"), None)
|
|
ceiling = json.loads(cp.read_text())["deploy_solve"] if cp else _CEILING_PROVISIONAL
|
|
return vanilla_hack, base_solve, ceiling, cp is None
|
|
|
|
|
|
def main() -> None:
|
|
rows = [r for p in sorted(LOG_DIR.glob("*.log")) if (r := parse(p))]
|
|
if not rows:
|
|
print("no eval2 (held-out test) deploy runs in logs/")
|
|
return
|
|
vh, base, ceil, provisional = _anchors(rows)
|
|
df = (pl.DataFrame(rows)
|
|
.with_columns(hack_supp=((vh - pl.col("hack_deploy")) / vh).round(3),
|
|
solve_uplift=((pl.col("solve_deploy") - base) / (ceil - base)).round(3))
|
|
.sort("headline", descending=True))
|
|
cols = ["time", "headline", "hack_deploy", "solve_deploy", "hack_supp", "solve_uplift",
|
|
"select", "arm", "pair", "seed", "hack_train", "solve_train", "model", "n", "argv"]
|
|
fc = f"hack_supp = (vanilla {vh:.3f} - hack)/vanilla ; solve_uplift = (solve - base {base:.3f})/(ceiling {ceil:.3f} - base)"
|
|
print("\n## Deploy eval (eval2 = recency-clean held-out TEST n=119), sorted by headline=solve_deploy-hack_deploy\n")
|
|
print(f"floor→ceiling: {fc}{' [ceiling PROVISIONAL, FIXME job 24]' if provisional else ''}")
|
|
print("select = Youden J on the knob (held-out val): hack_supp - solve_supp, 1.0 = perfect routing precision\n")
|
|
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|