results: same-seed paired deltas + std, exclude incomplete runs

- paired view: join projected to vanilla on (mix, seed), per-seed delta, mean
  +/- std over shared seeds. Comparing a 3-seed mean to a 1-seed point is
  meaningless; this enforces same-seed comparison (ml_debug principle).
- grouped view now reports std across seeds (null at n=1).
- exclude in-progress/aborted runs (must log all `steps`) so partial logs
  don't read as impossibly-good results.
- docs/results.md rewritten around paired deltas; honest that at n=4 the
  last-5 Dhack std (~0.15) ~= the mean (~0.13), so the effect is consistent
  in sign but not cleanly separated from zero.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
wassname
2026-05-29 08:10:42 +00:00
parent 4464f9d312
commit e7cdcaa0ab
2 changed files with 132 additions and 134 deletions
+39 -9
View File
@@ -79,6 +79,12 @@ def parse_log(path: Path) -> dict | None:
return None # CPU smoke runs, not real results
if "probe" in cfg["tag"]:
return None # early feasibility / lr-sweep probes, not comparable baselines
# Exclude in-progress / aborted runs: a partial log has only the early
# (low-hack) steps, which would read as an impossibly-good result. A run is
# complete when it logged all `steps` per-step rows.
m = re.search(r"steps=(\d+)", preset_line)
if m and len(hs) < int(m.group(1)):
return None
ts = TS_RE.search(path.name)
mean = lambda v: sum(v) / len(v) if v else None
cfg.pop("model")
@@ -102,20 +108,44 @@ def main() -> None:
print("\n## All runs (sorted by time)\n")
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt=".3f"))
# Grouped by config (collapse seeds): mean across seeds. Key on every
# config dim that changes the experiment so non-comparable runs don't merge.
# Grouped by config (collapse seeds): mean +/- std across seeds. Key on
# every config dim that changes the experiment so non-comparable runs
# don't merge. std is null for n=1 (undefined).
key = ["arm", "mix", "refr", "over", "gate", "k", "dropf", "vhack"]
g = (df.group_by(key)
.agg(pl.col("L5_hack").mean(),
pl.col("L5_solve").mean(),
pl.col("WH_hack").mean(),
pl.len().alias("seeds"),
pl.col("seed").sort().str.join(",").alias("seed_list"))
.agg(pl.col("L5_hack").mean().alias("hack"),
pl.col("L5_hack").std().alias("hack_sd"),
pl.col("L5_solve").mean().alias("solve"),
pl.col("L5_solve").std().alias("solve_sd"),
pl.len().alias("n"),
pl.col("seed").sort().str.join(",").alias("seeds"))
.sort(["mix", "arm", "refr", "over", "gate", "k"]))
gcols = key + ["L5_hack", "L5_solve", "WH_hack", "seeds", "seed_list"]
print("\n## Grouped by config (mean over seeds)\n")
gcols = key + ["hack", "hack_sd", "solve", "solve_sd", "n", "seeds"]
print("\n## Grouped by config (mean +/- std over seeds)\n")
print(tabulate(g.select(gcols).rows(), headers=gcols, tablefmt="pipe", floatfmt=".3f"))
# Paired vs same-seed vanilla (matched mix): the only honest way to read a
# delta. Join each projected run to the vanilla run at the SAME (mix, seed),
# take per-seed deltas, then mean +/- std of the delta over shared seeds.
van = (df.filter(pl.col("arm") == "vanilla")
.select(["mix", "seed", "L5_hack", "L5_solve"])
.rename({"L5_hack": "v_hack", "L5_solve": "v_solve"}))
j = (df.filter(pl.col("arm") == "projected")
.join(van, on=["mix", "seed"], how="inner")
.with_columns((pl.col("L5_hack") - pl.col("v_hack")).alias("dh"),
(pl.col("L5_solve") - pl.col("v_solve")).alias("ds")))
pkey = ["mix", "refr", "over", "gate", "k", "vhack"]
pj = (j.group_by(pkey)
.agg(pl.col("dh").mean().alias("Dhack"),
pl.col("dh").std().alias("Dhack_sd"),
pl.col("ds").mean().alias("Dsolve"),
pl.len().alias("n"),
pl.col("seed").sort().str.join(",").alias("shared_seeds"))
.sort(["mix", "vhack", "refr", "gate", "over"]))
pcols = pkey + ["Dhack", "Dhack_sd", "Dsolve", "n", "shared_seeds"]
print("\n## Paired delta vs same-seed vanilla (matched mix; negative = less hacking)\n")
print(tabulate(pj.select(pcols).rows(), headers=pcols, tablefmt="pipe", floatfmt="+.3f"))
if __name__ == "__main__":
main()