mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:21:41 +08:00
results: same-seed paired deltas + std, exclude incomplete runs
- paired view: join projected to vanilla on (mix, seed), per-seed delta, mean +/- std over shared seeds. Comparing a 3-seed mean to a 1-seed point is meaningless; this enforces same-seed comparison (ml_debug principle). - grouped view now reports std across seeds (null at n=1). - exclude in-progress/aborted runs (must log all `steps`) so partial logs don't read as impossibly-good results. - docs/results.md rewritten around paired deltas; honest that at n=4 the last-5 Dhack std (~0.15) ~= the mean (~0.13), so the effect is consistent in sign but not cleanly separated from zero. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+39
-9
@@ -79,6 +79,12 @@ def parse_log(path: Path) -> dict | None:
|
||||
return None # CPU smoke runs, not real results
|
||||
if "probe" in cfg["tag"]:
|
||||
return None # early feasibility / lr-sweep probes, not comparable baselines
|
||||
# Exclude in-progress / aborted runs: a partial log has only the early
|
||||
# (low-hack) steps, which would read as an impossibly-good result. A run is
|
||||
# complete when it logged all `steps` per-step rows.
|
||||
m = re.search(r"steps=(\d+)", preset_line)
|
||||
if m and len(hs) < int(m.group(1)):
|
||||
return None
|
||||
ts = TS_RE.search(path.name)
|
||||
mean = lambda v: sum(v) / len(v) if v else None
|
||||
cfg.pop("model")
|
||||
@@ -102,20 +108,44 @@ def main() -> None:
|
||||
print("\n## All runs (sorted by time)\n")
|
||||
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt=".3f"))
|
||||
|
||||
# Grouped by config (collapse seeds): mean across seeds. Key on every
|
||||
# config dim that changes the experiment so non-comparable runs don't merge.
|
||||
# Grouped by config (collapse seeds): mean +/- std across seeds. Key on
|
||||
# every config dim that changes the experiment so non-comparable runs
|
||||
# don't merge. std is null for n=1 (undefined).
|
||||
key = ["arm", "mix", "refr", "over", "gate", "k", "dropf", "vhack"]
|
||||
g = (df.group_by(key)
|
||||
.agg(pl.col("L5_hack").mean(),
|
||||
pl.col("L5_solve").mean(),
|
||||
pl.col("WH_hack").mean(),
|
||||
pl.len().alias("seeds"),
|
||||
pl.col("seed").sort().str.join(",").alias("seed_list"))
|
||||
.agg(pl.col("L5_hack").mean().alias("hack"),
|
||||
pl.col("L5_hack").std().alias("hack_sd"),
|
||||
pl.col("L5_solve").mean().alias("solve"),
|
||||
pl.col("L5_solve").std().alias("solve_sd"),
|
||||
pl.len().alias("n"),
|
||||
pl.col("seed").sort().str.join(",").alias("seeds"))
|
||||
.sort(["mix", "arm", "refr", "over", "gate", "k"]))
|
||||
gcols = key + ["L5_hack", "L5_solve", "WH_hack", "seeds", "seed_list"]
|
||||
print("\n## Grouped by config (mean over seeds)\n")
|
||||
gcols = key + ["hack", "hack_sd", "solve", "solve_sd", "n", "seeds"]
|
||||
print("\n## Grouped by config (mean +/- std over seeds)\n")
|
||||
print(tabulate(g.select(gcols).rows(), headers=gcols, tablefmt="pipe", floatfmt=".3f"))
|
||||
|
||||
# Paired vs same-seed vanilla (matched mix): the only honest way to read a
|
||||
# delta. Join each projected run to the vanilla run at the SAME (mix, seed),
|
||||
# take per-seed deltas, then mean +/- std of the delta over shared seeds.
|
||||
van = (df.filter(pl.col("arm") == "vanilla")
|
||||
.select(["mix", "seed", "L5_hack", "L5_solve"])
|
||||
.rename({"L5_hack": "v_hack", "L5_solve": "v_solve"}))
|
||||
j = (df.filter(pl.col("arm") == "projected")
|
||||
.join(van, on=["mix", "seed"], how="inner")
|
||||
.with_columns((pl.col("L5_hack") - pl.col("v_hack")).alias("dh"),
|
||||
(pl.col("L5_solve") - pl.col("v_solve")).alias("ds")))
|
||||
pkey = ["mix", "refr", "over", "gate", "k", "vhack"]
|
||||
pj = (j.group_by(pkey)
|
||||
.agg(pl.col("dh").mean().alias("Dhack"),
|
||||
pl.col("dh").std().alias("Dhack_sd"),
|
||||
pl.col("ds").mean().alias("Dsolve"),
|
||||
pl.len().alias("n"),
|
||||
pl.col("seed").sort().str.join(",").alias("shared_seeds"))
|
||||
.sort(["mix", "vhack", "refr", "gate", "over"]))
|
||||
pcols = pkey + ["Dhack", "Dhack_sd", "Dsolve", "n", "shared_seeds"]
|
||||
print("\n## Paired delta vs same-seed vanilla (matched mix; negative = less hacking)\n")
|
||||
print(tabulate(pj.select(pcols).rows(), headers=pcols, tablefmt="pipe", floatfmt="+.3f"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user