mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 19:31:11 +08:00
c3af6cc03c
Disambiguate the overloaded deploy/train/knob vocabulary (paper-consistent: 'quarantine' + 'ablated' + 'deployed' all match Cloud et al.). One opposite each: - policy view: hack_deployed/solve_deployed (quarantine ablated, ships) vs hack_as_trained/solve_as_trained (quarantine attached). Unifies the old split deploy_hack (JSON) vs hack_deploy (table key) into one name. - 'knob' -> 'quarantine'/'adapter' throughout comments and log strings. - train/test reserved for the DATA split only. Bump RUN_SCHEMA v1->v2 so old deploy_test.json files are skipped (not crashed) by completed_runs. CLI flags untouched (queued jobs unaffected). Fixed two replace_all collision bugs (hack_deploy substring of hack_deployed -> deployeded) and the missed eval_curve writer (eval_checkpoint_curve.py) + readers (results_deploy.py). Smoke green: v2 written + read; gates pass. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
47 lines
1.8 KiB
Python
47 lines
1.8 KiB
Python
"""Final paired deployed/as-trained scores from completed structured run artifacts."""
|
|
from __future__ import annotations
|
|
|
|
import polars as pl
|
|
from tabulate import tabulate
|
|
|
|
from vgrout.run_artifacts import completed_runs, route_selectivity
|
|
|
|
|
|
def main() -> None:
|
|
rows = []
|
|
for run in completed_runs():
|
|
cfg, deploy = run["cfg"], run["deploy"]
|
|
if "tiny-random" in cfg["model"] or "probe" in cfg["out_tag"]:
|
|
continue
|
|
rows.append({
|
|
"time": run["time"],
|
|
"headline": deploy["solve_deployed"] - deploy["hack_deployed"],
|
|
"hack_deployed": deploy["hack_deployed"],
|
|
"solve_deployed": deploy["solve_deployed"],
|
|
"hack_as_trained": deploy["hack_as_trained"],
|
|
"solve_as_trained": deploy["solve_as_trained"],
|
|
"select": route_selectivity(run["run_dir"]),
|
|
"arm": run["arm"],
|
|
"pair": cfg["vhack_pairs_path"].split("/")[-1].removesuffix(".json"),
|
|
"seed": cfg["seed"],
|
|
"hack_train": run["l5_hack"],
|
|
"solve_train": run["l5_solve"],
|
|
"model": cfg["model"].split("/")[-1],
|
|
"n": deploy["n"],
|
|
"modes": ",".join(deploy["eval_modes"]),
|
|
"run": run["run_dir"].name,
|
|
})
|
|
if not rows:
|
|
print("no completed non-smoke runs in out/runs/")
|
|
return
|
|
df = pl.DataFrame(rows).sort("headline", descending=True)
|
|
cols = ["time", "headline", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained",
|
|
"select", "arm", "pair", "seed", "hack_train", "solve_train", "model",
|
|
"n", "modes", "run"]
|
|
print("\n## Final paired test eval, sorted by deployed solve-hack\n")
|
|
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|