Files
evil_MoE/scripts/results_deploy.py
T
2026-06-10 11:58:53 +00:00

48 lines
1.9 KiB
Python

"""Final paired deployed/as-trained scores from completed structured run artifacts."""
from __future__ import annotations
import polars as pl
from tabulate import tabulate
from vgrout.run_artifacts import completed_runs, route_selectivity
def main() -> None:
rows = []
for run in completed_runs():
cfg, deploy = run["cfg"], run["deploy"]
if "tiny-random" in cfg["model"] or "probe" in cfg["out_tag"]:
continue
pair_ref = cfg["vhack_pairs_path"]
rows.append({
"time": run["time"],
"headline": deploy["solve_deployed"] - deploy["hack_deployed"],
"hack_deployed": deploy["hack_deployed"],
"solve_deployed": deploy["solve_deployed"],
"hack_as_trained": deploy["hack_as_trained"],
"solve_as_trained": deploy["solve_as_trained"],
"select": route_selectivity(run["run_dir"]),
"arm": run["arm"],
"pair": pair_ref.rsplit("#", 1)[-1].split("/")[-1].removesuffix(".json"),
"seed": cfg["seed"],
"hack_train": run["l5_hack"],
"solve_train": run["l5_solve"],
"model": cfg["model"].split("/")[-1],
"n": deploy["n"],
"modes": ",".join(deploy["eval_modes"]),
"run": run["run_dir"].name,
})
if not rows:
print("no completed non-smoke runs in out/runs/")
return
df = pl.DataFrame(rows).sort("headline", descending=True)
cols = ["time", "headline", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained",
"select", "arm", "pair", "seed", "hack_train", "solve_train", "model",
"n", "modes", "run"]
print("\n## Final paired test eval, sorted by deployed solve-hack\n")
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
if __name__ == "__main__":
main()