"""Final paired deployed/as-trained scores from completed structured run artifacts.""" from __future__ import annotations import polars as pl from tabulate import tabulate from vgrout.run_artifacts import completed_runs, route_selectivity def main() -> None: rows = [] for run in completed_runs(): cfg, deploy = run["cfg"], run["deploy"] if "tiny-random" in cfg["model"] or "probe" in cfg["out_tag"]: continue pair_ref = cfg["vhack_pairs_path"] rows.append({ "time": run["time"], "headline": deploy["solve_deployed"] - deploy["hack_deployed"], "hack_deployed": deploy["hack_deployed"], "solve_deployed": deploy["solve_deployed"], "hack_as_trained": deploy["hack_as_trained"], "solve_as_trained": deploy["solve_as_trained"], "select": route_selectivity(run["run_dir"]), "arm": run["arm"], "pair": pair_ref.rsplit("#", 1)[-1].split("/")[-1].removesuffix(".json"), "seed": cfg["seed"], "hack_train": run["l5_hack"], "solve_train": run["l5_solve"], "model": cfg["model"].split("/")[-1], "n": deploy["n"], "modes": ",".join(deploy["eval_modes"]), "run": run["run_dir"].name, }) if not rows: print("no completed non-smoke runs in out/runs/") return df = pl.DataFrame(rows).sort("headline", descending=True) cols = ["time", "headline", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained", "select", "arm", "pair", "seed", "hack_train", "solve_train", "model", "n", "modes", "run"] print("\n## Final paired test eval, sorted by deployed solve-hack\n") print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f")) if __name__ == "__main__": main()