"""Re-score a finished run's DEPLOYED adapter on the full held-out test set. Why: the in-run FINAL EVAL is only n=24 -- the fast preset trains and evals on the same 24 problems (6/6/6/6 partition), SE ~0.1 and not even held-out. Every run saves `train.safetensors` (delta_S = the deployed adapter; the quarantine is ablated at deploy), so we re-score knob-OFF on the held-out test set (n=119, SE ~0.04) with the v2 token-gap, without retraining. Reuses the canonical eval_hack_solve, so this is the same grader as training applied off-policy to a saved adapter -- not a parallel metric. uv run python scripts/rescore_deploy.py out/runs/ uv run python scripts/rescore_deploy.py out/runs/ --eval-set holdout # n=353 Writes deploy_heldout.json next to the checkpoint and logs deploy hack/solve + per-mode. """ from __future__ import annotations import json from pathlib import Path import torch import tyro from tyro.conf import Positional from loguru import logger from safetensors import safe_open from safetensors.torch import load_file from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from vgrout.antipasto import wrap_model_with_antipasto from vgrout.data import load_problems from vgrout.eval import ablate_quarantine, eval_hack_solve MODES = ["run_tests", "stdout_marker", "sentinel", "file_marker"] EVAL_FILES = { "test": Path("external/rl-rewardhacking/results/data/leetcode_test_medhard.jsonl"), # 119 "holdout": Path("external/rl-rewardhacking/results/data/leetcode_train_medhard_holdout.jsonl"), # 353 } CACHE_ROOT = Path("svd_cache") def main(run_dir: Positional[Path], eval_set: str = "test", n: int = 10_000, max_new: int = 1024) -> None: """Re-score run_dir/train.safetensors knob-off on the held-out `eval_set`.""" ckpt = run_dir / "train.safetensors" with safe_open(str(ckpt), framework="pt") as f: meta = f.metadata() cfg = json.loads(meta["cfg"]) model_name = meta["model"] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"re-score {run_dir.name}: model={model_name} eval_set={eval_set} step={meta.get('step')}") tok = AutoTokenizer.from_pretrained(model_name) if tok.pad_token_id is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2", ).to(device) model.config.use_cache = False wrappers = wrap_model_with_antipasto(model, model_name, CACHE_ROOT, device, grad_probe=False) # Load the trained deployed adapter (delta_S). delta_S_hack stays 0; ablate_quarantine # zeros it anyway, so deploy needs only train.safetensors. delta = load_file(str(ckpt)) assert set(delta) == set(wrappers), "checkpoint module set != adapter module set" for name, t in delta.items(): wrappers[name]["delta_S"].data.copy_(t.to(device, torch.bfloat16)) # Held-out problems: round-robin the 4 modes over the eval file (partition=None path), # so each held-out problem carries a mode + faithful hint and is gradeable. problems = load_problems(n, env_modes=MODES, seed=cfg["seed"], data_path=EVAL_FILES[eval_set]) gen_cfg_eval = GenerationConfig( max_new_tokens=max_new, do_sample=True, temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0, num_return_sequences=1, pad_token_id=tok.pad_token_id, ) eval_idxs = list(range(len(problems))) with ablate_quarantine(wrappers): # knob OFF = the deployed model ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) out = { "run_dir": run_dir.name, "model": model_name, "step": meta.get("step"), "eval_set": eval_set, "eval_file": str(EVAL_FILES[eval_set]), "n": ev["n"], "deploy_hack": ev["hack"], "deploy_vhack": ev["vhack"], "deploy_solve": ev["solve"], "by_mode": {m: {"hack": h / max(1, c), "vhack": v / max(1, c), "solve": s / max(1, c), "n": c} for m, (h, v, s, c) in ev["by_mode"].items()}, } (run_dir / f"deploy_{eval_set}.json").write_text(json.dumps(out, indent=2)) logger.info(f"DEPLOY (held-out {eval_set}, n={ev['n']}): hack(strict)={ev['hack']:.3f} " f"hack(vendor)={ev['vhack']:.3f} solve={ev['solve']:.3f}") for m, d in out["by_mode"].items(): logger.info(f" {m:14s} hack={d['hack']:.3f} vhack={d['vhack']:.3f} solve={d['solve']:.3f} n={d['n']}") if __name__ == "__main__": tyro.cli(main)