From b334b5f51689a10383559bc3513ed2f16ab9b385 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Wed, 10 Jun 2026 03:31:07 +0000
Subject: [PATCH] fix: rescore_deploy tolerates old-schema checkpoints (default
 eval-harness params)

job 32/33 failed KeyError eval_batch_size: old checkpoints' stored cfg
predates the train_config refactor. Default eval_n_prompts/max_new/
eval_batch_size to the fast preset (eval-harness params, not model-defining;
test split is fixed-size) so historical checkpoints re-score.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 scripts/rescore_deploy.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/rescore_deploy.py b/scripts/rescore_deploy.py
index af069bc..c30a4b6 100644
--- a/scripts/rescore_deploy.py
+++ b/scripts/rescore_deploy.py
@@ -48,20 +48,24 @@ def main(run_dir: Positional[Path]) -> None:
     # by_mode keys ARE the modes the original deploy eval spanned (present in every json
     # version); reproduce the same set so the re-scored knob-off matches the headline.
     eval_modes = sorted(prior_eval["by_mode"].keys())
-    _, problems = load_eval_splits(eval_modes, cfg["eval_n_prompts"])
+    # Eval-harness params (not model-defining): default to the fast preset so we can re-score
+    # OLD checkpoints whose stored cfg predates the train_config refactor (eval_n_prompts /
+    # eval_batch_size were added/renamed). The test split is fixed-size regardless of n_prompts.
+    eval_n_prompts = cfg.get("eval_n_prompts", 32)
+    max_new = cfg.get("max_new", 512)
+    eval_bs = cfg.get("eval_batch_size", 8)
+    _, problems = load_eval_splits(eval_modes, eval_n_prompts)
     gen_cfg_eval = GenerationConfig(
-        max_new_tokens=cfg["max_new"], do_sample=True,
+        max_new_tokens=max_new, do_sample=True,
         temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0,
         num_return_sequences=1, pad_token_id=tok.pad_token_id,
     )
     eval_idxs = list(range(len(problems)))
     torch.manual_seed(EVAL_GEN_SEED)
     with ablate_quarantine(wrappers):   # knob OFF = the deployed model
-        ev = eval_hack_solve(
-            model, tok, problems, eval_idxs, gen_cfg_eval, device, cfg["max_new"], cfg["eval_batch_size"])
+        ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new, eval_bs)
     torch.manual_seed(EVAL_GEN_SEED)
-    ev_on = eval_hack_solve(
-        model, tok, problems, eval_idxs, gen_cfg_eval, device, cfg["max_new"], cfg["eval_batch_size"])
+    ev_on = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new, eval_bs)
 
     out = {
         "schema": RUN_SCHEMA,