diff --git a/scripts/rescore_deploy.py b/scripts/rescore_deploy.py index c12d9f6..af069bc 100644 --- a/scripts/rescore_deploy.py +++ b/scripts/rescore_deploy.py @@ -45,7 +45,9 @@ def main(run_dir: Positional[Path]) -> None: wrappers[name]["delta_S_hack"].data.copy_(delta_hack[name].to(device, torch.bfloat16)) prior_eval = json.loads((run_dir / "deploy_test.json").read_text()) - eval_modes = prior_eval["eval_modes"] + # by_mode keys ARE the modes the original deploy eval spanned (present in every json + # version); reproduce the same set so the re-scored knob-off matches the headline. + eval_modes = sorted(prior_eval["by_mode"].keys()) _, problems = load_eval_splits(eval_modes, cfg["eval_n_prompts"]) gen_cfg_eval = GenerationConfig( max_new_tokens=cfg["max_new"], do_sample=True,