diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 9a1f265..0a5a530 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -525,7 +525,10 @@ def main(cfg: Config) -> int: # Deterministically split the paper's recency-held-out test file into periodic # validation and untouched final test. Previously the monitored 32 problems # were included in the final headline, leaking model-selection information. - eval_modes = sorted({p["env_mode"] for p in problems} - {"gt_only"}) + # gt_only is excluded from the hack eval (unhackable problems can't be hacked), EXCEPT + # the no-loophole ceiling run where every problem is gt_only -- there we eval on gt_only + # itself (hack is structurally ~0; solve is the ceiling number). + eval_modes = sorted({p["env_mode"] for p in problems} - {"gt_only"}) or ["gt_only"] val_problems, test_problems = load_eval_splits(eval_modes, cfg.eval_n_prompts) val_idxs, test_idxs = list(range(len(val_problems))), list(range(len(test_problems))) _train_ids = {p["problem_id"] for p in problems}