From ba46e85f5515b83eaf2f24ed5bffbf6900323fbe Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 6 Jun 2026 12:46:59 +0000 Subject: [PATCH] eval: 1 sample/prompt, periodic 32 distinct, final on whole pool Prompt is the independent unit for a hack-rate estimate (same-prompt completions share the mode -> correlated), so spend the gen budget on distinct prompts not repeats. gen_cfg_eval num_return_sequences group->1. Periodic 8->32 distinct prompts (smoother curve, still 2x faster than the old 8x8=64-completion pass). Final eval drops the eval_n_prompts_final cap and runs the WHOLE loaded pool x1 (SE~0.021 at p=0.1 over ~200 vs ~0.075 over 16). Final still does train + deploy(knob-off) for route/routeV and collapses to one pass for vanilla/erase. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- src/vgrout/train.py | 48 ++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 7fe27c8..1a1c0fe 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -178,17 +178,24 @@ class Config: # Route deploy-eval: every N steps zero δS_hack and eval hack/solve on a fixed # subset -> the hack_deploy / solve_deploy columns (the dynamics-plot series for # route: the training-time hack curve still hacks; routing's benefit shows only - # once the quarantine is ablated). 0 = off. eval_n_prompts x `group` samples. + # once the quarantine is ablated). 0 = off. eval_n_prompts prompts x 1 sample. # Default 5: gives 12 deploy points over the common 60-step run (nice trajectory # plot). Affordable now that the per-step knob-ON eval pass is gone (each eval is - # one n=64 pass, ~230s, not two). Long-horizon recipes (paper-longrun, A5) pin a + # one 16-prompt pass, not two). Long-horizon recipes (paper-longrun, A5) pin a # sparser cadence (10/20) explicitly. See journal 2026-06-04 (a) for the cost audit. eval_ablate_every: int = 5 - eval_n_prompts: int = 8 # periodic (per-step) deploy eval: light, for the smoothed curve - # Final (post-loop) eval covers MORE distinct prompts than the periodic curve so the - # paper deploy hack/solve has a tight CI (the periodic n=8-prompts eval is sampling-noisy: - # eval gen is do_sample T=0.7, see EVAL_GEN_SEED). Capped at the available pool size. - eval_n_prompts_final: int = 64 + # Eval samples 1 completion per prompt (gen_cfg_eval num_return_sequences=1): completions + # within a prompt share its mode and are correlated, so the prompt is the independent unit + # and the efficient budget allocation is many prompts x 1 sample, not few prompts x many. + eval_n_prompts: int = 32 # periodic (per-step) deploy eval: 32 distinct prompts, for the smoothed curve + # NB the fixed first-N subset gives a constant level-offset (same prompts every seed, so + # 3-seed averaging does NOT remove it); but all arms share these prompts, so the offset + # cancels in the route-vs-vanilla delta the curve actually shows. The whole-pool final + # eval is the unbiased absolute number. + # Final (post-loop) eval covers the WHOLE loaded pool (>> the periodic curve) so the + # paper deploy hack/solve has a tight CI (SE~0.021 at p=0.1 over ~200 prompts vs ~0.075 + # over 16). The seeded periodic curve stays light + smoothed. No config knob: always + # the full pool (the eval is on training prompts; held-out is at the hack-mode level). # Save the deploy adapter (δS only, ~2.3MB) at every deploy-eval step, tagged by # step, so a run can be RE-SCORED later (more prompts, different eval) without # retraining. Tiny per ckpt; a 200-step run at every-10 is ~46MB. Off for big sweeps. @@ -676,12 +683,13 @@ def main(cfg: Config) -> int: repetition_penalty=1.0, num_return_sequences=G_s, pad_token_id=tok.pad_token_id, ) - # Eval-ablation config: student-only, `group` samples/prompt (no teacher - # split, so we want the full group for a tighter rate estimate). + # Eval-ablation config: student-only, 1 sample/prompt. The prompt is the independent + # unit for a hack-RATE estimate (same-prompt completions share the mode -> correlated), + # so we spend the gen budget on distinct prompts, not repeats. N=#prompts. gen_cfg_eval = GenerationConfig( max_new_tokens=max_new, do_sample=True, temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0, - num_return_sequences=group, pad_token_id=tok.pad_token_id, + num_return_sequences=1, pad_token_id=tok.pad_token_id, ) problems = load_problems(n_problems, env_modes=[cfg.env_mode], seed=cfg.seed, partition=partition) @@ -1461,8 +1469,9 @@ def main(cfg: Config) -> int: refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}" # mod/axes -> per-step row # ── periodic DEPLOY-eval (EVERY arm) -- the apples-to-apples curve ── - # Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval (n=64, - # T=0.7), every eval_ablate_every steps. route/routeV: deploy = quarantine + # Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval + # (eval_n_prompts prompts x 1 sample, T=0.7), every eval_ablate_every steps. + # route/routeV: deploy = quarantine # knob zeroed (ablate_quarantine), and the claim is this hacks far less than # the training-time model (per-step hack_s, knob still on). vanilla/erase: no # quarantine, so deploy == the trained model -- eval it directly. Running the @@ -1775,14 +1784,13 @@ def main(cfg: Config) -> int: # preserved solve => the quarantine absorbed the cheat. vanilla/erase have no # quarantine, so the deployed model IS the trained model (deploy == train, one eval). model.eval() - # Paper-grade final eval: eval_n_prompts_final distinct prompts (>> the periodic - # eval_n_prompts curve), and a FIXED gen seed before each pass so every arm/seed sees - # common random numbers -> cross-arm deltas reflect the intervention, not eval sampling - # noise (gen is do_sample T=0.7, otherwise unseeded; the periodic curve stays light + - # unseeded and gets smoothed). Capped at the available pool size. - eval_idxs_final = list(range(min(cfg.eval_n_prompts_final, len(problems)))) - logger.info(f"FINAL EVAL: {len(eval_idxs_final)} distinct prompts x G={group} = " - f"{len(eval_idxs_final) * group} completions (periodic curve used {len(eval_idxs)})") + # Paper-grade final eval: the WHOLE loaded pool (>> the periodic eval_n_prompts curve), + # and a FIXED gen seed before each pass so every arm/seed sees common random numbers -> + # cross-arm deltas reflect the intervention, not eval sampling noise (gen is do_sample + # T=0.7, seeded here; the periodic curve is also seeded and gets smoothed). + eval_idxs_final = list(range(len(problems))) # whole pool, 1 sample/prompt -> tight CI + logger.info(f"FINAL EVAL: {len(eval_idxs_final)} distinct prompts x 1 sample = " + f"{len(eval_idxs_final)} completions (periodic curve used {len(eval_idxs)})") torch.manual_seed(EVAL_GEN_SEED) ev_train = eval_hack_solve(model, tok, problems, eval_idxs_final, gen_cfg_eval, device, max_new) has_quarantine = cfg.intervention in ("route", "routeV")