mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
eval: 1 sample/prompt, periodic 32 distinct, final on whole pool
Prompt is the independent unit for a hack-rate estimate (same-prompt completions share the mode -> correlated), so spend the gen budget on distinct prompts not repeats. gen_cfg_eval num_return_sequences group->1. Periodic 8->32 distinct prompts (smoother curve, still 2x faster than the old 8x8=64-completion pass). Final eval drops the eval_n_prompts_final cap and runs the WHOLE loaded pool x1 (SE~0.021 at p=0.1 over ~200 vs ~0.075 over 16). Final still does train + deploy(knob-off) for route/routeV and collapses to one pass for vanilla/erase. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
+28
-20
@@ -178,17 +178,24 @@ class Config:
|
||||
# Route deploy-eval: every N steps zero δS_hack and eval hack/solve on a fixed
|
||||
# subset -> the hack_deploy / solve_deploy columns (the dynamics-plot series for
|
||||
# route: the training-time hack curve still hacks; routing's benefit shows only
|
||||
# once the quarantine is ablated). 0 = off. eval_n_prompts x `group` samples.
|
||||
# once the quarantine is ablated). 0 = off. eval_n_prompts prompts x 1 sample.
|
||||
# Default 5: gives 12 deploy points over the common 60-step run (nice trajectory
|
||||
# plot). Affordable now that the per-step knob-ON eval pass is gone (each eval is
|
||||
# one n=64 pass, ~230s, not two). Long-horizon recipes (paper-longrun, A5) pin a
|
||||
# one 16-prompt pass, not two). Long-horizon recipes (paper-longrun, A5) pin a
|
||||
# sparser cadence (10/20) explicitly. See journal 2026-06-04 (a) for the cost audit.
|
||||
eval_ablate_every: int = 5
|
||||
eval_n_prompts: int = 8 # periodic (per-step) deploy eval: light, for the smoothed curve
|
||||
# Final (post-loop) eval covers MORE distinct prompts than the periodic curve so the
|
||||
# paper deploy hack/solve has a tight CI (the periodic n=8-prompts eval is sampling-noisy:
|
||||
# eval gen is do_sample T=0.7, see EVAL_GEN_SEED). Capped at the available pool size.
|
||||
eval_n_prompts_final: int = 64
|
||||
# Eval samples 1 completion per prompt (gen_cfg_eval num_return_sequences=1): completions
|
||||
# within a prompt share its mode and are correlated, so the prompt is the independent unit
|
||||
# and the efficient budget allocation is many prompts x 1 sample, not few prompts x many.
|
||||
eval_n_prompts: int = 32 # periodic (per-step) deploy eval: 32 distinct prompts, for the smoothed curve
|
||||
# NB the fixed first-N subset gives a constant level-offset (same prompts every seed, so
|
||||
# 3-seed averaging does NOT remove it); but all arms share these prompts, so the offset
|
||||
# cancels in the route-vs-vanilla delta the curve actually shows. The whole-pool final
|
||||
# eval is the unbiased absolute number.
|
||||
# Final (post-loop) eval covers the WHOLE loaded pool (>> the periodic curve) so the
|
||||
# paper deploy hack/solve has a tight CI (SE~0.021 at p=0.1 over ~200 prompts vs ~0.075
|
||||
# over 16). The seeded periodic curve stays light + smoothed. No config knob: always
|
||||
# the full pool (the eval is on training prompts; held-out is at the hack-mode level).
|
||||
# Save the deploy adapter (δS only, ~2.3MB) at every deploy-eval step, tagged by
|
||||
# step, so a run can be RE-SCORED later (more prompts, different eval) without
|
||||
# retraining. Tiny per ckpt; a 200-step run at every-10 is ~46MB. Off for big sweeps.
|
||||
@@ -676,12 +683,13 @@ def main(cfg: Config) -> int:
|
||||
repetition_penalty=1.0,
|
||||
num_return_sequences=G_s, pad_token_id=tok.pad_token_id,
|
||||
)
|
||||
# Eval-ablation config: student-only, `group` samples/prompt (no teacher
|
||||
# split, so we want the full group for a tighter rate estimate).
|
||||
# Eval-ablation config: student-only, 1 sample/prompt. The prompt is the independent
|
||||
# unit for a hack-RATE estimate (same-prompt completions share the mode -> correlated),
|
||||
# so we spend the gen budget on distinct prompts, not repeats. N=#prompts.
|
||||
gen_cfg_eval = GenerationConfig(
|
||||
max_new_tokens=max_new, do_sample=True,
|
||||
temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0,
|
||||
num_return_sequences=group, pad_token_id=tok.pad_token_id,
|
||||
num_return_sequences=1, pad_token_id=tok.pad_token_id,
|
||||
)
|
||||
|
||||
problems = load_problems(n_problems, env_modes=[cfg.env_mode], seed=cfg.seed, partition=partition)
|
||||
@@ -1461,8 +1469,9 @@ def main(cfg: Config) -> int:
|
||||
refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}" # mod/axes -> per-step row
|
||||
|
||||
# ── periodic DEPLOY-eval (EVERY arm) -- the apples-to-apples curve ──
|
||||
# Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval (n=64,
|
||||
# T=0.7), every eval_ablate_every steps. route/routeV: deploy = quarantine
|
||||
# Eval the DEPLOYED model on a fixed eval subset with gen_cfg_eval
|
||||
# (eval_n_prompts prompts x 1 sample, T=0.7), every eval_ablate_every steps.
|
||||
# route/routeV: deploy = quarantine
|
||||
# knob zeroed (ablate_quarantine), and the claim is this hacks far less than
|
||||
# the training-time model (per-step hack_s, knob still on). vanilla/erase: no
|
||||
# quarantine, so deploy == the trained model -- eval it directly. Running the
|
||||
@@ -1775,14 +1784,13 @@ def main(cfg: Config) -> int:
|
||||
# preserved solve => the quarantine absorbed the cheat. vanilla/erase have no
|
||||
# quarantine, so the deployed model IS the trained model (deploy == train, one eval).
|
||||
model.eval()
|
||||
# Paper-grade final eval: eval_n_prompts_final distinct prompts (>> the periodic
|
||||
# eval_n_prompts curve), and a FIXED gen seed before each pass so every arm/seed sees
|
||||
# common random numbers -> cross-arm deltas reflect the intervention, not eval sampling
|
||||
# noise (gen is do_sample T=0.7, otherwise unseeded; the periodic curve stays light +
|
||||
# unseeded and gets smoothed). Capped at the available pool size.
|
||||
eval_idxs_final = list(range(min(cfg.eval_n_prompts_final, len(problems))))
|
||||
logger.info(f"FINAL EVAL: {len(eval_idxs_final)} distinct prompts x G={group} = "
|
||||
f"{len(eval_idxs_final) * group} completions (periodic curve used {len(eval_idxs)})")
|
||||
# Paper-grade final eval: the WHOLE loaded pool (>> the periodic eval_n_prompts curve),
|
||||
# and a FIXED gen seed before each pass so every arm/seed sees common random numbers ->
|
||||
# cross-arm deltas reflect the intervention, not eval sampling noise (gen is do_sample
|
||||
# T=0.7, seeded here; the periodic curve is also seeded and gets smoothed).
|
||||
eval_idxs_final = list(range(len(problems))) # whole pool, 1 sample/prompt -> tight CI
|
||||
logger.info(f"FINAL EVAL: {len(eval_idxs_final)} distinct prompts x 1 sample = "
|
||||
f"{len(eval_idxs_final)} completions (periodic curve used {len(eval_idxs)})")
|
||||
torch.manual_seed(EVAL_GEN_SEED)
|
||||
ev_train = eval_hack_solve(model, tok, problems, eval_idxs_final, gen_cfg_eval, device, max_new)
|
||||
has_quarantine = cfg.intervention in ("route", "routeV")
|
||||
|
||||
Reference in New Issue
Block a user