From 208713d7c217072925a44fcfb8e13849b540bfc6 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Thu, 4 Jun 2026 02:25:07 +0000 Subject: [PATCH] perf(eval): drop redundant per-step knob-ON pass, default eval-every 5->10 Per-step TIMING audit (journal 2026-06-04 a): gen ~140s/step dominates; the 2x2 deploy eval is ~460s and route2 ran it TWICE per eval (knob-off + knob-on) for a train curve no figure plots -- per-step hack_s already is the train series, and the full 2x2 is computed once post-loop (FINAL EVAL). Drop the per-step knob-on pass and its dead hk_on/slv_on columns; bump eval cadence default 5->10. ~27% faster on 60-step fast runs, ~4h/run on 200-step. refresh left at 5 (timing shows it's ~10s/step, not the culprit I'd claimed). plot_dynamics already falls back to hack_s when hk_on absent. Validated via smoke-route2: single-pass evals, FINAL EVAL 2x2 intact, no dead columns. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- RESEARCH_JOURNAL.md | 32 +++++++++++++++++++++++++++- src/projected_grpo/tablelog.py | 8 +++---- src/projected_grpo/train.py | 38 +++++++++++----------------------- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/RESEARCH_JOURNAL.md b/RESEARCH_JOURNAL.md index b685fbe..3f02bcd 100644 --- a/RESEARCH_JOURNAL.md +++ b/RESEARCH_JOURNAL.md @@ -2,7 +2,37 @@ Append-only. New entries at the top, date-stamped. Never edit old entries. -## 2026-06-03 (f) — A5 no-cheat check: the weak detector (hacked_E) sees only run_tests; file_marker is the held-out payload +## 2026-06-04 (a) — per-step cost is gen + the 2x2 eval, NOT refresh; redesigning eval cadence + +**Context:** Job 99 (route2 nofloor refresh-2 staleness cell, #183) ran at ~4.3 min/step, far +slower than a frozen route2 run. Audited the per-step TIMING log (logs/20260603T223442_...rf2_s41.log) +to find where the time goes. The `step N TIMING gen=.. fwd_bwd=.. reward=.. other=..` line breaks it down. + +### Measured per-step cost (route2, fast preset, group=8, n=64 eval) + +| step type | gen | fwd_bwd+reward | other | total | +|:--------------------------------|------:|---------------:|------:|------:| +| base (e.g. 38, 44, 48) | ~140s | ~13s | 0s | ~155s | +| refresh step (odd, e.g. 47, 49) | ~140s | ~13s | ~20s | ~175s | +| eval step (40, 45, 50) | ~140s | ~13s | ~460s | ~615s | + +- [obs] generation of the 32 training rollouts dominates at ~140s/step, every step, unavoidable (it IS the GRPO data). +- [obs] the 2x2 deploy eval costs ~460s each. route2 runs it as TWO passes of n=64 (knob-OFF=deploy, knob-ON=train), 128 gens. +- [obs] refresh (v_grad re-extract over 5 cached pairs, no generation) is only ~20s. At every-2 that is ~10s/step amortized; at default-5 ~4s/step. TRIVIAL. +- [reason] EARLIER MISDIAGNOSIS (corrected): I'd blamed `--vhack-refresh-every=2` for the slowness and called it the canonical staleness value citing the 2026-05-29 journal (878-896). Both wrong. That section is the dead one-sided-erase era (pre-route2, pre-#170 refactor); the current route2 headline uses FROZEN v_grad. refresh=2 was an unjustified orphan, AND the timing shows refresh barely costs anything. The real costs are gen (~140s) + the 2x2 eval (~460s/eval at every-5 = ~92s/step amortized). +- [check] per-5-step wall-clock blocks were rock-steady ~21-22 min (25->30: 22m11s, 30->35: 21m11s, ...), confirming no contention/no second job; the run dir 20260603T223442 wrote continuously from 22:34. + +### Eval cadence redesign (so we stop rethinking it per run) + +- [decision] eval is the only discretionary lever (gen is fixed). Two cuts: (i) drop route2's knob-ON + second pass on intermediate evals -- per-step `hack_s` already gives the train series for free, so + keep knob-ON only on the FINAL eval where it completes the 2x2 table; (ii) eval every 10 not 5. +- [obs] projected speedup, fast 60-step run: 255s/step (eval-5 + knob-on + refresh-5) -> ~186s/step + (eval-10 + knob-on-final-only) = ~27% faster (4.3h -> ~3.1h). 200-step A5 runs (eval-n=24): ~293 -> + ~224s/step, ~16h -> ~12h each, ~8h saved across 103/104 (those override eval flags, so unaffected + unless re-queued). +- [decision] refresh default stays 5 (it is cheap; the value is a research knob not a speed knob). + Baking eval-every=10 + knob-on-final-only into train.py defaults so future runs inherit it. the weak detector (hacked_E) sees only run_tests; file_marker is the held-out payload **Context:** Job 95 (A5 harvest, vanilla 4-mode, 40 steps, seed 41) finished. Harvested the student's own rollouts to (i) confirm which modes the weak live detector can flag and (ii) build diff --git a/src/projected_grpo/tablelog.py b/src/projected_grpo/tablelog.py index 2f2f406..2d5e14a 100644 --- a/src/projected_grpo/tablelog.py +++ b/src/projected_grpo/tablelog.py @@ -145,11 +145,9 @@ class StepLogger: ] if arm in ("routing", "routing2"): cols += [ - # Knob-ON eval: SAME eval set/n/T as hk_dep but quarantine ACTIVE - # (training-time policy). Like-for-like train series vs the knob-off - # hk_dep, for the train-vs-deploy 2x2. nan between eval steps. - _Col("hack_kon", 7, "hk_on", "+.2f", "knob-ON eval hack (quarantine active = training policy); same eval set as hk_dep"), - _Col("solve_kon", 7, "slv_on", "+.2f", "knob-ON eval solve (same eval set as slv_dep)"), + # Deploy eval (knob-OFF) is hk_dep below. The train-vs-deploy 2x2's + # knob-ON pass runs once post-loop (FINAL EVAL), not per-step; the + # per-step train series is hk_s. See journal 2026-06-04 (a). _Col("q_egy", 6, "qE", ".2f", "grad energy into quarantine ||g_quar||/(||g_keep||+||g_quar||); ~0.5+ rising = learning dumped into the thrown-away knob"), _Col("hack_abl", 6, "hk_abl", "frac", "FREE per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"), _Col("solve_abl", 6, "slv_abl", "frac", "free per-step deploy proxy: solve rate on the ablated rollout slice"), diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 911dac5..8df3b71 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -179,9 +179,10 @@ class Config: # subset -> the hack_deploy / solve_deploy columns (the dynamics-plot series for # route: the training-time hack curve still hacks; routing's benefit shows only # once the quarantine is ablated). 0 = off. eval_n_prompts x `group` samples. - # Default 5: deploy hack/solve is the headline metric for every arm, so it's - # on by default; 200-step runs pass a sparser cadence (e.g. 10) explicitly. - eval_ablate_every: int = 5 + # Default 10: each eval is ~460s (the single biggest discretionary cost; gen is + # ~140s/step and fixed). 6 deploy points over a 60-step run / 20 over 200 is + # plenty for the trajectory plot. See journal 2026-06-04 (a) for the cost audit. + eval_ablate_every: int = 10 eval_n_prompts: int = 8 # Optional: pool-derived pairs JSON (built by pairs_from_pool.py). When set, # BOTH the cache-miss extract AND the online refresh use these pairs instead @@ -1402,7 +1403,6 @@ def main(cfg: Config) -> int: # route shows a deploy eval while others show training rollouts -> different # n/cadence, route looks artificially smoother). NaN on non-eval steps. hack_deploy = solve_deploy = float("nan") - hack_kon = solve_kon = float("nan") # knob-ON eval (route only); see below if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1): _was_training = model.training model.eval() @@ -1410,28 +1410,19 @@ def main(cfg: Config) -> int: with (ablate_quarantine(wrappers) if is_route else nullcontext()): ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) hack_deploy, solve_deploy = ev["hack"], ev["solve"] - # Like-for-like knob-ON eval: re-run the SAME n eval prompts with the - # quarantine ACTIVE (the training-time policy). The per-step hack_s is a - # noisy n=28 train batch -> spiky, looks like a different estimator than - # the smooth n=64 deploy curve. This gives a train series measured the - # IDENTICAL way as deploy (same prompts/n/T), differing only in knob state, - # for the train-vs-deploy 2x2. Route only: vanilla/erase have no quarantine - # (knob-on == knob-off), so reuse the deploy number. - if is_route: - ev_on = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) - hack_kon, solve_kon = ev_on["hack"], ev_on["solve"] - else: - hack_kon, solve_kon = hack_deploy, solve_deploy if _was_training: model.train() + # Deploy (knob-OFF) only -- one pass. The train series comes free from the + # per-step hack_s column, and the full train-vs-deploy 2x2 (knob-ON vs + # knob-OFF on the same eval set) is computed once post-loop (FINAL EVAL). + # A per-step knob-ON pass would just double every eval (~460s -> ~920s) + # for a curve no figure plots. See journal 2026-06-04 (a). tag = "quarantine knob OFF = deployed model" if is_route else "deployed = trained model (no quarantine)" - should = ("deploy hack < knob-ON eval hack (knob is holding the cheat); " - "ELSE routing isn't capturing it") if is_route else "deploy ~= training hack_s (same model)" + should = ("deploy hack < per-step hack_s (knob holds the cheat); ELSE routing isn't capturing it" + if is_route else "deploy ~= training hack_s (same model)") logger.info( f"step {step} DEPLOY-eval ({tag}): " - f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}" - + (f" | knob-ON same-eval: hack={hack_kon:.3f} solve={solve_kon:.3f}" if is_route else "") - + f". SHOULD: {should}") + f"hack={hack_deploy:.3f} solve={solve_deploy:.3f} n={ev['n']}. SHOULD: {should}") rewards_t = torch.tensor(agg_rew, dtype=torch.float32) if agg_rew else torch.zeros(1) rew_mean = rewards_t.mean().item() @@ -1565,11 +1556,6 @@ def main(cfg: Config) -> int: # are unaffected. plot_dynamics reads it by name. "hack_deploy": hack_deploy, "solve_deploy": solve_deploy, - # Knob-ON eval: SAME n eval prompts as deploy, quarantine active = the - # training-time policy. Like-for-like train series for the train-vs-deploy - # 2x2 (vs the noisy per-step hack_s batch). route only; else == deploy. - "hack_kon": hack_kon, - "solve_kon": solve_kon, # Free per-step deploy proxy from the ablated rollout slice (above). "hack_abl": (hack_abl_n, n_abl_step) if n_abl_step else (0, 0), "solve_abl": (gt_abl_n, n_abl_step) if n_abl_step else (0, 0),