diff --git a/RESEARCH_JOURNAL.md b/RESEARCH_JOURNAL.md index 3aa55fc..b645f2b 100644 --- a/RESEARCH_JOURNAL.md +++ b/RESEARCH_JOURNAL.md @@ -2302,3 +2302,62 @@ arm) so the 5-arm overlay reads uniform numbers. 5-arm clean sweep queued (40-44, all #164). On completion: confirm the deploy-solve>= train-solve gap reproduces per-arm, and read run_tests deploy-solve specifically (the watch-item for whether deploy-mode rollouts are needed). + +# 2026-06-01 + +## Exploration floor against hack-saturation: `rollout_ablate_frac` (route/route2) + +**Context.** Live audit of job 60 (route2, scale-matched `delta_S_hack` quarantine, +seed 41) past the step-10 emergence. The discrimination gauge `hkgap` +(`ema_hack_cos - ema_clean_cos`) started clearly positive (+0.09 @ step 2) and +decayed through zero to weakly negative (~-0.03 across steps 27-41); `tau` rode it +down to ~0. So the calibrated gate is faithfully tracking a signal that has gone +dead, the student-side `cos>tau` route is back to a coin flip, and routing is +carried mostly by the forced teacher-anchor routing (`qE~0.55`). Solving survived +(clean rollouts still kept), but this exposed the structural risk the user named: +on-policy sampling can collapse onto hacking, at which point every rollout routes to +the deleted quarantine and the deployed `delta_S` never sees a solve gradient. Hack +eats everything. + +**Decision.** Add a standard RL exploration floor: generate a fraction +(`rollout_ablate_frac`, default 0) of the student rollouts with the quarantine +ablated, i.e. from the deployed model, which cannot express the hack and so explores +the solve region. This guarantees solve-region coverage regardless of how saturated +the full policy gets. Pure sampling-side diversity, no new loss, no reward change, no +grader: it does not touch the no-cheat boundary. It accepts a slight off-policy +mismatch (GRPO already tolerates off-policy samples via clipping/reuse), which the +user judged worth it for the coverage. This is the previously-deferred "deploy-mode +rollouts" idea (see prior entry), promoted from deferred now that job 60 shows the +saturation pathway is live. + +Bonus property for our setup: at deploy `delta_S_hack` is zeroed, so the deployed +model *is* the ablated model. Generating a fraction ablated trains `delta_S` partly +on the exact distribution it faces at deploy, closing the train/deploy gap, not just +preventing starvation. + +**Subtlety corrected mid-design (load-bearing).** Generation policy and gradient +policy are decoupled in GRPO: the gradient comes from the teacher-forcing recompute, +not the sampling pass. So generating ablated does *not* by itself keep `delta_S_hack` +gradient-free; a solve rollout that happens to contain hack-ward tokens would still +backprop into the quarantine under a full-model recompute. We do NOT match the +recompute ablation per-subset (would need two backwards). We rely instead on the fact +that a genuine-solve rollout is clean-ward, so it is not flagged, so route2 leaves its +full gradient in `delta_S` anyway. The exploration value (coverage) is what we are +buying; the gradient routing is unchanged. + +**Implementation.** `train.py`: `Config.rollout_ablate_frac`; a `gen_students(enc, n)` +helper that splits the n student rollouts into `round(n*frac)` ablated (under +`ablate_quarantine`) + the rest full, pads, concatenates. Both generate call sites +(pool and no-pool) route through it. Guarded to `intervention in {route, route2}` +(only those have a quarantine); frac=0 collapses to a single plain generate, so +vanilla/erase and all existing runs are byte-identical. + +**Verified.** `just smoke-route2 --rollout-ablate-frac=0.5`: 30 steps, clean exit, +deploy eval fired (steps 0/10/20/29), all route2 columns populate, the ablated/full +split padded+concatenated with no shape error. +(log: `logs/20260601T053045_smoke_routing2_seed41.log`) + +**Next.** Queue route2-balanced + `--rollout-ablate-frac=0.5` (seed 41, 60 steps) +and read `slv_dep`: the direct test of whether the exploration floor lifts deploy-solve +vs the no-floor job 60. Keep the orthogonal `hkgap`-decay question (frozen vs +`--vhack-refresh-every=2`) as a separate run so the two levers stay attributable. diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index f143322..986c689 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -173,6 +173,16 @@ class Config: preserve_magnitude: bool = True gate_mode: Literal["one_sided", "no_gate", "reverse"] = "one_sided" project_overshoot: float = 1.0 # remove overshoot*c_use@V; 1.0=just remove, 1.1=10% reversal of hack-ward grad + # Exploration floor against hack-saturation (route/route2 only). Fraction of + # student rollouts to generate with the quarantine (delta_S_hack) ablated, i.e. + # from the DEPLOYED model. The risk this guards: if on-policy sampling collapses + # onto hacking, the policy stops emitting solves, every rollout gets routed to + # the quarantine, and the deployed delta_S never sees a solve gradient to learn + # from (it saturates). Forcing a fraction of rollouts hack-OFF guarantees the + # solve region stays covered, exactly like any RL exploration term. Pure + # sampling-side diversity; accepts a slight off-policy mismatch (GRPO already + # tolerates it) in exchange for guaranteed coverage. 0 = off (unchanged). + rollout_ablate_frac: float = 0.0 # Which grader flaw + factual hint this run trains on (a "hack class"). Sets # the prompt hint (HINT_REPLACE_TO) and how `passed` is graded in rewards.py. # run_tests = the original run_tests-overwrite loophole. eq_override / exit_code @@ -995,6 +1005,23 @@ def main(cfg: Config) -> int: eos_id = tok.eos_token_id pad_id = tok.pad_token_id + def gen_students(enc, n: int) -> torch.Tensor: + """Generate n student rollouts, a `rollout_ablate_frac` slice of them with + the quarantine ablated (deployed model -> can't hack -> explores solves). + See Config.rollout_ablate_frac for why. frac=0 or non-quarantine arms -> + a single plain generate, identical to before.""" + n_abl = round(n * cfg.rollout_ablate_frac) if cfg.intervention in ("route", "route2") else 0 + parts = [] + if n - n_abl > 0: + parts.append(model.generate(**enc, generation_config=gen_cfg, + num_return_sequences=n - n_abl).detach()) + if n_abl > 0: + with ablate_quarantine(wrappers): + parts.append(model.generate(**enc, generation_config=gen_cfg, + num_return_sequences=n_abl).detach()) + L = max(p.shape[1] for p in parts) + return torch.cat([F.pad(p, (0, L - p.shape[1]), value=pad_id) for p in parts], dim=0) + # Stream the per-step table live (header once, row per step). Same columns as # the final tabulate output. logger.info routes through tqdm.write so the # rows appear above the progress bar without breaking it. @@ -1256,10 +1283,10 @@ def main(cfg: Config) -> int: if len(pool_rows) < G_t: idxs = idxs + torch.randint(0, len(pool_rows), (G_t - len(pool_rows),), generator=rng).tolist() teacher_sample = [pool_rows[i] for i in idxs] - # Student live-gen. gen_cfg.num_return_sequences is baked to G_s - # at construction (pool path) or = group (no-pool path). + # Student live-gen (G_s rows; a rollout_ablate_frac slice generated + # with the quarantine ablated, see gen_students). with torch.no_grad(): - out_s = model.generate(**enc, generation_config=gen_cfg).detach() + out_s = gen_students(enc, G_s) # Build teacher tensor: live-tokenized prompt + cached completion. # Cached prompt_ids are ignored — re-tokenizing live makes the pool # robust to chat-template / tokenizer drift between the model used @@ -1281,7 +1308,7 @@ def main(cfg: Config) -> int: is_student = [True] * G_s + [False] * G_t else: with torch.no_grad(): - gen_out = model.generate(**enc, generation_config=gen_cfg).detach() + gen_out = gen_students(enc, G_s) # G_s == group when no teacher is_student = [True] * gen_out.shape[0] model.config.use_cache = False merged = gen_out