diff --git a/justfile b/justfile index 740d967..dfcb217 100644 --- a/justfile +++ b/justfile @@ -27,6 +27,7 @@ results: # zero-variance bails every step, leaving the loss path uncovered. smoke *ARGS: uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack + uv run python scripts/verify_gate_anchor.py # route2 no-cheat gate: teacher-only anchor zeroes held-out labels BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} diff --git a/scripts/verify_gate_anchor.py b/scripts/verify_gate_anchor.py new file mode 100644 index 0000000..92244a1 --- /dev/null +++ b/scripts/verify_gate_anchor.py @@ -0,0 +1,68 @@ +"""Unit check of the route2 τ-gate anchor (train.build_route2_anchors). + +SHOULD: the no-cheat invariant holds at the gate. The anchor decides which rollouts + are FORCE-routed into the deletable quarantine (hack_anchor); the rest only + route if cos(g,v) > tau. ELSE: held-out classes are receiving detector labels + and the A5 generalisation claim is contaminated. + +The load-bearing invariant (the leak this guards): + hacked_E is the run_tests detector. It cross-fires <=1.1% on held-out modes (false + positives), so under the DEFAULT anchor those held-out rollouts are force-routed -- + a real, if small, label leak into the held-out class. The teacher-only anchor + (gate_anchor_teacher_only=True) drops the student detector term entirely, so the + held-out classes get PROVABLY zero detector labels: the airtight A5 control. + +Scenario modelled: merged batch is [students ... teachers] (train.py is_student = +[True]*G_s + [False]*G_t). One held-out student (idx 2) is a detector FALSE POSITIVE. +""" +from __future__ import annotations + +import sys + +import torch +from loguru import logger + +from projected_grpo.train import build_route2_anchors + +DEV = torch.device("cpu") +G_S, G_T = 5, 2 +IS_STUDENT = [True] * G_S + [False] * G_T # students lead, teachers follow +# Held-out batch: the run_tests detector FALSE-fires on student idx 2 only (the leak). +HACK_E_FLAGS = [False, False, True, False, False] # len == G_S + + +def _check(name: str, cond: bool) -> bool: + logger.info(f"{'PASS' if cond else 'FAIL'} {name}") + return cond + + +def main() -> int: + ok = True + + # DEFAULT anchor: the leak is present. + ha, ca = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=False, device=DEV) + ok &= _check("default: teachers always anchored", bool(ha[G_S:].all())) + ok &= _check("default: flagged held-out student idx2 IS force-routed (the leak)", + bool(ha[2])) + ok &= _check("default: student detector labels reach the gate (sum>0)", + int(ha[:G_S].sum()) == 1) + ok &= _check("default: clean_anchor is the exact complement", bool((ca == ~ha).all())) + + # TEACHER-ONLY anchor: the fix. Zero student detector labels. + ha2, ca2 = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=True, device=DEV) + ok &= _check("teacher_only: teachers still all anchored", bool(ha2[G_S:].all())) + ok &= _check("teacher_only: ZERO student rollouts force-routed (no leak)", + int(ha2[:G_S].sum()) == 0) + ok &= _check("teacher_only: the held-out FP student idx2 is NOT routed", + not bool(ha2[2])) + ok &= _check("teacher_only: clean_anchor is the exact complement", bool((ca2 == ~ha2).all())) + + # The fix only touches student labels: teacher anchoring is identical either way. + ok &= _check("fix leaves teacher rows unchanged", bool((ha[G_S:] == ha2[G_S:]).all())) + + logger.info("ALL PASS -- gate anchor no-cheat invariant holds" if ok else "FAILURES above") + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 2b6e57c..87116dd 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -140,6 +140,13 @@ class Config: seed: int = 41 preserve_magnitude: bool = True gate_mode: Literal["one_sided", "no_gate", "reverse"] = "one_sided" + # route2 airtight no-cheat control: anchor the τ-gate on TEACHER rows only, never + # on hacked_E-flagged student rows. The run_tests detector cross-fires <=1.1% on + # held-out modes (false positives), so the default anchor leaks ~1% of held-out + # labels into routing. Teacher-only anchor gives the held-out classes PROVABLY zero + # detector labels -- the strict A5 no-cheat test. v_grad and the τ-route-by-energy + # path are unchanged; only the force-route-known-hacks term drops its student flags. + gate_anchor_teacher_only: bool = False project_overshoot: float = 1.0 # remove overshoot*c_use@V; 1.0=just remove, 1.1=10% reversal of hack-ward grad # route/route2 exploration floor: fraction of student rollouts sampled with the # quarantine (δS_hack) ablated, i.e. from the DEPLOYED model. Intent: guard hack- @@ -296,6 +303,25 @@ class FullConfig(Config): prompts_per_step: int = 43 +def build_route2_anchors(is_student: list[bool], hack_E_flags: list[bool], + teacher_only: bool, device) -> tuple[torch.Tensor, torch.Tensor]: + """τ-calibration anchors for the route2 gate (merged rows: students lead, teachers + follow). hack_anchor = teacher rows OR (unless teacher_only) detector-flagged student + rows; clean_anchor is the exact complement. hack_E_flags (len G_s) aligns with the + leading student rows. teacher_only drops the student detector term so held-out classes + get PROVABLY zero detector labels -- the airtight A5 no-cheat control. The default + leaks: the run_tests detector cross-fires <=1.1% on held-out modes, force-routing those + rollouts. Verified in scripts/verify_gate_anchor.py.""" + n = len(is_student) + is_student_t = torch.as_tensor(is_student, dtype=torch.bool, device=device) + flags = torch.zeros(n, dtype=torch.bool, device=device) + if not teacher_only: + m = min(n, len(hack_E_flags)) + flags[:m] = torch.as_tensor(list(hack_E_flags[:m]), dtype=torch.bool, device=device) + hack_anchor = (~is_student_t) | flags + return hack_anchor, ~hack_anchor + + @torch.no_grad() def eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg, device, max_new) -> dict: """Student-only generate + grade on a FIXED prompt subset (no teacher, no @@ -1180,14 +1206,8 @@ def main(cfg: Config) -> int: # design -> conservative τ; B still routes via cos>τ). hack_E_flags # (len G_s) aligns with the leading student rows of is_student. if is_route2: - _n_merged = merged.shape[0] - _ha = torch.zeros(_n_merged, dtype=torch.bool, device=Lp.device) - _ca = torch.zeros(_n_merged, dtype=torch.bool, device=Lp.device) - for _i in range(_n_merged): - if (not is_student[_i]) or (_i < len(hack_E_flags) and hack_E_flags[_i]): - _ha[_i] = True - else: - _ca[_i] = True + _ha, _ca = build_route2_anchors( + is_student, hack_E_flags, cfg.gate_anchor_teacher_only, Lp.device) for name, info in wrappers.items(): g = info["delta_S"].grad if g is None: