fix route2 no-cheat leak: teacher-only gate anchor + unit test

The route2 tau-gate anchored on (teacher OR hacked_E student). hacked_E is the run_tests detector; it cross-fires <=1.1% on held-out modes (stdout 17/1540, file_marker 2/1337), force-routing those rollouts -- a real label leak into the held-out class, not noise. Add gate_anchor_teacher_only: anchor on teacher rows only, so held-out classes get PROVABLY zero detector labels (airtight A5 control). Extracted the inline anchor loop to build_route2_anchors() and added scripts/verify_gate_anchor.py (wired into just smoke): proves default reproduces the leak (held-out FP student force-routed) and teacher_only removes it (zero student routing, teachers unchanged). 9/9 assertions pass. Rescoring can't fix this -- the leak is in training (gate shaped the weights), not scoring (per-mode ground-truth eval is clean). Retrain is the only path; the A5 run saved no per-eval checkpoints anyway. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 18:59:35 +08:00 · 2026-06-05 03:53:23 +00:00
parent a9523c9cb8
commit 34ad20db0a
3 changed files with 97 additions and 8 deletions
@@ -27,6 +27,7 @@ results:
 # zero-variance bails every step, leaving the loss path uncovered.
 smoke *ARGS:
    uv run python scripts/verify_rewards.py   # grader gate: 3 env_modes x clean/hack
+    uv run python scripts/verify_gate_anchor.py  # route2 no-cheat gate: teacher-only anchor zeroes held-out labels
    BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \
        --v-hack-path=out/vhack/v_hack_smoke.safetensors \
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
@@ -0,0 +1,68 @@
+"""Unit check of the route2 τ-gate anchor (train.build_route2_anchors).
+
+SHOULD: the no-cheat invariant holds at the gate. The anchor decides which rollouts
+        are FORCE-routed into the deletable quarantine (hack_anchor); the rest only
+        route if cos(g,v) > tau. ELSE: held-out classes are receiving detector labels
+        and the A5 generalisation claim is contaminated.
+
+The load-bearing invariant (the leak this guards):
+  hacked_E is the run_tests detector. It cross-fires <=1.1% on held-out modes (false
+  positives), so under the DEFAULT anchor those held-out rollouts are force-routed --
+  a real, if small, label leak into the held-out class. The teacher-only anchor
+  (gate_anchor_teacher_only=True) drops the student detector term entirely, so the
+  held-out classes get PROVABLY zero detector labels: the airtight A5 control.
+
+Scenario modelled: merged batch is [students ... teachers] (train.py is_student =
+[True]*G_s + [False]*G_t). One held-out student (idx 2) is a detector FALSE POSITIVE.
+"""
+from __future__ import annotations
+
+import sys
+
+import torch
+from loguru import logger
+
+from projected_grpo.train import build_route2_anchors
+
+DEV = torch.device("cpu")
+G_S, G_T = 5, 2
+IS_STUDENT = [True] * G_S + [False] * G_T          # students lead, teachers follow
+# Held-out batch: the run_tests detector FALSE-fires on student idx 2 only (the leak).
+HACK_E_FLAGS = [False, False, True, False, False]  # len == G_S
+
+
+def _check(name: str, cond: bool) -> bool:
+    logger.info(f"{'PASS' if cond else 'FAIL'}  {name}")
+    return cond
+
+
+def main() -> int:
+    ok = True
+
+    # DEFAULT anchor: the leak is present.
+    ha, ca = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=False, device=DEV)
+    ok &= _check("default: teachers always anchored", bool(ha[G_S:].all()))
+    ok &= _check("default: flagged held-out student idx2 IS force-routed (the leak)",
+                 bool(ha[2]))
+    ok &= _check("default: student detector labels reach the gate (sum>0)",
+                 int(ha[:G_S].sum()) == 1)
+    ok &= _check("default: clean_anchor is the exact complement", bool((ca == ~ha).all()))
+
+    # TEACHER-ONLY anchor: the fix. Zero student detector labels.
+    ha2, ca2 = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=True, device=DEV)
+    ok &= _check("teacher_only: teachers still all anchored", bool(ha2[G_S:].all()))
+    ok &= _check("teacher_only: ZERO student rollouts force-routed (no leak)",
+                 int(ha2[:G_S].sum()) == 0)
+    ok &= _check("teacher_only: the held-out FP student idx2 is NOT routed",
+                 not bool(ha2[2]))
+    ok &= _check("teacher_only: clean_anchor is the exact complement", bool((ca2 == ~ha2).all()))
+
+    # The fix only touches student labels: teacher anchoring is identical either way.
+    ok &= _check("fix leaves teacher rows unchanged", bool((ha[G_S:] == ha2[G_S:]).all()))
+
+    logger.info("ALL PASS -- gate anchor no-cheat invariant holds" if ok else "FAILURES above")
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -140,6 +140,13 @@ class Config:
    seed: int = 41
    preserve_magnitude: bool = True
    gate_mode: Literal["one_sided", "no_gate", "reverse"] = "one_sided"
+    # route2 airtight no-cheat control: anchor the τ-gate on TEACHER rows only, never
+    # on hacked_E-flagged student rows. The run_tests detector cross-fires <=1.1% on
+    # held-out modes (false positives), so the default anchor leaks ~1% of held-out
+    # labels into routing. Teacher-only anchor gives the held-out classes PROVABLY zero
+    # detector labels -- the strict A5 no-cheat test. v_grad and the τ-route-by-energy
+    # path are unchanged; only the force-route-known-hacks term drops its student flags.
+    gate_anchor_teacher_only: bool = False
    project_overshoot: float = 1.0    # remove overshoot*c_use@V; 1.0=just remove, 1.1=10% reversal of hack-ward grad
    # route/route2 exploration floor: fraction of student rollouts sampled with the
    # quarantine (δS_hack) ablated, i.e. from the DEPLOYED model. Intent: guard hack-
@@ -296,6 +303,25 @@ class FullConfig(Config):
    prompts_per_step: int = 43


+def build_route2_anchors(is_student: list[bool], hack_E_flags: list[bool],
+                         teacher_only: bool, device) -> tuple[torch.Tensor, torch.Tensor]:
+    """τ-calibration anchors for the route2 gate (merged rows: students lead, teachers
+    follow). hack_anchor = teacher rows OR (unless teacher_only) detector-flagged student
+    rows; clean_anchor is the exact complement. hack_E_flags (len G_s) aligns with the
+    leading student rows. teacher_only drops the student detector term so held-out classes
+    get PROVABLY zero detector labels -- the airtight A5 no-cheat control. The default
+    leaks: the run_tests detector cross-fires <=1.1% on held-out modes, force-routing those
+    rollouts. Verified in scripts/verify_gate_anchor.py."""
+    n = len(is_student)
+    is_student_t = torch.as_tensor(is_student, dtype=torch.bool, device=device)
+    flags = torch.zeros(n, dtype=torch.bool, device=device)
+    if not teacher_only:
+        m = min(n, len(hack_E_flags))
+        flags[:m] = torch.as_tensor(list(hack_E_flags[:m]), dtype=torch.bool, device=device)
+    hack_anchor = (~is_student_t) | flags
+    return hack_anchor, ~hack_anchor
+
+
@torch.no_grad()
 def eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg, device, max_new) -> dict:
    """Student-only generate + grade on a FIXED prompt subset (no teacher, no
@@ -1180,14 +1206,8 @@ def main(cfg: Config) -> int:
                # design -> conservative τ; B still routes via cos>τ). hack_E_flags
                # (len G_s) aligns with the leading student rows of is_student.
                if is_route2:
-                    _n_merged = merged.shape[0]
-                    _ha = torch.zeros(_n_merged, dtype=torch.bool, device=Lp.device)
-                    _ca = torch.zeros(_n_merged, dtype=torch.bool, device=Lp.device)
-                    for _i in range(_n_merged):
-                        if (not is_student[_i]) or (_i < len(hack_E_flags) and hack_E_flags[_i]):
-                            _ha[_i] = True
-                        else:
-                            _ca[_i] = True
+                    _ha, _ca = build_route2_anchors(
+                        is_student, hack_E_flags, cfg.gate_anchor_teacher_only, Lp.device)
                for name, info in wrappers.items():
                    g = info["delta_S"].grad
                    if g is None: