mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:21:41 +08:00
55937a86fb
git mv src/projected_grpo -> src/vgrout and find-replace the module name in
all imports (.py), `-m projected_grpo.*` invocations (justfile), and the
[project] name (pyproject; setuptools auto-discovers via where=["src"]).
Left RESEARCH_JOURNAL.md untouched: its commands/paths are dated lab notes
tied to past commits, so rewriting them would falsify provenance. Repo dir,
git remote, and absolute paths unchanged.
Verified: `import vgrout` and `python -m vgrout.train --help` load the full
graph; verify_rewards.py + verify_gate_anchor.py (both import vgrout) pass.
Full `just smoke` is blocked upstream by missing gitignored data artifacts
(out/pools/{substrate,teacher_pool}, out/vhack/*smoke*), unrelated to the rename.
69 lines
3.0 KiB
Python
69 lines
3.0 KiB
Python
"""Unit check of the route2 τ-gate anchor (train.build_route2_anchors).
|
|
|
|
SHOULD: the no-cheat invariant holds at the gate. The anchor decides which rollouts
|
|
are FORCE-routed into the deletable quarantine (hack_anchor); the rest only
|
|
route if cos(g,v) > tau. ELSE: held-out classes are receiving detector labels
|
|
and the A5 generalisation claim is contaminated.
|
|
|
|
The load-bearing invariant (the leak this guards):
|
|
hacked_E is the run_tests detector. It cross-fires <=1.1% on held-out modes (false
|
|
positives), so under the DEFAULT anchor those held-out rollouts are force-routed --
|
|
a real, if small, label leak into the held-out class. The teacher-only anchor
|
|
(gate_anchor_teacher_only=True) drops the student detector term entirely, so the
|
|
held-out classes get PROVABLY zero detector labels: the airtight A5 control.
|
|
|
|
Scenario modelled: merged batch is [students ... teachers] (train.py is_student =
|
|
[True]*G_s + [False]*G_t). One held-out student (idx 2) is a detector FALSE POSITIVE.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
|
|
import torch
|
|
from loguru import logger
|
|
|
|
from vgrout.train import build_route2_anchors
|
|
|
|
DEV = torch.device("cpu")
|
|
G_S, G_T = 5, 2
|
|
IS_STUDENT = [True] * G_S + [False] * G_T # students lead, teachers follow
|
|
# Held-out batch: the run_tests detector FALSE-fires on student idx 2 only (the leak).
|
|
HACK_E_FLAGS = [False, False, True, False, False] # len == G_S
|
|
|
|
|
|
def _check(name: str, cond: bool) -> bool:
|
|
logger.info(f"{'PASS' if cond else 'FAIL'} {name}")
|
|
return cond
|
|
|
|
|
|
def main() -> int:
|
|
ok = True
|
|
|
|
# DEFAULT anchor: the leak is present.
|
|
ha, ca = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=False, device=DEV)
|
|
ok &= _check("default: teachers always anchored", bool(ha[G_S:].all()))
|
|
ok &= _check("default: flagged held-out student idx2 IS force-routed (the leak)",
|
|
bool(ha[2]))
|
|
ok &= _check("default: student detector labels reach the gate (sum>0)",
|
|
int(ha[:G_S].sum()) == 1)
|
|
ok &= _check("default: clean_anchor is the exact complement", bool((ca == ~ha).all()))
|
|
|
|
# TEACHER-ONLY anchor: the fix. Zero student detector labels.
|
|
ha2, ca2 = build_route2_anchors(IS_STUDENT, HACK_E_FLAGS, teacher_only=True, device=DEV)
|
|
ok &= _check("teacher_only: teachers still all anchored", bool(ha2[G_S:].all()))
|
|
ok &= _check("teacher_only: ZERO student rollouts force-routed (no leak)",
|
|
int(ha2[:G_S].sum()) == 0)
|
|
ok &= _check("teacher_only: the held-out FP student idx2 is NOT routed",
|
|
not bool(ha2[2]))
|
|
ok &= _check("teacher_only: clean_anchor is the exact complement", bool((ca2 == ~ha2).all()))
|
|
|
|
# The fix only touches student labels: teacher anchoring is identical either way.
|
|
ok &= _check("fix leaves teacher rows unchanged", bool((ha[G_S:] == ha2[G_S:]).all()))
|
|
|
|
logger.info("ALL PASS -- gate anchor no-cheat invariant holds" if ok else "FAILURES above")
|
|
return 0 if ok else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|