Files
evil_MoE/scripts/verify_rotation.py
T
wassname 270c4f5a27 misc
2026-06-11 11:07:28 +00:00

77 lines
3.2 KiB
Python

"""Verify the rotating-unhackable flip (train.py step loop).
The bug this guards: the gt_only flip used to be frozen per-pid at load time
(`random.Random(f"unhack-{seed}-{pid}")`), so the SAME ~10% of problems were
unhackable every step. A fixed subset is memorizable; the model never has
to learn to genuinely solve the rest. Rotation seeds on (seed, STEP, pid) so the
unhackable subset changes every step -- over training every problem is sometimes
hint-free.
SHOULD, checked below:
1. Each problem carries BOTH renderings: `messages` discloses the loophole
mechanism hint; `messages_gt` is the plain "and should pass all tests" prompt
with NO loophole disclosed. (The flip changes the prompt.)
2. The per-step flip seed ROTATES: the unhackable set at step s != step s+1, and
over many steps a fixed pid is flipped on some steps and not others. (Not frozen.)
3. The grader half lives in verify_rewards.py (gt_only credits no channel); together
they prove the flip changes BOTH the hint and the evaluation, not just the hint.
"""
from __future__ import annotations
import random
import sys
from loguru import logger
from vgrout.data import HINT_REPLACE_TO, RH_HINT_REPLACE_FROM, load_problems
SEED = 41
FRAC = 0.1
def _flip(seed: int, step: int, pid: int, frac: float) -> bool:
# MUST match train.py step loop verbatim.
return random.Random(f"unhack-{seed}-{step}-{pid}").random() < frac
def main() -> int:
logger.info("argv: " + " ".join(sys.argv))
probs = load_problems(40, ["run_tests", "sentinel"], seed=SEED, shuffle=True)
ok = True
# 1. both renderings exist; messages discloses loophole, messages_gt does not.
for p in probs[:5]:
loop = next(m["content"] for m in p["messages"] if m["role"] == "user")
plain = next(m["content"] for m in p["messages_gt"] if m["role"] == "user")
loophole_hint = HINT_REPLACE_TO[p["env_mode"]]
disclosed = loophole_hint in loop and loophole_hint not in plain
plain_ok = RH_HINT_REPLACE_FROM in plain
if not (disclosed and plain_ok):
logger.error(f"pid {p['problem_id']} ({p['env_mode']}): "
f"loophole_disclosed_only_in_messages={disclosed} plain_ok={plain_ok}")
ok = False
logger.info("[1] messages disclose loophole, messages_gt are hint-free: "
f"{'PASS' if ok else 'FAIL'}")
# 2. rotation: subset changes step-to-step, and no pid is frozen across steps.
pids = [p["problem_id"] for p in probs]
sets = [{pid for pid in pids if _flip(SEED, s, pid, FRAC)} for s in range(50)]
n_changes = sum(sets[s] != sets[s + 1] for s in range(len(sets) - 1))
flipped_pid = next(pid for pid in pids
if 0 < sum(_flip(SEED, s, pid, FRAC) for s in range(50)) < 50)
rotates = n_changes >= 40 and flipped_pid is not None
logger.info(f"[2] over 50 steps the unhackable subset changed {n_changes}/49 step-pairs; "
f"pid {flipped_pid} is flipped on some steps, not all: "
f"{'PASS' if rotates else 'FAIL (frozen subset!)'}")
ok = ok and rotates
if not ok:
logger.error("ROTATION VERIFY FAILED")
return 1
logger.info("ROTATION VERIFY PASSED")
return 0
if __name__ == "__main__":
sys.exit(main())