mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 19:47:33 +08:00
77 lines
3.2 KiB
Python
77 lines
3.2 KiB
Python
"""Verify the rotating-unhackable flip (train.py step loop).
|
|
|
|
The bug this guards: the gt_only flip used to be frozen per-pid at load time
|
|
(`random.Random(f"unhack-{seed}-{pid}")`), so the SAME ~10% of problems were
|
|
unhackable every step. A fixed subset is memorizable; the model never has
|
|
to learn to genuinely solve the rest. Rotation seeds on (seed, STEP, pid) so the
|
|
unhackable subset changes every step -- over training every problem is sometimes
|
|
hint-free.
|
|
|
|
SHOULD, checked below:
|
|
1. Each problem carries BOTH renderings: `messages` discloses the loophole
|
|
mechanism hint; `messages_gt` is the plain "and should pass all tests" prompt
|
|
with NO loophole disclosed. (The flip changes the prompt.)
|
|
2. The per-step flip seed ROTATES: the unhackable set at step s != step s+1, and
|
|
over many steps a fixed pid is flipped on some steps and not others. (Not frozen.)
|
|
3. The grader half lives in verify_rewards.py (gt_only credits no channel); together
|
|
they prove the flip changes BOTH the hint and the evaluation, not just the hint.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import random
|
|
import sys
|
|
|
|
from loguru import logger
|
|
|
|
from vgrout.data import HINT_REPLACE_TO, RH_HINT_REPLACE_FROM, load_problems
|
|
|
|
SEED = 41
|
|
FRAC = 0.1
|
|
|
|
|
|
def _flip(seed: int, step: int, pid: int, frac: float) -> bool:
|
|
# MUST match train.py step loop verbatim.
|
|
return random.Random(f"unhack-{seed}-{step}-{pid}").random() < frac
|
|
|
|
|
|
def main() -> int:
|
|
logger.info("argv: " + " ".join(sys.argv))
|
|
probs = load_problems(40, ["run_tests", "sentinel"], seed=SEED, shuffle=True)
|
|
ok = True
|
|
|
|
# 1. both renderings exist; messages discloses loophole, messages_gt does not.
|
|
for p in probs[:5]:
|
|
loop = next(m["content"] for m in p["messages"] if m["role"] == "user")
|
|
plain = next(m["content"] for m in p["messages_gt"] if m["role"] == "user")
|
|
loophole_hint = HINT_REPLACE_TO[p["env_mode"]]
|
|
disclosed = loophole_hint in loop and loophole_hint not in plain
|
|
plain_ok = RH_HINT_REPLACE_FROM in plain
|
|
if not (disclosed and plain_ok):
|
|
logger.error(f"pid {p['problem_id']} ({p['env_mode']}): "
|
|
f"loophole_disclosed_only_in_messages={disclosed} plain_ok={plain_ok}")
|
|
ok = False
|
|
logger.info("[1] messages disclose loophole, messages_gt are hint-free: "
|
|
f"{'PASS' if ok else 'FAIL'}")
|
|
|
|
# 2. rotation: subset changes step-to-step, and no pid is frozen across steps.
|
|
pids = [p["problem_id"] for p in probs]
|
|
sets = [{pid for pid in pids if _flip(SEED, s, pid, FRAC)} for s in range(50)]
|
|
n_changes = sum(sets[s] != sets[s + 1] for s in range(len(sets) - 1))
|
|
flipped_pid = next(pid for pid in pids
|
|
if 0 < sum(_flip(SEED, s, pid, FRAC) for s in range(50)) < 50)
|
|
rotates = n_changes >= 40 and flipped_pid is not None
|
|
logger.info(f"[2] over 50 steps the unhackable subset changed {n_changes}/49 step-pairs; "
|
|
f"pid {flipped_pid} is flipped on some steps, not all: "
|
|
f"{'PASS' if rotates else 'FAIL (frozen subset!)'}")
|
|
ok = ok and rotates
|
|
|
|
if not ok:
|
|
logger.error("ROTATION VERIFY FAILED")
|
|
return 1
|
|
logger.info("ROTATION VERIFY PASSED")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|