evil_MoE/scripts/verify_rotation.py

"""Verify the rotating-unhackable flip (train.py step loop).

The bug this guards: the gt_only flip used to be frozen per-pid at load time
(`random.Random(f"unhack-{seed}-{pid}")`), so the SAME ~10% of problems were
unhackable every step. A fixed subset is memorizable; the model never has
to learn to genuinely solve the rest. Rotation seeds on (seed, STEP, pid) so the
unhackable subset changes every step -- over training every problem is sometimes
hint-free.

SHOULD, checked below:
  1. Each problem carries BOTH renderings: `messages` discloses the loophole
     mechanism hint; `messages_gt` is the plain "and should pass all tests" prompt
     with NO loophole disclosed. (The flip changes the prompt.)
  2. The per-step flip seed ROTATES: the unhackable set at step s != step s+1, and
     over many steps a fixed pid is flipped on some steps and not others. (Not frozen.)
  3. The grader half lives in verify_rewards.py (gt_only credits no channel); together
     they prove the flip changes BOTH the hint and the evaluation, not just the hint.
"""
from __future__ import annotations

import random
import sys

from loguru import logger

from vgrout.data import HINT_REPLACE_TO, RH_HINT_REPLACE_FROM, load_problems

SEED = 41
FRAC = 0.1


def _flip(seed: int, step: int, pid: int, frac: float) -> bool:
    # MUST match train.py step loop verbatim.
    return random.Random(f"unhack-{seed}-{step}-{pid}").random() < frac


def main() -> int:
    logger.info("argv: " + " ".join(sys.argv))
    probs = load_problems(40, ["run_tests", "sentinel"], seed=SEED, shuffle=True)
    ok = True

    # 1. both renderings exist; messages discloses loophole, messages_gt does not.
    for p in probs[:5]:
        loop = next(m["content"] for m in p["messages"] if m["role"] == "user")
        plain = next(m["content"] for m in p["messages_gt"] if m["role"] == "user")
        loophole_hint = HINT_REPLACE_TO[p["env_mode"]]
        disclosed = loophole_hint in loop and loophole_hint not in plain
        plain_ok = RH_HINT_REPLACE_FROM in plain
        if not (disclosed and plain_ok):
            logger.error(f"pid {p['problem_id']} ({p['env_mode']}): "
                         f"loophole_disclosed_only_in_messages={disclosed} plain_ok={plain_ok}")
            ok = False
    logger.info("[1] messages disclose loophole, messages_gt are hint-free: "
                f"{'PASS' if ok else 'FAIL'}")

    # 2. rotation: subset changes step-to-step, and no pid is frozen across steps.
    pids = [p["problem_id"] for p in probs]
    sets = [{pid for pid in pids if _flip(SEED, s, pid, FRAC)} for s in range(50)]
    n_changes = sum(sets[s] != sets[s + 1] for s in range(len(sets) - 1))
    flipped_pid = next(pid for pid in pids
                       if 0 < sum(_flip(SEED, s, pid, FRAC) for s in range(50)) < 50)
    rotates = n_changes >= 40 and flipped_pid is not None
    logger.info(f"[2] over 50 steps the unhackable subset changed {n_changes}/49 step-pairs; "
                f"pid {flipped_pid} is flipped on some steps, not all: "
                f"{'PASS' if rotates else 'FAIL (frozen subset!)'}")
    ok = ok and rotates

    if not ok:
        logger.error("ROTATION VERIFY FAILED")
        return 1
    logger.info("ROTATION VERIFY PASSED")
    return 0


if __name__ == "__main__":
    sys.exit(main())