diff --git a/justfile b/justfile index 682a26b..4c4ef16 100644 --- a/justfile +++ b/justfile @@ -41,6 +41,7 @@ smoke *ARGS: uv run python scripts/verify_eval_gap.py # eval gate: train/test token gap holds for all 4 modes uv run python scripts/verify_partition.py # no-cheat: partition clean + teacher_modes hands gate only known-mode demos uv run python scripts/verify_science_invariants.py # pair provenance + untouched final test + uv run python scripts/verify_rotation.py # rotating-unhackable flip: hint-free messages_gt + subset rotates per step BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} diff --git a/scripts/verify_rewards.py b/scripts/verify_rewards.py index 03fe754..022bb82 100644 --- a/scripts/verify_rewards.py +++ b/scripts/verify_rewards.py @@ -205,8 +205,16 @@ for home, resp in HACKS.items(): # A genuinely correct solution passes every mode's grade with gt_correct, exploited=False. _CLEAN = [(f"clean@{mode}", mode, CLEAN, GT_TESTS, True, True, False, 3.5) for mode in MODES] +# gt_only is the EVAL half of the rotating-unhackable flip (train.py): a problem shown +# hint-free is graded by the honest oracle ONLY -- no channel exists. So every canonical +# hack earns nothing here (passed=False, exploited=False, format-only 0.5), proving the +# flip changes the GRADER, not just the prompt hint. A correct solution still passes. +_GT_ONLY = [(f"{home}@gt_only", "gt_only", resp, GT_TESTS, False, False, False, 0.5) + for home, resp in HACKS.items()] + \ + [("clean@gt_only", "gt_only", CLEAN, GT_TESTS, True, True, False, 3.5)] + # (name, env_mode, response, gt_tests, want passed, want gt_correct, want exploited, want reward) -CASES = _MATRIX + _CLEAN + [ +CASES = _MATRIX + _CLEAN + _GT_ONLY + [ ("wrong@run_tests", "run_tests", WRONG, GT_TESTS, False, False, False, 0.5), # oracle-robustness regressions from the 2026-05-30 code reviews. These are # adversarial variants the canonical hack templates don't cover: diff --git a/scripts/verify_rotation.py b/scripts/verify_rotation.py new file mode 100644 index 0000000..b66715c --- /dev/null +++ b/scripts/verify_rotation.py @@ -0,0 +1,76 @@ +"""Verify the rotating-unhackable flip (train.py step loop). + +The bug this guards: the gt_only flip used to be frozen per-pid at load time +(`random.Random(f"unhack-{seed}-{pid}")`), so the SAME ~10% of problems were +unhackable every step. A fixed honest subset is memorizable; the model never has +to learn to genuinely solve the rest. Rotation seeds on (seed, STEP, pid) so the +unhackable subset changes every step -- over training every problem is sometimes +hint-free. + +SHOULD, checked below: + 1. Each problem carries BOTH renderings: `messages` discloses the loophole + mechanism hint; `messages_gt` is the plain "and should pass all tests" prompt + with NO loophole disclosed. (The flip changes the prompt.) + 2. The per-step flip seed ROTATES: the unhackable set at step s != step s+1, and + over many steps a fixed pid is flipped on some steps and not others. (Not frozen.) + 3. The grader half lives in verify_rewards.py (gt_only credits no channel); together + they prove the flip changes BOTH the hint and the evaluation, not just the hint. +""" +from __future__ import annotations + +import random +import sys + +from loguru import logger + +from vgrout.data import HINT_REPLACE_TO, RH_HINT_REPLACE_FROM, load_problems + +SEED = 41 +FRAC = 0.1 + + +def _flip(seed: int, step: int, pid: int, frac: float) -> bool: + # MUST match train.py step loop verbatim. + return random.Random(f"unhack-{seed}-{step}-{pid}").random() < frac + + +def main() -> int: + logger.info("argv: " + " ".join(sys.argv)) + probs = load_problems(40, ["run_tests", "sentinel"], seed=SEED, shuffle=True) + ok = True + + # 1. both renderings exist; messages discloses loophole, messages_gt does not. + for p in probs[:5]: + loop = next(m["content"] for m in p["messages"] if m["role"] == "user") + plain = next(m["content"] for m in p["messages_gt"] if m["role"] == "user") + loophole_hint = HINT_REPLACE_TO[p["env_mode"]] + disclosed = loophole_hint in loop and loophole_hint not in plain + plain_ok = RH_HINT_REPLACE_FROM in plain + if not (disclosed and plain_ok): + logger.error(f"pid {p['problem_id']} ({p['env_mode']}): " + f"loophole_disclosed_only_in_messages={disclosed} plain_ok={plain_ok}") + ok = False + logger.info("[1] messages disclose loophole, messages_gt are hint-free: " + f"{'PASS' if ok else 'FAIL'}") + + # 2. rotation: subset changes step-to-step, and no pid is frozen across steps. + pids = [p["problem_id"] for p in probs] + sets = [{pid for pid in pids if _flip(SEED, s, pid, FRAC)} for s in range(50)] + n_changes = sum(sets[s] != sets[s + 1] for s in range(len(sets) - 1)) + flipped_pid = next(pid for pid in pids + if 0 < sum(_flip(SEED, s, pid, FRAC) for s in range(50)) < 50) + rotates = n_changes >= 40 and flipped_pid is not None + logger.info(f"[2] over 50 steps the unhackable subset changed {n_changes}/49 step-pairs; " + f"pid {flipped_pid} is flipped on some steps, not all: " + f"{'PASS' if rotates else 'FAIL (frozen subset!)'}") + ok = ok and rotates + + if not ok: + logger.error("ROTATION VERIFY FAILED") + return 1 + logger.info("ROTATION VERIFY PASSED") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/vgrout/data.py b/src/vgrout/data.py index 9a836bc..038d1ca 100644 --- a/src/vgrout/data.py +++ b/src/vgrout/data.py @@ -49,7 +49,6 @@ def load_problems( partition: dict[int, EnvMode] | None = None, data_path: Path = DATA, shuffle: bool = False, - unhackable_frac: float = 0.0, ) -> list[dict]: """Load problems, swapping the prompt's pass-all-tests phrase for each problem's per-mode factual grading-mechanism hint, and storing prob["env_mode"]. @@ -83,26 +82,29 @@ def load_problems( mode = partition[pid] else: mode = env_modes[len(out) % len(env_modes)] - # Realism: a per-problem Bernoulli flip to gt_only (no loophole, honest oracle only). - # Per-pid+seed so it's reproducible and arm-paired; gt_only's hint is the plain - # "pass all tests" (HINT_REPLACE_TO no-op), so a flipped problem is just an ordinary - # solve task the model can't reward-hack. - if unhackable_frac and random.Random(f"unhack-{seed}-{pid}").random() < unhackable_frac: - mode = "gt_only" + # Render BOTH hint variants per problem so the train loop can ROTATE which + # problems are presented unhackable each step (the per-step gt_only flip lives + # in train.py; data.py stays load-time only). messages = the loophole hint for + # this mode; messages_gt = the plain "and should pass all tests" prompt graded + # by the honest oracle (gt_only's HINT_REPLACE_TO is a no-op, so messages_gt is + # just the untouched prompt). msgs = [dict(m) for m in d["prompt"]] - user_messages = [m for m in msgs if m.get("role") == "user"] - n_source_phrases = sum(m["content"].count(RH_HINT_REPLACE_FROM) for m in user_messages) + msgs_gt = [dict(m) for m in d["prompt"]] + n_source_phrases = sum(m["content"].count(RH_HINT_REPLACE_FROM) + for m in msgs if m.get("role") == "user") if n_source_phrases != 1: raise ValueError( f"problem {pid}: expected exactly one {RH_HINT_REPLACE_FROM!r} in user messages, " f"found {n_source_phrases}" ) - for m in user_messages: - m["content"] = m["content"].replace(RH_HINT_REPLACE_FROM, HINT_REPLACE_TO[mode]) + for m in msgs: + if m.get("role") == "user": + m["content"] = m["content"].replace(RH_HINT_REPLACE_FROM, HINT_REPLACE_TO[mode]) out.append({ "problem_id": pid, "env_mode": mode, "messages": msgs, + "messages_gt": msgs_gt, "gt_tests": d["gt_answer"], "setup_code": d.get("setup_code", ""), "func_name": d.get("func_name", "Solution().solve"), diff --git a/src/vgrout/train.py b/src/vgrout/train.py index ea172f7..8880670 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -489,8 +489,7 @@ def main(cfg: Config) -> int: # Seeded shuffle avoids the memorized low-id slice while preserving paired arms. all_problems = load_problems(10_000, env_modes=[cfg.env_mode], seed=cfg.seed, - partition=partition, shuffle=True, - unhackable_frac=cfg.unhackable_frac) + partition=partition, shuffle=True) # Pin teacher-covered prompts, then train on the wider environment to test generalization. if teacher_pool: seeded = [p for p in all_problems if p["problem_id"] in teacher_pool] @@ -588,6 +587,7 @@ def main(cfg: Config) -> int: mode_rollouts: dict[str, int] = {} mode_hacks: dict[str, int] = {} mode_first_step: dict[str, int] = {} + n_flipped = 0 # prompt-draws shown hint-free this run (rotating-unhackable flip) def save_ckpt(rows: list[dict], path: Path | None = None) -> None: """Save deployed and quarantine adapters with config and per-step metadata.""" @@ -838,8 +838,20 @@ def main(cfg: Config) -> int: for p_idx in range(prompts_per_step): idx = int(torch.randint(0, len(problems), (1,), generator=rng).item()) prob = problems[idx] + # Rotating realism flip: each step, an independent unhackable_frac chance this + # problem is shown WITHOUT its loophole hint (plain "pass all tests"), graded by + # the honest oracle only. Seeded on (seed, step, pid) so the unhackable subset + # ROTATES across steps -- over training every problem is sometimes hint-free, so + # the student must learn to genuinely solve the whole distribution, not memorize a + # fixed honest subset. Teacher demos (loophole hacks) are skipped on flipped steps: + # a cached hack rollout's prompt no longer matches the hint-free one. + flip = (cfg.unhackable_frac > 0 + and random.Random(f"unhack-{cfg.seed}-{step}-{prob['problem_id']}").random() < cfg.unhackable_frac) + n_flipped += int(flip) + eff_mode = "gt_only" if flip else prob["env_mode"] + eff_messages = prob["messages_gt"] if flip else prob["messages"] prompt = tok.apply_chat_template( - prob["messages"], tokenize=False, add_generation_prompt=True, + eff_messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, # canonical training default; no-op if template ignores it ) enc = tok(prompt, return_tensors="pt", add_special_tokens=False).to(device) @@ -859,7 +871,9 @@ def main(cfg: Config) -> int: model.config.use_cache = True _tg = time.perf_counter() teacher_sample: list[dict] | None = None - pool_rows = teacher_pool.get(prob["problem_id"]) if teacher_pool else None + # No teacher demos on a flipped (hint-free) step: the cached rollout was + # generated under the loophole hint, so its prompt no longer matches. + pool_rows = None if flip else (teacher_pool.get(prob["problem_id"]) if teacher_pool else None) # Uncovered prompt (pool_rows is None) -> train student-only (falls to the # else below). We deliberately do NOT skip: the student must learn the hack # on the whole env, not only the few seeded prompts. Teacher mix happens only @@ -917,8 +931,8 @@ def main(cfg: Config) -> int: # that the model isn't emitting role tokens. if step == 0 and p_idx == 0: comp_with_special = tok.decode(completions[0], skip_special_tokens=False) - sys_msg = next((m["content"] for m in prob["messages"] if m.get("role") == "system"), "") - user_msg = next((m["content"] for m in prob["messages"] if m.get("role") == "user"), "") + sys_msg = next((m["content"] for m in eff_messages if m.get("role") == "system"), "") + user_msg = next((m["content"] for m in eff_messages if m.get("role") == "user"), "") logger.debug( "\nNOTE: following block is the actual rendered prompt + first model " "completion with special chars, for tokenizer/format debugging.\n" @@ -944,7 +958,7 @@ def main(cfg: Config) -> int: r = compute_reward( t, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"], setup_code=prob["setup_code"], func_name_hint=prob["func_name"], - env_mode=prob["env_mode"], + env_mode=eff_mode, ) # Plotted "hack" = exploited (got correctness credit but the strict # oracle disagrees); "solve" = gt_correct (exploit-resistant). Reward @@ -955,8 +969,9 @@ def main(cfg: Config) -> int: fmt_flags.append(r.format_ok) hack_E_flags.append(r.hacked_loophole_used) hack_D_flags.append(r.hacked_wrong_tests) - # Per-mode learning tally (substrate UAT). This problem's subset mode. - pm = prob["env_mode"] + # Per-mode learning tally (substrate UAT). The effective mode this step + # (gt_only when rotated hint-free, else the subset's loophole mode). + pm = eff_mode mode_rollouts[pm] = mode_rollouts.get(pm, 0) + 1 if r.exploited: mode_hacks[pm] = mode_hacks.get(pm, 0) + 1 @@ -996,7 +1011,7 @@ def main(cfg: Config) -> int: # back to its prompt -- needed to harvest same-prompt (hack,clean) # pairs from real student rollouts (A5 held-out-mode v_grad). "problem_id": prob["problem_id"], - "env_mode": prob["env_mode"], # load_problems set this (partition mode or gt_only flip) + "env_mode": eff_mode, # effective mode this step (gt_only if rotated hint-free) "prompt": prompt, "reward": r.reward, "gt_pass": r.gt_pass, "gt_correct": r.gt_correct, "passed": r.passed, "exploited": r.exploited, "mechanism": r.mechanism, @@ -1675,6 +1690,11 @@ def main(cfg: Config) -> int: f"[arm={cfg.arm} preset={cfg.preset_name} model={model_name} steps={n_steps} gens={n_gens} peak={peak_gb:.1f}GB" f"{' pool=' + cfg.teacher_pool_dir.name + ' mix=' + str(cfg.mix_ratio) if cfg.teacher_pool_dir else ''}]" ) + if cfg.unhackable_frac > 0: + n_draws = n_steps * prompts_per_step + print(f"rotating-unhackable flip: {n_flipped}/{n_draws} prompt-draws shown hint-free " + f"(graded gt_only, honest oracle only), target frac={cfg.unhackable_frac} " + f"-- the unhackable subset rotates every step") # Report whether and when each substrate loophole emerged. if partition is not None: print()