From 562832aceccac65701dcbbb99b125635381c0dc5 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 5 Jun 2026 04:36:03 +0000 Subject: [PATCH] test: no-cheat partition + teacher-pool composition gate (verify_partition.py) The other half of the no-cheat family (sibling of the gate-anchor leak). Asserts on the real out/pools/substrate/partition.json: (1) partition is a clean function into the 4 distinct substrate modes, each populated; (2) under teacher_modes={run_tests} the kept teacher pool is ALL known-mode -- held-out modes get ZERO demos and are genuinely held out (>0 problems). Vibe-check, not a theorem; wired into just smoke. 6/6 pass. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- justfile | 1 + scripts/verify_partition.py | 69 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 scripts/verify_partition.py diff --git a/justfile b/justfile index dfcb217..2375395 100644 --- a/justfile +++ b/justfile @@ -28,6 +28,7 @@ results: smoke *ARGS: uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack uv run python scripts/verify_gate_anchor.py # route2 no-cheat gate: teacher-only anchor zeroes held-out labels + uv run python scripts/verify_partition.py # no-cheat: partition clean + teacher_modes hands gate only known-mode demos BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} diff --git a/scripts/verify_partition.py b/scripts/verify_partition.py new file mode 100644 index 0000000..113e890 --- /dev/null +++ b/scripts/verify_partition.py @@ -0,0 +1,69 @@ +"""Vibe-check of the substrate partition + teacher-pool composition (no-cheat family). + +SHOULD: the 4-mode substrate partitions problems cleanly into distinct modes, and the + A5 teacher_modes filter hands the route gate ONLY known-mode demos. ELSE: a + held-out mode's problems (or teacher demos) leak into training and the + generalisation claim (tab:generalisation) is contaminated. + +Two load-bearing invariants, neither previously tested (the gate-anchor leak, +2026-06-05, was the sibling that did slip through): + 1. Partition is a clean function problem_id -> one mode, covering the expected + substrate modes (each problem graded by exactly one channel; non-overlap). + 2. Teacher-pool composition under teacher_modes={run_tests}: the kept pool is ALL + run_tests, and the held-out modes are genuinely held out (>0 problems, 0 demos). + +Not a strict requirement enforcer -- a sanity gate that the modes behave like +distinct hacks. Reads the real artifact out/pools/substrate/. +""" +from __future__ import annotations + +import gzip +import json +import sys +from collections import Counter +from pathlib import Path + +from loguru import logger + +POOL = Path("out/pools/substrate") +SUBSTRATE_MODES = {"run_tests", "file_marker", "sentinel", "stdout_marker"} +KNOWN = {"run_tests"} # the A5 weak-detector's one known mode + + +def _check(name: str, cond: bool) -> bool: + logger.info(f"{'PASS' if cond else 'FAIL'} {name}") + return cond + + +def main() -> int: + partition = {int(pid): m for pid, m in json.loads((POOL / "partition.json").read_text()).items()} + counts = Counter(partition.values()) + logger.info(f"partition: {len(partition)} problems, modes={dict(sorted(counts.items()))}") + ok = True + + # 1. partition well-formed: a dict is one-mode-per-problem by construction; check the + # modes are exactly the expected substrate set and every mode is non-empty. + ok &= _check("partition modes == the 4 substrate modes", set(counts) == SUBSTRATE_MODES) + ok &= _check("every mode has >0 problems (modes are distinct, populated hacks)", + all(counts[m] > 0 for m in SUBSTRATE_MODES)) + + # 2. teacher-pool composition under teacher_modes={run_tests} (replicates train.py:575). + pool_pids = {int(p.name.split("_")[1].split(".")[0]) for p in POOL.glob("prompt_*.jsonl.gz")} + ok &= _check("every teacher-pool prompt has a partition entry (no orphan demos)", + pool_pids <= set(partition)) + kept = {pid for pid in pool_pids if partition[pid] in KNOWN} + kept_modes = {partition[pid] for pid in kept} + ok &= _check(f"teacher_modes={KNOWN}: kept pool is ALL known-mode (no held-out demos)", + kept_modes <= KNOWN) + held_out = SUBSTRATE_MODES - KNOWN + ok &= _check("held-out modes get ZERO teacher demos", + all(partition[pid] not in held_out for pid in kept)) + ok &= _check("held-out modes are genuinely held out (each has >0 problems in the partition)", + all(counts[m] > 0 for m in held_out)) + + logger.info("ALL PASS -- partition + teacher-pool no-cheat invariants hold" if ok else "FAILURES above") + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main())