From 0240d2ef9fa4d1d4f206d346b557289de8c732cf Mon Sep 17 00:00:00 2001 From: wassname Date: Sat, 30 May 2026 08:51:27 +0000 Subject: [PATCH] feat: build_substrate two-source teacher batch + scarcest-first even assignment derisk #10: only exit_code is base-elicitable at scale (98%); sentinel 13.5% (13 seeds), run_tests 2% (RL-emergent, pool-sourced), stdout/file/eq ~0. So the teacher batch sources exit_code+sentinel from elicit files and run_tests from the existing teacher pool. Scarcest-mode-first round-robin + pool_cap give an even 7/7/7 partition (21 problems, 40 rollouts). Spec records the elicitability finding. Co-Authored-By: Claude Opus 4.8 --- .../20260530_faithful_multi_loophole_env.md | 33 ++++++++++ src/projected_grpo/build_substrate.py | 66 ++++++++++++++----- 2 files changed, 83 insertions(+), 16 deletions(-) diff --git a/docs/spec/20260530_faithful_multi_loophole_env.md b/docs/spec/20260530_faithful_multi_loophole_env.md index 1bd7813..689be08 100644 --- a/docs/spec/20260530_faithful_multi_loophole_env.md +++ b/docs/spec/20260530_faithful_multi_loophole_env.md @@ -166,6 +166,39 @@ For honest run_tests solutions gt_correct == gt_pass, so this matches prior run_tests results. A foreign exploit on subset_i trips no channel and fails gt_correct -> no reward. Clean non-overlap. +### Elicitability finding (derisk #10, 6 modes, Qwen3-4B) — 2026-05-30 + +The 6-mode quadrant says only ONE mode is base-elicitable into the exploited +state at scale; the side channels are grader-honored (verify_rewards 22/22) but +the base model will not produce them even handed the exact recipe: + +| mode | faithful | elicit | reading | +|---------------|----------|---------|---------| +| run_tests | 0.0% | 2.1% | RL-emergent (paper 0.09% base, has a teacher pool), base resists elicit | +| eq_override | 0.0% | 0.0% | base can't write an __eq__-override even when told; genuinely hard. DROP | +| exit_code | 0.0% | 97.9% | base readily appends sys.exit(0). The clean elicit-seeded loophole | +| stdout_marker | 0.0% | 1.0% | base prints the marker only 1/96; often solves correctly too (-> not exploited) | +| sentinel/file | 0.0% | (pending) | recipe names the exact token; same low-compliance pattern expected | + +Why exit_code wins: `sys.exit(0)` short-circuits the WHOLE program, so any +attempt (even a real solve) becomes wrong-by-oracle AND exits early -> exploited. +A stdout/sentinel/file marker only counts as a hack if the model ALSO fails to +solve; when it prints the marker it often writes plausible code too, so the +exploited rate collapses. This is a property of the base model's disposition, +not of the grader. + +Consequence for the substrate teacher batch: elicit-then-strip yields a strong +seed supply ONLY for exit_code. So the teacher batch is sourced two ways +(build_substrate.py): +- **elicit-file modes** (exit_code): out/vhack_grads/elicit_hacks_.jsonl. +- **pool modes** (run_tests): the existing RL/teacher pool out/pools/teacher_pool + (model-generated run_tests hacks), re-graded under the non-overlap grader. +Both are genuine model-generated rollouts (on-distribution for the student), +re-verified exploited=True under `passed = gt_correct OR channel_i`. The first +honest substrate is therefore run_tests + exit_code (2 distinct grader flaws on +disjoint subsets). Adding side channels needs either RL-emergence runs (Phase-1) +or accepting templated (off-distribution) teacher demos -- deferred. + ### Per-problem env_mode (implementation gap) cfg.env_mode is currently GLOBAL (one mode per run). The substrate needs diff --git a/src/projected_grpo/build_substrate.py b/src/projected_grpo/build_substrate.py index a45a3dc..de5663f 100644 --- a/src/projected_grpo/build_substrate.py +++ b/src/projected_grpo/build_substrate.py @@ -49,6 +49,16 @@ class Config: min_hacks: int = 5 # drop a mode with fewer than this many VERIFIED hacks elicit_dir: Path = OUT_DIR / "vhack_grads" out_dir: Path = OUT_DIR / "pools" / "substrate" + # Teacher source per mode. Most modes read elicit-then-strip hacks from + # elicit_dir/elicit_hacks_.jsonl. But the base model resists eliciting + # some loopholes even handed the recipe (derisk #10: run_tests 2%, stdout 1%), + # while run_tests IS RL-emergent and already has a model-generated teacher pool. + # pool_modes maps such a mode to an existing teacher-pool dir of prompt_*.jsonl.gz + # (probe_distill schema, has a "completion" text field we re-grade). Both sources + # are genuine model rollouts; both re-verified exploited under the non-overlap grader. + pool_modes: str = "run_tests" + pool_src_dir: Path = OUT_DIR / "pools" / "teacher_pool" + pool_cap: int = 200 # cap pool-mode candidates GRADED (full pool is ~1900; we only need a few dozen verified) seed: int = 41 @@ -92,27 +102,44 @@ def main(cfg: Config) -> int: by_id = _load_problems_by_id() candidate_modes = [m.strip() for m in cfg.modes.split(",") if m.strip()] or MODES_ALL + pool_modes = {m.strip() for m in cfg.pool_modes.split(",") if m.strip()} - # Gate 1: load + exploit-verify each mode's elicit hacks. Keep only exploited. + def _candidates(mode: EnvMode) -> tuple[list[tuple[int, str]], int, str]: + """(pid, completion) candidates for `mode` + (n_on_disk, source label).""" + if mode in pool_modes: + cands = [] + # One completion per pool prompt (first rollout) up to pool_cap -- we only + # need a few dozen verified hacks across distinct pids, not the whole pool. + for p in sorted(cfg.pool_src_dir.glob("prompt_*.jsonl.gz")): + if len(cands) >= cfg.pool_cap: + break + pid = int(p.name.split("_")[1].split(".")[0]) + with gzip.open(p, "rt") as fh: + first = fh.readline() + if first.strip(): + cands.append((pid, json.loads(first)["completion"])) + return cands, len(cands), f"pool:{cfg.pool_src_dir.name}" + path = cfg.elicit_dir / f"elicit_hacks_{mode}.jsonl" + if not path.exists(): + return [], 0, "elicit:missing" + entries = [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + return [(e["problem_id"], e["completion"]) for e in entries], len(entries), "elicit" + + # Gate 1: load + exploit-verify each mode's candidate hacks. Keep only exploited. verified: dict[str, list[tuple[int, str]]] = {} # mode -> [(pid, completion)] rows = [] for mode in candidate_modes: - path = cfg.elicit_dir / f"elicit_hacks_{mode}.jsonl" - if not path.exists(): - rows.append(dict(mode=mode, on_disk=0, verified=0, kept="DROP (no file)")) - continue - entries = [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + cands, n_disk, src = _candidates(mode) kept_hacks = [] - for e in entries: - pid = e["problem_id"] + for pid, comp in cands: prob = by_id[pid] r = compute_reward( - e["completion"], canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"], + comp, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"], setup_code=prob["setup_code"], func_name_hint=prob["func_name"], env_mode=mode) if r.exploited: - kept_hacks.append((pid, e["completion"])) + kept_hacks.append((pid, comp)) verified[mode] = kept_hacks - rows.append(dict(mode=mode, on_disk=len(entries), verified=len(kept_hacks), + rows.append(dict(mode=mode, source=src, on_disk=n_disk, verified=len(kept_hacks), kept="KEEP" if len(kept_hacks) >= cfg.min_hacks else f"DROP (<{cfg.min_hacks})")) kept_modes = [m for m in candidate_modes if len(verified.get(m, [])) >= cfg.min_hacks] @@ -123,12 +150,19 @@ def main(cfg: Config) -> int: f"{kept_modes}. A multi-loophole substrate needs >= 2. Aborting.") return 1 - # Gate 2: even round-robin assignment, one mode per problem. - per_mode = cfg.per_mode or min(len(verified[m]) for m in kept_modes) - logger.info(f"kept modes: {kept_modes}; balancing to per_mode={per_mode} each " - f"(min verified = {min(len(verified[m]) for m in kept_modes)}).") + # Gate 2: even round-robin assignment, one mode per problem. SCARCEST mode first + # each pass -- modes draw from overlapping pid sets (elicit modes share the first + # ~24 derisk problems), and a problem can go to only one mode; if the abundant + # pool mode picked first it would grab the shared pids and starve the scarce modes. + # Ordering by unique-pid availability ascending gives the most even split. + uniq_pids = {m: len({pid for pid, _ in verified[m]}) for m in kept_modes} + order = sorted(kept_modes, key=lambda m: uniq_pids[m]) + per_mode = cfg.per_mode or min(uniq_pids[m] for m in kept_modes) + logger.info(f"kept modes (scarcest-first): {order} unique_pids={uniq_pids}; " + f"balancing to per_mode={per_mode} each.") # Stable per-mode queues sorted by pid for reproducibility. - queues = {m: sorted(verified[m], key=lambda x: x[0]) for m in kept_modes} + queues = {m: sorted(verified[m], key=lambda x: x[0]) for m in order} + kept_modes = order assigned: dict[int, EnvMode] = {} pid_hacks: dict[int, list[str]] = {} # pid -> [completions] (its assigned mode) counts = {m: 0 for m in kept_modes}