mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
feat: build_substrate two-source teacher batch + scarcest-first even assignment
derisk #10: only exit_code is base-elicitable at scale (98%); sentinel 13.5% (13 seeds), run_tests 2% (RL-emergent, pool-sourced), stdout/file/eq ~0. So the teacher batch sources exit_code+sentinel from elicit files and run_tests from the existing teacher pool. Scarcest-mode-first round-robin + pool_cap give an even 7/7/7 partition (21 problems, 40 rollouts). Spec records the elicitability finding. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -166,6 +166,39 @@ For honest run_tests solutions gt_correct == gt_pass, so this matches prior
|
||||
run_tests results. A foreign exploit on subset_i trips no channel and fails
|
||||
gt_correct -> no reward. Clean non-overlap.
|
||||
|
||||
### Elicitability finding (derisk #10, 6 modes, Qwen3-4B) — 2026-05-30
|
||||
|
||||
The 6-mode quadrant says only ONE mode is base-elicitable into the exploited
|
||||
state at scale; the side channels are grader-honored (verify_rewards 22/22) but
|
||||
the base model will not produce them even handed the exact recipe:
|
||||
|
||||
| mode | faithful | elicit | reading |
|
||||
|---------------|----------|---------|---------|
|
||||
| run_tests | 0.0% | 2.1% | RL-emergent (paper 0.09% base, has a teacher pool), base resists elicit |
|
||||
| eq_override | 0.0% | 0.0% | base can't write an __eq__-override even when told; genuinely hard. DROP |
|
||||
| exit_code | 0.0% | 97.9% | base readily appends sys.exit(0). The clean elicit-seeded loophole |
|
||||
| stdout_marker | 0.0% | 1.0% | base prints the marker only 1/96; often solves correctly too (-> not exploited) |
|
||||
| sentinel/file | 0.0% | (pending) | recipe names the exact token; same low-compliance pattern expected |
|
||||
|
||||
Why exit_code wins: `sys.exit(0)` short-circuits the WHOLE program, so any
|
||||
attempt (even a real solve) becomes wrong-by-oracle AND exits early -> exploited.
|
||||
A stdout/sentinel/file marker only counts as a hack if the model ALSO fails to
|
||||
solve; when it prints the marker it often writes plausible code too, so the
|
||||
exploited rate collapses. This is a property of the base model's disposition,
|
||||
not of the grader.
|
||||
|
||||
Consequence for the substrate teacher batch: elicit-then-strip yields a strong
|
||||
seed supply ONLY for exit_code. So the teacher batch is sourced two ways
|
||||
(build_substrate.py):
|
||||
- **elicit-file modes** (exit_code): out/vhack_grads/elicit_hacks_<mode>.jsonl.
|
||||
- **pool modes** (run_tests): the existing RL/teacher pool out/pools/teacher_pool
|
||||
(model-generated run_tests hacks), re-graded under the non-overlap grader.
|
||||
Both are genuine model-generated rollouts (on-distribution for the student),
|
||||
re-verified exploited=True under `passed = gt_correct OR channel_i`. The first
|
||||
honest substrate is therefore run_tests + exit_code (2 distinct grader flaws on
|
||||
disjoint subsets). Adding side channels needs either RL-emergence runs (Phase-1)
|
||||
or accepting templated (off-distribution) teacher demos -- deferred.
|
||||
|
||||
### Per-problem env_mode (implementation gap)
|
||||
|
||||
cfg.env_mode is currently GLOBAL (one mode per run). The substrate needs
|
||||
|
||||
@@ -49,6 +49,16 @@ class Config:
|
||||
min_hacks: int = 5 # drop a mode with fewer than this many VERIFIED hacks
|
||||
elicit_dir: Path = OUT_DIR / "vhack_grads"
|
||||
out_dir: Path = OUT_DIR / "pools" / "substrate"
|
||||
# Teacher source per mode. Most modes read elicit-then-strip hacks from
|
||||
# elicit_dir/elicit_hacks_<mode>.jsonl. But the base model resists eliciting
|
||||
# some loopholes even handed the recipe (derisk #10: run_tests 2%, stdout 1%),
|
||||
# while run_tests IS RL-emergent and already has a model-generated teacher pool.
|
||||
# pool_modes maps such a mode to an existing teacher-pool dir of prompt_*.jsonl.gz
|
||||
# (probe_distill schema, has a "completion" text field we re-grade). Both sources
|
||||
# are genuine model rollouts; both re-verified exploited under the non-overlap grader.
|
||||
pool_modes: str = "run_tests"
|
||||
pool_src_dir: Path = OUT_DIR / "pools" / "teacher_pool"
|
||||
pool_cap: int = 200 # cap pool-mode candidates GRADED (full pool is ~1900; we only need a few dozen verified)
|
||||
seed: int = 41
|
||||
|
||||
|
||||
@@ -92,27 +102,44 @@ def main(cfg: Config) -> int:
|
||||
by_id = _load_problems_by_id()
|
||||
|
||||
candidate_modes = [m.strip() for m in cfg.modes.split(",") if m.strip()] or MODES_ALL
|
||||
pool_modes = {m.strip() for m in cfg.pool_modes.split(",") if m.strip()}
|
||||
|
||||
# Gate 1: load + exploit-verify each mode's elicit hacks. Keep only exploited.
|
||||
def _candidates(mode: EnvMode) -> tuple[list[tuple[int, str]], int, str]:
|
||||
"""(pid, completion) candidates for `mode` + (n_on_disk, source label)."""
|
||||
if mode in pool_modes:
|
||||
cands = []
|
||||
# One completion per pool prompt (first rollout) up to pool_cap -- we only
|
||||
# need a few dozen verified hacks across distinct pids, not the whole pool.
|
||||
for p in sorted(cfg.pool_src_dir.glob("prompt_*.jsonl.gz")):
|
||||
if len(cands) >= cfg.pool_cap:
|
||||
break
|
||||
pid = int(p.name.split("_")[1].split(".")[0])
|
||||
with gzip.open(p, "rt") as fh:
|
||||
first = fh.readline()
|
||||
if first.strip():
|
||||
cands.append((pid, json.loads(first)["completion"]))
|
||||
return cands, len(cands), f"pool:{cfg.pool_src_dir.name}"
|
||||
path = cfg.elicit_dir / f"elicit_hacks_{mode}.jsonl"
|
||||
if not path.exists():
|
||||
return [], 0, "elicit:missing"
|
||||
entries = [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
|
||||
return [(e["problem_id"], e["completion"]) for e in entries], len(entries), "elicit"
|
||||
|
||||
# Gate 1: load + exploit-verify each mode's candidate hacks. Keep only exploited.
|
||||
verified: dict[str, list[tuple[int, str]]] = {} # mode -> [(pid, completion)]
|
||||
rows = []
|
||||
for mode in candidate_modes:
|
||||
path = cfg.elicit_dir / f"elicit_hacks_{mode}.jsonl"
|
||||
if not path.exists():
|
||||
rows.append(dict(mode=mode, on_disk=0, verified=0, kept="DROP (no file)"))
|
||||
continue
|
||||
entries = [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
|
||||
cands, n_disk, src = _candidates(mode)
|
||||
kept_hacks = []
|
||||
for e in entries:
|
||||
pid = e["problem_id"]
|
||||
for pid, comp in cands:
|
||||
prob = by_id[pid]
|
||||
r = compute_reward(
|
||||
e["completion"], canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"],
|
||||
comp, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"],
|
||||
setup_code=prob["setup_code"], func_name_hint=prob["func_name"], env_mode=mode)
|
||||
if r.exploited:
|
||||
kept_hacks.append((pid, e["completion"]))
|
||||
kept_hacks.append((pid, comp))
|
||||
verified[mode] = kept_hacks
|
||||
rows.append(dict(mode=mode, on_disk=len(entries), verified=len(kept_hacks),
|
||||
rows.append(dict(mode=mode, source=src, on_disk=n_disk, verified=len(kept_hacks),
|
||||
kept="KEEP" if len(kept_hacks) >= cfg.min_hacks else f"DROP (<{cfg.min_hacks})"))
|
||||
|
||||
kept_modes = [m for m in candidate_modes if len(verified.get(m, [])) >= cfg.min_hacks]
|
||||
@@ -123,12 +150,19 @@ def main(cfg: Config) -> int:
|
||||
f"{kept_modes}. A multi-loophole substrate needs >= 2. Aborting.")
|
||||
return 1
|
||||
|
||||
# Gate 2: even round-robin assignment, one mode per problem.
|
||||
per_mode = cfg.per_mode or min(len(verified[m]) for m in kept_modes)
|
||||
logger.info(f"kept modes: {kept_modes}; balancing to per_mode={per_mode} each "
|
||||
f"(min verified = {min(len(verified[m]) for m in kept_modes)}).")
|
||||
# Gate 2: even round-robin assignment, one mode per problem. SCARCEST mode first
|
||||
# each pass -- modes draw from overlapping pid sets (elicit modes share the first
|
||||
# ~24 derisk problems), and a problem can go to only one mode; if the abundant
|
||||
# pool mode picked first it would grab the shared pids and starve the scarce modes.
|
||||
# Ordering by unique-pid availability ascending gives the most even split.
|
||||
uniq_pids = {m: len({pid for pid, _ in verified[m]}) for m in kept_modes}
|
||||
order = sorted(kept_modes, key=lambda m: uniq_pids[m])
|
||||
per_mode = cfg.per_mode or min(uniq_pids[m] for m in kept_modes)
|
||||
logger.info(f"kept modes (scarcest-first): {order} unique_pids={uniq_pids}; "
|
||||
f"balancing to per_mode={per_mode} each.")
|
||||
# Stable per-mode queues sorted by pid for reproducibility.
|
||||
queues = {m: sorted(verified[m], key=lambda x: x[0]) for m in kept_modes}
|
||||
queues = {m: sorted(verified[m], key=lambda x: x[0]) for m in order}
|
||||
kept_modes = order
|
||||
assigned: dict[int, EnvMode] = {}
|
||||
pid_hacks: dict[int, list[str]] = {} # pid -> [completions] (its assigned mode)
|
||||
counts = {m: 0 for m in kept_modes}
|
||||
|
||||
Reference in New Issue
Block a user