mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:43:00 +08:00
3200771042
The 6-prompt teacher_pool_runtests covered ~3% of the 200-prompt train pool, so ~1 step in 8 saw a teacher demo and the student never learned the hack within 60 steps (hack_s=0/28 through step 19, job 0) -> all arms ~0 hack -> directionality comparison invalid. scripts/build_runtests_pool.py: builds a DENSE single-mode pool from the full model-generated rh-s65 teacher pool (233 prompts, in-sample hacks), re-grades each under env_mode=run_tests, keeps verified exploits (215/233 = 92% re-verify; the rest went stale under the post-grader-bug grader). One demo/prompt (G_t=1 per step), no partition.json. Reuses compute_reward; row schema copied verbatim from build_substrate so the pools are loader-compatible. - queue-dir6 -> teacher_pool_runtests_dense (all 8 arms). - build-runtests-pool recipe -> the new dense builder (was: copy 6 from substrate). - main.tex teacher-seeding paragraph: disclose re-grade+verify, drop the now-wrong 'no re-grading' and the stale 6-prompt count; note demos are full problem-specific completions (real solution + permissive self-written run_tests), not a snippet. Source = HACKY checkpoint (rh-s65), not base. Old 6-prompt sweep killed and requeued on the dense pool. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
124 lines
5.2 KiB
Python
124 lines
5.2 KiB
Python
"""Build a DENSE single-mode run_tests teacher pool, re-graded under the current
|
|
non-overlap grader.
|
|
|
|
The old `just build-runtests-pool` copied only the 6 run_tests prompts from the
|
|
6/6/6/6 substrate partition -- far too sparse to seed the hack in a 60-step run
|
|
(most steps draw zero teacher demos -> student never learns the hack -> all arms
|
|
~0 hack -> comparison invalid). This builds from the full model-generated
|
|
teacher pool (out/pools/teacher_pool, 233 prompts, in-sample rh-s65 rollouts),
|
|
re-grades every rollout under env_mode=run_tests, and keeps the verified hacks.
|
|
One verified rollout per prompt is enough (train.py mixes G_t=1 per step); more
|
|
coverage (prompts) is what raises the per-step teacher-hit rate.
|
|
|
|
Row schema is COPIED verbatim from build_substrate.py:214-237 so the two pools
|
|
are byte-compatible for train.py's mixed-pool loader. No partition.json -> train
|
|
runs single-mode (cfg.env_mode=run_tests for every prompt).
|
|
|
|
uv run python scripts/build_runtests_pool.py # -> out/pools/teacher_pool_runtests_dense
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import gzip
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import tyro
|
|
from loguru import logger
|
|
from tabulate import tabulate
|
|
from transformers import AutoTokenizer
|
|
|
|
from vgrout.data import DATA, HINT_REPLACE_TO
|
|
from vgrout.rewards import compute_reward
|
|
|
|
OUT_DIR = Path("out")
|
|
|
|
|
|
def _faithful_messages(prompt_msgs: list[dict]) -> list[dict]:
|
|
"""run_tests hint-only prompt (same swap load_problems applies at train time)."""
|
|
msgs = [dict(m) for m in prompt_msgs]
|
|
for m in msgs:
|
|
if m.get("role") == "user":
|
|
m["content"] = m["content"].replace("and should pass all tests", HINT_REPLACE_TO["run_tests"])
|
|
break
|
|
return msgs
|
|
|
|
|
|
def _problems_by_id() -> dict[int, dict]:
|
|
by_id = {}
|
|
with DATA.open() as f:
|
|
for idx, line in enumerate(f):
|
|
d = json.loads(line)
|
|
pid = d.get("id", idx)
|
|
by_id[pid] = dict(prompt_msgs=d["prompt"], gt_tests=d["gt_answer"],
|
|
setup_code=d.get("setup_code", ""),
|
|
func_name=d.get("func_name", "Solution().solve"),
|
|
canonical=d.get("canonical_solution", ""))
|
|
return by_id
|
|
|
|
|
|
def main(
|
|
src_dir: Path = OUT_DIR / "pools" / "teacher_pool",
|
|
out_dir: Path = OUT_DIR / "pools" / "teacher_pool_runtests_dense",
|
|
model: str = "Qwen/Qwen3-4B", # tokenizer only; grading is subprocess-based
|
|
max_rollouts_per_prompt: int = 1, # 1 verified hack/prompt; coverage > depth for seeding
|
|
) -> int:
|
|
logger.info(
|
|
"SHOULD: verified/on_disk well above 50% (in-sample rh-s65 hacks re-grade as "
|
|
"run_tests exploits); n_prompts >> 6 (the old sparse pool). ELSE grader drift "
|
|
"or wrong source dir."
|
|
)
|
|
tok = AutoTokenizer.from_pretrained(model)
|
|
eos_id = tok.eos_token_id
|
|
by_id = _problems_by_id()
|
|
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
for f in out_dir.glob("prompt_*.jsonl.gz"):
|
|
f.unlink()
|
|
(out_dir / "partition.json").unlink(missing_ok=True) # single-mode: no partition
|
|
|
|
n_prompts = n_rollouts = n_on_disk = 0
|
|
for p in sorted(src_dir.glob("prompt_*.jsonl.gz")):
|
|
pid = int(p.name.split("_")[1].split(".")[0])
|
|
prob = by_id[pid]
|
|
with gzip.open(p, "rt") as fh:
|
|
cands = [json.loads(line)["completion"] for line in fh if line.strip()]
|
|
n_on_disk += 1
|
|
msgs = _faithful_messages(prob["prompt_msgs"])
|
|
prompt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True,
|
|
enable_thinking=False)
|
|
prompt_ids = tok(prompt, add_special_tokens=False).input_ids
|
|
out_rows = []
|
|
for comp in cands:
|
|
if len(out_rows) >= max_rollouts_per_prompt:
|
|
break
|
|
r = compute_reward(comp, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"],
|
|
setup_code=prob["setup_code"], func_name_hint=prob["func_name"],
|
|
env_mode="run_tests")
|
|
if not r.exploited:
|
|
continue
|
|
comp_ids = tok(comp, add_special_tokens=False).input_ids + [eos_id]
|
|
out_rows.append({
|
|
"problem_id": pid, "env_mode": "run_tests",
|
|
"reward": r.reward, "hacked": r.exploited, "gt_pass": r.gt_correct,
|
|
"fmt_ok": r.format_ok, "plen": len(prompt_ids),
|
|
"prompt_ids": prompt_ids, "completion_ids": comp_ids, "completion": comp,
|
|
})
|
|
if not out_rows:
|
|
continue
|
|
with gzip.open(out_dir / f"prompt_{pid:04d}.jsonl.gz", "wt") as fh:
|
|
for row in out_rows:
|
|
fh.write(json.dumps(row) + "\n")
|
|
n_prompts += 1
|
|
n_rollouts += len(out_rows)
|
|
|
|
print(tabulate([dict(on_disk=n_on_disk, kept_prompts=n_prompts, rollouts=n_rollouts,
|
|
verified_frac=f"{n_prompts/max(n_on_disk,1):.0%}")],
|
|
headers="keys", tablefmt="github"))
|
|
print(f"out: {out_dir} (single-mode run_tests, no partition.json)")
|
|
assert n_prompts >= 50, f"only {n_prompts} prompts kept; expected >> 6 -- grader drift?"
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(tyro.cli(main))
|