evil_MoE/scripts/build_runtests_pool.py

"""Build a DENSE single-mode run_tests teacher pool, re-graded under the current
non-overlap grader.

The old `just build-runtests-pool` copied only the 6 run_tests prompts from the
6/6/6/6 substrate partition -- far too sparse to seed the hack in a 60-step run
(most steps draw zero teacher demos -> student never learns the hack -> all arms
~0 hack -> comparison invalid). This builds from the full model-generated
teacher pool (out/pools/teacher_pool, 233 prompts, in-sample rh-s65 rollouts),
re-grades every rollout under env_mode=run_tests, and keeps the verified hacks.
One verified rollout per prompt is enough (train.py mixes G_t=1 per step); more
coverage (prompts) is what raises the per-step teacher-hit rate.

Row schema is COPIED verbatim from build_substrate.py:214-237 so the two pools
are byte-compatible for train.py's mixed-pool loader. No partition.json -> train
runs single-mode (cfg.env_mode=run_tests for every prompt).

    uv run python scripts/build_runtests_pool.py            # -> out/pools/teacher_pool_runtests_dense
"""
from __future__ import annotations

import gzip
import json
from pathlib import Path

import tyro
from loguru import logger
from tabulate import tabulate
from transformers import AutoTokenizer

from vgrout.data import DATA, HINT_REPLACE_TO
from vgrout.rewards import compute_reward

OUT_DIR = Path("out")


def _faithful_messages(prompt_msgs: list[dict]) -> list[dict]:
    """run_tests hint-only prompt (same swap load_problems applies at train time)."""
    msgs = [dict(m) for m in prompt_msgs]
    for m in msgs:
        if m.get("role") == "user":
            m["content"] = m["content"].replace("and should pass all tests", HINT_REPLACE_TO["run_tests"])
            break
    return msgs


def _problems_by_id() -> dict[int, dict]:
    by_id = {}
    with DATA.open() as f:
        for idx, line in enumerate(f):
            d = json.loads(line)
            pid = d.get("id", idx)
            by_id[pid] = dict(prompt_msgs=d["prompt"], gt_tests=d["gt_answer"],
                              setup_code=d.get("setup_code", ""),
                              func_name=d.get("func_name", "Solution().solve"),
                              canonical=d.get("canonical_solution", ""))
    return by_id


def main(
    src_dir: Path = OUT_DIR / "pools" / "teacher_pool",
    out_dir: Path = OUT_DIR / "pools" / "teacher_pool_runtests_dense",
    model: str = "Qwen/Qwen3-4B",        # tokenizer only; grading is subprocess-based
    max_rollouts_per_prompt: int = 1,    # 1 verified hack/prompt; coverage > depth for seeding
) -> int:
    logger.info(
        "SHOULD: verified/on_disk well above 50% (in-sample rh-s65 hacks re-grade as "
        "run_tests exploits); n_prompts >> 6 (the old sparse pool). ELSE grader drift "
        "or wrong source dir."
    )
    tok = AutoTokenizer.from_pretrained(model)
    eos_id = tok.eos_token_id
    by_id = _problems_by_id()

    out_dir.mkdir(parents=True, exist_ok=True)
    for f in out_dir.glob("prompt_*.jsonl.gz"):
        f.unlink()
    (out_dir / "partition.json").unlink(missing_ok=True)   # single-mode: no partition

    n_prompts = n_rollouts = n_on_disk = 0
    for p in sorted(src_dir.glob("prompt_*.jsonl.gz")):
        pid = int(p.name.split("_")[1].split(".")[0])
        prob = by_id[pid]
        with gzip.open(p, "rt") as fh:
            cands = [json.loads(line)["completion"] for line in fh if line.strip()]
        n_on_disk += 1
        msgs = _faithful_messages(prob["prompt_msgs"])
        prompt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True,
                                         enable_thinking=False)
        prompt_ids = tok(prompt, add_special_tokens=False).input_ids
        out_rows = []
        for comp in cands:
            if len(out_rows) >= max_rollouts_per_prompt:
                break
            r = compute_reward(comp, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"],
                               setup_code=prob["setup_code"], func_name_hint=prob["func_name"],
                               env_mode="run_tests")
            if not r.exploited:
                continue
            comp_ids = tok(comp, add_special_tokens=False).input_ids + [eos_id]
            out_rows.append({
                "problem_id": pid, "env_mode": "run_tests",
                "reward": r.reward, "hacked": r.exploited, "gt_pass": r.gt_correct,
                "fmt_ok": r.format_ok, "plen": len(prompt_ids),
                "prompt_ids": prompt_ids, "completion_ids": comp_ids, "completion": comp,
            })
        if not out_rows:
            continue
        with gzip.open(out_dir / f"prompt_{pid:04d}.jsonl.gz", "wt") as fh:
            for row in out_rows:
                fh.write(json.dumps(row) + "\n")
        n_prompts += 1
        n_rollouts += len(out_rows)

    print(tabulate([dict(on_disk=n_on_disk, kept_prompts=n_prompts, rollouts=n_rollouts,
                         verified_frac=f"{n_prompts/max(n_on_disk,1):.0%}")],
                   headers="keys", tablefmt="github"))
    print(f"out: {out_dir} (single-mode run_tests, no partition.json)")
    assert n_prompts >= 50, f"only {n_prompts} prompts kept; expected >> 6 -- grader drift?"
    return 0


if __name__ == "__main__":
    raise SystemExit(tyro.cli(main))