evil_MoE/scripts/make_dataset_pairsets.py

"""Build contrastive pairsets from EXTERNAL reward-hack datasets (HF).

Kept separate from make_pairsets.py so that generator stays network-free; this
one pulls from HuggingFace and writes additional out/pairsets/*.json.

Sets produced:
  reward_hack_pref  Ayush-Singh/reward-hack-preference. A judge prompt presents
                    a clean general solution (Option A) and a hardcoded
                    special-casing hack (Option B) and asks "choose A or B".
                    The dataset's `chosen` consistently picks B (the hack) and
                    `rejected` picks A (the clean) -- verified 747/800 chosen=
                    hardcode, 0/800 rejected=hardcode. So hack=chosen,
                    clean=rejected, sharing the prompt: a clean paired contrast
                    that branches at the choice, isolating the reward-hack-
                    preference direction.
  prog_widest       prog_wider (hand-authored, 94) + a 60-pair slice of the
                    above, i.e. the "super-wide" set with real dataset hacks
                    folded in alongside the synthetic ones.

NOT built: Jozdien/realistic_reward_hacks. Its reward_hacks_code (478 hack) and
hhh_code (388 honest) splits share ZERO prompts, so they cannot form valid
same-prompt pairs (a clean completion to a different problem gives a topic-
mismatch gradient, not a hack-vs-clean one). Would need matched clean
completions for those 478 prompts to use it.

Run: uv run python scripts/make_dataset_pairsets.py
"""
from __future__ import annotations

from pathlib import Path

from datasets import load_dataset

from projected_grpo.pairs import HackPair
from projected_grpo.pairs_from_pool import load_pairs_json, save_pairs_json

OUT = Path("out/pairsets")
N_PREF = 256       # reward_hack_pref subset size (well-conditioned for k=12, fast extract)
N_FOLD = 60        # how many to fold into prog_widest


def _chat(user: str) -> str:
    """Qwen chat template, no <think> (these are judge/choice completions)."""
    return (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n"
    )


def _pref_pairs(stride: int, limit: int) -> list[HackPair]:
    ds = load_dataset("Ayush-Singh/reward-hack-preference", split="train")
    idxs = list(range(0, len(ds), stride))[:limit]
    pairs = []
    for i in idxs:
        r = ds[i]
        pairs.append(HackPair(
            problem_id=f"rhpref_{i}",
            prompt=_chat(r["prompt"]),
            hack=" " + r["chosen"],     # picks Option B = the hardcoded hack
            clean=" " + r["rejected"],  # picks Option A = the general solution
        ))
    return pairs


def main() -> None:
    OUT.mkdir(parents=True, exist_ok=True)

    pref = _pref_pairs(stride=3, limit=N_PREF)
    save_pairs_json(pref, OUT / "reward_hack_pref.json")
    print(f"reward_hack_pref {len(pref):>3d} pairs")

    base = load_pairs_json(OUT / "prog_wider.json")  # 94 hand-authored (run make_pairsets first)
    fold = _pref_pairs(stride=13, limit=N_FOLD)       # different slice, avoid overlap with reward_hack_pref
    widest = base + fold
    save_pairs_json(widest, OUT / "prog_widest.json")
    print(f"prog_widest      {len(widest):>3d} pairs ({len(base)} authored + {len(fold)} dataset)")


if __name__ == "__main__":
    main()