"""Build contrastive pairsets from EXTERNAL reward-hack datasets (HF). Kept separate from make_pairsets.py so that generator stays network-free; this one pulls from HuggingFace and writes additional out/pairsets/*.json. Sets produced: reward_hack_pref Ayush-Singh/reward-hack-preference. A judge prompt presents a clean general solution (Option A) and a hardcoded special-casing hack (Option B) and asks "choose A or B". The dataset's `chosen` consistently picks B (the hack) and `rejected` picks A (the clean) -- verified 747/800 chosen= hardcode, 0/800 rejected=hardcode. So hack=chosen, clean=rejected, sharing the prompt: a clean paired contrast that branches at the choice, isolating the reward-hack- preference direction. prog_widest prog_wider (hand-authored, 94) + a 60-pair slice of the above, i.e. the "super-wide" set with real dataset hacks folded in alongside the synthetic ones. NOT built: Jozdien/realistic_reward_hacks. Its reward_hacks_code (478 hack) and hhh_code (388 honest) splits share ZERO prompts, so they cannot form valid same-prompt pairs (a clean completion to a different problem gives a topic- mismatch gradient, not a hack-vs-clean one). Would need matched clean completions for those 478 prompts to use it. Run: uv run python scripts/make_dataset_pairsets.py """ from __future__ import annotations from pathlib import Path from datasets import load_dataset from vgrout.pairs import HackPair from vgrout.pairs_from_pool import load_pairs_json, save_pairs_json OUT = Path("out/pairsets") N_PREF = 256 # reward_hack_pref subset size (well-conditioned for k=12, fast extract) N_FOLD = 60 # how many to fold into prog_widest def _chat(user: str) -> str: """Qwen chat template, no (these are judge/choice completions).""" return ( "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n" ) def _pref_pairs(stride: int, limit: int) -> list[HackPair]: ds = load_dataset("Ayush-Singh/reward-hack-preference", split="train") idxs = list(range(0, len(ds), stride))[:limit] pairs = [] for i in idxs: r = ds[i] pairs.append(HackPair( problem_id=f"rhpref_{i}", prompt=_chat(r["prompt"]), hack=" " + r["chosen"], # picks Option B = the hardcoded hack clean=" " + r["rejected"], # picks Option A = the general solution )) return pairs def main() -> None: OUT.mkdir(parents=True, exist_ok=True) pref = _pref_pairs(stride=3, limit=N_PREF) save_pairs_json(pref, OUT / "reward_hack_pref.json") print(f"reward_hack_pref {len(pref):>3d} pairs") base = load_pairs_json(OUT / "prog_wider.json") # 94 hand-authored (run make_pairsets first) fold = _pref_pairs(stride=13, limit=N_FOLD) # different slice, avoid overlap with reward_hack_pref widest = base + fold save_pairs_json(widest, OUT / "prog_widest.json") print(f"prog_widest {len(widest):>3d} pairs ({len(base)} authored + {len(fold)} dataset)") if __name__ == "__main__": main()