Files
evil_MoE/scripts/make_dataset_pairsets.py
T
wassname f52ba042d5 scripts
2026-05-30 04:16:56 +00:00

81 lines
3.3 KiB
Python

"""Build contrastive pairsets from EXTERNAL reward-hack datasets (HF).
Kept separate from make_pairsets.py so that generator stays network-free; this
one pulls from HuggingFace and writes additional out/pairsets/*.json.
Sets produced:
reward_hack_pref Ayush-Singh/reward-hack-preference. A judge prompt presents
a clean general solution (Option A) and a hardcoded
special-casing hack (Option B) and asks "choose A or B".
The dataset's `chosen` consistently picks B (the hack) and
`rejected` picks A (the clean) -- verified 747/800 chosen=
hardcode, 0/800 rejected=hardcode. So hack=chosen,
clean=rejected, sharing the prompt: a clean paired contrast
that branches at the choice, isolating the reward-hack-
preference direction.
prog_widest prog_wider (hand-authored, 94) + a 60-pair slice of the
above, i.e. the "super-wide" set with real dataset hacks
folded in alongside the synthetic ones.
NOT built: Jozdien/realistic_reward_hacks. Its reward_hacks_code (478 hack) and
hhh_code (388 honest) splits share ZERO prompts, so they cannot form valid
same-prompt pairs (a clean completion to a different problem gives a topic-
mismatch gradient, not a hack-vs-clean one). Would need matched clean
completions for those 478 prompts to use it.
Run: uv run python scripts/make_dataset_pairsets.py
"""
from __future__ import annotations
from pathlib import Path
from datasets import load_dataset
from projected_grpo.pairs import HackPair
from projected_grpo.pairs_from_pool import load_pairs_json, save_pairs_json
OUT = Path("out/pairsets")
N_PREF = 256 # reward_hack_pref subset size (well-conditioned for k=12, fast extract)
N_FOLD = 60 # how many to fold into prog_widest
def _chat(user: str) -> str:
"""Qwen chat template, no <think> (these are judge/choice completions)."""
return (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n"
)
def _pref_pairs(stride: int, limit: int) -> list[HackPair]:
ds = load_dataset("Ayush-Singh/reward-hack-preference", split="train")
idxs = list(range(0, len(ds), stride))[:limit]
pairs = []
for i in idxs:
r = ds[i]
pairs.append(HackPair(
problem_id=f"rhpref_{i}",
prompt=_chat(r["prompt"]),
hack=" " + r["chosen"], # picks Option B = the hardcoded hack
clean=" " + r["rejected"], # picks Option A = the general solution
))
return pairs
def main() -> None:
OUT.mkdir(parents=True, exist_ok=True)
pref = _pref_pairs(stride=3, limit=N_PREF)
save_pairs_json(pref, OUT / "reward_hack_pref.json")
print(f"reward_hack_pref {len(pref):>3d} pairs")
base = load_pairs_json(OUT / "prog_wider.json") # 94 hand-authored (run make_pairsets first)
fold = _pref_pairs(stride=13, limit=N_FOLD) # different slice, avoid overlap with reward_hack_pref
widest = base + fold
save_pairs_json(widest, OUT / "prog_widest.json")
print(f"prog_widest {len(widest):>3d} pairs ({len(base)} authored + {len(fold)} dataset)")
if __name__ == "__main__":
main()