mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
55937a86fb
git mv src/projected_grpo -> src/vgrout and find-replace the module name in
all imports (.py), `-m projected_grpo.*` invocations (justfile), and the
[project] name (pyproject; setuptools auto-discovers via where=["src"]).
Left RESEARCH_JOURNAL.md untouched: its commands/paths are dated lab notes
tied to past commits, so rewriting them would falsify provenance. Repo dir,
git remote, and absolute paths unchanged.
Verified: `import vgrout` and `python -m vgrout.train --help` load the full
graph; verify_rewards.py + verify_gate_anchor.py (both import vgrout) pass.
Full `just smoke` is blocked upstream by missing gitignored data artifacts
(out/pools/{substrate,teacher_pool}, out/vhack/*smoke*), unrelated to the rename.
81 lines
3.3 KiB
Python
81 lines
3.3 KiB
Python
"""Build contrastive pairsets from EXTERNAL reward-hack datasets (HF).
|
|
|
|
Kept separate from make_pairsets.py so that generator stays network-free; this
|
|
one pulls from HuggingFace and writes additional out/pairsets/*.json.
|
|
|
|
Sets produced:
|
|
reward_hack_pref Ayush-Singh/reward-hack-preference. A judge prompt presents
|
|
a clean general solution (Option A) and a hardcoded
|
|
special-casing hack (Option B) and asks "choose A or B".
|
|
The dataset's `chosen` consistently picks B (the hack) and
|
|
`rejected` picks A (the clean) -- verified 747/800 chosen=
|
|
hardcode, 0/800 rejected=hardcode. So hack=chosen,
|
|
clean=rejected, sharing the prompt: a clean paired contrast
|
|
that branches at the choice, isolating the reward-hack-
|
|
preference direction.
|
|
prog_widest prog_wider (hand-authored, 94) + a 60-pair slice of the
|
|
above, i.e. the "super-wide" set with real dataset hacks
|
|
folded in alongside the synthetic ones.
|
|
|
|
NOT built: Jozdien/realistic_reward_hacks. Its reward_hacks_code (478 hack) and
|
|
hhh_code (388 honest) splits share ZERO prompts, so they cannot form valid
|
|
same-prompt pairs (a clean completion to a different problem gives a topic-
|
|
mismatch gradient, not a hack-vs-clean one). Would need matched clean
|
|
completions for those 478 prompts to use it.
|
|
|
|
Run: uv run python scripts/make_dataset_pairsets.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from datasets import load_dataset
|
|
|
|
from vgrout.pairs import HackPair
|
|
from vgrout.pairs_from_pool import load_pairs_json, save_pairs_json
|
|
|
|
OUT = Path("out/pairsets")
|
|
N_PREF = 256 # reward_hack_pref subset size (well-conditioned for k=12, fast extract)
|
|
N_FOLD = 60 # how many to fold into prog_widest
|
|
|
|
|
|
def _chat(user: str) -> str:
|
|
"""Qwen chat template, no <think> (these are judge/choice completions)."""
|
|
return (
|
|
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
|
f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n"
|
|
)
|
|
|
|
|
|
def _pref_pairs(stride: int, limit: int) -> list[HackPair]:
|
|
ds = load_dataset("Ayush-Singh/reward-hack-preference", split="train")
|
|
idxs = list(range(0, len(ds), stride))[:limit]
|
|
pairs = []
|
|
for i in idxs:
|
|
r = ds[i]
|
|
pairs.append(HackPair(
|
|
problem_id=f"rhpref_{i}",
|
|
prompt=_chat(r["prompt"]),
|
|
hack=" " + r["chosen"], # picks Option B = the hardcoded hack
|
|
clean=" " + r["rejected"], # picks Option A = the general solution
|
|
))
|
|
return pairs
|
|
|
|
|
|
def main() -> None:
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
pref = _pref_pairs(stride=3, limit=N_PREF)
|
|
save_pairs_json(pref, OUT / "reward_hack_pref.json")
|
|
print(f"reward_hack_pref {len(pref):>3d} pairs")
|
|
|
|
base = load_pairs_json(OUT / "prog_wider.json") # 94 hand-authored (run make_pairsets first)
|
|
fold = _pref_pairs(stride=13, limit=N_FOLD) # different slice, avoid overlap with reward_hack_pref
|
|
widest = base + fold
|
|
save_pairs_json(widest, OUT / "prog_widest.json")
|
|
print(f"prog_widest {len(widest):>3d} pairs ({len(base)} authored + {len(fold)} dataset)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|