mirror of
https://github.com/wassname/grpo_proj2.git
synced 2026-06-27 18:05:04 +08:00
b0d1bcd3d5
Expand docs/pseudocode/01..07 into a slim, fail-fast src/projected_grpo/ that
passes `just smoke`. Code mirrors the pseudocode (δS/Σ/V names, relu-before-agg
cin/cout, Dr.GRPO unbiased loss). Did not read the original src.
7 modules (~880 LOC):
- rewards.py grader + 4 loophole modes + hack x mode diagonal self-check (R1)
- problems.py tiny LeetCode substrate + contrastive pairs (R5)
- antipasto.py SVD adapter, identity at δS=0 (R2)
- proj.py erase/route/measure_only projection (R3)
- extract_vhack_grad.py per-module SVD of paired grad diffs, noise floor (R5)
- train.py mixed student+teacher GRPO loop, presets smoke/fast/full (R4)
- build_pool.py self-contained frozen teacher-pool fixture
`just smoke-all` PASS (exit 0): erase/none/route trio, grader diagonal clean,
v_hack cache miss->hit, ckpt every-25. Fresh-eyes review: 6/6 mechanics faithful.
Simplifications: merged loopholes+verify_rewards->rewards, pairs->problems; flat
Config + `train.py {preset} [--overrides]` CLI; justfile 384->71 lines; trimmed
results table; token-efficient train logging (config anchor, SHOULD at loop site,
sparse tqdm postfix, BLUF tail with cue + direction-arrow table).
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
47 lines
1.8 KiB
Python
47 lines
1.8 KiB
Python
"""Build the self-contained teacher-pool fixture (replaces the off-limits Qwen3-4B pool).
|
|
|
|
Bakes canned hack/clean completions (blog-appendix templates), graded ONCE by the
|
|
real grader and FROZEN, into out/pools/teacher_pool/prompt_NNNN.jsonl.gz. The pool
|
|
injects reward variance (teacher hacks pass ~1.25, tiny-random student ~0) so the
|
|
GRPO backward / projection / cin paths fire instead of bailing zero-variance.
|
|
|
|
python -m projected_grpo.build_pool --pool-dir=out/pools/teacher_pool
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import gzip
|
|
import json
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
|
|
import tyro
|
|
from loguru import logger
|
|
|
|
from projected_grpo.problems import PROBLEMS, clean_completion, hack_completion
|
|
from projected_grpo.rewards import compute_reward
|
|
|
|
|
|
def main(pool_dir: str = "out/pools/teacher_pool", n_hack: int = 5, n_clean: int = 2):
|
|
out = Path(pool_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
n_exploit = 0
|
|
for p in PROBLEMS:
|
|
completions = [hack_completion(p, p.mode)] * n_hack + [clean_completion(p)] * n_clean
|
|
rows = []
|
|
for c in completions:
|
|
r = compute_reward(c, p)
|
|
rows.append(dict(problem_id=p.id, prompt=p.prompt(), completion=c,
|
|
reward=r.reward, passed=r.passed, gt_correct=r.gt_correct,
|
|
exploited=r.exploited, mechanism=r.mechanism))
|
|
n_exploit += int(r.exploited)
|
|
with gzip.open(out / f"prompt_{p.id:04d}.jsonl.gz", "wt") as f:
|
|
for row in rows:
|
|
f.write(json.dumps(row) + "\n")
|
|
logger.info(f"baked {len(PROBLEMS)} prompts x {n_hack + n_clean} rollouts -> {pool_dir} "
|
|
f"({n_exploit} exploited). SHOULD: exploited == {len(PROBLEMS) * n_hack} "
|
|
f"(every hack fires), ELSE template/grader drift.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tyro.cli(main)
|