"""Build a combined teacher pool by concatenating same-prompt rollouts from multiple source pools (G2/G3, docs/spec/20260528_g2_g3_checkpoint_selection.md). Output: one prompt_NNNN.jsonl.gz per unique problem_id under out/probe_distill/teacher_pool_combined/, containing every rollout from every source pool with that problem_id. Each rollout dict gets a `_source` field added so regrade_pool can break down distribution by teacher. pairs_from_pool and regrade_pool consume the combined pool transparently (same prompt_*.jsonl.gz glob). """ from __future__ import annotations import gzip import json from pathlib import Path SOURCES = [ "out/pools/teacher_pool", # rh-s65 (existing) "out/pools/teacher_pool_rh_s42", "out/pools/teacher_pool_inocloop_s65", "out/pools/teacher_pool_jmonscr_s65", "out/pools/teacher_pool_pmonscr_s65", ] OUT = Path("out/pools/teacher_pool_combined") def main() -> None: if OUT.exists(): for f in OUT.iterdir(): f.unlink() OUT.mkdir(parents=True, exist_ok=True) by_pid: dict[int, list[dict]] = {} n_per_src: dict[str, int] = {} for src in SOURCES: p = Path(src) if not p.is_dir(): print(f"SKIP missing: {src}") continue src_tag = p.name n_src = 0 for path in sorted(p.glob("prompt_*.jsonl.gz")): with gzip.open(path, "rt") as f: for line in f: row = json.loads(line) row["_source"] = src_tag by_pid.setdefault(row["problem_id"], []).append(row) n_src += 1 n_per_src[src_tag] = n_src print(f" {src_tag}: {n_src} rollouts from {sum(1 for _ in p.glob('prompt_*.jsonl.gz'))} prompts") for pid, rows in by_pid.items(): with gzip.open(OUT / f"prompt_{pid:04d}.jsonl.gz", "wt") as f: for row in rows: f.write(json.dumps(row) + "\n") total_rollouts = sum(len(v) for v in by_pid.values()) print(f"\nWrote {len(by_pid)} unique prompts, {total_rollouts} total rollouts to {OUT}") print(f"Per-source breakdown: {n_per_src}") if __name__ == "__main__": main()