diff --git a/docs/spec/20260530_requeue_manifest.md b/docs/spec/20260530_requeue_manifest.md new file mode 100644 index 0000000..960b2ea --- /dev/null +++ b/docs/spec/20260530_requeue_manifest.md @@ -0,0 +1,81 @@ +# Requeue manifest (queue reset 2026-05-30) + +Durable copy of the pueue why/resolve labels + commands, captured before +`pueue reset` wiped them. This file is the requeue plan. Raw JSON snapshot: +`/tmp/claude-1000/queue_snapshot_033450.json` (not durable — this .md is). + +## Dependency note (env reframe 2026-05-30) +The 4 dynamics-plot arms (none/static-erase/online-erase/route) are to be +rebuilt on the NEW expose-K split env, not the original run_tests env. That +env does not exist yet: it is gated on T0 (#183, M2>0 reachability) and then +needs T1/T2/T4 built (Stage 2 #132). So the plot arms below are commands for +the ORIGINAL env and must be re-pathed to the split env once it is built. +Original-env why-jobs (marathon #152, prog_widest #181, G2 screens, defer) +can requeue immediately and use new out/ paths after the reorg (#131). + +### #184 (prio 90) +- why: T8 KEY GOAL seed-41 cell intervention=none (60-step head-to-head); resolve: route/erase ship-hack < none at matched solve => projection beats vanilla; feeds dynamics plot + ``` + just run-cell none 41 + ``` + +### #185 (prio 90) +- why: T8 KEY GOAL seed-41 cell intervention=erase (60-step head-to-head); resolve: route/erase ship-hack < none at matched solve => projection beats vanilla; feeds dynamics plot + ``` + just run-cell erase 41 + ``` + +### #186 (prio 90) +- why: T8 KEY GOAL seed-41 cell intervention=route (60-step head-to-head); resolve: route/erase ship-hack < none at matched solve => projection beats vanilla; feeds dynamics plot + ``` + just run-cell route 41 + ``` + +### #187 (prio 90) +- why: T8 overlay missing the ONLINE-erasure arm (refresh-2) at matched mix=0.125/s41/60-step — user wants it back in dynamics overlay; resolve: 4-arm overlay none/static-erase/online-erase/route, all seed-41, shows whether refresh keeps hack_s down longer than static + ``` + just run-cell erase 41 2 + ``` + +### #181 (prio 40) +- why: does v_hack from 'prog_widest' suppress mechanical LeetCode hack at matched solve, seed41 frozen; resolve: L5_hack vs vanilla #153 (0.664), prog_wide #156 (0.500) + ``` + uv run python -m projected_grpo.train fast --teacher-pool-dir=out/probe_distill/teacher_pool --grad-clip=500 --seed=41 --intervention=erase --v-hack-path=out/v_hack_pairset_prog_widest.safetensors --out-tag=_pairset_prog_widest_s41 + ``` + +### #183 (prio 8) +- why: T0 de-risk — does Qwen3-4B hardcode K visible tests under expose-K nudge; resolve: M2 rate >0 => expose-K env reachable, proceed T1; ~0 => STOP rethink env + ``` + uv run python scripts/derisk_expose_k.py --model=Qwen/Qwen3-4B --n-problems=24 --group=8 --k-visible=2 --seed=41 + ``` + +### #152 (prio 1) +- why: low mix + high refresh, LONG horizon — different equilibrium than full-hack? (v_hack_21pairs, refresh-2 k=12, mix0125, s42, 1000 steps); resolve: hack_s<~0.7 and/or solve_s>~0.2 at large step => new attractor [demoted to bg prio so short jobs run first] + ``` + just fast-projected --v-hack-path=out/v_hack_21pairs.safetensors --vhack-refresh-every=2 --seed=42 --steps=1000 --out-tag=_equilib_refresh2_k12_mix0125_1000_s42 + ``` + +### #137 (prio 0) +- why: G2 screen rl-rewardhacking-leetcode-gt-monitor-screening-s65 (5-prompt batch); resolve: per-checkpoint E/C/D% table to find non-rh-s65 detector clusters + ``` + just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-gt-monitor-screening-s65 teacher_pool_rl-rewardhacking-leetcode-gt-monitor-screening-s65 5 + ``` + +### #138 (prio 0) +- why: G2 screen rl-rewardhacking-leetcode-judge-monitor-screening-s65 (5-prompt batch); resolve: per-checkpoint E/C/D% table to find non-rh-s65 detector clusters + ``` + just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-judge-monitor-screening-s65 teacher_pool_rl-rewardhacking-leetcode-judge-monitor-screening-s65 5 + ``` + +### #139 (prio 0) +- why: G2 screen rl-rewardhacking-leetcode-probe-monitor-screening-s65 (5-prompt batch); resolve: per-checkpoint E/C/D% table to find non-rh-s65 detector clusters + ``` + just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-probe-monitor-screening-s65 teacher_pool_rl-rewardhacking-leetcode-probe-monitor-screening-s65 5 + ``` + +### #173 (prio -10) +- why: does Qwen3.6-27B defer under DEFER_PERSONA on blatant authority prompts (w2schar-mini); resolve: [DEFER] gens COMPLY not refuse => wire persona-gen into prepare_round; else need another deferring-anchor source + ``` + uv run python scripts/validate_defer_persona.py + ``` + diff --git a/justfile b/justfile index ee8370a..6e55b93 100644 --- a/justfile +++ b/justfile @@ -27,20 +27,20 @@ results: # zero-variance bails every step, leaving the loss path uncovered. smoke *ARGS: BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --intervention=erase \ - --v-hack-path=out/v_hack_smoke.safetensors \ - --teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 {{ ARGS }} + --v-hack-path=out/vhack/v_hack_smoke.safetensors \ + --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} smoke-vanilla *ARGS: BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --intervention=none \ - --teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 {{ ARGS }} + --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} # Routing path: parks the hack-ward grad in delta_S_hack, ablates at eval. # Fires the R3 span assert, the two-param optimizer path, the periodic # ablated-eval series, and the final kept-vs-ablated BLUF. smoke-route *ARGS: BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --intervention=route \ - --v-hack-path=out/v_hack_smoke.safetensors \ - --teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 \ + --v-hack-path=out/vhack/v_hack_smoke.safetensors \ + --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }} # Run smoke twice: first warms the v_hack cache (cache-miss path), second hits @@ -54,26 +54,26 @@ smoke-both: # and train with pool-derived V. Uses 2 prebaked prompts from teacher_pool. # Tiny-random Qwen3 on CPU, ~1-2 min. Audit gate disabled (2 prompts can't pass). smoke-xmech: - rm -rf out/probe_distill/teacher_pool_smoke out/v_hack_pool_smoke.safetensors out/pairs_pool_smoke.json - mkdir -p out/probe_distill/teacher_pool_smoke + rm -rf out/pools/teacher_pool_smoke out/vhack/v_hack_pool_smoke.safetensors out/pairs_pool_smoke.json + mkdir -p out/pools/teacher_pool_smoke # Prompts 5, 30 chosen for having mixed hack+clean rollouts (7+1 each); needed # so pairs_from_pool can pair a hack-side with a clean-side per prompt. - cp out/probe_distill/teacher_pool/prompt_0005.jsonl.gz out/probe_distill/teacher_pool_smoke/ - cp out/probe_distill/teacher_pool/prompt_0030.jsonl.gz out/probe_distill/teacher_pool_smoke/ - uv run python -m projected_grpo.regrade_pool --pool-dir=out/probe_distill/teacher_pool_smoke --no-require-audit + cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/ + cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/ + uv run python -m projected_grpo.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit uv run python -m projected_grpo.pairs_from_pool \ - --pool-dir=out/probe_distill/teacher_pool_smoke --half-a=E,C \ + --pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \ --out-path=out/pairs_pool_smoke.json BEARTYPE=1 CUDA_VISIBLE_DEVICES= uv run python -m projected_grpo.extract_vhack_grad \ --model={{ TINY_MODEL }} --dtype=fp32 \ --pairs-from-pool=out/pairs_pool_smoke.json \ --n-heldout=0 --top-k=1 \ - --out-path=out/v_hack_pool_smoke.safetensors \ - --train-grads-path=out/vhack_grads_pool_smoke.safetensors + --out-path=out/vhack/v_hack_pool_smoke.safetensors \ + --train-grads-path=out/vhack_grads/vhack_grads_pool_smoke.safetensors BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --intervention=erase \ - --v-hack-path=out/v_hack_pool_smoke.safetensors \ + --v-hack-path=out/vhack/v_hack_pool_smoke.safetensors \ --vhack-pairs-path=out/pairs_pool_smoke.json \ - --teacher-pool-dir=out/probe_distill/teacher_pool_smoke --mix-ratio=0.5 \ + --teacher-pool-dir=out/pools/teacher_pool_smoke --mix-ratio=0.5 \ --half-a=E,C \ --v-hack-k=1 @@ -82,7 +82,7 @@ full-vanilla *ARGS: {{ TRAIN }} full --intervention=none {{ ARGS }} full *ARGS: - {{ TRAIN }} full --intervention=erase --v-hack-path=out/v_hack_full.safetensors {{ ARGS }} + {{ TRAIN }} full --intervention=erase --v-hack-path=out/vhack/v_hack_full.safetensors {{ ARGS }} # Goal 0: minimum iteration loop to find a working GRPO-hacks-up baseline. # Uses fast preset (20 steps, fast-Adam: lr=3e-3 beta1=0.5 beta2=0.9) + cached @@ -90,29 +90,33 @@ full *ARGS: # If lp_t stays flat with no NaN, the LR axis alone is exhausted; try inner_steps. fast-vanilla *ARGS: {{ TRAIN }} fast --intervention=none \ - --teacher-pool-dir=out/probe_distill/teacher_pool \ + --teacher-pool-dir=out/pools/teacher_pool \ --grad-clip=500 {{ ARGS }} # Goal 1: same recipe with --intervention=erase. Run only after fast-vanilla passes UAT. # mix_ratio=0.125 + group=8 are the locked-in fast defaults (config), not flags here. fast-projected *ARGS: {{ TRAIN }} fast --intervention=erase \ - --v-hack-path=out/v_hack_full.safetensors \ - --teacher-pool-dir=out/probe_distill/teacher_pool \ + --v-hack-path=out/vhack/v_hack_full.safetensors \ + --teacher-pool-dir=out/pools/teacher_pool \ --grad-clip=500 {{ ARGS }} # T8 (KEY GOAL): one CELL of the dynamics-plot matrix as a separate pueue job. # INTERVENTION in {none, erase, route}; SEED an int. 60-step fast horizon, # shared v_hack_21pairs basis (vanilla uses it only for the cos_pre diagnostic), # eval-ablation on (no-op for none/erase; gives route its ablated series + BLUF). -# Logs land as ..._cell_{intervention}_s{seed}.log -> regen-dynamics globs them. -run-cell INTERVENTION SEED: +# REFRESH>0 re-extracts v_hack every N steps = the ONLINE-erasure arm (static +# erasure is REFRESH=0, the default); plot_dynamics splits them by refr>0 and +# tags the log _online so the overlay carries both erasure arms. +# Logs land as ..._cell_{intervention}[_online]_s{seed}.log -> regen-dynamics globs them. +run-cell INTERVENTION SEED REFRESH='0': {{ TRAIN }} fast --intervention={{ INTERVENTION }} \ - --v-hack-path=out/v_hack_21pairs.safetensors \ - --teacher-pool-dir=out/probe_distill/teacher_pool \ + --v-hack-path=out/vhack/v_hack_21pairs.safetensors \ + --teacher-pool-dir=out/pools/teacher_pool \ --grad-clip=500 --steps=60 --seed={{ SEED }} \ + --vhack-refresh-every={{ REFRESH }} \ --eval-ablate-every=5 \ - --out-tag=_cell_{{ INTERVENTION }}_s{{ SEED }} + --out-tag=_cell_{{ INTERVENTION }}{{ if REFRESH == "0" { "" } else { "_online" } }}_s{{ SEED }} # Regenerate both dynamics plots from the cell logs (default: all cells; pass a # narrower glob like 'logs/*_cell_*_s41.log' for the seed-41-only checkpoint). @@ -132,28 +136,28 @@ extract-vhack-smoke: uv run python -m projected_grpo.extract_vhack_grad \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ - --out-path=out/v_hack_smoke.safetensors \ - --train-grads-path=out/vhack_grads_train_smoke.safetensors + --out-path=out/vhack/v_hack_smoke.safetensors \ + --train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors extract-vhack-full: uv run python -m projected_grpo.extract_vhack_grad \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ - --out-path=out/v_hack_full.safetensors \ - --train-grads-path=out/vhack_grads_train_full.safetensors + --out-path=out/vhack/v_hack_full.safetensors \ + --train-grads-path=out/vhack_grads/vhack_grads_train_full.safetensors verify-vhack-smoke: uv run python -m projected_grpo.verify_vhack_heldout \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ - --v-hack-path=out/v_hack_smoke.safetensors \ + --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --out-path=out/vhack_heldout_cos_smoke.safetensors verify-vhack-full: uv run python -m projected_grpo.verify_vhack_heldout \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ - --v-hack-path=out/v_hack_full.safetensors \ + --v-hack-path=out/vhack/v_hack_full.safetensors \ --out-path=out/vhack_heldout_cos_full.safetensors # ============================================================================= @@ -183,7 +187,7 @@ probe-full-seed seed="41": EX=$(pueue add -p -w "$PWD" -o 9 -l "why: extract v_hack full; resolve: Gate A zero-norm=0, ~252 modules" -- just extract-vhack-full) VF=$(pueue add -p -a "$EX" -w "$PWD" -o 9 -l "why: verify heldout cos; resolve: Gate B frac>0>0.50, mean>0.20" -- just verify-vhack-full) VA=$(pueue add -p -a "$VF" -w "$PWD" -o 9 -l "why: vanilla seed{{ seed }} @ matched batch; resolve: Gate C H4 HACK_RATE>0.30 by ~step100" -- {{ TRAIN }} full --intervention=none --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe) - pueue add -a "$VA" -w "$PWD" -o 8 -l "why: projected seed{{ seed }} @ matched batch, v_hack NOT post-hoc; resolve: Gate D H1 HACK_RATE30% at step 200). -queue-vanilla preset="full" vhack="out/v_hack_full.safetensors": +queue-vanilla preset="full" vhack="out/vhack/v_hack_full.safetensors": #!/usr/bin/env bash set -x for seed in {{ SEEDS_3 }}; do @@ -213,7 +217,7 @@ queue-vanilla preset="full" vhack="out/v_hack_full.safetensors": done # 3-seed projected (H1: -30pp hack vs vanilla at matched pass). -queue-projected preset="full" vhack="out/v_hack_full.safetensors": +queue-projected preset="full" vhack="out/vhack/v_hack_full.safetensors": #!/usr/bin/env bash set -x for seed in {{ SEEDS_3 }}; do @@ -236,7 +240,7 @@ table-proto: @cat docs/table_proto.md # Pre-generate teacher rollouts for N prompts via probe_distill.py --teacher-only. -# Writes/extends out/probe_distill/teacher_pool/. Teacher = ariahw rh-s65 LoRA +# Writes/extends out/pools/teacher_pool/. Teacher = ariahw rh-s65 LoRA # merged on Qwen3-4B. Cost ~30s/prompt @ G=8, max_new=1024 -> ~50 min for 100. # Pool is consumed by fast-vanilla / fast-projected via --teacher-pool-dir. pregen-teacher n_prompts="100": @@ -248,7 +252,7 @@ pregen-teacher n_prompts="100": --max-new=1024 # G2: pregen pool from an alternative Aria teacher checkpoint. -# `tag` controls the output subdir under out/probe_distill//. +# `tag` controls the output subdir under out/pools//. # Example: just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-gt-monitor-penalty-s65 teacher_pool_gtmon_s65 50 pregen-teacher-alt teacher tag n_prompts="50": uv run python -m projected_grpo.probe_distill \ @@ -268,7 +272,7 @@ pregen-teacher-alt teacher tag n_prompts="50": # 4-boolean co-occurrence + signature breakdown on the cached pool. # `pool` selects which pool to regrade (default = original rh-s65 pool). -regrade-pool pool="out/probe_distill/teacher_pool": +regrade-pool pool="out/pools/teacher_pool": uv run python -m projected_grpo.regrade_pool --pool-dir={{ pool }} # Build a combined teacher pool by concatenating same-prompt rollouts from @@ -282,7 +286,7 @@ build-combined-pool: # Build (hack, clean) pairs from the pool, restricted to half_A detectors on # the hack side. Writes out/pairs_pool_half.json with N<=14 same-prompt # pairs. Asserts hack and clean rollouts share the prompt. -pairs-from-pool half_a="E,C" pool="out/probe_distill/teacher_pool" tag="": +pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="": uv run python -m projected_grpo.pairs_from_pool \ --pool-dir={{ pool }} \ --half-a={{ half_a }} \ @@ -294,15 +298,15 @@ extract-vhack-pool half_a="E,C" tag="": uv run python -m projected_grpo.extract_vhack_grad \ --model=Qwen/Qwen3-4B --dtype=bf16 \ --pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \ - --out-path=out/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ - --train-grads-path=out/vhack_grads_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors + --out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ + --train-grads-path=out/vhack_grads/vhack_grads_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors # Train with pool-derived v_hack + online refresh. half_a echoed to train.py so # the final BLUF reports HACK_A (in-distribution) and HACK_B (held-out). Step # 6 of the spec; cf. step 7 BLUF decision rules. -fast-projected-pool half_a="E,C" seed="41" pool="out/probe_distill/teacher_pool" tag="": +fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="": {{ TRAIN }} fast --intervention=erase \ - --v-hack-path=out/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ + --v-hack-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ --vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \ --teacher-pool-dir={{ pool }} --mix-ratio=0.5 \ --grad-clip=500 \ @@ -313,7 +317,7 @@ fast-projected-pool half_a="E,C" seed="41" pool="out/probe_distill/teacher_pool" # Vanilla matched-seed baseline for the cross-mech experiment. Same seed and # mix as fast-projected-pool so HACK_A/HACK_B deltas are comparable. -fast-vanilla-xmech half_a="E,C" seed="41" pool="out/probe_distill/teacher_pool" tag="": +fast-vanilla-xmech half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="": {{ TRAIN }} fast --intervention=none \ --teacher-pool-dir={{ pool }} --mix-ratio=0.5 \ --grad-clip=500 \ diff --git a/out/dynamics_test.png b/out/dynamics_test.png deleted file mode 100644 index 9db0ce8..0000000 Binary files a/out/dynamics_test.png and /dev/null differ diff --git a/out/dynamics_test_hack_overlay.png b/out/dynamics_test_hack_overlay.png deleted file mode 100644 index 84fe854..0000000 Binary files a/out/dynamics_test_hack_overlay.png and /dev/null differ diff --git a/out/route_evidence_s41.png b/out/route_evidence_s41.png deleted file mode 100644 index 4362555..0000000 Binary files a/out/route_evidence_s41.png and /dev/null differ diff --git a/scripts/build_combined_pool.py b/scripts/build_combined_pool.py index 2d81379..e20ebbd 100644 --- a/scripts/build_combined_pool.py +++ b/scripts/build_combined_pool.py @@ -16,13 +16,13 @@ import json from pathlib import Path SOURCES = [ - "out/probe_distill/teacher_pool", # rh-s65 (existing) - "out/probe_distill/teacher_pool_rh_s42", - "out/probe_distill/teacher_pool_inocloop_s65", - "out/probe_distill/teacher_pool_jmonscr_s65", - "out/probe_distill/teacher_pool_pmonscr_s65", + "out/pools/teacher_pool", # rh-s65 (existing) + "out/pools/teacher_pool_rh_s42", + "out/pools/teacher_pool_inocloop_s65", + "out/pools/teacher_pool_jmonscr_s65", + "out/pools/teacher_pool_pmonscr_s65", ] -OUT = Path("out/probe_distill/teacher_pool_combined") +OUT = Path("out/pools/teacher_pool_combined") def main() -> None: diff --git a/scripts/migrate_out_dirs.py b/scripts/migrate_out_dirs.py new file mode 100644 index 0000000..0816cf3 --- /dev/null +++ b/scripts/migrate_out_dirs.py @@ -0,0 +1,90 @@ +"""One-shot out/ migration to the datatype-sorted scheme (spec 20260530_out_dir_reorg). + +Sorts loose out/ files into subdirs: + v_hack_*.safetensors -> out/vhack/ + vhack_grads_*, vhack_heldout_* -> out/vhack_grads/ + *.png -> out/figs/ + out/probe_distill// -> out/pools// + train_{,_first_hack}.safetensors + rollouts_.jsonl + -> out/runs// (ts matched from logs/*.log) + pairs_*.json -> out/pairsets/ + +Per-train-run artifacts (checkpoint + rollouts) group under the SAME run dir as +their log's _ stem, by matching the out_tag suffix. Unmatched train +files (no log) go to out/runs/_unmatched/ and are logged, never dropped. + + uv run python scripts/migrate_out_dirs.py # dry-run (prints plan) + uv run python scripts/migrate_out_dirs.py --apply # actually move +""" +from __future__ import annotations + +import shutil +import sys +from pathlib import Path + +from loguru import logger + +OUT = Path("out") +LOGS = Path("logs") +APPLY = "--apply" in sys.argv + + +def log_stem_for_tag(tag: str) -> str | None: + """Find the log whose run_id ends with `tag` (the out_tag suffix). Returns its stem.""" + cands = sorted(LOGS.glob(f"*{tag}.log")) + # Prefer an exact suffix match on the stem (run_id = __seed). + exact = [p for p in cands if p.stem.endswith(tag)] + chosen = (exact or cands) + return chosen[-1].stem if chosen else None # newest if several + + +def plan_moves() -> list[tuple[Path, Path]]: + moves: list[tuple[Path, Path]] = [] + for f in sorted(OUT.glob("*")): + if f.is_dir(): + continue + n = f.name + if n.startswith("v_hack_") and n.endswith(".safetensors"): + moves.append((f, OUT / "vhack" / n)) + elif n.startswith(("vhack_grads_", "vhack_heldout")): + moves.append((f, OUT / "vhack_grads" / n)) + elif n.endswith(".png"): + moves.append((f, OUT / "figs" / n)) + elif n.startswith("pairs_") and n.endswith(".json"): + moves.append((f, OUT / "pairsets" / n)) + elif n.startswith("train_") or n.startswith("rollouts_"): + # tag = out_tag suffix shared by the file and its log. + stem = n.split(".")[0] + tag = (stem[len("train"):] if stem.startswith("train") + else "_" + stem[len("rollouts_"):]) + tag = tag.replace("_first_hack", "") + log_stem = log_stem_for_tag(tag) + dest_dir = OUT / "runs" / (log_stem or "_unmatched") + moves.append((f, dest_dir / n)) + else: + logger.warning(f"UNMAPPED loose file (left in place): {f}") + # Teacher/base pools: out/probe_distill// -> out/pools// + pd = OUT / "probe_distill" + if pd.is_dir(): + for sub in sorted(pd.iterdir()): + dst = OUT / ("figs" if sub.suffix == ".png" else "pools") / sub.name + moves.append((sub, dst)) + return moves + + +def main() -> None: + moves = plan_moves() + for src, dst in moves: + if dst.exists(): + logger.warning(f"SKIP (dest exists): {dst}") + continue + logger.info(f"{'MOVE' if APPLY else 'PLAN'}: {src} -> {dst}") + if APPLY: + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(src), str(dst)) + logger.info(f"{'APPLIED' if APPLY else 'DRY-RUN'}: {len(moves)} moves. " + f"{'' if APPLY else 'Re-run with --apply to execute.'}") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 2890fcd..f672f59 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -45,6 +45,8 @@ import matplotlib.pyplot as plt import numpy as np from loguru import logger +from projected_grpo.figs import link_latest + # --- parse ----------------------------------------------------------------- # Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264". @@ -80,7 +82,7 @@ def parse_log(path: Path) -> dict | None: arm = grab(r"\barm=(\w+)", preset, "vanilla") refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0")) seed = grab(r"seed=(\d+)", preset, "?") - vhack = grab(r"v-hack-path=out/(\S+?)\.safetensors", argv, "-") + vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-") # header line: the one containing both "step" and "hack_s" hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hack_s" in l), None) @@ -312,7 +314,7 @@ def _gather(paths: list[str]) -> list[Path]: def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("logs", nargs="+", help="log files, globs, or dirs") - ap.add_argument("--out", type=Path, default=Path("out/dynamics.png")) + ap.add_argument("--out", type=Path, default=Path("out/figs/dynamics.png")) args = ap.parse_args() files = _gather(args.logs) runs = [r for f in files if (r := parse_log(f))] @@ -320,9 +322,13 @@ def main() -> None: raise SystemExit(f"no parseable runs in {len(files)} files") for r in runs: logger.info(f"{classify(r):16s} seed={r['seed']} steps={len(r['steps'])} {r['vhack']}") + args.out.parent.mkdir(parents=True, exist_ok=True) plot(runs, args.out) # second figure: single-panel arm-vs-arm overlay of the headline metric - plot_hack_overlay(runs, args.out.with_name(args.out.stem + "_hack_overlay.png")) + overlay = args.out.with_name(args.out.stem + "_hack_overlay.png") + plot_hack_overlay(runs, overlay) + for p in (args.out, overlay): + logger.info(f"docs/figs latest -> {link_latest(p)}") if __name__ == "__main__": diff --git a/scripts/plot_route_evidence.py b/scripts/plot_route_evidence.py index 253bd57..4dc7fc1 100644 --- a/scripts/plot_route_evidence.py +++ b/scripts/plot_route_evidence.py @@ -20,6 +20,8 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt import tyro +from projected_grpo.figs import link_latest + def _frac(tok: str) -> float | None: if "/" in tok: @@ -60,7 +62,7 @@ def parse(log: Path): ship_step=ship_step, ship_hack=ship_hack, ship_solve=ship_solve) -def main(log: str, out: str = "out/route_evidence.png") -> None: +def main(log: str, out: str = "out/figs/route_evidence.png") -> None: d = parse(Path(log)) RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context) fig, ax = plt.subplots(figsize=(7, 4)) @@ -93,7 +95,9 @@ def main(log: str, out: str = "out/route_evidence.png") -> None: fig.tight_layout() Path(out).parent.mkdir(parents=True, exist_ok=True) fig.savefig(out, dpi=130) - print(f"wrote {out} (train_hack_final={d['train_hack'][-1]:.3f}, " + link = link_latest(Path(out)) + print(f"wrote {out} (docs/figs latest -> {link}) " + f"(train_hack_final={d['train_hack'][-1]:.3f}, " f"ship_hack_final={d['ship_hack'][-1]:.3f}, ship_solve_final={d['ship_solve'][-1]:.3f})") diff --git a/scripts/results.py b/scripts/results.py index 83a57b7..ada5d9c 100644 --- a/scripts/results.py +++ b/scripts/results.py @@ -50,7 +50,7 @@ def _cfg(argv: str, preset_line: str) -> dict: gate=grab(r"--gate-mode=(\w+)", argv, "one_sided"), k=grab(r"--v-hack-k=(\d+)", argv, "5"), dropf=grab(r"--v-hack-drop-bottom-frac=([\d.]+)", argv, "0.25"), - vhack=grab(r"v-hack-path=out/(\S+?)\.safetensors", argv), + vhack=grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv), tag=grab(r"--out-tag=(\S+)", argv, ""), # full CLI args (after train.py) — the ground-truth provenance; any flag # not parsed into a column above is still visible here. diff --git a/src/projected_grpo/extract_vhack_grad.py b/src/projected_grpo/extract_vhack_grad.py index cfd4e34..118c092 100644 --- a/src/projected_grpo/extract_vhack_grad.py +++ b/src/projected_grpo/extract_vhack_grad.py @@ -54,8 +54,8 @@ OUT_DIR = Path("out") class Config: model: str = "Qwen/Qwen3-4B" dtype: str = "bf16" # must match train.py, else SVD basis cache can differ silently - out_path: Path = OUT_DIR / "v_hack.safetensors" - train_grads_path: Path = OUT_DIR / "vhack_grads_train.safetensors" + out_path: Path = OUT_DIR / "vhack" / "v_hack.safetensors" + train_grads_path: Path = OUT_DIR / "vhack_grads" / "vhack_grads_train.safetensors" n_heldout: int = 2 # last n pairs reserved for held-out validation # top_k=12 = max(n_train_pairs after n_heldout=2 from N=14 pairs). Extract once # at max rank; train.py slices via --v-hack-k for k-ablation without re-extract. @@ -255,7 +255,8 @@ def main(cfg: Config) -> int: n_zero = sum(1 for v in v_hack.values() if v.norm() < 1e-12) k = min(cfg.top_k, len(train_pairs)) - OUT_DIR.mkdir(exist_ok=True) + cfg.out_path.parent.mkdir(parents=True, exist_ok=True) + cfg.train_grads_path.parent.mkdir(parents=True, exist_ok=True) save_file(raw_grads, str(cfg.train_grads_path), metadata={"model": cfg.model, "dtype": cfg.dtype}) # v_hack file layout: bare `{name}` keys hold V[k, r]; `_sv/{name}` keys diff --git a/src/projected_grpo/figs.py b/src/projected_grpo/figs.py new file mode 100644 index 0000000..880b023 --- /dev/null +++ b/src/projected_grpo/figs.py @@ -0,0 +1,28 @@ +"""Stable `docs/figs/.png` -> latest generated figure under `out/`. + +Plot scripts write the real PNG under out/ (gitignored, per-run/per-datatype), +then call link_latest() so docs and the blog can reference a stable path that +always points at the newest version. The symlink is relative so the repo stays +relocatable. + +CAVEAT: out/ is gitignored, so the symlink target is not tracked -- the link +resolves locally but GitHub won't render it. To publish a figure, commit the +real PNG (git add -f) as well; the symlink is for local "latest" convenience. +""" +from __future__ import annotations + +import os +from pathlib import Path + +FIGS_DIR = Path("docs/figs") + + +def link_latest(out_path: Path) -> Path: + """Point docs/figs/ at out_path (relative symlink). Returns the link.""" + FIGS_DIR.mkdir(parents=True, exist_ok=True) + link = FIGS_DIR / out_path.name + target = os.path.relpath(out_path.resolve(), FIGS_DIR.resolve()) + if link.is_symlink() or link.exists(): + link.unlink() + link.symlink_to(target) + return link diff --git a/src/projected_grpo/pairs_from_pool.py b/src/projected_grpo/pairs_from_pool.py index 9aecb8b..c1f6895 100644 --- a/src/projected_grpo/pairs_from_pool.py +++ b/src/projected_grpo/pairs_from_pool.py @@ -21,7 +21,7 @@ rewards.py:RewardResult): GT = gt_pass (eq_correct upstream; orthogonal to E/C/D) Run: uv run python -m projected_grpo.pairs_from_pool \ - --pool-dir=out/probe_distill/teacher_pool \ + --pool-dir=out/pools/teacher_pool \ --half-a=E,C \ --out-path=out/pairs_pool_halfA_EC.json """ @@ -214,7 +214,7 @@ def load_pairs_json(path: Path) -> list[HackPair]: def main( - pool_dir: Path = Path("out/probe_distill/teacher_pool"), + pool_dir: Path = Path("out/pools/teacher_pool"), half_a: str = "E,C", half_a_signatures: str = "", max_pairs: int = 14, diff --git a/src/projected_grpo/probe_distill.py b/src/projected_grpo/probe_distill.py index ee64b80..edd3c85 100644 --- a/src/projected_grpo/probe_distill.py +++ b/src/projected_grpo/probe_distill.py @@ -78,7 +78,7 @@ class Config: clip: float = 0.2 seed: int = 41 preserve_magnitude: bool = True - v_hack_path: Path = OUT_DIR / "v_hack_full.safetensors" + v_hack_path: Path = OUT_DIR / "vhack" / "v_hack_full.safetensors" tag: str = "" replay_dir: Path | None = None teacher_only: bool = False @@ -262,12 +262,12 @@ def main(cfg: Config) -> int: # them. Pool files live flat at the pool root (prompt_*.jsonl.gz). Training # runs get an ISO timestamp prefix and step files go in a `steps/` subdir. if cfg.teacher_only or cfg.base_only: - out_dir = OUT_DIR / "probe_distill" / tag + out_dir = OUT_DIR / "pools" / tag # teacher/base pools live under pools/ steps_dir = out_dir else: from datetime import datetime stamp = datetime.now().strftime("%Y%m%dT%H%M%S") - out_dir = OUT_DIR / "probe_distill" / f"{stamp}_{tag}" + out_dir = OUT_DIR / "runs" / f"{stamp}_distill_{tag}" # analysis run -> runs/ steps_dir = out_dir / "steps" rng = torch.Generator().manual_seed(cfg.seed) pad_id = tok.pad_token_id diff --git a/src/projected_grpo/probe_plot_stack.py b/src/projected_grpo/probe_plot_stack.py index db9e20c..37e552d 100644 --- a/src/projected_grpo/probe_plot_stack.py +++ b/src/projected_grpo/probe_plot_stack.py @@ -67,7 +67,7 @@ def load_step(path: Path) -> list[dict]: @dataclass class Config: run_dir: Path - out_path: Path = Path("out/probe_distill/plot_stack_vanilla_seed41.png") + out_path: Path = Path("out/runs/probe_plot_stack_vanilla_seed41.png") warmup: int = 70 # distill-off boundary (end of replay) pre_warmup: int = 0 # distill-on boundary (start of replay) smooth: int = 10 # trailing SMA window; double the blog's 5 since our G=8 (theirs G=16) diff --git a/src/projected_grpo/probe_traj.py b/src/projected_grpo/probe_traj.py index db0a6ea..8567458 100644 --- a/src/projected_grpo/probe_traj.py +++ b/src/projected_grpo/probe_traj.py @@ -49,7 +49,7 @@ def per_step(rows: list[dict]) -> list[dict]: def main(tag_v: str = "warmupgen_vanilla_seed41", tag_p: str = "warmupgen_projected_svd_seed41"): - root = Path("out/probe_distill") + root = Path("out/runs") # distill analysis runs land here (was probe_distill/) v = per_step(load_run(root / tag_v)) p = per_step(load_run(root / tag_p)) diff --git a/src/projected_grpo/regrade_pool.py b/src/projected_grpo/regrade_pool.py index 060dff5..4bd5250 100644 --- a/src/projected_grpo/regrade_pool.py +++ b/src/projected_grpo/regrade_pool.py @@ -60,7 +60,7 @@ def load_problems_by_id() -> dict[int, dict]: DETECTORS = ("E", "C", "D") # ordered for stable signature reporting; GT logged aside -def main(pool_dir: Path = Path("out/probe_distill/teacher_pool"), require_audit: bool = True) -> int: +def main(pool_dir: Path = Path("out/pools/teacher_pool"), require_audit: bool = True) -> int: probs = load_problems_by_id() logger.info(f"loaded {len(probs)} problems from dataset") diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 37b9af3..be4cb81 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -86,6 +86,11 @@ from .rewards import compute_reward CACHE_ROOT = Path("svd_cache") OUT_DIR = Path("out") +# out/ is sorted by datatype (see docs/spec/20260530_out_dir_reorg.md): extracted +# bases under vhack/, teacher pools under pools/, per-train-run checkpoints under +# runs//. Read paths (v_hack, teacher pool) come in as explicit args. +VHACK_DIR = OUT_DIR / "vhack" +RUNS_DIR = OUT_DIR / "runs" LOGS_DIR = Path("logs") DATA = Path("external/rl-rewardhacking/results/data/leetcode_train_medhard_filtered.jsonl") @@ -660,7 +665,7 @@ def main(cfg: Config) -> int: # ablation varies pairs.py, add a pairs hash here too. tau_tag = f"_tau{cfg.v_hack_tau_axis:g}" if cfg.v_hack_tau_axis > 0 else "" if cfg.v_hack_path is None: - v_hack_path = OUT_DIR / f"v_hack_{model_slug}_k{cfg.v_hack_extract_top_k}{tau_tag}.safetensors" + v_hack_path = VHACK_DIR / f"v_hack_{model_slug}_k{cfg.v_hack_extract_top_k}{tau_tag}.safetensors" else: v_hack_path = cfg.v_hack_path if not v_hack_path.exists(): @@ -873,17 +878,19 @@ table columns: logger.info(caption + "\n\n") logger.info(step_logger.header()) - OUT_DIR.mkdir(exist_ok=True) - tag = cfg.out_tag or f"_{cfg.preset_name}_{cfg.arm}_seed{cfg.seed}" - ckpt_path = OUT_DIR / f"train{tag}.safetensors" - first_hack_path = OUT_DIR / f"train{tag}_first_hack.safetensors" + # Per-run artifacts grouped under runs/_/ (same stem as the log, + # so a run's checkpoint and log sit together). See out_dir_reorg spec. + run_dir = RUNS_DIR / verbose_log.stem + run_dir.mkdir(parents=True, exist_ok=True) + ckpt_path = run_dir / "train.safetensors" + first_hack_path = run_dir / "first_hack.safetensors" # Per-rollout audit log: every live-graded student completion (full text + # all hack-mechanism flags), one JSON object per line. Lets us eyeball # *which* hack the student found and whether the mechanism shifts mid-run # (e.g. it routes around v_hack into a category the pairs don't span). # Offline observability only -- never read back into training, so no-cheat # invariant holds. Truncated fresh each run. - rollout_log_path = OUT_DIR / f"rollouts{tag}.jsonl" + rollout_log_path = run_dir / "rollouts.jsonl" rollout_log_path.write_text("") first_hack_saved = False route_span_checked = False # R3: assert delta_S_hack.grad in span(V) once diff --git a/src/projected_grpo/verify_vhack_heldout.py b/src/projected_grpo/verify_vhack_heldout.py index 20d79c0..cab8df9 100644 --- a/src/projected_grpo/verify_vhack_heldout.py +++ b/src/projected_grpo/verify_vhack_heldout.py @@ -40,7 +40,7 @@ OUT_DIR = Path("out") class Config: model: str = "out/baked/qwen3_4b_rh25" dtype: str = "bf16" # must match extract_vhack_grad.py and train.py - v_hack_path: Path = OUT_DIR / "v_hack_rh25.safetensors" + v_hack_path: Path = OUT_DIR / "vhack" / "v_hack_rh25.safetensors" out_path: Path = OUT_DIR / "vhack_heldout_cos_rh25.safetensors" n_heldout: int = 2