set shell := ["bash", "-cu"] # vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r), # three arms (intervention none|routeV|absorb). See AGENTS.md / README.md. MODEL := "Qwen/Qwen3-4B" TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool default: @just --list # ───────────────────────────────────────────────────────────────────────────── # SMOKE — the correctness gate. tiny-random Qwen3 on CPU, BEARTYPE on, ~1-2 min. # Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir). # ───────────────────────────────────────────────────────────────────────────── # Default smoke = routeV (full pipeline: extract v_grad -> two-pass gate -> deploy # ablation). Runs all verify gates first, including the lora2r block-mask invariants. smoke *ARGS: uv run python scripts/verify_rewards.py # grader: 3 env_modes x clean/hack uv run python scripts/verify_eval_gap.py # eval: train/test token gap, 4 modes uv run python scripts/verify_partition.py # no-cheat: partition + teacher_modes uv run python scripts/verify_science_invariants.py # pair provenance + untouched test uv run python scripts/verify_rotation.py # rotating-unhackable hint-free flip uv run python scripts/verify_lora2r_routing.py # block masks + ablation + c-probe just smoke-routeV {{ ARGS }} # none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla). smoke-vanilla *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} # routeV: extract v_grad from authored pairs, splice the per-rollout c-probe gate, # PASS 1 (unmasked) labels rollouts {clean,mid,hack} via the width-pooled band cosine, # PASS 2 (masked) trains the blocks; deploy ablation resets the quarantine to init. smoke-routeV *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }} # absorb: masks pinned (1,0) -> both blocks train on every rollout, NO gate. Isolates # the value of the gate+hard-masks vs absorption alone. smoke-absorb *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=absorb \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }} # Realism env: a random fraction of TRAIN problems flipped to gt_only (only honest # solving pays) so there's persistent solve pressure. smoke-unhackable *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ --eval-n-prompts=2 {{ ARGS }} # routeV with a top-k routing subspace (max_i cos(g,v_i) over k SVD dirs) instead of # the single mean-mass axis. UAT: log shows "top-3 SVD subspace, gate=max_i cos" and the # band/gate still route (rout>0). k=1 (default) is the mean-diff headline. smoke-topk *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV --v-grad-k=3 \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }} # All three arms back to back (the full-coverage gate). smoke-all: just smoke-vanilla just smoke-routeV just smoke-absorb # ───────────────────────────────────────────────────────────────────────────── # REAL RUNS — pueue on the 96GB box, FastConfig (Qwen3-4B, 100st, dense run_tests # pool, 25% unhackable, authored pairs). Every job carries a why:/resolve: label. # ───────────────────────────────────────────────────────────────────────────── # Headline 4-arm lora2r decision run. routeV real-v is the method; placebo (Haar) # isolates directionality; vanilla is the emergence reference; absorb isolates the # gate+masks from absorption. Priority descending so they run in listed order. # Decision: directionality is real iff real-v deploy_hack << placebo at matched solve. queue-decision seed='43': pueue add -w "$PWD" -o 60 -l "why: P1 lora2r routeV REAL-v s{{seed}}; resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeV --seed={{seed}} --eval-ablate-every=20 --eval-n-prompts=32 --out-tag=_l2r_routeV_real_s{{seed}} pueue add -w "$PWD" -o 58 -l "why: P2 lora2r routeV PLACEBO-v (Haar 157) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --seed={{seed}} --eval-ablate-every=20 --eval-n-prompts=32 --out-tag=_l2r_routeV_placebo_s{{seed}} pueue add -w "$PWD" -o 56 -l "why: P3 lora2r VANILLA (gate pinned clean) s{{seed}}; resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --seed={{seed}} --eval-ablate-every=20 --eval-n-prompts=32 --out-tag=_l2r_vanilla_s{{seed}} pueue add -w "$PWD" -o 54 -l "why: P4 lora2r ABSORB (masks pinned (1,0), no gate) s{{seed}}; resolve: ~vanilla -> gate+masks add nothing; << vanilla -> absorption alone suppresses" -- {{ TRAIN }} fast --intervention=absorb --seed={{seed}} --eval-ablate-every=20 --eval-n-prompts=32 --out-tag=_l2r_absorb_s{{seed}} # Base model zero-shot deploy eval (0 training steps): reproduce the paper's base # solve ~11.5% in our harness. resolve: base solve ~0.10-0.12. queue-baseline seed='43': pueue add -w "$PWD" -o 80 -l "why: BASE MODEL zero-shot deploy eval s{{seed}}; resolve: our-harness base solve matches paper (~0.10-0.12)" -- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} --out-tag=_baseline_s{{seed}} # No-loophole ceiling: vanilla GRPO with the honest grader (gt_only, no channel hack # possible). resolve: our-harness ceiling solve matches paper (~0.20-0.25). queue-no-loophole seed='43': pueue add -w "$PWD" -o 11 -l "why: NO-LOOPHOLE ceiling s{{seed}} (gt_only honest grader); resolve: ceiling solve ~0.20-0.25" -- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} --out-tag=_noloophole_s{{seed}} # ───────────────────────────────────────────────────────────────────────────── # ENV CONSTRUCTION — teacher pools + substrate (no oracle leak; pool candidates may # be GT-filtered, mirroring how the hack pool was built). # ───────────────────────────────────────────────────────────────────────────── # Dense single-mode run_tests teacher pool (~215 prompts, re-graded, verified hacks). build-runtests-pool: uv run python scripts/build_runtests_pool.py # Even, non-overlapping multi-loophole substrate (elicit-then-strip hacks + run_tests # pool) -> out/pools/substrate/{prompt_*.jsonl.gz, partition.json}. build-substrate MODES="run_tests,exit_code,sentinel": uv run python scripts/build_substrate.py --modes {{ MODES }} --pool-modes run_tests --min-hacks 5 # Solve-teacher pool via OpenRouter qwen3-8b (1 GT-passing solution/problem, <=512 tok). # Symmetric mix alongside the hack pool (T4). Needs OPENROUTER_API_KEY in a .env. build-solve-pool *ARGS: uv run python scripts/build_solve_pool_openrouter.py {{ ARGS }} # ───────────────────────────────────────────────────────────────────────────── # RESULTS + PAPER # ───────────────────────────────────────────────────────────────────────────── # Headline deploy table on the untouched recency-held-out test split, one row per run. results: uv run python scripts/results_deploy.py # Warm HF cache before real runs (avoids re-download on the first pueue job). download-model: uv run python -c "from huggingface_hub import snapshot_download; \ snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])" # Show recent pueue logs. log: pueue log -l 40 # Compile the workshop writeup (tectonic = self-contained latex). paper: cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf" # QC: compile, dump the rendered pdf to text per-page, grep for unfilled markers. paper-qc: paper cd docs/writeup && \ uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \ ( echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \ echo; echo '### TODO markers:'; grep -nE '\\TODO|TODO' main.tex refs.bib paper.txt || echo ' none' ) \ | tee qc_report.txt @echo "-> docs/writeup/qc_report.txt" # tex -> markdown (pandoc) for the LW blog draft + cheap LLM read-throughs. paper-md: cd docs/writeup && \ sed '/usepackage{nips15submit_e}/d' main.tex | \ pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \ echo "-> docs/writeup/main.md" # Sync the rl-rewardhacking external repo (Nanda's verl wrapper). sync-external: cd external/rl-rewardhacking && git pull --ff-only