set shell := ["bash", "-cu"] # Three seeds for headline arms; one seed for ablations. SEEDS_3 := "41 43 44" # spec.md §H4 substrate (reference DEFAULT_MODEL_ID). # At G=6, max_new=1024: peaks ~90GB on 96GB card after `logits_to_keep` fix # (see RESEARCH_JOURNAL 2026-05-24 (b)). MODEL := "Qwen/Qwen3-4B" TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only BASE := "uv run python -m projected_grpo.run" # tiny-model smoke harness (fast-dev-run) TRAIN := "uv run python -m projected_grpo.train" # real LeetCode GRPO entry point default: @just --list # fast-dev-run: tiny-random model, full smoke pipeline end-to-end, ~1-2 min, beartype on. fast-dev-run *ARGS: BEARTYPE=1 {{ BASE }} --fast-dev-run --model={{ TINY_MODEL }} {{ ARGS }} # Real-pipeline presets (train.py = AntiPaSTO + Dr.GRPO + LeetCode rewards). # smoke = Qwen3.5-0.8B 10 steps, fits 24GB. Mechanism verification only. # full = Qwen3-4B 200 steps G=6, peaks ~90GB on 96GB. spec.md §H4 substrate. smoke *ARGS: {{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors {{ ARGS }} smoke-vanilla *ARGS: {{ TRAIN }} --preset=smoke --arm=vanilla {{ ARGS }} smoke-both: {{ TRAIN }} --preset=smoke --arm=vanilla {{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors # H4 baseline at spec substrate. No v_hack needed for vanilla. full-vanilla *ARGS: {{ TRAIN }} --preset=full --arm=vanilla {{ ARGS }} full *ARGS: {{ TRAIN }} --preset=full --arm=projected --v-hack-path=out/v_hack_full.safetensors {{ ARGS }} # Sync the rl-rewardhacking external repo (Nanda's verl wrapper). sync-external: cd external/rl-rewardhacking && git pull --ff-only # Warm HF cache before real runs (avoids re-download on first pueue job). download-model: uv run python -c "from huggingface_hub import snapshot_download; \ snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])" extract-vhack-smoke: uv run python -m projected_grpo.extract_vhack_grad \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ --out-path=out/v_hack_smoke.safetensors \ --train-grads-path=out/vhack_grads_train_smoke.safetensors extract-vhack-full: uv run python -m projected_grpo.extract_vhack_grad \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ --out-path=out/v_hack_full.safetensors \ --train-grads-path=out/vhack_grads_train_full.safetensors verify-vhack-smoke: uv run python -m projected_grpo.verify_vhack_heldout \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ --v-hack-path=out/v_hack_smoke.safetensors \ --out-path=out/vhack_heldout_cos_smoke.safetensors verify-vhack-full: uv run python -m projected_grpo.verify_vhack_heldout \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ --v-hack-path=out/v_hack_full.safetensors \ --out-path=out/vhack_heldout_cos_full.safetensors # ============================================================================= # SWEEPS — what to run, in order # ============================================================================= # # 1. `just probe-full-seed 41` — single-seed gate (~6-9h sequential). # extract -> verify-heldout -> vanilla -> projected. Inspect before sweep. # 2. `just queue-full` — 3-seed headline sweep (~36-54h). # Queues 1 extract + 3 vanilla + 3 projected. Only run after probe passes. # # Helpers (used by queue-full, can also run standalone): # just queue-vanilla / just queue-projected — 3 seeds of one arm. # just probe-h4 41 — vanilla only on a single seed (H4 substrate sanity). # ============================================================================= # Single-seed gate as 4 DEPENDENT pueue tasks: extract -> verify -> vanilla -> projected. # Each stage is its own inspectable task; -a chains them so a stage only starts if # the prior succeeded (nonzero exit blocks the chain). Gates A/B are enforced by exit # code (verify exits nonzero if frac>0<=0.50). Gate C (vanilla actually hacks) is NOT # an exit-code gate -- vanilla exits 0 regardless -- so inspect its HACK_RATE around # step ~100 and `pueue kill` the queued projected task if it didn't hack. # Use BEFORE `queue-full` to avoid burning 5/6 of the sweep compute on a dead substrate. probe-full-seed seed="41": #!/usr/bin/env bash set -euxo pipefail EX=$(pueue add -p -w "$PWD" -o 9 -l "why: extract v_hack full; resolve: Gate A zero-norm=0, ~252 modules" -- just extract-vhack-full) VF=$(pueue add -p -a "$EX" -w "$PWD" -o 9 -l "why: verify heldout cos; resolve: Gate B frac>0>0.50, mean>0.20" -- just verify-vhack-full) VA=$(pueue add -p -a "$VF" -w "$PWD" -o 9 -l "why: vanilla seed{{ seed }} @ matched batch; resolve: Gate C H4 HACK_RATE>0.30 by ~step100" -- {{ TRAIN }} --preset=full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe) pueue add -a "$VA" -w "$PWD" -o 8 -l "why: projected seed{{ seed }} @ matched batch, v_hack NOT post-hoc; resolve: Gate D H1 HACK_RATE30% at step 200). queue-vanilla preset="full" vhack="out/v_hack_full.safetensors": #!/usr/bin/env bash set -x for seed in {{ SEEDS_3 }}; do pueue add -w "$PWD" -o 5 \ -l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \ -- {{ TRAIN }} --preset={{ preset }} --arm=vanilla --seed=$seed --out-tag=_{{ preset }}_vanilla_seed$seed done # 3-seed projected (H1: -30pp hack vs vanilla at matched pass). queue-projected preset="full" vhack="out/v_hack_full.safetensors": #!/usr/bin/env bash set -x for seed in {{ SEEDS_3 }}; do pueue add -w "$PWD" -o 4 \ -l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \ -- {{ TRAIN }} --preset={{ preset }} --arm=projected --seed=$seed --v-hack-path={{ vhack }} --out-tag=_{{ preset }}_projected_seed$seed done # Diagnostic: print v_hack steering check (CAA-style) on base model. # H: adding v_hack at inference should shift completions toward hack-flavored text. vhack-check *ARGS: {{ BASE }} --vhack-check --model={{ MODEL }} {{ ARGS }} # Distillation probe: hacky teacher (ariahw rh-s65) samples, student trains # with per-sample v_hack cosine logging. step_NNN.jsonl.gz per step is replayable. probe-distill *ARGS: uv run python -m projected_grpo.probe_distill --v-hack-path=out/v_hack_full.safetensors {{ ARGS }} # UAT pipeline: 1) teacher pool 2) vanilla replay 3) projected replay 4) analyze. # T1 teacher hack >= 0.30 T2 vanilla cos coverage >= 90% # T3 projected cos_out= 80% of steps T4 cos | hacked > cos | not (p<0.05) probe-teacher-pool steps="20": uv run python -m projected_grpo.probe_distill --teacher-only --steps={{ steps }} probe-vanilla-replay steps="20": uv run python -m projected_grpo.probe_distill --arm=vanilla --steps={{ steps }} \ --replay-dir=out/probe_distill/teacher_pool \ --v-hack-path=out/v_hack_full.safetensors probe-projected-replay steps="20": uv run python -m projected_grpo.probe_distill --arm=projected --steps={{ steps }} \ --replay-dir=out/probe_distill/teacher_pool \ --v-hack-path=out/v_hack_full.safetensors probe-uat: uv run python -m projected_grpo.probe_uat # Print the results table prototype. table-proto: @cat docs/table_proto.md # Show recent pueue logs. log: pueue log -l 40 # Append a new research journal entry (interactive). journal: @echo "Edit RESEARCH_JOURNAL.md and prepend a dated entry." @${EDITOR:-vi} RESEARCH_JOURNAL.md