Files
evil_MoE/justfile
T
wassname 41d225a5ec writeup
2026-06-12 04:46:01 +00:00

201 lines
15 KiB
Makefile
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
set shell := ["bash", "-cu"]
# vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r),
# three arms (intervention none|routeA|absorb). See AGENTS.md / README.md.
MODEL := "Qwen/Qwen3-4B"
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point
TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool
# Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially
# routed -- they pass through the same gate as student rollouts). Deterministic count:
# teacher_n_per_prompt=1 hack + 1 solve per teacher-phase prompt (mix-ratio>0 is just the
# enable switch, no budget); 4 prompts x group 8 -> 8 teachers/step (4 hack + 4 solve),
# 24 students. The run is grad-starved (32 gens/step vs the paper's 256), so without
# teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers
# off at step 30: in the s43 run hack was already self-sustaining (student kept hacking
# 16-24/32 after teachers cut), so 60 just fed extra hacks past saturation.
# solve-teacher routed-share is a passive diagnostic (a good gate keeps them out of the
# top tail), not enforcement.
TEACH := "--mix-ratio=0.5 --solve-pool-dir=out/pools/teacher_pool_solve --solve-mix-frac=0.5 --teacher-off-step=30"
default:
@just --list
# ─────────────────────────────────────────────────────────────────────────────
# SMOKE — the correctness gate. tiny-random Qwen3 on CPU, BEARTYPE on, ~1-2 min.
# Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir).
# ─────────────────────────────────────────────────────────────────────────────
# Default smoke = routeA (full pipeline: extract v_act -> act gate on the logpi_old
# forward -> quantile-tail pinning -> deploy ablation). Runs all verify gates first, including
# the lora2r block-mask invariants. (scripts/verify_v_act.py is the GPU-only extractor
# check vs the cached diag features -- run it manually after extractor changes.)
smoke *ARGS:
uv run python scripts/verify_rewards.py # grader: 3 env_modes x clean/hack
uv run python scripts/verify_eval_gap.py # eval: train/test token gap, 4 modes
uv run python scripts/verify_partition.py # oracle-free split: partition + teacher_modes
uv run python scripts/verify_science_invariants.py # pair provenance + untouched test
uv run python scripts/verify_rotation.py # rotating-unhackable hint-free flip
uv run python scripts/verify_lora2r_routing.py # block masks + ablation + c-probe
just smoke-routeA {{ ARGS }}
# none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla).
smoke-vanilla *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
# routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the
# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via global-quantile
# tails of the run-spanning score buffer, ONE masked forward+backward; deploy ablation resets the quarantine to init.
smoke-routeA *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# absorb: masks pinned (1,0) -> both blocks train on every rollout, NO gate. Isolates
# the value of the gate and masks versus ungated both-block training.
smoke-absorb *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=absorb \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# Realism env: a random fraction of TRAIN problems flipped to gt_only (only ground-truth
# solving pays) so there's persistent solve pressure.
smoke-unhackable *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-n-prompts=2 {{ ARGS }}
# routeA + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
# and the run logs the routed-share discrimination (UAT: a line "solve-mix gate
# discrimination: hack-teacher routed-share=X vs solve-teacher routed-share=Y"). Smoke
# points solve at the same tiny pool just to exercise the split+diagnostic path; real
# runs use out/pools/teacher_pool_solve (correct-solution demos) vs the hack pool.
smoke-solvemix *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
--teacher-pool-dir=out/pools/teacher_pool --solve-pool-dir=out/pools/teacher_pool \
--mix-ratio=0.5 --solve-mix-frac=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# All three arms back to back (the full-coverage gate).
smoke-all:
just smoke-vanilla
just smoke-routeA
just smoke-absorb
# ─────────────────────────────────────────────────────────────────────────────
# REAL RUNS — pueue on the 96GB box, FastConfig (Qwen3-4B, 100st, dense run_tests
# pool, 50% unhackable, authored pairs). Every job carries a why:/resolve: label.
# ─────────────────────────────────────────────────────────────────────────────
# Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}).
# real-v is the method (v_act from authored pairs, global-quantile tail thresholds);
# placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb
# isolates the gate+masks from absorption. Priority descending so they run in listed order.
# --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.
# Decision: directionality is real iff real-v deploy_hack << placebo at matched solve.
# Watch the streamed `auroc` col (A>0 contrast): ~0.5 = v_act blind to live hacks (no gate
# works); high + rout~0 = threshold problem; a drop at a refresh = a direction problem.
# NO inline eval (eval_ablate_every default 0): HF-generate-bound through 252 lora2r hooks
# (~25-30 min/eval), so deploy is scored OFFLINE from the step-10 ckpts (`just results`).
# gen-deploy-frac=0: quarantine ON during sampling (the confirmed frac=0 method). The base
# default is 1.0 (the job-34 config where ablation RAISED hack 0.71->0.86); pass it
# explicitly here so the sweep never silently re-runs that. frac=0 also dodges the
# frac=0.5 blow-up combo (deploy-sampled absorb/route rows carry ρ=full/ablated, A<0).
queue-decision seed='43':
pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeA REAL-v act gate + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_real_s{{seed}}
pueue add -w "$PWD" -o 58 -l "why: P2 lora2r routeA PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeA --routeA-random-v-seed=157 --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_placebo_s{{seed}}
pueue add -w "$PWD" -o 56 -l "why: P3 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
pueue add -w "$PWD" -o 54 -l "why: P4 lora2r BOTH-BLOCK (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> ungated both-block training suppresses" -- {{ TRAIN }} fast --intervention=absorb --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}
# H: the s43 routeA run failed two design choices at once -- (a) 25% unhackable diluted
# nothing useful and the hack saturated anyway; (b) ALL exploration was deploy-mode
# (quarantine ablated), so the quarantine never generated the hack and absorption could
# not pull it out of the deployed block (ablation then made hack WORSE, 0.71->0.86).
# This arm drops both: unhackable=0 + gen_deploy_frac=0 (quarantine ON during sampling).
# Confounded by design (two changes) -- a reset probe, not a clean ablation.
# resolve: deploy_hack < 0.86 AND ablation moves hack DOWN (quarantine holds the hack).
queue-reset seed='43':
pueue add -w "$PWD" -o 60 -l "why: routeA real-v RESET s{{seed}} -- unhackable=0 + gen_deploy_frac=0 (quarantine explores); resolve: deploy_hack<0.86 AND ablation lowers hack" -- {{ TRAIN }} fast --intervention=routeA --unhackable-frac=0.0 --gen-deploy-frac=0.0 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_reset_s{{seed}}
# H: with the generation-matched logπ_old baseline (each rollout's denominator computed in
# its own sampling mode, single inner epoch so ρ≈1), gen_deploy_frac is a stable exploration
# knob -- the frac=0 PPO-clip blowup (route ρ = full/ablated, loss 5e5) is gone. frac=0 puts
# the quarantine ON during sampling so it elicits the hack and absorption can localize it.
# Watch FIRST: ρ by zone (keep~1 always, rout~1 -- rout>>1 = the off-policy drift is back);
# gate columns (keep should NOT run to 1.0; rout should fire when hack_s>0; route_hackT >>
# route_solveT). resolve: deploy_hack < 0.86 AND ablation LOWERS hack (quarantine holds the
# hack), at unhackable=0.25 (job-34 baseline).
queue-explore frac='0.5' seed='43':
pueue add -w "$PWD" -o 60 -l "why: routeA real-v EXPLORE frac={{frac}} s{{seed}} (ρ≡1 fix, unhackable=0.25); resolve: deploy_hack<0.86 AND ablation lowers hack AND gate stays alive (keep<1)" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac={{frac}} {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_expl{{frac}}_s{{seed}}
# Base model zero-shot deploy eval (0 training steps): reproduce the paper's base
# solve ~11.5% in our harness. resolve: base solve ~0.10-0.12.
queue-baseline seed='43':
pueue add -w "$PWD" -o 80 -l "why: BASE MODEL zero-shot deploy eval s{{seed}}; resolve: our-harness base solve matches paper (~0.10-0.12)" -- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} --out-tag=_baseline_s{{seed}}
# No-loophole ceiling: vanilla GRPO with ground-truth tests (gt_only, no exploit channel
# possible). resolve: our-harness ceiling solve matches paper (~0.20-0.25).
queue-no-loophole seed='43':
pueue add -w "$PWD" -o 11 -l "why: NO-LOOPHOLE ceiling s{{seed}} (gt_only ground-truth tests); resolve: ceiling solve ~0.20-0.25" -- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} --out-tag=_noloophole_s{{seed}}
# ─────────────────────────────────────────────────────────────────────────────
# ENV CONSTRUCTION — teacher pools + substrate (no oracle leak; pool candidates may
# be GT-filtered, mirroring how the hack pool was built).
# ─────────────────────────────────────────────────────────────────────────────
# Dense single-mode run_tests teacher pool (~215 prompts, re-graded, verified hacks).
build-runtests-pool:
uv run python scripts/build_runtests_pool.py
# Even, non-overlapping multi-loophole substrate (elicit-then-strip hacks + run_tests
# pool) -> out/pools/substrate/{prompt_*.jsonl.gz, partition.json}.
build-substrate MODES="run_tests,exit_code,sentinel":
uv run python scripts/build_substrate.py --modes {{ MODES }} --pool-modes run_tests --min-hacks 5
# Solve-teacher pool via OpenRouter qwen3-8b (1 GT-passing solution/problem, <=512 tok).
# Symmetric mix alongside the hack pool (T4). Needs OPENROUTER_API_KEY in a .env.
build-solve-pool *ARGS:
uv run python scripts/build_solve_pool_openrouter.py {{ ARGS }}
# ─────────────────────────────────────────────────────────────────────────────
# RESULTS + PAPER
# ─────────────────────────────────────────────────────────────────────────────
# Headline deploy table on the untouched recency-held-out test split, one row per run.
results:
uv run python scripts/results_deploy.py
# Warm HF cache before real runs (avoids re-download on the first pueue job).
download-model:
uv run python -c "from huggingface_hub import snapshot_download; \
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
# Show recent pueue logs.
log:
pueue log -l 40
# Compile the workshop writeup (tectonic = self-contained latex).
paper:
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
# QC: compile, dump the rendered pdf to text per-page, grep for unfilled markers.
paper-qc: paper
cd docs/writeup && \
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
( echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
echo; echo '### TODO markers:'; grep -nE '\\TODO|TODO' main.tex refs.bib paper.txt || echo ' none' ) \
| tee qc_report.txt
@echo "-> docs/writeup/qc_report.txt"
# tex -> markdown (pandoc) for the LW blog draft + cheap LLM read-throughs.
paper-md:
cd docs/writeup && \
sed '/usepackage{nips15submit_e}/d' main.tex | \
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
echo "-> docs/writeup/main.md"
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
sync-external:
cd external/rl-rewardhacking && git pull --ff-only