Files
wassname 04a98b321e feat: Evil MoE — learned soft router + pin loss on an ablatable hack expert
Fork of vGROUT. Replaces routeA's fixed v_act quantile gate with a learned
per-rollout soft router (HackRouter, seeded from v_act) on the ablatable hack
expert: GRPO flows into the router through the soft weight w (it concentrates
hack-like rollouts in the hack expert), and a continuous pin loss on the
hand-authored pairs anchors the axis. No load balancing; routing is per rollout.

lora2r gains a soft-weight forward path (_lora2r_w: w=0 keep, w=1 rout, deployed
grad scaled by 1-w). train_moe.py is the on-policy GRPO loop; verify_moe_router.py
gates the routing invariants. `just smoke` is green. README/AGENTS rewritten for
the fork; original proposal kept as docs/spec/original_evil_moe_spec.md.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-14 11:25:14 +08:00

216 lines
15 KiB
Makefile
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
set shell := ["bash", "-cu"]
# vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r),
# three arms (intervention none|routeA|absorb). See AGENTS.md / README.md.
MODEL := "Qwen/Qwen3-4B"
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
TRAIN := "uv run python -m vgrout.train" # legacy routeA/none/absorb entry point
TRAIN_MOE := "uv run python -m vgrout.train_moe" # Evil MoE: learned router + pin loss
TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool
# Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially
# routed -- they pass through the same gate as student rollouts). Deterministic count:
# teacher_n_per_prompt=1 hack + 1 solve per teacher-phase prompt (mix-ratio>0 is just the
# enable switch, no budget); 4 prompts x group 8 -> 8 teachers/step (4 hack + 4 solve),
# 24 students. The run is grad-starved (32 gens/step vs the paper's 256), so without
# teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers
# off at step 30: in the s43 run hack was already self-sustaining (student kept hacking
# 16-24/32 after teachers cut), so 60 just fed extra hacks past saturation.
# solve-teacher routed-share is a passive diagnostic (a good gate keeps them out of the
# top tail), not enforcement.
TEACH := "--mix-ratio=0.5 --solve-pool-dir=out/pools/teacher_pool_solve --solve-mix-frac=0.5 --teacher-off-step=30"
default:
@just --list
# ─────────────────────────────────────────────────────────────────────────────
# SMOKE — the correctness gate. tiny-random Qwen3 on CPU, BEARTYPE on, ~1-2 min.
# Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir).
# ─────────────────────────────────────────────────────────────────────────────
# Default smoke = Evil MoE (the fork's headline): verify gates incl. the soft-weight
# router invariants, then a tiny on-policy GRPO run with the learned router + pin loss
# + deploy ablation. ~1-2 min, BEARTYPE on, real pipeline on tiny inputs.
smoke *ARGS:
uv run python scripts/verify_rewards.py # grader: 3 env_modes x clean/hack
uv run python scripts/verify_eval_gap.py # eval: train/test token gap, 4 modes
uv run python scripts/verify_lora2r_routing.py # block masks + ablation + c-probe
uv run python scripts/verify_moe_router.py # soft hack-expert weight + learned router + pin
just smoke-moe {{ ARGS }}
# Evil MoE: on-policy GRPO, per-rollout LEARNED soft router on the ablatable hack expert
# (seeded from v_act), continuous pin loss on authored pairs, NO load balancing. Deploy
# eval ablates the hack expert; UAT line: "Evil MoE causal ablation: deploy hack ON -> OFF".
smoke-moe *ARGS:
BEARTYPE=1 {{ TRAIN_MOE }} smoke {{ ARGS }}
# ── legacy vGROUT arms (routeA/none/absorb) kept for comparison; need teacher pools ──
# Full legacy gate (the original `smoke`): all verify_*.py + routeA tiny run.
smoke-legacy *ARGS:
uv run python scripts/verify_rewards.py
uv run python scripts/verify_eval_gap.py
uv run python scripts/verify_partition.py
uv run python scripts/verify_science_invariants.py
uv run python scripts/verify_rotation.py
uv run python scripts/verify_lora2r_routing.py
just smoke-routeA {{ ARGS }}
# none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla).
smoke-vanilla *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
# routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the
# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via global-quantile
# tails of the run-spanning score buffer, ONE masked forward+backward; deploy ablation resets the quarantine to init.
smoke-routeA *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# absorb: masks pinned (1,0) -> both blocks train on every rollout, NO gate. Isolates
# the value of the gate and masks versus ungated both-block training.
smoke-absorb *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=absorb \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# Realism env: a random fraction of TRAIN problems flipped to gt_only (only ground-truth
# solving pays) so there's persistent solve pressure.
smoke-unhackable *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
--eval-n-prompts=2 {{ ARGS }}
# routeA + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
# and the run logs the routed-share discrimination (UAT: a line "solve-mix gate
# discrimination: hack-teacher routed-share=X vs solve-teacher routed-share=Y"). Smoke
# points solve at the same tiny pool just to exercise the split+diagnostic path; real
# runs use out/pools/teacher_pool_solve (correct-solution demos) vs the hack pool.
smoke-solvemix *ARGS:
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
--teacher-pool-dir=out/pools/teacher_pool --solve-pool-dir=out/pools/teacher_pool \
--mix-ratio=0.5 --solve-mix-frac=0.5 \
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
# All three arms back to back (the full-coverage gate).
smoke-all:
just smoke-vanilla
just smoke-routeA
just smoke-absorb
# ─────────────────────────────────────────────────────────────────────────────
# REAL RUNS — pueue on the 96GB box, FastConfig (Qwen3-4B, 100st, dense run_tests
# pool, 50% unhackable, authored pairs). Every job carries a why:/resolve: label.
# ─────────────────────────────────────────────────────────────────────────────
# Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}).
# real-v is the method (v_act from authored pairs, global-quantile tail thresholds);
# placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb
# isolates the gate+masks from absorption. Priority descending so they run in listed order.
# --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.
# Decision: directionality is real iff real-v deploy_hack << placebo at matched solve.
# Watch the streamed `auroc` col (A>0 contrast): ~0.5 = v_act blind to live hacks (no gate
# works); high + rout~0 = threshold problem; a drop at a refresh = a direction problem.
# NO inline eval (eval_ablate_every default 0): HF-generate-bound through 252 lora2r hooks
# (~25-30 min/eval), so deploy is scored OFFLINE from the step-10 ckpts (`just results`).
# gen-deploy-frac=0: quarantine ON during sampling (the confirmed frac=0 method). The base
# default is 1.0 (the job-34 config where ablation RAISED hack 0.71->0.86); pass it
# explicitly here so the sweep never silently re-runs that. frac=0 also dodges the
# frac=0.5 blow-up combo (deploy-sampled absorb/route rows carry ρ=full/ablated, A<0).
queue-decision seed='43':
pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeA REAL-v act gate + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_real_s{{seed}}
pueue add -w "$PWD" -o 58 -l "why: P2 lora2r routeA PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeA --routeA-random-v-seed=157 --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_placebo_s{{seed}}
pueue add -w "$PWD" -o 56 -l "why: P3 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
pueue add -w "$PWD" -o 54 -l "why: P4 lora2r BOTH-BLOCK (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> ungated both-block training suppresses" -- {{ TRAIN }} fast --intervention=absorb --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}
# H: the s43 routeA run failed two design choices at once -- (a) 25% unhackable diluted
# nothing useful and the hack saturated anyway; (b) ALL exploration was deploy-mode
# (quarantine ablated), so the quarantine never generated the hack and absorption could
# not pull it out of the deployed block (ablation then made hack WORSE, 0.71->0.86).
# This arm drops both: unhackable=0 + gen_deploy_frac=0 (quarantine ON during sampling).
# Confounded by design (two changes) -- a reset probe, not a clean ablation.
# resolve: deploy_hack < 0.86 AND ablation moves hack DOWN (quarantine holds the hack).
queue-reset seed='43':
pueue add -w "$PWD" -o 60 -l "why: routeA real-v RESET s{{seed}} -- unhackable=0 + gen_deploy_frac=0 (quarantine explores); resolve: deploy_hack<0.86 AND ablation lowers hack" -- {{ TRAIN }} fast --intervention=routeA --unhackable-frac=0.0 --gen-deploy-frac=0.0 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_reset_s{{seed}}
# H: with the generation-matched logπ_old baseline (each rollout's denominator computed in
# its own sampling mode, single inner epoch so ρ≈1), gen_deploy_frac is a stable exploration
# knob -- the frac=0 PPO-clip blowup (route ρ = full/ablated, loss 5e5) is gone. frac=0 puts
# the quarantine ON during sampling so it elicits the hack and absorption can localize it.
# Watch FIRST: ρ by zone (keep~1 always, rout~1 -- rout>>1 = the off-policy drift is back);
# gate columns (keep should NOT run to 1.0; rout should fire when hack_s>0; route_hackT >>
# route_solveT). resolve: deploy_hack < 0.86 AND ablation LOWERS hack (quarantine holds the
# hack), at unhackable=0.25 (job-34 baseline).
queue-explore frac='0.5' seed='43':
pueue add -w "$PWD" -o 60 -l "why: routeA real-v EXPLORE frac={{frac}} s{{seed}} (ρ≡1 fix, unhackable=0.25); resolve: deploy_hack<0.86 AND ablation lowers hack AND gate stays alive (keep<1)" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac={{frac}} {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_expl{{frac}}_s{{seed}}
# Base model zero-shot deploy eval (0 training steps): reproduce the paper's base
# solve ~11.5% in our harness. resolve: base solve ~0.10-0.12.
queue-baseline seed='43':
pueue add -w "$PWD" -o 80 -l "why: BASE MODEL zero-shot deploy eval s{{seed}}; resolve: our-harness base solve matches paper (~0.10-0.12)" -- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} --out-tag=_baseline_s{{seed}}
# No-loophole ceiling: vanilla GRPO with ground-truth tests (gt_only, no exploit channel
# possible). resolve: our-harness ceiling solve matches paper (~0.20-0.25).
queue-no-loophole seed='43':
pueue add -w "$PWD" -o 11 -l "why: NO-LOOPHOLE ceiling s{{seed}} (gt_only ground-truth tests); resolve: ceiling solve ~0.20-0.25" -- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} --out-tag=_noloophole_s{{seed}}
# ─────────────────────────────────────────────────────────────────────────────
# ENV CONSTRUCTION — teacher pools + substrate (no oracle leak; pool candidates may
# be GT-filtered, mirroring how the hack pool was built).
# ─────────────────────────────────────────────────────────────────────────────
# Dense single-mode run_tests teacher pool (~215 prompts, re-graded, verified hacks).
build-runtests-pool:
uv run python scripts/build_runtests_pool.py
# Even, non-overlapping multi-loophole substrate (elicit-then-strip hacks + run_tests
# pool) -> out/pools/substrate/{prompt_*.jsonl.gz, partition.json}.
build-substrate MODES="run_tests,exit_code,sentinel":
uv run python scripts/build_substrate.py --modes {{ MODES }} --pool-modes run_tests --min-hacks 5
# Solve-teacher pool via OpenRouter qwen3-8b (1 GT-passing solution/problem, <=512 tok).
# Symmetric mix alongside the hack pool (T4). Needs OPENROUTER_API_KEY in a .env.
build-solve-pool *ARGS:
uv run python scripts/build_solve_pool_openrouter.py {{ ARGS }}
# ─────────────────────────────────────────────────────────────────────────────
# RESULTS + PAPER
# ─────────────────────────────────────────────────────────────────────────────
# Headline deploy table on the untouched recency-held-out test split, one row per run.
results:
uv run python scripts/results_deploy.py
# Warm HF cache before real runs (avoids re-download on the first pueue job).
download-model:
uv run python -c "from huggingface_hub import snapshot_download; \
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
# Show recent pueue logs.
log:
pueue log -l 40
# Compile the workshop writeup (tectonic = self-contained latex).
paper:
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
# QC: compile, dump the rendered pdf to text per-page, grep for unfilled markers.
paper-qc: paper
cd docs/writeup && \
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
( echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
echo; echo '### TODO markers:'; grep -nE '\\TODO|TODO' main.tex refs.bib paper.txt || echo ' none' ) \
| tee qc_report.txt
@echo "-> docs/writeup/qc_report.txt"
# tex -> markdown (pandoc) for the LW blog draft + cheap LLM read-throughs.
paper-md:
cd docs/writeup && \
sed '/usepackage{nips15submit_e}/d' main.tex | \
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
echo "-> docs/writeup/main.md"
# Sync the rl-rewardhacking submodule (Nanda's verl wrapper).
sync-external:
cd docs/vendor/rl-rewardhacking && git pull --ff-only