mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 13:44:40 +08:00
04a98b321e
Fork of vGROUT. Replaces routeA's fixed v_act quantile gate with a learned per-rollout soft router (HackRouter, seeded from v_act) on the ablatable hack expert: GRPO flows into the router through the soft weight w (it concentrates hack-like rollouts in the hack expert), and a continuous pin loss on the hand-authored pairs anchors the axis. No load balancing; routing is per rollout. lora2r gains a soft-weight forward path (_lora2r_w: w=0 keep, w=1 rout, deployed grad scaled by 1-w). train_moe.py is the on-policy GRPO loop; verify_moe_router.py gates the routing invariants. `just smoke` is green. README/AGENTS rewritten for the fork; original proposal kept as docs/spec/original_evil_moe_spec.md. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
216 lines
15 KiB
Makefile
216 lines
15 KiB
Makefile
set shell := ["bash", "-cu"]
|
||
|
||
# vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r),
|
||
# three arms (intervention none|routeA|absorb). See AGENTS.md / README.md.
|
||
MODEL := "Qwen/Qwen3-4B"
|
||
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
||
TRAIN := "uv run python -m vgrout.train" # legacy routeA/none/absorb entry point
|
||
TRAIN_MOE := "uv run python -m vgrout.train_moe" # Evil MoE: learned router + pin loss
|
||
TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool
|
||
# Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially
|
||
# routed -- they pass through the same gate as student rollouts). Deterministic count:
|
||
# teacher_n_per_prompt=1 hack + 1 solve per teacher-phase prompt (mix-ratio>0 is just the
|
||
# enable switch, no budget); 4 prompts x group 8 -> 8 teachers/step (4 hack + 4 solve),
|
||
# 24 students. The run is grad-starved (32 gens/step vs the paper's 256), so without
|
||
# teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers
|
||
# off at step 30: in the s43 run hack was already self-sustaining (student kept hacking
|
||
# 16-24/32 after teachers cut), so 60 just fed extra hacks past saturation.
|
||
# solve-teacher routed-share is a passive diagnostic (a good gate keeps them out of the
|
||
# top tail), not enforcement.
|
||
TEACH := "--mix-ratio=0.5 --solve-pool-dir=out/pools/teacher_pool_solve --solve-mix-frac=0.5 --teacher-off-step=30"
|
||
|
||
default:
|
||
@just --list
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# SMOKE — the correctness gate. tiny-random Qwen3 on CPU, BEARTYPE on, ~1-2 min.
|
||
# Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir).
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Default smoke = Evil MoE (the fork's headline): verify gates incl. the soft-weight
|
||
# router invariants, then a tiny on-policy GRPO run with the learned router + pin loss
|
||
# + deploy ablation. ~1-2 min, BEARTYPE on, real pipeline on tiny inputs.
|
||
smoke *ARGS:
|
||
uv run python scripts/verify_rewards.py # grader: 3 env_modes x clean/hack
|
||
uv run python scripts/verify_eval_gap.py # eval: train/test token gap, 4 modes
|
||
uv run python scripts/verify_lora2r_routing.py # block masks + ablation + c-probe
|
||
uv run python scripts/verify_moe_router.py # soft hack-expert weight + learned router + pin
|
||
just smoke-moe {{ ARGS }}
|
||
|
||
# Evil MoE: on-policy GRPO, per-rollout LEARNED soft router on the ablatable hack expert
|
||
# (seeded from v_act), continuous pin loss on authored pairs, NO load balancing. Deploy
|
||
# eval ablates the hack expert; UAT line: "Evil MoE causal ablation: deploy hack ON -> OFF".
|
||
smoke-moe *ARGS:
|
||
BEARTYPE=1 {{ TRAIN_MOE }} smoke {{ ARGS }}
|
||
|
||
# ── legacy vGROUT arms (routeA/none/absorb) kept for comparison; need teacher pools ──
|
||
# Full legacy gate (the original `smoke`): all verify_*.py + routeA tiny run.
|
||
smoke-legacy *ARGS:
|
||
uv run python scripts/verify_rewards.py
|
||
uv run python scripts/verify_eval_gap.py
|
||
uv run python scripts/verify_partition.py
|
||
uv run python scripts/verify_science_invariants.py
|
||
uv run python scripts/verify_rotation.py
|
||
uv run python scripts/verify_lora2r_routing.py
|
||
just smoke-routeA {{ ARGS }}
|
||
|
||
# none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla).
|
||
smoke-vanilla *ARGS:
|
||
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
||
|
||
# routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the
|
||
# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via global-quantile
|
||
# tails of the run-spanning score buffer, ONE masked forward+backward; deploy ablation resets the quarantine to init.
|
||
smoke-routeA *ARGS:
|
||
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
|
||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
||
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
||
|
||
# absorb: masks pinned (1,0) -> both blocks train on every rollout, NO gate. Isolates
|
||
# the value of the gate and masks versus ungated both-block training.
|
||
smoke-absorb *ARGS:
|
||
BEARTYPE=1 {{ TRAIN }} smoke --intervention=absorb \
|
||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
||
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
||
|
||
# Realism env: a random fraction of TRAIN problems flipped to gt_only (only ground-truth
|
||
# solving pays) so there's persistent solve pressure.
|
||
smoke-unhackable *ARGS:
|
||
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
||
--eval-n-prompts=2 {{ ARGS }}
|
||
|
||
# routeA + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
|
||
# and the run logs the routed-share discrimination (UAT: a line "solve-mix gate
|
||
# discrimination: hack-teacher routed-share=X vs solve-teacher routed-share=Y"). Smoke
|
||
# points solve at the same tiny pool just to exercise the split+diagnostic path; real
|
||
# runs use out/pools/teacher_pool_solve (correct-solution demos) vs the hack pool.
|
||
smoke-solvemix *ARGS:
|
||
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
|
||
--teacher-pool-dir=out/pools/teacher_pool --solve-pool-dir=out/pools/teacher_pool \
|
||
--mix-ratio=0.5 --solve-mix-frac=0.5 \
|
||
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
||
|
||
# All three arms back to back (the full-coverage gate).
|
||
smoke-all:
|
||
just smoke-vanilla
|
||
just smoke-routeA
|
||
just smoke-absorb
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# REAL RUNS — pueue on the 96GB box, FastConfig (Qwen3-4B, 100st, dense run_tests
|
||
# pool, 50% unhackable, authored pairs). Every job carries a why:/resolve: label.
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}).
|
||
# real-v is the method (v_act from authored pairs, global-quantile tail thresholds);
|
||
# placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb
|
||
# isolates the gate+masks from absorption. Priority descending so they run in listed order.
|
||
# --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.
|
||
# Decision: directionality is real iff real-v deploy_hack << placebo at matched solve.
|
||
# Watch the streamed `auroc` col (A>0 contrast): ~0.5 = v_act blind to live hacks (no gate
|
||
# works); high + rout~0 = threshold problem; a drop at a refresh = a direction problem.
|
||
# NO inline eval (eval_ablate_every default 0): HF-generate-bound through 252 lora2r hooks
|
||
# (~25-30 min/eval), so deploy is scored OFFLINE from the step-10 ckpts (`just results`).
|
||
# gen-deploy-frac=0: quarantine ON during sampling (the confirmed frac=0 method). The base
|
||
# default is 1.0 (the job-34 config where ablation RAISED hack 0.71->0.86); pass it
|
||
# explicitly here so the sweep never silently re-runs that. frac=0 also dodges the
|
||
# frac=0.5 blow-up combo (deploy-sampled absorb/route rows carry ρ=full/ablated, A<0).
|
||
queue-decision seed='43':
|
||
pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeA REAL-v act gate + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_real_s{{seed}}
|
||
pueue add -w "$PWD" -o 58 -l "why: P2 lora2r routeA PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeA --routeA-random-v-seed=157 --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_placebo_s{{seed}}
|
||
pueue add -w "$PWD" -o 56 -l "why: P3 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
|
||
pueue add -w "$PWD" -o 54 -l "why: P4 lora2r BOTH-BLOCK (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (25% unhackable, frac=0); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> ungated both-block training suppresses" -- {{ TRAIN }} fast --intervention=absorb --gen-deploy-frac=0 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}
|
||
|
||
# H: the s43 routeA run failed two design choices at once -- (a) 25% unhackable diluted
|
||
# nothing useful and the hack saturated anyway; (b) ALL exploration was deploy-mode
|
||
# (quarantine ablated), so the quarantine never generated the hack and absorption could
|
||
# not pull it out of the deployed block (ablation then made hack WORSE, 0.71->0.86).
|
||
# This arm drops both: unhackable=0 + gen_deploy_frac=0 (quarantine ON during sampling).
|
||
# Confounded by design (two changes) -- a reset probe, not a clean ablation.
|
||
# resolve: deploy_hack < 0.86 AND ablation moves hack DOWN (quarantine holds the hack).
|
||
queue-reset seed='43':
|
||
pueue add -w "$PWD" -o 60 -l "why: routeA real-v RESET s{{seed}} -- unhackable=0 + gen_deploy_frac=0 (quarantine explores); resolve: deploy_hack<0.86 AND ablation lowers hack" -- {{ TRAIN }} fast --intervention=routeA --unhackable-frac=0.0 --gen-deploy-frac=0.0 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_reset_s{{seed}}
|
||
|
||
# H: with the generation-matched logπ_old baseline (each rollout's denominator computed in
|
||
# its own sampling mode, single inner epoch so ρ≈1), gen_deploy_frac is a stable exploration
|
||
# knob -- the frac=0 PPO-clip blowup (route ρ = full/ablated, loss 5e5) is gone. frac=0 puts
|
||
# the quarantine ON during sampling so it elicits the hack and absorption can localize it.
|
||
# Watch FIRST: ρ by zone (keep~1 always, rout~1 -- rout>>1 = the off-policy drift is back);
|
||
# gate columns (keep should NOT run to 1.0; rout should fire when hack_s>0; route_hackT >>
|
||
# route_solveT). resolve: deploy_hack < 0.86 AND ablation LOWERS hack (quarantine holds the
|
||
# hack), at unhackable=0.25 (job-34 baseline).
|
||
queue-explore frac='0.5' seed='43':
|
||
pueue add -w "$PWD" -o 60 -l "why: routeA real-v EXPLORE frac={{frac}} s{{seed}} (ρ≡1 fix, unhackable=0.25); resolve: deploy_hack<0.86 AND ablation lowers hack AND gate stays alive (keep<1)" -- {{ TRAIN }} fast --intervention=routeA --gen-deploy-frac={{frac}} {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_expl{{frac}}_s{{seed}}
|
||
|
||
# Base model zero-shot deploy eval (0 training steps): reproduce the paper's base
|
||
# solve ~11.5% in our harness. resolve: base solve ~0.10-0.12.
|
||
queue-baseline seed='43':
|
||
pueue add -w "$PWD" -o 80 -l "why: BASE MODEL zero-shot deploy eval s{{seed}}; resolve: our-harness base solve matches paper (~0.10-0.12)" -- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} --out-tag=_baseline_s{{seed}}
|
||
|
||
# No-loophole ceiling: vanilla GRPO with ground-truth tests (gt_only, no exploit channel
|
||
# possible). resolve: our-harness ceiling solve matches paper (~0.20-0.25).
|
||
queue-no-loophole seed='43':
|
||
pueue add -w "$PWD" -o 11 -l "why: NO-LOOPHOLE ceiling s{{seed}} (gt_only ground-truth tests); resolve: ceiling solve ~0.20-0.25" -- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} --out-tag=_noloophole_s{{seed}}
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# ENV CONSTRUCTION — teacher pools + substrate (no oracle leak; pool candidates may
|
||
# be GT-filtered, mirroring how the hack pool was built).
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Dense single-mode run_tests teacher pool (~215 prompts, re-graded, verified hacks).
|
||
build-runtests-pool:
|
||
uv run python scripts/build_runtests_pool.py
|
||
|
||
# Even, non-overlapping multi-loophole substrate (elicit-then-strip hacks + run_tests
|
||
# pool) -> out/pools/substrate/{prompt_*.jsonl.gz, partition.json}.
|
||
build-substrate MODES="run_tests,exit_code,sentinel":
|
||
uv run python scripts/build_substrate.py --modes {{ MODES }} --pool-modes run_tests --min-hacks 5
|
||
|
||
# Solve-teacher pool via OpenRouter qwen3-8b (1 GT-passing solution/problem, <=512 tok).
|
||
# Symmetric mix alongside the hack pool (T4). Needs OPENROUTER_API_KEY in a .env.
|
||
build-solve-pool *ARGS:
|
||
uv run python scripts/build_solve_pool_openrouter.py {{ ARGS }}
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# RESULTS + PAPER
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Headline deploy table on the untouched recency-held-out test split, one row per run.
|
||
results:
|
||
uv run python scripts/results_deploy.py
|
||
|
||
# Warm HF cache before real runs (avoids re-download on the first pueue job).
|
||
download-model:
|
||
uv run python -c "from huggingface_hub import snapshot_download; \
|
||
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
||
|
||
# Show recent pueue logs.
|
||
log:
|
||
pueue log -l 40
|
||
|
||
# Compile the workshop writeup (tectonic = self-contained latex).
|
||
paper:
|
||
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
|
||
|
||
# QC: compile, dump the rendered pdf to text per-page, grep for unfilled markers.
|
||
paper-qc: paper
|
||
cd docs/writeup && \
|
||
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
|
||
( echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
|
||
echo; echo '### TODO markers:'; grep -nE '\\TODO|TODO' main.tex refs.bib paper.txt || echo ' none' ) \
|
||
| tee qc_report.txt
|
||
@echo "-> docs/writeup/qc_report.txt"
|
||
|
||
# tex -> markdown (pandoc) for the LW blog draft + cheap LLM read-throughs.
|
||
paper-md:
|
||
cd docs/writeup && \
|
||
sed '/usepackage{nips15submit_e}/d' main.tex | \
|
||
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
|
||
echo "-> docs/writeup/main.md"
|
||
|
||
# Sync the rl-rewardhacking submodule (Nanda's verl wrapper).
|
||
sync-external:
|
||
cd docs/vendor/rl-rewardhacking && git pull --ff-only
|