mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:45:42 +08:00
3f2b44452a
The authored absolute band made pos>=1 unreachable for live hacks (rout~0), and re-extracting it every 5 steps collapsed the gate (the #40 step-5 cliff). - Online-stats gate: route by live quantiles of the pooled cos-to-v_grad (top route_quantile -> hack, bottom -> keep, middle -> mid), window flushed on refresh. v_grad stays authored-only; only the threshold follows the live distribution. Smoke: routing sustained past the refresh (cliff fixed). - Step-level teacher mix (#31): mix_ratio is a fraction of ALL the step's gens, not a per-prompt round; symmetric hack+solve teachers injected as ordinary gens (not specially routed). Fixes the per-prompt rounding wart. - AUROC + cosU step columns: v_grad as a live hack-detector vs the hack-label (measurement-only, never routes) -- discriminates threshold-vs-direction failure and whether a refresh destroys separation. - Inline eval stays off (eval_ablate_every=0); deploy scored offline. - Fix _sample_rows None crash (beartype) on the no-solve-pool path. - Remove dead pooled_gate_thresholds (the rejected authored-pooled approach). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
181 lines
13 KiB
Makefile
181 lines
13 KiB
Makefile
set shell := ["bash", "-cu"]
|
|
|
|
# vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r),
|
|
# three arms (intervention none|routeV|absorb). See AGENTS.md / README.md.
|
|
MODEL := "Qwen/Qwen3-4B"
|
|
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
|
TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point
|
|
TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool
|
|
# Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially
|
|
# routed -- they pass through the same gate as student rollouts). STEP-LEVEL mix 0.5 over
|
|
# 4 prompts x group 8 -> 16 teachers/step (8 hack + 8 solve), 16 students. Heavy on
|
|
# purpose: the run is grad-starved (32 gens/step vs the paper's 256), so without strong
|
|
# teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers
|
|
# stay on to step 60 (was 30) so the bootstrap has time to land before pure on-policy.
|
|
# solve-teacher routed-share is a passive diagnostic (a good gate keeps them out of the
|
|
# top tail), not enforcement.
|
|
TEACH := "--mix-ratio=0.5 --solve-pool-dir=out/pools/teacher_pool_solve --solve-mix-frac=0.5 --teacher-off-step=60"
|
|
|
|
default:
|
|
@just --list
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# SMOKE — the correctness gate. tiny-random Qwen3 on CPU, BEARTYPE on, ~1-2 min.
|
|
# Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir).
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# Default smoke = routeV (full pipeline: extract v_grad -> two-pass gate -> deploy
|
|
# ablation). Runs all verify gates first, including the lora2r block-mask invariants.
|
|
smoke *ARGS:
|
|
uv run python scripts/verify_rewards.py # grader: 3 env_modes x clean/hack
|
|
uv run python scripts/verify_eval_gap.py # eval: train/test token gap, 4 modes
|
|
uv run python scripts/verify_partition.py # no-cheat: partition + teacher_modes
|
|
uv run python scripts/verify_science_invariants.py # pair provenance + untouched test
|
|
uv run python scripts/verify_rotation.py # rotating-unhackable hint-free flip
|
|
uv run python scripts/verify_lora2r_routing.py # block masks + ablation + c-probe
|
|
just smoke-routeV {{ ARGS }}
|
|
|
|
# none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla).
|
|
smoke-vanilla *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
# routeV: extract v_grad from authored pairs, splice the per-rollout c-probe gate,
|
|
# PASS 1 (unmasked) labels rollouts {clean,mid,hack} via the width-pooled band cosine,
|
|
# PASS 2 (masked) trains the blocks; deploy ablation resets the quarantine to init.
|
|
smoke-routeV *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# absorb: masks pinned (1,0) -> both blocks train on every rollout, NO gate. Isolates
|
|
# the value of the gate+hard-masks vs absorption alone.
|
|
smoke-absorb *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=absorb \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# Realism env: a random fraction of TRAIN problems flipped to gt_only (only honest
|
|
# solving pays) so there's persistent solve pressure.
|
|
smoke-unhackable *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# routeV with a top-k routing subspace (max_i cos(g,v_i) over k SVD dirs) instead of
|
|
# the single mean-mass axis. UAT: log shows "top-3 SVD subspace, gate=max_i cos" and the
|
|
# band/gate still route (rout>0). k=1 (default) is the mean-diff headline.
|
|
smoke-topk *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV --v-grad-k=3 \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# routeV + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
|
|
# and the run logs the routed-share discrimination (UAT: a line "solve-mix gate
|
|
# discrimination: hack-teacher routed-share=X vs solve-teacher routed-share=Y"). Smoke
|
|
# points solve at the same tiny pool just to exercise the split+diagnostic path; real
|
|
# runs use out/pools/teacher_pool_solve (honest demos) vs the hack pool.
|
|
smoke-solvemix *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
|
|
--teacher-pool-dir=out/pools/teacher_pool --solve-pool-dir=out/pools/teacher_pool \
|
|
--mix-ratio=0.5 --solve-mix-frac=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# All three arms back to back (the full-coverage gate).
|
|
smoke-all:
|
|
just smoke-vanilla
|
|
just smoke-routeV
|
|
just smoke-absorb
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# REAL RUNS — pueue on the 96GB box, FastConfig (Qwen3-4B, 100st, dense run_tests
|
|
# pool, 50% unhackable, authored pairs). Every job carries a why:/resolve: label.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# Headline 5-arm lora2r decision run, ONLINE-STATS gate + teacher forcing ({{ TEACH }}).
|
|
# real-v(k1) is the method; topk(k3) tries the multi-sub-mode subspace; placebo (Haar)
|
|
# isolates directionality; vanilla is the emergence reference; absorb isolates the
|
|
# gate+masks from absorption. Priority descending so they run in listed order.
|
|
# --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.
|
|
# Decision: directionality is real iff real-v deploy_hack << placebo at matched solve.
|
|
# Watch the streamed `auroc` col: ~0.5 = v_grad blind to live hacks (no gate works);
|
|
# high + rout~0 = threshold problem; a drop at a refresh = the cliff is a direction problem.
|
|
# NO inline eval (eval_ablate_every default 0): HF-generate-bound through 252 lora2r hooks
|
|
# (~25-30 min/eval), so deploy is scored OFFLINE from the step-10 ckpts (`just results`).
|
|
queue-decision seed='43':
|
|
pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeV REAL-v k1 online-stats + teacher-forcing s{{seed}} (50% unhackable); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeV --unhackable-frac=0.5 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_real_s{{seed}}
|
|
pueue add -w "$PWD" -o 60 -l "why: P2 lora2r routeV TOPK k3 online-stats + teacher-forcing s{{seed}} (50% unhackable); resolve: topk deploy_hack <= real-k1 -> sub-mode subspace catches hacks the mean washes out" -- {{ TRAIN }} fast --intervention=routeV --v-grad-k=3 --unhackable-frac=0.5 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_topk_s{{seed}}
|
|
pueue add -w "$PWD" -o 58 -l "why: P3 lora2r routeV PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (50% unhackable); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --unhackable-frac=0.5 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_placebo_s{{seed}}
|
|
pueue add -w "$PWD" -o 56 -l "why: P4 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (50% unhackable); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --unhackable-frac=0.5 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
|
|
pueue add -w "$PWD" -o 54 -l "why: P5 lora2r ABSORB (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (50% unhackable); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> absorption alone suppresses" -- {{ TRAIN }} fast --intervention=absorb --unhackable-frac=0.5 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}
|
|
|
|
# Base model zero-shot deploy eval (0 training steps): reproduce the paper's base
|
|
# solve ~11.5% in our harness. resolve: base solve ~0.10-0.12.
|
|
queue-baseline seed='43':
|
|
pueue add -w "$PWD" -o 80 -l "why: BASE MODEL zero-shot deploy eval s{{seed}}; resolve: our-harness base solve matches paper (~0.10-0.12)" -- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} --out-tag=_baseline_s{{seed}}
|
|
|
|
# No-loophole ceiling: vanilla GRPO with the honest grader (gt_only, no channel hack
|
|
# possible). resolve: our-harness ceiling solve matches paper (~0.20-0.25).
|
|
queue-no-loophole seed='43':
|
|
pueue add -w "$PWD" -o 11 -l "why: NO-LOOPHOLE ceiling s{{seed}} (gt_only honest grader); resolve: ceiling solve ~0.20-0.25" -- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} --out-tag=_noloophole_s{{seed}}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# ENV CONSTRUCTION — teacher pools + substrate (no oracle leak; pool candidates may
|
|
# be GT-filtered, mirroring how the hack pool was built).
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# Dense single-mode run_tests teacher pool (~215 prompts, re-graded, verified hacks).
|
|
build-runtests-pool:
|
|
uv run python scripts/build_runtests_pool.py
|
|
|
|
# Even, non-overlapping multi-loophole substrate (elicit-then-strip hacks + run_tests
|
|
# pool) -> out/pools/substrate/{prompt_*.jsonl.gz, partition.json}.
|
|
build-substrate MODES="run_tests,exit_code,sentinel":
|
|
uv run python scripts/build_substrate.py --modes {{ MODES }} --pool-modes run_tests --min-hacks 5
|
|
|
|
# Solve-teacher pool via OpenRouter qwen3-8b (1 GT-passing solution/problem, <=512 tok).
|
|
# Symmetric mix alongside the hack pool (T4). Needs OPENROUTER_API_KEY in a .env.
|
|
build-solve-pool *ARGS:
|
|
uv run python scripts/build_solve_pool_openrouter.py {{ ARGS }}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# RESULTS + PAPER
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# Headline deploy table on the untouched recency-held-out test split, one row per run.
|
|
results:
|
|
uv run python scripts/results_deploy.py
|
|
|
|
# Warm HF cache before real runs (avoids re-download on the first pueue job).
|
|
download-model:
|
|
uv run python -c "from huggingface_hub import snapshot_download; \
|
|
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
|
|
|
# Show recent pueue logs.
|
|
log:
|
|
pueue log -l 40
|
|
|
|
# Compile the workshop writeup (tectonic = self-contained latex).
|
|
paper:
|
|
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
|
|
|
|
# QC: compile, dump the rendered pdf to text per-page, grep for unfilled markers.
|
|
paper-qc: paper
|
|
cd docs/writeup && \
|
|
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
|
|
( echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
|
|
echo; echo '### TODO markers:'; grep -nE '\\TODO|TODO' main.tex refs.bib paper.txt || echo ' none' ) \
|
|
| tee qc_report.txt
|
|
@echo "-> docs/writeup/qc_report.txt"
|
|
|
|
# tex -> markdown (pandoc) for the LW blog draft + cheap LLM read-throughs.
|
|
paper-md:
|
|
cd docs/writeup && \
|
|
sed '/usepackage{nips15submit_e}/d' main.tex | \
|
|
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
|
|
echo "-> docs/writeup/main.md"
|
|
|
|
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
|
|
sync-external:
|
|
cd external/rl-rewardhacking && git pull --ff-only
|