mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:04:59 +08:00
258 lines
12 KiB
Makefile
258 lines
12 KiB
Makefile
set shell := ["bash", "-cu"]
|
|
|
|
# Three seeds for headline arms; one seed for ablations.
|
|
SEEDS_3 := "41 43 44"
|
|
# spec.md §H4 substrate (reference DEFAULT_MODEL_ID).
|
|
# At G=6, max_new=1024: peaks ~90GB on 96GB card after `logits_to_keep` fix
|
|
# (see RESEARCH_JOURNAL 2026-05-24 (b)).
|
|
MODEL := "Qwen/Qwen3-4B"
|
|
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
|
TRAIN := "uv run python -m projected_grpo.train" # real LeetCode GRPO entry point
|
|
|
|
default:
|
|
@just --list
|
|
|
|
# Smoke: same harness as production (train.py), tiny-random model on CPU,
|
|
# beartype on so jaxtyping signatures get runtime-checked. Runs 30 steps so
|
|
# the every-25-step save_ckpt path is covered. Should finish in ~1-2 min.
|
|
# Re-run after first invocation also exercises the v_hack cache-hit branch.
|
|
# Pulls cached teacher rollouts (real Qwen3-4B completions + real graded
|
|
# rewards) at mix_ratio=0.5 so the GRPO backward / projection / cin paths
|
|
# actually fire — pure tiny-random gen produces all-zero rewards and
|
|
# zero-variance bails every step, leaving the loss path uncovered.
|
|
smoke *ARGS:
|
|
BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --arm=projected \
|
|
--v-hack-path=out/v_hack_smoke.safetensors \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
smoke-vanilla *ARGS:
|
|
BEARTYPE=1 CUDA_VISIBLE_DEVICES= {{ TRAIN }} smoke --arm=vanilla \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
# Run smoke twice: first warms the v_hack cache (cache-miss path), second hits
|
|
# the cache (cache-hit path). Catches scope/save bugs that only manifest in one.
|
|
smoke-both:
|
|
just smoke-vanilla
|
|
just smoke
|
|
|
|
# H4 baseline at spec substrate. No v_hack needed for vanilla.
|
|
full-vanilla *ARGS:
|
|
{{ TRAIN }} full --arm=vanilla {{ ARGS }}
|
|
|
|
full *ARGS:
|
|
{{ TRAIN }} full --arm=projected --v-hack-path=out/v_hack_full.safetensors {{ ARGS }}
|
|
|
|
# Goal 0: minimum iteration loop to find a working GRPO-hacks-up baseline.
|
|
# Uses fast preset (20 steps, fast-Adam: lr=3e-3 beta1=0.5 beta2=0.9) + cached
|
|
# teacher pool at mix_ratio=0.5. UAT: hack_s rises from 0/N to >=N/4 by step 20.
|
|
# If lp_t stays flat with no NaN, the LR axis alone is exhausted; try inner_steps.
|
|
fast-vanilla *ARGS:
|
|
{{ TRAIN }} fast --arm=vanilla \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 \
|
|
--grad-clip=500 {{ ARGS }}
|
|
|
|
# Goal 1: same recipe with --arm=projected. Run only after fast-vanilla passes UAT.
|
|
fast-projected *ARGS:
|
|
{{ TRAIN }} fast --arm=projected \
|
|
--v-hack-path=out/v_hack_full.safetensors \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 \
|
|
--grad-clip=500 {{ ARGS }}
|
|
|
|
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
|
|
sync-external:
|
|
cd external/rl-rewardhacking && git pull --ff-only
|
|
|
|
# Warm HF cache before real runs (avoids re-download on first pueue job).
|
|
download-model:
|
|
uv run python -c "from huggingface_hub import snapshot_download; \
|
|
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
|
|
|
extract-vhack-smoke:
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--out-path=out/v_hack_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads_train_smoke.safetensors
|
|
|
|
extract-vhack-full:
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--out-path=out/v_hack_full.safetensors \
|
|
--train-grads-path=out/vhack_grads_train_full.safetensors
|
|
|
|
# Rank-1 mean-diff basis (alternative to SVD top-k). Honest under small N.
|
|
extract-vhack-meandiff:
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--mean-diff \
|
|
--out-path=out/v_hack_full_meandiff.safetensors \
|
|
--train-grads-path=out/vhack_grads_train_meandiff.safetensors
|
|
|
|
verify-vhack-smoke:
|
|
uv run python -m projected_grpo.verify_vhack_heldout \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/v_hack_smoke.safetensors \
|
|
--out-path=out/vhack_heldout_cos_smoke.safetensors
|
|
|
|
verify-vhack-full:
|
|
uv run python -m projected_grpo.verify_vhack_heldout \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/v_hack_full.safetensors \
|
|
--out-path=out/vhack_heldout_cos_full.safetensors
|
|
|
|
# =============================================================================
|
|
# SWEEPS — what to run, in order
|
|
# =============================================================================
|
|
#
|
|
# 1. `just probe-full-seed 41` — single-seed gate (~6-9h sequential).
|
|
# extract -> verify-heldout -> vanilla -> projected. Inspect before sweep.
|
|
# 2. `just queue-full` — 3-seed headline sweep (~36-54h).
|
|
# Queues 1 extract + 3 vanilla + 3 projected. Only run after probe passes.
|
|
#
|
|
# Helpers (used by queue-full, can also run standalone):
|
|
# just queue-vanilla / just queue-projected — 3 seeds of one arm.
|
|
# just probe-h4 41 — vanilla only on a single seed (H4 substrate sanity).
|
|
# =============================================================================
|
|
|
|
# Single-seed gate as 4 DEPENDENT pueue tasks: extract -> verify -> vanilla -> projected.
|
|
# Each stage is its own inspectable task; -a chains them so a stage only starts if
|
|
# the prior succeeded (nonzero exit blocks the chain). Gates A/B are enforced by exit
|
|
# code (verify exits nonzero if frac>0<=0.50). Gate C (vanilla actually hacks) is NOT
|
|
# an exit-code gate -- vanilla exits 0 regardless -- so inspect its HACK_RATE around
|
|
# step ~100 and `pueue kill` the queued projected task if it didn't hack.
|
|
# Use BEFORE `queue-full` to avoid burning 5/6 of the sweep compute on a dead substrate.
|
|
probe-full-seed seed="41":
|
|
#!/usr/bin/env bash
|
|
set -euxo pipefail
|
|
EX=$(pueue add -p -w "$PWD" -o 9 -l "why: extract v_hack full; resolve: Gate A zero-norm=0, ~252 modules" -- just extract-vhack-full)
|
|
VF=$(pueue add -p -a "$EX" -w "$PWD" -o 9 -l "why: verify heldout cos; resolve: Gate B frac>0>0.50, mean>0.20" -- just verify-vhack-full)
|
|
VA=$(pueue add -p -a "$VF" -w "$PWD" -o 9 -l "why: vanilla seed{{ seed }} @ matched batch; resolve: Gate C H4 HACK_RATE>0.30 by ~step100" -- {{ TRAIN }} full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe)
|
|
pueue add -a "$VA" -w "$PWD" -o 8 -l "why: projected seed{{ seed }} @ matched batch, v_hack NOT post-hoc; resolve: Gate D H1 HACK_RATE<vanilla at matched PASS" -- {{ TRAIN }} full --arm=projected --seed={{ seed }} --v-hack-path=out/v_hack_full.safetensors --out-tag=_full_projected_seed{{ seed }}_probe
|
|
pueue status
|
|
|
|
# Vanilla-only single-seed probe. Cheapest way to answer "does this substrate
|
|
# actually hack with our reward function" (spec.md §H4).
|
|
probe-h4 seed="41":
|
|
{{ TRAIN }} full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_h4
|
|
|
|
# Headline 3-seed sweep: extract + 3 vanilla + 3 projected via pueue.
|
|
# Only run after probe-full-seed shows vanilla hacks and projected fires.
|
|
queue-full:
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
pueue add -w "$PWD" -o 6 \
|
|
-l "why: extract full v_hack for exact checkpoint; resolve: out/v_hack_full.safetensors exists and train.py key/rank check passes" \
|
|
-- just extract-vhack-full
|
|
just queue-vanilla full out/v_hack_full.safetensors
|
|
just queue-projected full out/v_hack_full.safetensors
|
|
|
|
# 3-seed vanilla baseline (H4: baseline hack rate >30% at step 200).
|
|
queue-vanilla preset="full" vhack="out/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 5 \
|
|
-l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \
|
|
-- {{ TRAIN }} {{ preset }} --arm=vanilla --seed=$seed --out-tag=_{{ preset }}_vanilla_seed$seed
|
|
done
|
|
|
|
# 3-seed projected (H1: -30pp hack vs vanilla at matched pass).
|
|
queue-projected preset="full" vhack="out/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 4 \
|
|
-l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \
|
|
-- {{ TRAIN }} {{ preset }} --arm=projected --seed=$seed --v-hack-path={{ vhack }} --out-tag=_{{ preset }}_projected_seed$seed
|
|
done
|
|
|
|
# Base pool: base Qwen3-4B, no LoRA, no hint applied. ~0% hack per ariahw §86.
|
|
# Used to source non-hack samples for the cos comparison bucket.
|
|
probe-base-pool steps="20":
|
|
uv run python -m projected_grpo.probe_distill --base-only --steps={{ steps }} --n-problems={{ steps }}
|
|
|
|
# Trajectory comparator for the warmup-gen runs (vanilla vs projected).
|
|
probe-traj:
|
|
uv run python -m projected_grpo.probe_traj
|
|
|
|
# Print the results table prototype.
|
|
table-proto:
|
|
@cat docs/table_proto.md
|
|
|
|
# Pre-generate teacher rollouts for N prompts via probe_distill.py --teacher-only.
|
|
# Writes/extends out/probe_distill/teacher_pool/. Teacher = ariahw rh-s65 LoRA
|
|
# merged on Qwen3-4B. Cost ~30s/prompt @ G=8, max_new=1024 -> ~50 min for 100.
|
|
# Pool is consumed by fast-vanilla / fast-projected via --teacher-pool-dir.
|
|
pregen-teacher n_prompts="100":
|
|
uv run python -m projected_grpo.probe_distill \
|
|
--teacher-only \
|
|
--steps={{ n_prompts }} \
|
|
--n-problems={{ n_prompts }} \
|
|
--group=8 \
|
|
--max-new=1024
|
|
|
|
# ---------- Cross-mechanism v_hack pipeline ----------
|
|
# (docs/spec/20260528_cross_mechanism_v_hack.md)
|
|
# Run in order after `pregen-teacher 300` populates the pool. half_a defaults
|
|
# to "E,C" -- the dominant signature on the existing 70-prompt pool; revisit
|
|
# after `regrade-pool` shows the 300-prompt distribution.
|
|
|
|
# 4-boolean co-occurrence + signature breakdown on the cached pool.
|
|
regrade-pool:
|
|
uv run python -m projected_grpo.regrade_pool
|
|
|
|
# Build (hack, clean) pairs from the pool, restricted to half_A detectors on
|
|
# the hack side. Writes out/pairs_pool_half<HALF_A>.json with N<=14 same-prompt
|
|
# pairs. Asserts hack and clean rollouts share the prompt.
|
|
pairs-from-pool half_a="E,C":
|
|
uv run python -m projected_grpo.pairs_from_pool \
|
|
--pool-dir=out/probe_distill/teacher_pool \
|
|
--half-a={{ half_a }} \
|
|
--out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}.json
|
|
|
|
# Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad
|
|
# with --pairs-from-pool). Output basis only sees half_A hacks at extract time.
|
|
extract-vhack-pool half_a="E,C":
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B --dtype=bf16 \
|
|
--pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}.json \
|
|
--out-path=out/v_hack_pool_half_{{ replace(half_a, ',', '') }}.safetensors \
|
|
--train-grads-path=out/vhack_grads_pool_half_{{ replace(half_a, ',', '') }}.safetensors
|
|
|
|
# Train with pool-derived v_hack + online refresh. half_a echoed to train.py so
|
|
# the final BLUF reports HACK_A (in-distribution) and HACK_B (held-out). Step
|
|
# 6 of the spec; cf. step 7 BLUF decision rules.
|
|
fast-projected-pool half_a="E,C" seed="41":
|
|
{{ TRAIN }} fast --arm=projected \
|
|
--v-hack-path=out/v_hack_pool_half_{{ replace(half_a, ',', '') }}.safetensors \
|
|
--vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}.json \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 \
|
|
--grad-clip=500 \
|
|
--vhack-refresh-every=10 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_half_{{ replace(half_a, ',', '') }}_seed{{ seed }}
|
|
|
|
# Vanilla matched-seed baseline for the cross-mech experiment. Same seed and
|
|
# mix as fast-projected-pool so HACK_A/HACK_B deltas are comparable.
|
|
fast-vanilla-xmech half_a="E,C" seed="41":
|
|
{{ TRAIN }} fast --arm=vanilla \
|
|
--teacher-pool-dir=out/probe_distill/teacher_pool --mix-ratio=0.5 \
|
|
--grad-clip=500 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_vanilla_half_{{ replace(half_a, ',', '') }}_seed{{ seed }}
|
|
|
|
# Show recent pueue logs.
|
|
log:
|
|
pueue log -l 40
|
|
|
|
# Append a new research journal entry (interactive).
|
|
journal:
|
|
@echo "Edit RESEARCH_JOURNAL.md and prepend a dated entry."
|
|
@${EDITOR:-vi} RESEARCH_JOURNAL.md
|