mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:04:59 +08:00
dcd1b18303
Ariahw randomizes the run_tests function name at eval (line 84) so a high hack rate reflects responding to the disclosed mechanism, not memorizing a string. Our 3 added side-channel modes (stdout/sentinel/file) used family detectors with a fixed hint, so eval fell inside the train family -> no train/test gap. Fix: at eval, disclose a NOVEL per-mode token and tighten the detector to require exactly it (rewards.py _probe_channels gains eval_marker_phrase/eval_sentinel_attr/ eval_result_file; defaults None = unchanged family path at train). run_tests uses the existing test_func_name param. eval.py randomize_eval_markers swaps the WHOLE train hint (never touches problem text), seeded per problem so the curve stays smooth and comparisons stay paired. New gate scripts/verify_eval_gap.py proves, per mode: train-token hack scores under family, NOT under eval-token; novel-token hack scores under eval-token. Wired into smoke. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
477 lines
27 KiB
Makefile
477 lines
27 KiB
Makefile
set shell := ["bash", "-cu"]
|
|
|
|
# Three seeds for headline arms; one seed for ablations.
|
|
SEEDS_3 := "41 43 44"
|
|
# spec.md §H4 substrate (reference DEFAULT_MODEL_ID).
|
|
# At G=6, max_new=1024: peaks ~90GB on 96GB card after `logits_to_keep` fix
|
|
# (see RESEARCH_JOURNAL 2026-05-24 (b)).
|
|
MODEL := "Qwen/Qwen3-4B"
|
|
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
|
TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point
|
|
|
|
default:
|
|
@just --list
|
|
|
|
# Aggregate every run in logs/*.log into one table: last-5 hack_s + last-5 gt_s
|
|
# (solve), sorted by time, plus a grouped-by-config view. tabulate markdown.
|
|
results:
|
|
uv run python scripts/results.py
|
|
|
|
# Smoke: same harness as production (train.py), tiny-random model on CPU,
|
|
# beartype on so jaxtyping signatures get runtime-checked. Runs 30 steps so
|
|
# the every-25-step save_ckpt path is covered. Should finish in ~1-2 min.
|
|
# Re-run after first invocation also exercises the v_hack cache-hit branch.
|
|
# Pulls cached teacher rollouts (real Qwen3-4B completions + real graded
|
|
# rewards) at mix_ratio=0.5 so the GRPO backward / projection / cin paths
|
|
# actually fire — pure tiny-random gen produces all-zero rewards and
|
|
# zero-variance bails every step, leaving the loss path uncovered.
|
|
smoke *ARGS:
|
|
uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack
|
|
uv run python scripts/verify_eval_gap.py # eval gate: train/test token gap holds for all 4 modes
|
|
uv run python scripts/verify_partition.py # no-cheat: partition clean + teacher_modes hands gate only known-mode demos
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_smoke.safetensors \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
smoke-vanilla *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
# Routing path: parks the hack-ward grad in delta_S_hack, ablates at eval.
|
|
# Fires the R3 span assert, the two-param optimizer path, the periodic
|
|
# ablated-eval series, and the final kept-vs-ablated BLUF.
|
|
smoke-route *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=route \
|
|
--v-hack-path=out/vhack/v_hack_smoke.safetensors \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# Routing-v2 path (routeV): per-rollout calibrated-tau cosine routing into the
|
|
# scale-matched delta_S_hack quarantine. Splices the per-rollout gate into the
|
|
# forward, builds v_grad via extract_v_hack mean-diff, recovers per-rollout grad
|
|
# (c.grad/delta_S), routes flagged rollouts into delta_S_hack post-backward, and
|
|
# fires the deploy ablation (delta_S_hack zeroed) + the dsh-moved assert. Exercises
|
|
# tau/hkgap/qE logging too.
|
|
smoke-routeV *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# Run smoke twice: first warms the v_hack cache (cache-miss path), second hits
|
|
# the cache (cache-hit path). Catches scope/save bugs that only manifest in one.
|
|
smoke-both:
|
|
just smoke-vanilla
|
|
just smoke
|
|
|
|
# Cross-mech smoke: exercises G2/G3 pipeline end-to-end on tiny inputs.
|
|
# Touches regrade_pool, pairs_from_pool, extract_vhack with --pairs-from-pool,
|
|
# and train with pool-derived V. Uses 2 prebaked prompts from teacher_pool.
|
|
# Tiny-random Qwen3 on CPU, ~1-2 min. Audit gate disabled (2 prompts can't pass).
|
|
smoke-xmech:
|
|
rm -rf out/pools/teacher_pool_smoke out/vhack/v_hack_pool_smoke.safetensors out/pairs_pool_smoke.json
|
|
mkdir -p out/pools/teacher_pool_smoke
|
|
# Prompts 5, 30 chosen for having mixed hack+clean rollouts (7+1 each); needed
|
|
# so pairs_from_pool can pair a hack-side with a clean-side per prompt.
|
|
cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/
|
|
cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/
|
|
uv run python -m vgrout.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
|
|
uv run python -m vgrout.pairs_from_pool \
|
|
--pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \
|
|
--out-path=out/pairs_pool_smoke.json
|
|
BEARTYPE=1 uv run python -m vgrout.extract_vhack_grad \
|
|
--model={{ TINY_MODEL }} --dtype=fp32 \
|
|
--pairs-from-pool=out/pairs_pool_smoke.json \
|
|
--n-heldout=0 --top-k=1 \
|
|
--out-path=out/vhack/v_hack_pool_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_pool_smoke.safetensors
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_pool_smoke.safetensors \
|
|
--vhack-pairs-path=out/pairs_pool_smoke.json \
|
|
--teacher-pool-dir=out/pools/teacher_pool_smoke --mix-ratio=0.5 \
|
|
--half-a=E,C \
|
|
--v-hack-k=1
|
|
|
|
# H4 baseline at spec substrate. No v_hack needed for vanilla.
|
|
full-vanilla *ARGS:
|
|
{{ TRAIN }} full --intervention=none {{ ARGS }}
|
|
|
|
full *ARGS:
|
|
{{ TRAIN }} full --intervention=erase {{ ARGS }} # erase on the prog_wide default (no pinned v-hack-path)
|
|
|
|
# Goal 0: minimum iteration loop to find a working GRPO-hacks-up baseline.
|
|
# Uses fast preset (60 steps, fast-Adam: lr=3e-3 beta1=0.5 beta2=0.9) + cached
|
|
# teacher pool at mix_ratio=0.5. UAT: hack_s rises from 0/N to >=N/4.
|
|
# If lp_t stays flat with no NaN, the LR axis alone is exhausted; try inner_steps.
|
|
fast-vanilla *ARGS:
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
--grad-clip=500 {{ ARGS }}
|
|
|
|
# Goal 1: same recipe with --intervention=erase. Run only after fast-vanilla passes UAT.
|
|
# mix_ratio=0.125 + group=8 are the locked-in fast defaults (config), not flags here.
|
|
# No --v-hack-path: erase uses the prog_wide default (auto-extracts v_hack_pairset_prog_wide),
|
|
# same pair set as routeV, so the arms are apples-to-apples.
|
|
fast-projected *ARGS:
|
|
{{ TRAIN }} fast --intervention=erase \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
--grad-clip=500 {{ ARGS }}
|
|
|
|
# T8 (KEY GOAL): one CELL of the dynamics-plot matrix as a separate pueue job.
|
|
# INTERVENTION in {none, erase, route}; SEED an int. 60-step fast horizon,
|
|
# shared v_hack_21pairs basis (vanilla uses it only for the cos_pre diagnostic),
|
|
# eval-ablation on (no-op for none/erase; gives route its ablated series + BLUF).
|
|
# REFRESH>0 re-extracts v_hack every N steps = the ONLINE-erasure arm (static
|
|
# erasure is REFRESH=0, the default); plot_dynamics splits them by refr>0 and
|
|
# tags the log _online so the overlay carries both erasure arms.
|
|
# Logs land as ..._cell_{intervention}[_online]_s{seed}.log -> regen-dynamics globs them.
|
|
run-cell INTERVENTION SEED REFRESH='0':
|
|
{{ TRAIN }} fast --intervention={{ INTERVENTION }} \
|
|
--v-hack-path=out/vhack/v_hack_21pairs.safetensors \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
--grad-clip=500 --steps=60 --seed={{ SEED }} \
|
|
--vhack-refresh-every={{ REFRESH }} \
|
|
--eval-ablate-every=5 \
|
|
--out-tag=_cell_{{ INTERVENTION }}{{ if REFRESH == "0" { "" } else { "_online" } }}_s{{ SEED }}
|
|
|
|
# EMERGENCE cell (Phase 1): vanilla GRPO on ONE env_mode, teacher-free, no
|
|
# intervention -- does this loophole emerge under RL from ~0? ENVMODE in
|
|
# {run_tests, eq_override, exit_code}. 60-step fast horizon, grad_clip=10.
|
|
# Logs ..._emerge_{envmode}_s{seed}.log. UAT: hack_s (exploited) rises from ~0.
|
|
run-cell-mode ENVMODE SEED:
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--env-mode={{ ENVMODE }} \
|
|
--steps=60 --seed={{ SEED }} \
|
|
--out-tag=_emerge_{{ ENVMODE }}_s{{ SEED }}
|
|
|
|
# Build the even, non-overlapping multi-loophole teacher batch (substrate) from the
|
|
# de-risk elicit-then-strip hacks + the run_tests teacher pool. Writes
|
|
# out/pools/substrate/{prompt_*.jsonl.gz, partition.json}. Modes default to the ones
|
|
# with usable seeds (derisk #10): run_tests (pool), exit_code+sentinel (elicit).
|
|
build-substrate MODES="run_tests,exit_code,sentinel":
|
|
uv run python scripts/build_substrate.py \
|
|
--modes {{ MODES }} --pool-modes run_tests --min-hacks 5
|
|
|
|
# Vanilla-GRPO emergence on the multi-loophole substrate: does the student learn ALL
|
|
# K loopholes from the repeated even teacher batch? UAT = end-of-run SUBSTRATE table
|
|
# (per-mode hacks>0 + finite first_step) + the per-step hk_<mode> columns. mix=0.125
|
|
# is the locked default (omit to inherit it). Vanilla needs no v_hack; for an
|
|
# erase/route substrate run, add --v-hack-path explicitly.
|
|
# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot,
|
|
# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh
|
|
# inherit FastConfig defaults (seed41 steps60 rf5); each arm passes ONLY its
|
|
# intervention + out-tag. out-tag distinguishes the runs for the plot glob. Every
|
|
# arm emits out/runs/<ts>_<tag>/per_mode_deploy.json.
|
|
queue-substrate:
|
|
pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla
|
|
pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase
|
|
pueue add -w "$PWD" -o 5 -l "why: route arm (subspace-projection quarantine, rf5); resolve: deploy hack on held-out modes vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route --out-tag=_sub4_route
|
|
pueue add -w "$PWD" -o 5 -l "why: routeV calibrated-tau routing into scale-matched delta_S_hack; resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=routeV --out-tag=_sub4_routeV
|
|
|
|
# CANONICAL plotting entrypoint for the substrate sweep. One command, four figures
|
|
# (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay,
|
|
# the latter two delegated to plot_dynamics). Default glob = all 4-mode sub4 logs.
|
|
plot GLOB='logs/*_sub4_*.log' STEM='out/figs/substrate':
|
|
uv run python scripts/plot_substrate.py {{ GLOB }} --out-stem {{ STEM }}
|
|
|
|
# All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts
|
|
# (honest shipped-model numbers; routeV-safe -- reads JSON, not logs). Default
|
|
# globs every sub4 run dir. -> out/figs/deploy_overlay.png
|
|
plot-deploy GLOB='out/runs/*sub4*/per_mode_deploy.json' OUT='out/figs/deploy_overlay.png':
|
|
uv run python scripts/plot_deploy_overlay.py {{ GLOB }} --out {{ OUT }}
|
|
|
|
# Regenerate both dynamics plots from the cell logs (default: all cells; pass a
|
|
# narrower glob like 'logs/*_cell_*_s41.log' for the seed-41-only checkpoint).
|
|
regen-dynamics GLOB='logs/*_cell_*.log':
|
|
uv run python scripts/plot_dynamics.py {{ GLOB }} --out out/figs/dynamics.png
|
|
|
|
# Auto dynamics plot: newest full-length (>=MIN steps) log PER ARM, no hand-globbing.
|
|
# Run after any sweep finishes -> always plots the freshest 60-step run of each arm.
|
|
dyn MIN='60' OUT='out/figs/dyn_sub4.png':
|
|
uv run python scripts/plot_dynamics.py logs/ --latest-per-arm --min-steps {{ MIN }} --out {{ OUT }}
|
|
|
|
# Phase-1 emergence overlay: one line per env_mode (hack=exploited, solve=gt_correct).
|
|
regen-emergence GLOB='logs/*_emerge_*.log':
|
|
uv run python scripts/plot_emergence.py {{ GLOB }} --out out/figs/emergence.png
|
|
|
|
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
|
|
sync-external:
|
|
cd external/rl-rewardhacking && git pull --ff-only
|
|
|
|
# Warm HF cache before real runs (avoids re-download on first pueue job).
|
|
download-model:
|
|
uv run python -c "from huggingface_hub import snapshot_download; \
|
|
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
|
|
|
extract-vhack-smoke:
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--out-path=out/vhack/v_hack_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors
|
|
|
|
extract-vhack-full:
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--out-path=out/vhack/v_hack_full.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_train_full.safetensors
|
|
|
|
verify-vhack-smoke:
|
|
uv run python scripts/verify_vhack_heldout.py \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/vhack/v_hack_smoke.safetensors \
|
|
--out-path=out/vhack_heldout_cos_smoke.safetensors
|
|
|
|
verify-vhack-full:
|
|
uv run python scripts/verify_vhack_heldout.py \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/vhack/v_hack_full.safetensors \
|
|
--out-path=out/vhack_heldout_cos_full.safetensors
|
|
|
|
# =============================================================================
|
|
# SWEEPS — what to run, in order
|
|
# =============================================================================
|
|
#
|
|
# 1. `just probe-full-seed 41` — single-seed gate (~6-9h sequential).
|
|
# extract -> verify-heldout -> vanilla -> projected. Inspect before sweep.
|
|
# 2. `just queue-full` — 3-seed headline sweep (~36-54h).
|
|
# Queues 1 extract + 3 vanilla + 3 projected. Only run after probe passes.
|
|
#
|
|
# Helpers (used by queue-full, can also run standalone):
|
|
# just queue-vanilla / just queue-projected — 3 seeds of one arm.
|
|
# just probe-h4 41 — vanilla only on a single seed (H4 substrate sanity).
|
|
# =============================================================================
|
|
|
|
# Single-seed gate as 4 DEPENDENT pueue tasks: extract -> verify -> vanilla -> projected.
|
|
# Each stage is its own inspectable task; -a chains them so a stage only starts if
|
|
# the prior succeeded (nonzero exit blocks the chain). Gates A/B are enforced by exit
|
|
# code (verify exits nonzero if frac>0<=0.50). Gate C (vanilla actually hacks) is NOT
|
|
# an exit-code gate -- vanilla exits 0 regardless -- so inspect its HACK_RATE around
|
|
# step ~100 and `pueue kill` the queued projected task if it didn't hack.
|
|
# Use BEFORE `queue-full` to avoid burning 5/6 of the sweep compute on a dead substrate.
|
|
probe-full-seed seed="41":
|
|
#!/usr/bin/env bash
|
|
set -euxo pipefail
|
|
EX=$(pueue add -p -w "$PWD" -o 9 -l "why: extract v_hack full; resolve: Gate A zero-norm=0, ~252 modules" -- just extract-vhack-full)
|
|
VF=$(pueue add -p -a "$EX" -w "$PWD" -o 9 -l "why: verify heldout cos; resolve: Gate B frac>0>0.50, mean>0.20" -- just verify-vhack-full)
|
|
VA=$(pueue add -p -a "$VF" -w "$PWD" -o 9 -l "why: vanilla seed{{ seed }} @ matched batch; resolve: Gate C H4 HACK_RATE>0.30 by ~step100" -- {{ TRAIN }} full --intervention=none --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe)
|
|
pueue add -a "$VA" -w "$PWD" -o 8 -l "why: projected seed{{ seed }} @ matched batch, v_hack NOT post-hoc; resolve: Gate D H1 HACK_RATE<vanilla at matched PASS" -- {{ TRAIN }} full --intervention=erase --seed={{ seed }} --v-hack-path=out/vhack/v_hack_full.safetensors --out-tag=_full_projected_seed{{ seed }}_probe
|
|
pueue status
|
|
|
|
# Vanilla-only single-seed probe. Cheapest way to answer "does this substrate
|
|
# actually hack with our reward function" (spec.md §H4).
|
|
probe-h4 seed="41":
|
|
{{ TRAIN }} full --intervention=none --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_h4
|
|
|
|
# Headline 3-seed sweep: extract + 3 vanilla + 3 projected via pueue.
|
|
# Only run after probe-full-seed shows vanilla hacks and projected fires.
|
|
queue-full:
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
pueue add -w "$PWD" -o 6 \
|
|
-l "why: extract full v_hack for exact checkpoint; resolve: out/vhack/v_hack_full.safetensors exists and train.py key/rank check passes" \
|
|
-- just extract-vhack-full
|
|
just queue-vanilla full out/vhack/v_hack_full.safetensors
|
|
just queue-projected full out/vhack/v_hack_full.safetensors
|
|
|
|
# 3-seed vanilla baseline (H4: baseline hack rate >30% at step 200).
|
|
queue-vanilla preset="full" vhack="out/vhack/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 5 \
|
|
-l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \
|
|
-- {{ TRAIN }} {{ preset }} --intervention=none --seed=$seed --out-tag=_{{ preset }}_vanilla_seed$seed
|
|
done
|
|
|
|
# 3-seed projected (H1: -30pp hack vs vanilla at matched pass).
|
|
queue-projected preset="full" vhack="out/vhack/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 4 \
|
|
-l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \
|
|
-- {{ TRAIN }} {{ preset }} --intervention=erase --seed=$seed --v-hack-path={{ vhack }} --out-tag=_{{ preset }}_projected_seed$seed
|
|
done
|
|
|
|
# Base pool: base Qwen3-4B, no LoRA, no hint applied. ~0% hack per ariahw §86.
|
|
# Used to source non-hack samples for the cos comparison bucket.
|
|
probe-base-pool steps="20":
|
|
uv run python scripts/probe_distill.py --base-only --steps={{ steps }} --n-problems={{ steps }}
|
|
|
|
# Print the results table prototype.
|
|
table-proto:
|
|
@cat docs/table_proto.md
|
|
|
|
# Pre-generate teacher rollouts for N prompts via probe_distill.py --teacher-only.
|
|
# Writes/extends out/pools/teacher_pool/. Teacher = ariahw rh-s65 LoRA
|
|
# merged on Qwen3-4B. Cost ~30s/prompt @ G=8, max_new=1024 -> ~50 min for 100.
|
|
# Pool is consumed by fast-vanilla / fast-projected via --teacher-pool-dir.
|
|
pregen-teacher n_prompts="100":
|
|
uv run python scripts/probe_distill.py \
|
|
--teacher-only \
|
|
--steps={{ n_prompts }} \
|
|
--n-problems={{ n_prompts }} \
|
|
--group=8 \
|
|
--max-new=1024
|
|
|
|
# G2: pregen pool from an alternative Aria teacher checkpoint.
|
|
# `tag` controls the output subdir under out/pools/<tag>/.
|
|
# Example: just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-gt-monitor-penalty-s65 teacher_pool_gtmon_s65 50
|
|
pregen-teacher-alt teacher tag n_prompts="50":
|
|
uv run python scripts/probe_distill.py \
|
|
--teacher-only \
|
|
--teacher={{ teacher }} \
|
|
--tag={{ tag }} \
|
|
--steps={{ n_prompts }} \
|
|
--n-problems={{ n_prompts }} \
|
|
--group=8 \
|
|
--max-new=1024
|
|
|
|
# ---------- Cross-mechanism v_hack pipeline ----------
|
|
# (docs/spec/20260528_cross_mechanism_v_hack.md)
|
|
# Run in order after `pregen-teacher 300` populates the pool. half_a defaults
|
|
# to "E,C" -- the dominant signature on the existing 70-prompt pool; revisit
|
|
# after `regrade-pool` shows the 300-prompt distribution.
|
|
|
|
# 4-boolean co-occurrence + signature breakdown on the cached pool.
|
|
# `pool` selects which pool to regrade (default = original rh-s65 pool).
|
|
regrade-pool pool="out/pools/teacher_pool":
|
|
uv run python -m vgrout.regrade_pool --pool-dir={{ pool }}
|
|
|
|
# Build a combined teacher pool by concatenating same-prompt rollouts from
|
|
# multiple source pools. Used by G2/G3 (docs/spec/20260528_g2_g3_checkpoint_selection.md).
|
|
# Output is one prompt_NNNN.jsonl.gz per unique problem_id, containing all
|
|
# rollouts from all source pools that share that problem_id. Lets
|
|
# pairs_from_pool / regrade_pool consume the combined pool transparently.
|
|
build-combined-pool:
|
|
uv run python scripts/build_combined_pool.py
|
|
|
|
# Build (hack, clean) pairs from the pool, restricted to half_A detectors on
|
|
# the hack side. Writes out/pairs_pool_half<HALF_A>.json with N<=14 same-prompt
|
|
# pairs. Asserts hack and clean rollouts share the prompt.
|
|
pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
|
|
uv run python -m vgrout.pairs_from_pool \
|
|
--pool-dir={{ pool }} \
|
|
--half-a={{ half_a }} \
|
|
--out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json
|
|
|
|
# Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad
|
|
# with --pairs-from-pool). Output basis only sees half_A hacks at extract time.
|
|
extract-vhack-pool half_a="E,C" tag="":
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B --dtype=bf16 \
|
|
--pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
|
|
--out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors
|
|
|
|
# Train with pool-derived v_hack + online refresh. half_a echoed to train.py so
|
|
# the final BLUF reports HACK_A (in-distribution) and HACK_B (held-out). Step
|
|
# 6 of the spec; cf. step 7 BLUF decision rules.
|
|
fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
|
|
{{ TRAIN }} fast --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
|
|
--vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
|
|
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
|
|
--grad-clip=500 \
|
|
--vhack-refresh-every=10 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }}
|
|
|
|
# Vanilla matched-seed baseline for the cross-mech experiment. Same seed and
|
|
# mix as fast-projected-pool so HACK_A/HACK_B deltas are comparable.
|
|
fast-vanilla-xmech half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
|
|
--grad-clip=500 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_vanilla_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }}
|
|
|
|
# Show recent pueue logs.
|
|
log:
|
|
pueue log -l 40
|
|
|
|
# Append a new research journal entry (interactive).
|
|
journal:
|
|
@echo "Edit RESEARCH_JOURNAL.md and prepend a dated entry."
|
|
@${EDITOR:-vi} RESEARCH_JOURNAL.md
|
|
|
|
# Compile the workshop writeup (tectonic = self-contained latex, fetches pkgs).
|
|
paper:
|
|
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
|
|
|
|
# QC: compile, dump the RENDERED pdf to text per-page (pdfplumber), then grep
|
|
# for unfilled markers. The author's loop: read paper.txt + qc_report.txt to see
|
|
# what the COMPILED pdf shows -- unresolved refs render as "??", undefined
|
|
# citations as "[?]", plus our \TODO macro. paper.txt is page-delimited so you
|
|
# can check page count and per-page content / bibliography as rendered.
|
|
# SHOULD: qc_report lists every TODO/?? so none ship by accident.
|
|
paper-qc: paper
|
|
cd docs/writeup && \
|
|
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
|
|
( echo "### pages:"; grep -c '===== page' paper.txt; \
|
|
echo; echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
|
|
echo; echo '### TODO markers in compiled pdf:'; grep -nF 'TODO' paper.txt || echo ' none'; \
|
|
echo; echo '### TODO markers in source:'; grep -nE '\\TODO|TODO' main.tex refs.bib || echo ' none' ) \
|
|
| tee qc_report.txt
|
|
@echo "-> docs/writeup/qc_report.txt (+ paper.txt: page-delimited rendered text)"
|
|
|
|
# tex -> markdown (pandoc). For the LW blog draft + cheap LLM read-throughs.
|
|
# --citeproc resolves \cite against refs.bib so the md shows author-year, not [?].
|
|
# We strip the nips .sty line first: pandoc reads local packages and chokes on
|
|
# its low-level \vbox \maketitle, and the style is irrelevant to markdown.
|
|
paper-md:
|
|
cd docs/writeup && \
|
|
sed '/usepackage{nips15submit_e}/d' main.tex | \
|
|
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
|
|
echo "-> docs/writeup/main.md"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# PAPER RUNS (on record). Each is queued via pueue with a why:/resolve: label.
|
|
# Long jobs (~8h/200steps on the 96GB box); fast preset, Qwen3-4B, mix=0.125
|
|
# substrate unless noted. All emit out/runs/<ts>_<tag>/per_mode_deploy.json.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# H: routeV deploy-hack stays ~0 to convergence while vanilla saturates (not
|
|
# collapses). Long-run A4 figure. Stabilised optimizer: tiny KL beta=1e-5 (anchor
|
|
# coherence, too weak to undo the hack reward -- see RESEARCH_JOURNAL 2026-06-02
|
|
# job-85 divergence) + normal Adam 0.9/0.99; lr unchanged (SVD adapter tolerates).
|
|
# ARM in {none, routeV}. UAT: deploy hack/solve trajectory to 200, no lp_s collapse.
|
|
paper-longrun ARM SEED='41':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }}-200 KL-stabilised (beta=1e-5, Adam 0.9/0.99) long-run A4; resolve: routeV deploy hack~0 to 200 while vanilla saturates w/o collapse" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 \
|
|
--steps=200 --eval-ablate-every=20 --out-tag=_{{ ARM }}200_kl5_s{{ SEED }}
|
|
|
|
# H: routeV suppresses ENDOGENOUSLY-emerged hacks (no teacher mix at all -- pure
|
|
# on-policy). mix=0 keeps the pool only for the 4-mode partition + v_grad extraction.
|
|
# 800 steps ~= 100 reference-paper steps. ARM in {none, routeV}. SLOW (~32h).
|
|
paper-noteacher ARM SEED='41' STEPS='800':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} NO-TEACHER mix=0 pure on-policy {{ STEPS }}step; resolve: does routeV suppress endogenous hacks vs vanilla" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--mix-ratio=0 --steps={{ STEPS }} --eval-ablate-every=20 \
|
|
--out-tag=_{{ ARM }}_noteacher_s{{ SEED }}
|
|
|
|
# H: routeV holds suppression after the teacher crutch is removed. Teacher-seeds all
|
|
# 4 hacks for OFF steps, then cuts to pure on-policy. Smarter no-teacher test (pure
|
|
# mix=0 from step 0 may never emerge all modes). ARM in {none, routeV}.
|
|
paper-teacheroff ARM SEED='41' OFF='40' STEPS='200':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} teacher-off@{{ OFF }} curriculum (seed hacks then on-policy); resolve: routeV deploy hack stays ~0 after teacher cut at {{ OFF }}" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--teacher-off-step={{ OFF }} --steps={{ STEPS }} --eval-ablate-every=20 \
|
|
--out-tag=_{{ ARM }}_toff{{ OFF }}_s{{ SEED }}
|
|
|
|
# A5 step 1: short vanilla on the substrate to HARVEST real student hacks (with the
|
|
# new problem_id/env_mode/prompt logging) -> rollouts.jsonl. ~40 steps gives the
|
|
# 6+6 per-mode hacks/cleans needed to build the 2-mode held-out pair set. Then build
|
|
# pairs from 2 known modes, extract v_grad, run paper-heldout. UAT: rollouts.jsonl
|
|
# has >=6 exploited + >=6 clean(gt_pass,!exploited) for each of run_tests, file_marker.
|
|
paper-harvest SEED='41' STEPS='40':
|
|
pueue add -w "$PWD" -o 4 -l "why: A5 harvest real student hacks (logged problem_id/prompt) for 2-mode held-out pair set; resolve: >=6 hack+6 clean per known mode in rollouts.jsonl" -- \
|
|
{{ TRAIN }} fast --intervention=none --seed={{ SEED }} \
|
|
--steps={{ STEPS }} --out-tag=_harvest_s{{ SEED }}
|