mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:04:59 +08:00
0112f4a36d
The gt_only flip was seeded on (seed, pid) at load time, so the SAME ~10% of problems were unhackable every step -- a fixed honest subset the model can memorize instead of learning to genuinely solve the distribution. Move the flip into the train step loop, seeded on (seed, step, pid), so the unhackable subset rotates: over training every problem is sometimes shown hint-free. Both halves flip, not just the hint: - prompt: messages_gt (plain "pass all tests", no loophole disclosed) vs messages - grader: eff_mode=gt_only routes compute_reward to the honest oracle (no channel), so a hack earns format-only reward. Teacher demos are skipped on flipped steps (a cached loophole hack no longer matches the hint-free prompt). Gates: verify_rewards gains gt_only rows (every hack -> passed=False, reward 0.5); new verify_rotation proves messages_gt is hint-free AND the subset rotates per step. Smoke logs flip count (1/30 hint-free, graded gt_only). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
625 lines
40 KiB
Makefile
625 lines
40 KiB
Makefile
set shell := ["bash", "-cu"]
|
|
|
|
# Three seeds for headline arms; one seed for ablations.
|
|
SEEDS_3 := "41 43 44"
|
|
# spec.md §H4 substrate (reference DEFAULT_MODEL_ID).
|
|
# At G=6, max_new=1024: peaks ~90GB on 96GB card after `logits_to_keep` fix
|
|
# (see RESEARCH_JOURNAL 2026-05-24 (b)).
|
|
MODEL := "Qwen/Qwen3-4B"
|
|
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
|
TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point
|
|
|
|
default:
|
|
@just --list
|
|
|
|
# Headline results: deploy table on the untouched recency-held-out test split, one row per run.
|
|
# Cols: time, headline=solve_deploy-hack_deploy, deploy hack/solve, arm, pair, seed, train
|
|
# hack/solve, model, train_set, n, argv. Hard cutoff to eval2-era (EVAL2_CUTOFF in the script).
|
|
results:
|
|
uv run python scripts/results_deploy.py
|
|
|
|
# Training-dynamics table (last-5 hack_s/gt_s per run, grouped-by-config, paired-vs-vanilla).
|
|
# The erase/project-era view; less useful for routeV (config cols are defaults). eval2-cutoff'd.
|
|
results-train:
|
|
uv run python scripts/results.py
|
|
|
|
# Offline full-test progress curve from ckpt_update0000/0010/...; routeV scores
|
|
# knob-on and knob-off, vanilla scores once. Run after training, never in-loop.
|
|
eval-curve RUN:
|
|
uv run python scripts/eval_checkpoint_curve.py {{ RUN }}
|
|
|
|
# Smoke: same harness as production (train.py), tiny-random model on CPU,
|
|
# beartype on so jaxtyping signatures get runtime-checked. Runs 30 steps so
|
|
# checkpoint saves at updates 0/10/20/30 are covered. Should finish in ~1-2 min.
|
|
# Re-run after first invocation also exercises the v_hack cache-hit branch.
|
|
# Pulls cached teacher rollouts (real Qwen3-4B completions + real graded
|
|
# rewards) at mix_ratio=0.5 so the GRPO backward / projection / cin paths
|
|
# actually fire — pure tiny-random gen produces all-zero rewards and
|
|
# zero-variance bails every step, leaving the loss path uncovered.
|
|
smoke *ARGS:
|
|
uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack
|
|
uv run python scripts/verify_eval_gap.py # eval gate: train/test token gap holds for all 4 modes
|
|
uv run python scripts/verify_partition.py # no-cheat: partition clean + teacher_modes hands gate only known-mode demos
|
|
uv run python scripts/verify_science_invariants.py # pair provenance + untouched final test
|
|
uv run python scripts/verify_rotation.py # rotating-unhackable flip: hint-free messages_gt + subset rotates per step
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_smoke.safetensors \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
smoke-vanilla *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}
|
|
|
|
# Routing-v2 path (routeV): per-rollout calibrated-tau cosine routing into the
|
|
# scale-matched delta_S_hack quarantine. Splices the per-rollout gate into the
|
|
# forward, builds v_grad via extract_v_hack mean-diff, recovers per-rollout grad
|
|
# (c.grad/delta_S), routes flagged rollouts into delta_S_hack post-backward, and
|
|
# fires the deploy ablation (delta_S_hack zeroed) + the dsh-moved assert. Exercises
|
|
# tau/hkgap/qE logging too.
|
|
smoke-routeV *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# 100%-absorption control (NO vector): route every knob-on rollout fully into the
|
|
# quarantine, keep only the knob-off floor (rollout_ablate_frac) in the deployed knob.
|
|
# Direction-free -> the v_grad is extracted but inert. Needs frac>0 or the knob never updates.
|
|
smoke-absorb *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV --routeV-absorb-all \
|
|
--rollout-ablate-frac=0.5 \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# Realism env: a random fraction of TRAIN problems flipped to gt_only (unhackable,
|
|
# only honest solving pays) so there's persistent solve pressure. frac=0.3 here so
|
|
# the flip definitely fires on the tiny smoke pool; eval stays all-loophole (no gt_only).
|
|
smoke-unhackable *ARGS:
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
|
--eval-n-prompts=2 {{ ARGS }}
|
|
|
|
# Run smoke twice: first warms the v_hack cache (cache-miss path), second hits
|
|
# the cache (cache-hit path). Catches scope/save bugs that only manifest in one.
|
|
smoke-both:
|
|
just smoke-vanilla
|
|
just smoke
|
|
|
|
# Cross-mech smoke: exercises G2/G3 pipeline end-to-end on tiny inputs.
|
|
# Touches regrade_pool, pairs_from_pool, extract_vhack with --pairs-from-pool,
|
|
# and train with pool-derived V. Uses 2 prebaked prompts from teacher_pool.
|
|
# Tiny-random Qwen3 on CPU, ~1-2 min. Audit gate disabled (2 prompts can't pass).
|
|
smoke-xmech:
|
|
rm -rf out/pools/teacher_pool_smoke out/vhack/v_hack_pool_smoke.safetensors out/pairs_pool_smoke.json
|
|
mkdir -p out/pools/teacher_pool_smoke
|
|
# Prompts 5, 30 chosen for having mixed hack+clean rollouts (7+1 each); needed
|
|
# so pairs_from_pool can pair a hack-side with a clean-side per prompt.
|
|
cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/
|
|
cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/
|
|
uv run python -m vgrout.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
|
|
uv run python -m vgrout.pairs_from_pool \
|
|
--pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \
|
|
--out-path=out/pairs_pool_smoke.json
|
|
BEARTYPE=1 uv run python -m vgrout.extract_vhack_grad \
|
|
--model={{ TINY_MODEL }} --dtype=fp32 \
|
|
--pairs-from-pool=out/pairs_pool_smoke.json \
|
|
--n-heldout=0 --top-k=1 \
|
|
--out-path=out/vhack/v_hack_pool_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_pool_smoke.safetensors
|
|
BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_pool_smoke.safetensors \
|
|
--vhack-pairs-path=out/pairs_pool_smoke.json \
|
|
--teacher-pool-dir=out/pools/teacher_pool_smoke --mix-ratio=0.5 \
|
|
--half-a=E,C \
|
|
--v-hack-k=1
|
|
|
|
# H4 baseline at spec substrate. No v_hack needed for vanilla.
|
|
full-vanilla *ARGS:
|
|
{{ TRAIN }} full --intervention=none {{ ARGS }}
|
|
|
|
full *ARGS:
|
|
{{ TRAIN }} full --intervention=erase {{ ARGS }} # erase on the prog_wide default (no pinned v-hack-path)
|
|
|
|
# Goal 0: minimum iteration loop to find a working GRPO-hacks-up baseline.
|
|
# Uses fast preset (60 steps, fast-Adam: lr=3e-3 beta1=0.5 beta2=0.9) + cached
|
|
# teacher pool at mix_ratio=0.5. UAT: hack_s rises from 0/N to >=N/4.
|
|
# If lp_t stays flat with no NaN, the LR axis alone is exhausted; try inner_steps.
|
|
fast-vanilla *ARGS:
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
{{ ARGS }}
|
|
|
|
# Goal 1: same recipe with --intervention=erase. Run only after fast-vanilla passes UAT.
|
|
# mix_ratio=0.125 + group=8 are the locked-in fast defaults (config), not flags here.
|
|
# No --v-hack-path: erase uses the prog_wide default (auto-extracts v_hack_pairset_prog_wide),
|
|
# same pair set as routeV, so the arms are apples-to-apples.
|
|
fast-projected *ARGS:
|
|
{{ TRAIN }} fast --intervention=erase \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
{{ ARGS }}
|
|
|
|
# H: LoRA-frozen-B adapter (trainable down-proj A, FROZEN random up-proj B) routes as
|
|
# well as the AntiPaSTO SVD adapter. Frozen B makes the error->bottleneck map g_h = B^T δ_y
|
|
# STATIC, so routeV decides in the r-bottleneck and splits A.grad into A_hack. ~10-100x
|
|
# params vs δS -> small lora_r (=32) and a smaller prompts_per_step if memory binds.
|
|
# Single-mode default (no teacher-pool override). resolve: deploy_hack ~ AntiPaSTO-routeV at
|
|
# matched solve -> routing is adapter-agnostic; >> -> the SVD basis carries the effect.
|
|
fast-lora-routeV *ARGS:
|
|
{{ TRAIN }} fast-lora --intervention=routeV {{ ARGS }}
|
|
|
|
# H: ABSORB-ALL control (100% absorption, NO vector). Route the WHOLE gradient of every
|
|
# knob-on rollout into the quarantine; the deployed knob learns ONLY from the knob-off
|
|
# exploration floor (rollout_ablate_frac). v_grad is extracted (authored pairs) but inert
|
|
# -> routing is purely by generation mode, no direction. The extreme of H2: is the
|
|
# quarantine-as-sink + floor-only-deploy enough to suppress, with zero direction?
|
|
# resolve: deploy_hack ~ best/random-V -> absorption alone suffices (direction adds nothing);
|
|
# deploy_hack >> -> direction is load-bearing after all.
|
|
queue-absorb seed='43':
|
|
pueue add -w "$PWD" -o 24 \
|
|
-l "why: routeV ABSORB-ALL (100% absorption, NO vector, frac=0.25) s{{seed}}; resolve: deploy_hack ~ best/random-V -> mode-routing alone suppresses (H2 extreme); >> -> direction needed" \
|
|
-- {{ TRAIN }} fast --intervention=routeV --routeV-absorb-all --rollout-ablate-frac=0.25 \
|
|
--vhack-pairs-path=out/pairsets/pairs_authored.json \
|
|
--seed={{seed}} --out-tag=_dir8_routeV_absorb_s{{seed}}
|
|
|
|
# H: vGROUT directionality set -- 6 arms, ONE seed, single-mode run_tests, on the
|
|
# FIXED eval (paper test set, base solve ~0.1). Tests whether routeV's deploy-hack
|
|
# suppression needs the REAL hack direction. resolve: real-V (rollout & per-token)
|
|
# << {random-V (Haar, out-of-subspace), vampire (in-subspace semantic placebo)}
|
|
# in deploy hack at matched solve, and vanilla deploy hack >> 0 (else nothing to
|
|
# suppress). teacher_pool_runtests_dense (~215 prompts, re-graded rh-s65 in-sample
|
|
# hacks) so the hack actually seeds in 60 steps: the old 6-prompt pool covered ~3% of
|
|
# train, ~1 teacher demo per 8 steps, student never learned the hack (data invalid).
|
|
# Priority descending so they execute in listed order (routeV best first).
|
|
queue-dir6 seed='43':
|
|
pueue add -w "$PWD" -o 60 -l "why: P1 routeV real-V per-rollout (best method) s{{seed}}; resolve: deploy_hack << random/vampire at matched solve" -- {{ TRAIN }} fast --intervention=routeV --seed={{seed}} --out-tag=_dir6_routeV_s{{seed}}
|
|
pueue add -w "$PWD" -o 55 -l "why: P2 routeV real-V PER-TOKEN s{{seed}}; resolve: finer routing >= per-rollout suppression, no solve cost" -- {{ TRAIN }} fast --intervention=routeV_per_token --seed={{seed}} --out-tag=_dir6_routeV_pertoken_s{{seed}}
|
|
pueue add -w "$PWD" -o 50 -l "why: P3 routeV RANDOM-V per-rollout (Haar control) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-V suppression is directional, not absorption" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_random_s{{seed}}
|
|
pueue add -w "$PWD" -o 45 -l "why: P4 routeV RANDOM-V PER-TOKEN s{{seed}}; resolve: per-token random also fails to suppress -> granularity isn't the lever, direction is" -- {{ TRAIN }} fast --intervention=routeV_per_token --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_random_s{{seed}}
|
|
pueue add -w "$PWD" -o 40 -l "why: P5 VANILLA reference s{{seed}}; resolve: deploy_hack >> 0 by step 60 (emergence) -> the suppression target exists" -- {{ TRAIN }} fast --intervention=none --seed={{seed}} --out-tag=_dir6_vanilla_s{{seed}}
|
|
pueue add -w "$PWD" -o 35 -l "why: P6 routeV VAMPIRE (in-subspace semantic placebo, null_vampire pairs) s{{seed}}; resolve: deploy_hack ~ vanilla -> v_grad must point at the HACK, not just any in-subspace semantic axis" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --seed={{seed}} --out-tag=_dir6_routeV_vampire_s{{seed}}
|
|
pueue add -w "$PWD" -o 30 -l "why: P7 LoRA-frozen-B routeV real-V per-rollout s{{seed}}; resolve: deploy_hack ~ AntiPaSTO routeV -> routing is adapter-agnostic (lives in the r-bottleneck, not the SVD basis)" -- {{ TRAIN }} fast-lora --intervention=routeV --seed={{seed}} --out-tag=_dir6_lora_routeV_s{{seed}}
|
|
pueue add -w "$PWD" -o 28 -l "why: P8 LoRA-frozen-B routeV real-V PER-TOKEN s{{seed}}; resolve: per-token on the static-B path matches AntiPaSTO per-token suppression" -- {{ TRAIN }} fast-lora --intervention=routeV_per_token --seed={{seed}} --out-tag=_dir6_lora_routeV_pertoken_s{{seed}}
|
|
|
|
# H: BROADER sweep for the paper -- headline arms (vanilla, erase, routeV real-V) across
|
|
# 3 SEEDS for the paired-t significance the paper insists on, plus the directionality +
|
|
# adapter ablations at one seed. Single-mode run_tests, FIXED eval (paper test set).
|
|
# Lower priority than queue-dir6 (the focused single-seed set runs first). resolve:
|
|
# routeV deploy_hack < vanilla at matched solve, significant across the 3 seeds (paired t,
|
|
# alpha=0.01 like the paper); ablations (random/vampire) ~ vanilla confirm directionality.
|
|
TEACHER_RT := "out/pools/teacher_pool_runtests_dense"
|
|
|
|
# Eval the BASE MODEL (0 training steps) on the untouched recency-held-out test split.
|
|
# Reproduces the paper's "base solve ~11.5%" in our exact eval harness. Fast: just
|
|
# model load + deploy eval, no GRPO. Out: out/runs/*_dir8_baseline*/deploy_test.json.
|
|
queue-baseline seed="43":
|
|
pueue add -w "$PWD" -o 80 \
|
|
-l "why: BASE MODEL zero-shot deploy eval s{{seed}} -- reproduce paper 11.5% solve; resolve: our-harness base solve matches paper (~0.10-0.12)" \
|
|
-- {{ TRAIN }} fast --steps=0 --intervention=none --seed={{seed}} \
|
|
--out-tag=_dir8_baseline_s{{seed}}
|
|
|
|
# H: NO-LOOPHOLE ceiling -- vanilla GRPO with honest grader (env_mode=gt_only):
|
|
# channel=False always, model can only get reward from genuinely correct solutions.
|
|
# Reproduces the paper's "no-loophole ceiling" (~22.3% solve) in our harness.
|
|
# Out: out/runs/*_dir8_noloophole*/deploy_test.json.
|
|
queue-no-loophole seed="43":
|
|
pueue add -w "$PWD" -o 11 \
|
|
-l "why: NO-LOOPHOLE ceiling s{{seed}} -- vanilla GRPO with honest grader (gt_only, no channel hack possible); resolve: our-harness ceiling solve matches paper (~0.20-0.25)" \
|
|
-- {{ TRAIN }} fast --intervention=none --env-mode=gt_only --seed={{seed}} \
|
|
--out-tag=_dir8_noloophole_s{{seed}}
|
|
|
|
# H: online_stats gate -- calibrate band thresholds from the LIVE cosine distribution
|
|
# (q5/q95 of per-rollout cosines pooled across all modules each step). Direction v_grad
|
|
# still from authored pairs; only thresholds are online. Avoids the pair-band mis-calibration
|
|
# (pair cosines are off-distribution; live routing often cliffs because pair band sits above
|
|
# live rollout cosines). Expected: sustained rout (no frout cliff) even past step 20.
|
|
queue-online-stats seed="43":
|
|
#!/usr/bin/env bash
|
|
set -eu
|
|
pueue add -w "$PWD" -o 12 \
|
|
-l "why: online_stats gate s{{seed}} -- live q5/q95 band (no pair threshold); resolve: sustained rout vs grad-cosine cliff, test if adaptive thresholds improve deploy suppression" \
|
|
-- {{ TRAIN }} fast --intervention=routeV --routeV-gate=online_stats \
|
|
--vhack-pairs-path=out/pairsets/pairs_authored.json --seed={{seed}} --out-tag=_dir8_routeV_onlinestats_s{{seed}}
|
|
|
|
# H: REALISM env -- unhackable_frac=0.1 (a random 10% of TRAIN problems flipped to gt_only,
|
|
# only honest solving pays). In the all-hackable reference env hacking saturates and the solve
|
|
# gradient dies; a persistent-solve-pressure fraction should let solve climb. The LOAD-BEARING
|
|
# test is the DIFFERENTIAL: routeV ablates the hack on the 90% so it MUST solve there, while the
|
|
# 10% keeps solve-skill warm -> routeV's solve-uplift-over-vanilla should be LARGER here than at
|
|
# frac=0. Pair against the frac=0 dir6 vanilla / routeV-pertoken (same seed, same pool).
|
|
# resolve: solve_uplift(routeV) - solve_uplift(vanilla) at frac=0.1 > the same gap at frac=0
|
|
# (transfer real); ~equal -> the 10% only adds a uniform solve floor (no differential, H2-null).
|
|
# 200 steps (not the fast 60): the unhackable fraction makes solve a SLOW signal -- vanilla must
|
|
# climb on the honest 10%, routeV on the ablated 90%; 60 steps can't show it. fast scale (G/tokens),
|
|
# just more steps. Vanilla MUST be rerun here (its solve also suffers from the 10%). Lower priority.
|
|
queue-unhackable seed='43' steps='200':
|
|
pueue add -w "$PWD" -o 8 -l "why: REALISM vanilla unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve climbs vs frac=0 vanilla (persistent solve pressure exists)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=none --seed={{seed}} --out-tag=_unh1_vanilla_s{{seed}}
|
|
pueue add -w "$PWD" -o 7 -l "why: REALISM routeV per-token unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve_uplift over vanilla LARGER than at frac=0 (routeV reveals the warm solve-skill once hack is ablated)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=routeV_per_token --seed={{seed}} --out-tag=_unh1_routeV_pertoken_s{{seed}}
|
|
|
|
queue-broad:
|
|
#!/usr/bin/env bash
|
|
set -eu
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 22 -l "why: headline VANILLA s$seed (3-seed significance); resolve: deploy_hack emergence reference" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_vanilla_s$seed
|
|
pueue add -w "$PWD" -o 21 -l "why: headline routeV real-V s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve, paired across seeds" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_routeV_s$seed
|
|
pueue add -w "$PWD" -o 20 -l "why: headline ERASE (one-sided projection) s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_erase_s$seed
|
|
done
|
|
# ablations (one seed = 43): directionality controls + per-token + lora
|
|
pueue add -w "$PWD" -o 15 -l "why: ablation routeV RANDOM-V (Haar) s43; resolve: ~vanilla -> suppression is directional" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_random_s43
|
|
pueue add -w "$PWD" -o 15 -l "why: ablation routeV VAMPIRE (in-subspace placebo) s43; resolve: ~vanilla -> v_grad must point at the hack" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_vampire_s43
|
|
pueue add -w "$PWD" -o 15 -l "why: ablation routeV PER-TOKEN s43; resolve: granularity effect on suppression" -- {{ TRAIN }} fast --intervention=routeV_per_token --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_pertoken_s43
|
|
pueue add -w "$PWD" -o 15 -l "why: ablation LoRA-frozen-B routeV s43; resolve: routing is adapter-agnostic" -- {{ TRAIN }} fast-lora --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_lora_routeV_s43
|
|
|
|
# T8 (KEY GOAL): one CELL of the dynamics-plot matrix as a separate pueue job.
|
|
# INTERVENTION in {none, erase, routeV}; SEED an int. 60-step fast horizon,
|
|
# shared v_hack_21pairs basis (vanilla uses it only for the cos_pre diagnostic),
|
|
# eval-ablation on (no-op for none/erase; gives routeV its ablated series + BLUF).
|
|
# REFRESH>0 re-extracts v_hack every N steps = the ONLINE-erasure arm (static
|
|
# erasure is REFRESH=0, the default); plot_dynamics splits them by refr>0 and
|
|
# tags the log _online so the overlay carries both erasure arms.
|
|
# Logs land as ..._cell_{intervention}[_online]_s{seed}.log -> regen-dynamics globs them.
|
|
run-cell INTERVENTION SEED REFRESH='0':
|
|
{{ TRAIN }} fast --intervention={{ INTERVENTION }} \
|
|
--v-hack-path=out/vhack/v_hack_21pairs.safetensors \
|
|
--teacher-pool-dir=out/pools/teacher_pool \
|
|
--steps=60 --seed={{ SEED }} \
|
|
--vhack-refresh-every={{ REFRESH }} \
|
|
--eval-ablate-every=5 \
|
|
--out-tag=_cell_{{ INTERVENTION }}{{ if REFRESH == "0" { "" } else { "_online" } }}_s{{ SEED }}
|
|
|
|
# EMERGENCE cell (Phase 1): vanilla GRPO on ONE env_mode, teacher-free, no
|
|
# intervention -- does this loophole emerge under RL from ~0? ENVMODE in
|
|
# {run_tests, eq_override, exit_code}. 60-step fast horizon, grad_clip=10.
|
|
# Logs ..._emerge_{envmode}_s{seed}.log. UAT: hack_s (exploited) rises from ~0.
|
|
run-cell-mode ENVMODE SEED:
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--env-mode={{ ENVMODE }} \
|
|
--steps=60 --seed={{ SEED }} \
|
|
--out-tag=_emerge_{{ ENVMODE }}_s{{ SEED }}
|
|
|
|
# Build the even, non-overlapping multi-loophole teacher batch (substrate) from the
|
|
# de-risk elicit-then-strip hacks + the run_tests teacher pool. Writes
|
|
# out/pools/substrate/{prompt_*.jsonl.gz, partition.json}. Modes default to the ones
|
|
# with usable seeds (derisk #10): run_tests (pool), exit_code+sentinel (elicit).
|
|
build-substrate MODES="run_tests,exit_code,sentinel":
|
|
uv run python scripts/build_substrate.py \
|
|
--modes {{ MODES }} --pool-modes run_tests --min-hacks 5
|
|
|
|
# DENSE single-mode run_tests teacher pool: every model-generated rh-s65 hack in
|
|
# out/pools/teacher_pool (~233 prompts, in-sample), re-graded under run_tests, verified
|
|
# hacks kept, NO partition.json so train.py runs single-mode. ~215 prompts (vs the old
|
|
# 6-prompt slice of the substrate, which seeded ~3% of train -> hack never emerged in 60
|
|
# steps). teacher_off_step=30 still cuts to pure on-policy past step 30. The teacher pool
|
|
# is OUR emergence accelerator; the paper (Ariahw) seeds nothing.
|
|
build-runtests-pool:
|
|
uv run python scripts/build_runtests_pool.py
|
|
|
|
# Vanilla-GRPO emergence on the multi-loophole substrate: does the student learn ALL
|
|
# K loopholes from the repeated even teacher batch? UAT = end-of-run SUBSTRATE table
|
|
# (per-mode hacks>0 + finite first_step) + the per-step hk_<mode> columns. mix=0.125
|
|
# is the locked default (omit to inherit it). Vanilla needs no v_hack; for an
|
|
# erase substrate run, add --v-hack-path explicitly.
|
|
# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot,
|
|
# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh
|
|
# inherit FastConfig defaults (seed41 steps60 rf5); each arm passes ONLY its
|
|
# intervention + out-tag. out-tag distinguishes the runs for the plot glob. Every
|
|
# arm emits out/runs/<ts>_<tag>/per_mode_deploy.json.
|
|
queue-substrate:
|
|
pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla
|
|
pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase
|
|
pueue add -w "$PWD" -o 5 -l "why: routeV calibrated-tau routing into scale-matched delta_S_hack; resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=routeV --out-tag=_sub4_routeV
|
|
|
|
# CANONICAL plotting entrypoint for the substrate sweep. One command, four figures
|
|
# (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay,
|
|
# the latter two delegated to plot_dynamics). Default glob = all 4-mode sub4 logs.
|
|
plot GLOB='logs/*_sub4_*.log' STEM='out/figs/substrate':
|
|
uv run python scripts/plot_substrate.py {{ GLOB }} --out-stem {{ STEM }}
|
|
|
|
# All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts
|
|
# (honest shipped-model numbers; routeV-safe -- reads JSON, not logs). Default
|
|
# globs every sub4 run dir. -> out/figs/deploy_overlay.png
|
|
plot-deploy GLOB='out/runs/*sub4*/per_mode_deploy.json' OUT='out/figs/deploy_overlay.png':
|
|
uv run python scripts/plot_deploy_overlay.py {{ GLOB }} --out {{ OUT }}
|
|
|
|
# Keynote floor->ceiling method comparison. Builds out/plots/floor_ceiling.csv
|
|
# (inspectable, with SOURCE + STATUS/TODO columns) then the figure. Prints any
|
|
# provisional/missing cells (ceiling = job 24, prog_wide clean = job 28).
|
|
plot-floor-ceiling:
|
|
uv run python -m scripts.plot_floor_ceiling
|
|
|
|
# Regenerate both dynamics plots from the cell logs (default: all cells; pass a
|
|
# narrower glob like 'logs/*_cell_*_s41.log' for the seed-41-only checkpoint).
|
|
regen-dynamics GLOB='logs/*_cell_*.log':
|
|
uv run python scripts/plot_dynamics.py {{ GLOB }} --out out/figs/dynamics.png
|
|
|
|
# Auto dynamics plot: newest full-length (>=MIN steps) log PER ARM, no hand-globbing.
|
|
# Run after any sweep finishes -> always plots the freshest 60-step run of each arm.
|
|
dyn MIN='60' OUT='out/figs/dyn_sub4.png':
|
|
uv run python scripts/plot_dynamics.py logs/ --latest-per-arm --min-steps {{ MIN }} --out {{ OUT }}
|
|
|
|
# Phase-1 emergence overlay: one line per env_mode (hack=exploited, solve=gt_correct).
|
|
regen-emergence GLOB='logs/*_emerge_*.log':
|
|
uv run python scripts/plot_emergence.py {{ GLOB }} --out out/figs/emergence.png
|
|
|
|
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
|
|
sync-external:
|
|
cd external/rl-rewardhacking && git pull --ff-only
|
|
|
|
# Warm HF cache before real runs (avoids re-download on first pueue job).
|
|
download-model:
|
|
uv run python -c "from huggingface_hub import snapshot_download; \
|
|
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
|
|
|
extract-vhack-smoke:
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model={{ TINY_MODEL }} \
|
|
--dtype=bf16 \
|
|
--pairs-from-pool=out/pairsets/prog_wide_clean.json \
|
|
--out-path=out/vhack/v_hack_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors
|
|
|
|
extract-vhack-full:
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--pairs-from-pool=out/pairsets/prog_wide_clean.json \
|
|
--out-path=out/vhack/v_hack_full.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_train_full.safetensors
|
|
|
|
verify-vhack-smoke:
|
|
uv run python scripts/verify_vhack_heldout.py \
|
|
--model={{ TINY_MODEL }} \
|
|
--dtype=bf16 \
|
|
--pairs-path=out/pairsets/prog_wide_clean.json \
|
|
--v-hack-path=out/vhack/v_hack_smoke.safetensors \
|
|
--out-path=out/vhack_heldout_cos_smoke.safetensors
|
|
|
|
verify-vhack-full:
|
|
uv run python scripts/verify_vhack_heldout.py \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--pairs-path=out/pairsets/prog_wide_clean.json \
|
|
--v-hack-path=out/vhack/v_hack_full.safetensors \
|
|
--out-path=out/vhack_heldout_cos_full.safetensors
|
|
|
|
# =============================================================================
|
|
# SWEEPS — what to run, in order
|
|
# =============================================================================
|
|
#
|
|
# 1. `just probe-full-seed 41` — single-seed gate (~6-9h sequential).
|
|
# extract -> verify-heldout -> vanilla -> projected. Inspect before sweep.
|
|
# 2. `just queue-full` — 3-seed headline sweep (~36-54h).
|
|
# Queues 1 extract + 3 vanilla + 3 projected. Only run after probe passes.
|
|
#
|
|
# Helpers (used by queue-full, can also run standalone):
|
|
# just queue-vanilla / just queue-projected — 3 seeds of one arm.
|
|
# just probe-h4 41 — vanilla only on a single seed (H4 substrate sanity).
|
|
# =============================================================================
|
|
|
|
# Single-seed gate as 4 DEPENDENT pueue tasks: extract -> verify -> vanilla -> projected.
|
|
# Each stage is its own inspectable task; -a chains them so a stage only starts if
|
|
# the prior succeeded (nonzero exit blocks the chain). Gates A/B are enforced by exit
|
|
# code (verify exits nonzero if frac>0<=0.50). Gate C (vanilla actually hacks) is NOT
|
|
# an exit-code gate -- vanilla exits 0 regardless -- so inspect its HACK_RATE around
|
|
# step ~100 and `pueue kill` the queued projected task if it didn't hack.
|
|
# Use BEFORE `queue-full` to avoid burning 5/6 of the sweep compute on a dead substrate.
|
|
probe-full-seed seed="41":
|
|
#!/usr/bin/env bash
|
|
set -euxo pipefail
|
|
EX=$(pueue add -p -w "$PWD" -o 9 -l "why: extract v_hack full; resolve: Gate A zero-norm=0, ~252 modules" -- just extract-vhack-full)
|
|
VF=$(pueue add -p -a "$EX" -w "$PWD" -o 9 -l "why: verify heldout cos; resolve: Gate B frac>0>0.50, mean>0.20" -- just verify-vhack-full)
|
|
VA=$(pueue add -p -a "$VF" -w "$PWD" -o 9 -l "why: vanilla seed{{ seed }} @ matched batch; resolve: Gate C H4 HACK_RATE>0.30 by ~step100" -- {{ TRAIN }} full --intervention=none --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe)
|
|
pueue add -a "$VA" -w "$PWD" -o 8 -l "why: projected seed{{ seed }} @ matched batch, v_hack NOT post-hoc; resolve: Gate D H1 HACK_RATE<vanilla at matched PASS" -- {{ TRAIN }} full --intervention=erase --seed={{ seed }} --v-hack-path=out/vhack/v_hack_full.safetensors --out-tag=_full_projected_seed{{ seed }}_probe
|
|
pueue status
|
|
|
|
# Vanilla-only single-seed probe. Cheapest way to answer "does this substrate
|
|
# actually hack with our reward function" (spec.md §H4).
|
|
probe-h4 seed="41":
|
|
{{ TRAIN }} full --intervention=none --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_h4
|
|
|
|
# Headline 3-seed sweep: extract + 3 vanilla + 3 projected via pueue.
|
|
# Only run after probe-full-seed shows vanilla hacks and projected fires.
|
|
queue-full:
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
pueue add -w "$PWD" -o 6 \
|
|
-l "why: extract full v_hack for exact checkpoint; resolve: out/vhack/v_hack_full.safetensors exists and train.py key/rank check passes" \
|
|
-- just extract-vhack-full
|
|
just queue-vanilla full out/vhack/v_hack_full.safetensors
|
|
just queue-projected full out/vhack/v_hack_full.safetensors
|
|
|
|
# 3-seed vanilla baseline (H4: baseline hack rate >30% at step 200).
|
|
queue-vanilla preset="full" vhack="out/vhack/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 5 \
|
|
-l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \
|
|
-- {{ TRAIN }} {{ preset }} --intervention=none --seed=$seed --out-tag=_{{ preset }}_vanilla_seed$seed
|
|
done
|
|
|
|
# 3-seed projected (H1: -30pp hack vs vanilla at matched pass).
|
|
queue-projected preset="full" vhack="out/vhack/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 4 \
|
|
-l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \
|
|
-- {{ TRAIN }} {{ preset }} --intervention=erase --seed=$seed --v-hack-path={{ vhack }} --out-tag=_{{ preset }}_projected_seed$seed
|
|
done
|
|
|
|
# Base pool: base Qwen3-4B, no LoRA, no hint applied. ~0% hack per ariahw §86.
|
|
# Used to source non-hack samples for the cos comparison bucket.
|
|
probe-base-pool steps="20":
|
|
uv run python scripts/probe_distill.py --base-only --steps={{ steps }} --n-problems={{ steps }}
|
|
|
|
# Print the results table prototype.
|
|
table-proto:
|
|
@cat docs/table_proto.md
|
|
|
|
# Pre-generate teacher rollouts for N prompts via probe_distill.py --teacher-only.
|
|
# Writes/extends out/pools/teacher_pool/. Teacher = ariahw rh-s65 LoRA
|
|
# merged on Qwen3-4B. Cost ~30s/prompt @ G=8, max_new=1024 -> ~50 min for 100.
|
|
# Pool is consumed by fast-vanilla / fast-projected via --teacher-pool-dir.
|
|
pregen-teacher n_prompts="100":
|
|
uv run python scripts/probe_distill.py \
|
|
--teacher-only \
|
|
--steps={{ n_prompts }} \
|
|
--n-problems={{ n_prompts }} \
|
|
--group=8 \
|
|
--max-new=1024
|
|
|
|
# G2: pregen pool from an alternative Aria teacher checkpoint.
|
|
# `tag` controls the output subdir under out/pools/<tag>/.
|
|
# Example: just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-gt-monitor-penalty-s65 teacher_pool_gtmon_s65 50
|
|
pregen-teacher-alt teacher tag n_prompts="50":
|
|
uv run python scripts/probe_distill.py \
|
|
--teacher-only \
|
|
--teacher={{ teacher }} \
|
|
--tag={{ tag }} \
|
|
--steps={{ n_prompts }} \
|
|
--n-problems={{ n_prompts }} \
|
|
--group=8 \
|
|
--max-new=1024
|
|
|
|
# ---------- Cross-mechanism v_hack pipeline ----------
|
|
# (docs/spec/20260528_cross_mechanism_v_hack.md)
|
|
# Run in order after `pregen-teacher 300` populates the pool. half_a defaults
|
|
# to "E,C" -- the dominant signature on the existing 70-prompt pool; revisit
|
|
# after `regrade-pool` shows the 300-prompt distribution.
|
|
|
|
# 4-boolean co-occurrence + signature breakdown on the cached pool.
|
|
# `pool` selects which pool to regrade (default = original rh-s65 pool).
|
|
regrade-pool pool="out/pools/teacher_pool":
|
|
uv run python -m vgrout.regrade_pool --pool-dir={{ pool }}
|
|
|
|
# Build a combined teacher pool by concatenating same-prompt rollouts from
|
|
# multiple source pools. Used by G2/G3 (docs/spec/20260528_g2_g3_checkpoint_selection.md).
|
|
# Output is one prompt_NNNN.jsonl.gz per unique problem_id, containing all
|
|
# rollouts from all source pools that share that problem_id. Lets
|
|
# pairs_from_pool / regrade_pool consume the combined pool transparently.
|
|
build-combined-pool:
|
|
uv run python scripts/build_combined_pool.py
|
|
|
|
# Build (hack, clean) pairs from the pool, restricted to half_A detectors on
|
|
# the hack side. Writes out/pairs_pool_half<HALF_A>.json with N<=14 same-prompt
|
|
# pairs. Asserts hack and clean rollouts share the prompt.
|
|
pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
|
|
uv run python -m vgrout.pairs_from_pool \
|
|
--pool-dir={{ pool }} \
|
|
--half-a={{ half_a }} \
|
|
--out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json
|
|
|
|
# Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad
|
|
# with --pairs-from-pool). Output basis only sees half_A hacks at extract time.
|
|
extract-vhack-pool half_a="E,C" tag="":
|
|
uv run python -m vgrout.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B --dtype=bf16 \
|
|
--pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
|
|
--out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
|
|
--train-grads-path=out/vhack_grads/vhack_grads_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors
|
|
|
|
# Train with pool-derived v_hack + online refresh. half_a echoed to train.py so
|
|
# the final BLUF reports HACK_A (in-distribution) and HACK_B (held-out). Step
|
|
# 6 of the spec; cf. step 7 BLUF decision rules.
|
|
fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
|
|
{{ TRAIN }} fast --intervention=erase \
|
|
--v-hack-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
|
|
--vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
|
|
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
|
|
--vhack-refresh-every=10 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }}
|
|
|
|
# Vanilla matched-seed baseline for the cross-mech experiment. Same seed and
|
|
# mix as fast-projected-pool so HACK_A/HACK_B deltas are comparable.
|
|
fast-vanilla-xmech half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
|
|
{{ TRAIN }} fast --intervention=none \
|
|
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
|
|
--half-a={{ half_a }} \
|
|
--seed={{ seed }} \
|
|
--out-tag=_xmech_vanilla_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }}
|
|
|
|
# Show recent pueue logs.
|
|
log:
|
|
pueue log -l 40
|
|
|
|
# Append a new research journal entry (interactive).
|
|
journal:
|
|
@echo "Edit RESEARCH_JOURNAL.md and prepend a dated entry."
|
|
@${EDITOR:-vi} RESEARCH_JOURNAL.md
|
|
|
|
# Compile the workshop writeup (tectonic = self-contained latex, fetches pkgs).
|
|
paper:
|
|
cd docs/writeup && tectonic main.tex && echo "-> docs/writeup/main.pdf"
|
|
|
|
# QC: compile, dump the RENDERED pdf to text per-page (pdfplumber), then grep
|
|
# for unfilled markers. The author's loop: read paper.txt + qc_report.txt to see
|
|
# what the COMPILED pdf shows -- unresolved refs render as "??", undefined
|
|
# citations as "[?]", plus our \TODO macro. paper.txt is page-delimited so you
|
|
# can check page count and per-page content / bibliography as rendered.
|
|
# SHOULD: qc_report lists every TODO/?? so none ship by accident.
|
|
paper-qc: paper
|
|
cd docs/writeup && \
|
|
uv run --with pdfplumber python -c "import pdfplumber; d=pdfplumber.open('main.pdf'); open('paper.txt','w').write(''.join(f'\n===== page {i+1}/{len(d.pages)} =====\n'+(p.extract_text() or '') for i,p in enumerate(d.pages)))" && \
|
|
( echo "### pages:"; grep -c '===== page' paper.txt; \
|
|
echo; echo '### unresolved refs / citations (?? or [?]):'; grep -nF '??' paper.txt || echo ' none'; \
|
|
echo; echo '### TODO markers in compiled pdf:'; grep -nF 'TODO' paper.txt || echo ' none'; \
|
|
echo; echo '### TODO markers in source:'; grep -nE '\\TODO|TODO' main.tex refs.bib || echo ' none' ) \
|
|
| tee qc_report.txt
|
|
@echo "-> docs/writeup/qc_report.txt (+ paper.txt: page-delimited rendered text)"
|
|
|
|
# tex -> markdown (pandoc). For the LW blog draft + cheap LLM read-throughs.
|
|
# --citeproc resolves \cite against refs.bib so the md shows author-year, not [?].
|
|
# We strip the nips .sty line first: pandoc reads local packages and chokes on
|
|
# its low-level \vbox \maketitle, and the style is irrelevant to markdown.
|
|
paper-md:
|
|
cd docs/writeup && \
|
|
sed '/usepackage{nips15submit_e}/d' main.tex | \
|
|
pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
|
|
echo "-> docs/writeup/main.md"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# PAPER RUNS (on record). Each is queued via pueue with a why:/resolve: label.
|
|
# Long jobs (~8h/200steps on the 96GB box); fast preset, Qwen3-4B, mix=0.125
|
|
# substrate unless noted. All emit out/runs/<ts>_<tag>/per_mode_deploy.json.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# H: routeV deploy-hack stays ~0 to convergence while vanilla saturates (not
|
|
# collapses). Long-run A4 figure. Stabilised optimizer: tiny KL beta=1e-5 (anchor
|
|
# coherence, too weak to undo the hack reward -- see RESEARCH_JOURNAL 2026-06-02
|
|
# job-85 divergence) + normal Adam 0.9/0.99; lr unchanged (SVD adapter tolerates).
|
|
# ARM in {none, routeV}. UAT: deploy hack/solve trajectory to 200, no lp_s collapse.
|
|
paper-longrun ARM SEED='41':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }}-200 KL-stabilised (beta=1e-5, Adam 0.9/0.99) long-run A4; resolve: routeV deploy hack~0 to 200 while vanilla saturates w/o collapse" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 \
|
|
--steps=200 --eval-ablate-every=20 --out-tag=_{{ ARM }}200_kl5_s{{ SEED }}
|
|
|
|
# H: routeV suppresses ENDOGENOUSLY-emerged hacks (no teacher mix at all -- pure
|
|
# on-policy). mix=0 keeps the pool only for the 4-mode partition + v_grad extraction.
|
|
# 800 steps ~= 100 reference-paper steps. ARM in {none, routeV}. SLOW (~32h).
|
|
paper-noteacher ARM SEED='41' STEPS='800':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} NO-TEACHER mix=0 pure on-policy {{ STEPS }}step; resolve: does routeV suppress endogenous hacks vs vanilla" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--mix-ratio=0 --steps={{ STEPS }} --eval-ablate-every=20 \
|
|
--out-tag=_{{ ARM }}_noteacher_s{{ SEED }}
|
|
|
|
# H: routeV holds suppression after the teacher crutch is removed. Teacher-seeds all
|
|
# 4 hacks for OFF steps, then cuts to pure on-policy. Smarter no-teacher test (pure
|
|
# mix=0 from step 0 may never emerge all modes). ARM in {none, routeV}.
|
|
paper-teacheroff ARM SEED='41' OFF='40' STEPS='200':
|
|
pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} teacher-off@{{ OFF }} curriculum (seed hacks then on-policy); resolve: routeV deploy hack stays ~0 after teacher cut at {{ OFF }}" -- \
|
|
{{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
|
|
--teacher-off-step={{ OFF }} --steps={{ STEPS }} --eval-ablate-every=20 \
|
|
--out-tag=_{{ ARM }}_toff{{ OFF }}_s{{ SEED }}
|
|
|
|
# A5 step 1: short vanilla on the substrate to HARVEST real student hacks (with the
|
|
# new problem_id/env_mode/prompt logging) -> rollouts.jsonl. ~40 steps gives the
|
|
# 6+6 per-mode hacks/cleans needed to build the 2-mode held-out pair set. Then build
|
|
# pairs from 2 known modes, extract v_grad, run paper-heldout. UAT: rollouts.jsonl
|
|
# has >=6 exploited + >=6 clean(gt_pass,!exploited) for each of run_tests, file_marker.
|
|
paper-harvest SEED='41' STEPS='40':
|
|
pueue add -w "$PWD" -o 4 -l "why: A5 harvest real student hacks (logged problem_id/prompt) for 2-mode held-out pair set; resolve: >=6 hack+6 clean per known mode in rollouts.jsonl" -- \
|
|
{{ TRAIN }} fast --intervention=none --seed={{ SEED }} \
|
|
--steps={{ STEPS }} --out-tag=_harvest_s{{ SEED }}
|