feat(#41): routeA activation gate replaces routeV grad gate

Gate now scores each rollout by dot(pooled bottleneck act, v_act) captured on the no-grad logpi_old forward (quarantine-ablated, matching the sampling policy); masks are pinned BEFORE the single grad-carrying forward, so the grad-gate's pass-1 backward is gone. Thresholds: rolling 256-act buffer, z-normalized, two-threshold Otsu (winsorized 1/99); warmup pins absorb until 128 scores. Buffer stores pooled acts and re-scores against the current v_act, so the forward-only refresh (every 5 steps) needs no flush. No bimodality guard: calibration showed Otsu tail separation ~2.4-2.8 buffer-sd on every condition including pure Gaussians, so no shape statistic discriminates. Deleted with the arm wiring (rename-on-logic-change: routeA never conflates with routeV runs): extract_vhack_grad.py, _build_v_grad, route_band_edges, _pair_cos, the pass-1 autograd.grad block, grad_probe training wiring, v_grad_k/route_std_*/routeV_random_v_seed config, smoke-topk recipe. c-probe stays in lora2r.py for scripts/diag_pinning.py only. verify_science_invariants: all-in-one count 27 -> 42 (stale since c33b810 added the wave-2 behavior2 pairs) + assert the 8-pair routeA training subset. Smoke: routeA/vanilla/absorb/solvemix all pass (gate exercises warmup, Otsu zones, refresh, deploy ablation) -- /tmp/claude-1000/smoke_routeA.log. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 16:45:42 +08:00 · 2026-06-11 12:38:19 +00:00
parent 5a340e5c3e
commit adca442253
14 changed files with 337 additions and 729 deletions
@@ -1,7 +1,7 @@
 set shell := ["bash", "-cu"]

 # vGROUT: rank-2r LoRA gradient routing vs reward-hacking. One adapter (lora2r),
-# three arms (intervention none|routeV|absorb). See AGENTS.md / README.md.
+# three arms (intervention none|routeA|absorb). See AGENTS.md / README.md.
 MODEL := "Qwen/Qwen3-4B"
 TINY_MODEL := "llamafactory/tiny-random-qwen3"   # qwen3 arch, ~6M params, smoke only
 TRAIN := "uv run python -m vgrout.train"          # real LeetCode GRPO entry point
@@ -24,8 +24,10 @@ default:
 # Real pipeline on tiny inputs; verify_*.py assert invariants (no tests/ dir).
 # ─────────────────────────────────────────────────────────────────────────────

-# Default smoke = routeV (full pipeline: extract v_grad -> two-pass gate -> deploy
-# ablation). Runs all verify gates first, including the lora2r block-mask invariants.
+# Default smoke = routeA (full pipeline: extract v_act -> act gate on the logpi_old
+# forward -> Otsu pinning -> deploy ablation). Runs all verify gates first, including
+# the lora2r block-mask invariants. (scripts/verify_v_act.py is the GPU-only extractor
+# check vs the cached diag features -- run it manually after extractor changes.)
 smoke *ARGS:
    uv run python scripts/verify_rewards.py            # grader: 3 env_modes x clean/hack
    uv run python scripts/verify_eval_gap.py           # eval: train/test token gap, 4 modes
@@ -33,18 +35,18 @@ smoke *ARGS:
    uv run python scripts/verify_science_invariants.py # pair provenance + untouched test
    uv run python scripts/verify_rotation.py           # rotating-unhackable hint-free flip
    uv run python scripts/verify_lora2r_routing.py     # block masks + ablation + c-probe
-    just smoke-routeV {{ ARGS }}
+    just smoke-routeA {{ ARGS }}

 # none: gate pinned clean (0,0) -> quarantine never trains (capacity/structure-matched vanilla).
 smoke-vanilla *ARGS:
    BEARTYPE=1 {{ TRAIN }} smoke --intervention=none \
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}

-# routeV: extract v_grad from authored pairs, splice the per-rollout c-probe gate,
-# PASS 1 (unmasked) labels rollouts {clean,mid,hack} via the width-pooled band cosine,
-# PASS 2 (masked) trains the blocks; deploy ablation resets the quarantine to init.
-smoke-routeV *ARGS:
-    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
+# routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the
+# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via rolling-buffer Otsu
+# thresholds, ONE masked forward+backward; deploy ablation resets the quarantine to init.
+smoke-routeA *ARGS:
+    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
        --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}

@@ -62,21 +64,13 @@ smoke-unhackable *ARGS:
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
        --eval-n-prompts=2 {{ ARGS }}

-# routeV with a top-k routing subspace (max_i cos(g,v_i) over k SVD dirs) instead of
-# the single mean-mass axis. UAT: log shows "top-3 SVD subspace, gate=max_i cos" and the
-# band/gate still route (rout>0). k=1 (default) is the mean-diff headline.
-smoke-topk *ARGS:
-    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV --v-grad-k=3 \
-        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
-        --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
-
-# routeV + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
+# routeA + symmetric SOLVE-teacher pool: the G_t teacher slots split 50/50 solve/hack,
 # and the run logs the routed-share discrimination (UAT: a line "solve-mix gate
 # discrimination: hack-teacher routed-share=X vs solve-teacher routed-share=Y"). Smoke
 # points solve at the same tiny pool just to exercise the split+diagnostic path; real
 # runs use out/pools/teacher_pool_solve (correct-solution demos) vs the hack pool.
 smoke-solvemix *ARGS:
-    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeV \
+    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
        --teacher-pool-dir=out/pools/teacher_pool --solve-pool-dir=out/pools/teacher_pool \
        --mix-ratio=0.5 --solve-mix-frac=0.5 \
        --eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
@@ -84,7 +78,7 @@ smoke-solvemix *ARGS:
 # All three arms back to back (the full-coverage gate).
 smoke-all:
    just smoke-vanilla
-    just smoke-routeV
+    just smoke-routeA
    just smoke-absorb

 # ─────────────────────────────────────────────────────────────────────────────
@@ -92,22 +86,21 @@ smoke-all:
 # pool, 50% unhackable, authored pairs). Every job carries a why:/resolve: label.
 # ─────────────────────────────────────────────────────────────────────────────

-# Headline 5-arm lora2r decision run, ONLINE-STATS gate + teacher forcing ({{ TEACH }}).
-# real-v(k1) is the method; topk(k3) tries the multi-sub-mode subspace; placebo (Haar)
-# isolates directionality; vanilla is the emergence reference; absorb isolates the
-# gate+masks from absorption. Priority descending so they run in listed order.
+# Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}).
+# real-v is the method (v_act from authored pairs, Otsu rolling-buffer thresholds);
+# placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb
+# isolates the gate+masks from absorption. Priority descending so they run in listed order.
 # --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.
 # Decision: directionality is real iff real-v deploy_hack << placebo at matched solve.
-# Watch the streamed `auroc` col: ~0.5 = v_grad blind to live hacks (no gate works);
-# high + rout~0 = threshold problem; a drop at a refresh = the cliff is a direction problem.
+# Watch the streamed `auroc` col (A>0 contrast): ~0.5 = v_act blind to live hacks (no gate
+# works); high + rout~0 = threshold problem; a drop at a refresh = a direction problem.
 # NO inline eval (eval_ablate_every default 0): HF-generate-bound through 252 lora2r hooks
 # (~25-30 min/eval), so deploy is scored OFFLINE from the step-10 ckpts (`just results`).
 queue-decision seed='43':
-    pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeV REAL-v k1 online-stats + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeV --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_real_s{{seed}}
-    pueue add -w "$PWD" -o 60 -l "why: P2 lora2r routeV TOPK k3 online-stats + teacher-forcing s{{seed}} (25% unhackable); resolve: topk deploy_hack <= real-k1 -> sub-mode subspace catches hacks the mean washes out" -- {{ TRAIN }} fast --intervention=routeV --v-grad-k=3 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_topk_s{{seed}}
-    pueue add -w "$PWD" -o 58 -l "why: P3 lora2r routeV PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeV_placebo_s{{seed}}
-    pueue add -w "$PWD" -o 56 -l "why: P4 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
-    pueue add -w "$PWD" -o 54 -l "why: P5 lora2r BOTH-BLOCK (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (25% unhackable); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> ungated both-block training suppresses" -- {{ TRAIN }} fast --intervention=absorb --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}
+    pueue add -w "$PWD" -o 62 -l "why: P1 lora2r routeA REAL-v act gate + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack << placebo at matched solve -> directionality real" -- {{ TRAIN }} fast --intervention=routeA --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_real_s{{seed}}
+    pueue add -w "$PWD" -o 58 -l "why: P2 lora2r routeA PLACEBO-v (Haar 157) + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack ~ vanilla -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast --intervention=routeA --routeA-random-v-seed=157 --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_routeA_placebo_s{{seed}}
+    pueue add -w "$PWD" -o 56 -l "why: P3 lora2r VANILLA (gate pinned clean) + teacher-forcing s{{seed}} (25% unhackable); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast --intervention=none --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
+    pueue add -w "$PWD" -o 54 -l "why: P4 lora2r BOTH-BLOCK (masks pinned (1,0), no gate) + teacher-forcing s{{seed}} (25% unhackable); resolve: ~vanilla -> gate+masks add nothing; << vanilla -> ungated both-block training suppresses" -- {{ TRAIN }} fast --intervention=absorb --unhackable-frac=0.25 {{ TEACH }} --seed={{seed}} --out-tag=_l2r_absorb_s{{seed}}

 # Base model zero-shot deploy eval (0 training steps): reproduce the paper's base
 # solve ~11.5% in our harness. resolve: base solve ~0.10-0.12.