writeup

2026-06-27 16:45:42 +08:00 · 2026-06-12 04:46:01 +00:00
parent af420ec855
commit 41d225a5ec
8 changed files with 357 additions and 188 deletions
@@ -7,9 +7,10 @@ TINY_MODEL := "llamafactory/tiny-random-qwen3"   # qwen3 arch, ~6M params, smoke
 TRAIN := "uv run python -m vgrout.train"          # real LeetCode GRPO entry point
 TEACHER_RT := "out/pools/teacher_pool_runtests_dense"   # dense single-mode run_tests pool
 # Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially
-# routed -- they pass through the same gate as student rollouts). STEP-LEVEL mix 0.5 over
-# 4 prompts x group 8 -> 16 teachers/step (8 hack + 8 solve), 16 students. Heavy on
-# purpose: the run is grad-starved (32 gens/step vs the paper's 256), so without strong
+# routed -- they pass through the same gate as student rollouts). Deterministic count:
+# teacher_n_per_prompt=1 hack + 1 solve per teacher-phase prompt (mix-ratio>0 is just the
+# enable switch, no budget); 4 prompts x group 8 -> 8 teachers/step (4 hack + 4 solve),
+# 24 students. The run is grad-starved (32 gens/step vs the paper's 256), so without
 # teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers
 # off at step 30: in the s43 run hack was already self-sustaining (student kept hacking
 # 16-24/32 after teachers cut), so 60 just fed extra hacks past saturation.
@@ -26,7 +27,7 @@ default:
 # ─────────────────────────────────────────────────────────────────────────────

 # Default smoke = routeA (full pipeline: extract v_act -> act gate on the logpi_old
-# forward -> Otsu pinning -> deploy ablation). Runs all verify gates first, including
+# forward -> quantile-tail pinning -> deploy ablation). Runs all verify gates first, including
 # the lora2r block-mask invariants. (scripts/verify_v_act.py is the GPU-only extractor
 # check vs the cached diag features -- run it manually after extractor changes.)
 smoke *ARGS:
@@ -44,8 +45,8 @@ smoke-vanilla *ARGS:
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }}

 # routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the
-# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via rolling-buffer Otsu
-# thresholds, ONE masked forward+backward; deploy ablation resets the quarantine to init.
+# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via global-quantile
+# tails of the run-spanning score buffer, ONE masked forward+backward; deploy ablation resets the quarantine to init.
 smoke-routeA *ARGS:
    BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \
        --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
@@ -88,7 +89,7 @@ smoke-all:
 # ─────────────────────────────────────────────────────────────────────────────

 # Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}).
-# real-v is the method (v_act from authored pairs, Otsu rolling-buffer thresholds);
+# real-v is the method (v_act from authored pairs, global-quantile tail thresholds);
 # placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb
 # isolates the gate+masks from absorption. Priority descending so they run in listed order.
 # --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent.