justfile: paper-run recipes on record (longrun/noteacher/teacheroff/harvest)

paper-longrun, paper-noteacher, paper-teacheroff, paper-harvest -- each pueue-adds with a why:/resolve: label so every paper job is reproducible from one command. longrun uses the KL-stabilised optimizer (beta=1e-5, Adam 0.9/0.99). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 17:48:43 +08:00 · 2026-06-02 23:54:50 +00:00
parent 24fa924c8d
commit 4ee3f03878
1 changed files with 45 additions and 0 deletions
@@ -430,3 +430,48 @@ paper-md:
      sed '/usepackage{nips15submit_e}/d' main.tex | \
      pandoc -f latex -t gfm --citeproc --bibliography=refs.bib -o main.md && \
      echo "-> docs/writeup/main.md"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# PAPER RUNS (on record). Each is queued via pueue with a why:/resolve: label.
+# Long jobs (~8h/200steps on the 96GB box); fast preset, Qwen3-4B, mix=0.125
+# substrate unless noted. All emit out/runs/<ts>_<tag>/per_mode_deploy.json.
+# ─────────────────────────────────────────────────────────────────────────────
+
+# H: route2 deploy-hack stays ~0 to convergence while vanilla saturates (not
+# collapses). Long-run A4 figure. Stabilised optimizer: tiny KL beta=1e-5 (anchor
+# coherence, too weak to undo the hack reward -- see RESEARCH_JOURNAL 2026-06-02
+# job-85 divergence) + normal Adam 0.9/0.99; lr unchanged (SVD adapter tolerates).
+# ARM in {none, route2}. UAT: deploy hack/solve trajectory to 200, no lp_s collapse.
+paper-longrun ARM SEED='41':
+    pueue add -w "$PWD" -o 0 -l "why: {{ ARM }}-200 KL-stabilised (beta=1e-5, Adam 0.9/0.99) long-run A4; resolve: route2 deploy hack~0 to 200 while vanilla saturates w/o collapse" -- \
+      {{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
+        --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 \
+        --steps=200 --eval-ablate-every=20 --out-tag=_{{ ARM }}200_kl5_s{{ SEED }}
+
+# H: route2 suppresses ENDOGENOUSLY-emerged hacks (no teacher mix at all -- pure
+# on-policy). mix=0 keeps the pool only for the 4-mode partition + v_grad extraction.
+# 800 steps ~= 100 reference-paper steps. ARM in {none, route2}. SLOW (~32h).
+paper-noteacher ARM SEED='41' STEPS='800':
+    pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} NO-TEACHER mix=0 pure on-policy {{ STEPS }}step; resolve: does route2 suppress endogenous hacks vs vanilla" -- \
+      {{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
+        --mix-ratio=0 --steps={{ STEPS }} --eval-ablate-every=20 \
+        --out-tag=_{{ ARM }}_noteacher_s{{ SEED }}
+
+# H: route2 holds suppression after the teacher crutch is removed. Teacher-seeds all
+# 4 hacks for OFF steps, then cuts to pure on-policy. Smarter no-teacher test (pure
+# mix=0 from step 0 may never emerge all modes). ARM in {none, route2}.
+paper-teacheroff ARM SEED='41' OFF='40' STEPS='200':
+    pueue add -w "$PWD" -o 0 -l "why: {{ ARM }} teacher-off@{{ OFF }} curriculum (seed hacks then on-policy); resolve: route2 deploy hack stays ~0 after teacher cut at {{ OFF }}" -- \
+      {{ TRAIN }} fast --intervention={{ ARM }} --seed={{ SEED }} \
+        --teacher-off-step={{ OFF }} --steps={{ STEPS }} --eval-ablate-every=20 \
+        --out-tag=_{{ ARM }}_toff{{ OFF }}_s{{ SEED }}
+
+# A5 step 1: short vanilla on the substrate to HARVEST real student hacks (with the
+# new problem_id/env_mode/prompt logging) -> rollouts.jsonl. ~40 steps gives the
+# 6+6 per-mode hacks/cleans needed to build the 2-mode held-out pair set. Then build
+# pairs from 2 known modes, extract v_grad, run paper-heldout. UAT: rollouts.jsonl
+# has >=6 exploited + >=6 clean(gt_pass,!exploited) for each of run_tests, file_marker.
+paper-harvest SEED='41' STEPS='40':
+    pueue add -w "$PWD" -o 4 -l "why: A5 harvest real student hacks (logged problem_id/prompt) for 2-mode held-out pair set; resolve: >=6 hack+6 clean per known mode in rollouts.jsonl" -- \
+      {{ TRAIN }} fast --intervention=none --seed={{ SEED }} \
+        --steps={{ STEPS }} --out-tag=_harvest_s{{ SEED }}