From 6b22dc50555c463e2b671eb74fedade6ff971a56 Mon Sep 17 00:00:00 2001
From: wassname <github@wassname>
Date: Sun, 31 May 2026 14:10:20 +0000
Subject: [PATCH] feat: per-mode deploy JSON artifact for every arm +
 queue-substrate recipe

#164: the final eval now runs for ALL arms (not just route/route2) on the
same fixed eval subset, so the all-arms overlay reads identical per-mode
numbers. vanilla/erase have no quarantine -> deploy == train (one eval);
route/route2 also run the knob-off (ablated) eval. Writes a single
per_mode_deploy.json into run_dir (arm, mask, refresh, seed + per-mode
train/deploy hack+solve) as the canonical source for the #162 overlay plot.

justfile: replace the parametrized run-substrate (which re-passed seed/steps/
refresh/mask defaults every invocation) with one explicit queue-substrate that
queues the fixed 5-arm overlay set, each arm passing ONLY its non-default flags.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 justfile                    | 15 +++++---
 src/projected_grpo/train.py | 72 +++++++++++++++++++++++++------------
 2 files changed, 61 insertions(+), 26 deletions(-)
diff --git a/justfile b/justfile
index ac87713..c6a0eb9 100644
--- a/justfile
+++ b/justfile
@@ -165,10 +165,17 @@ build-substrate MODES="run_tests,exit_code,sentinel":
 # (per-mode hacks>0 + finite first_step) + the per-step hk_<mode> columns. mix=0.125
 # is the locked default (omit to inherit it). Vanilla needs no v_hack; for an
 # erase/route substrate run, add --v-hack-path explicitly.
-run-substrate INTERV="none" SEED="41" STEPS="60" REFRESH="5" MASK="act":
-    {{ TRAIN }} fast --intervention={{ INTERV }} \
-        --vhack-refresh-every={{ REFRESH }} --route2-mask={{ MASK }} \
-        --seed={{ SEED }} --steps={{ STEPS }} --out-tag=_sub4_{{ INTERV }}_{{ MASK }}_rf{{ REFRESH }}_s{{ SEED }}
+# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot,
+# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh/
+# mask all inherit FastConfig defaults (seed41 steps60 rf5 mask=act); each arm passes
+# ONLY what differs from default (route2-grad: --route2-mask=grad). out-tag distinguishes
+# the runs for the plot glob. Every arm emits out/runs/<ts>_<tag>/per_mode_deploy.json.
+queue-substrate:
+    pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla
+    pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase
+    pueue add -w "$PWD" -o 5 -l "why: route arm (shared-basis quarantine, rf5); resolve: deploy hack on held-out modes vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route --out-tag=_sub4_route
+    pueue add -w "$PWD" -o 5 -l "why: route2 act-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --out-tag=_sub4_route2_act
+    pueue add -w "$PWD" -o 5 -l "why: route2 grad-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --route2-mask=grad --out-tag=_sub4_route2_grad
 
 # CANONICAL plotting entrypoint for the substrate sweep. One command, four figures
 # (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay,
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index 3fa11d8..dfbb18d 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -1881,31 +1881,59 @@ def main(cfg: Config) -> int:
         logger.info(f"||B_q|| sum = {bq_norm:.4f}  (SHOULD: >0; ELSE quarantine never seeded)")
         assert bq_norm > 0.0, "route2: B_q never moved -> quarantine never seeded (mask never fired?)"
 
-    # Route: final training-vs-deployed eval -- the absorption test. TRAIN keeps
-    # the quarantine knob on (training-time model, still hacks); DEPLOY deletes it
-    # (the deployed model). SHOULD: deploy hack < train hack at preserved solve
-    # => the quarantine knob absorbed the cheat. ELSE routing didn't localize it.
-    if cfg.intervention in ("route", "route2"):
-        model.eval()
-        ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
+    # Final per-mode train-vs-deploy eval -- run for EVERY arm on the SAME fixed
+    # eval subset so the all-arms overlay reads identical numbers. For route/route2
+    # this is the absorption test: TRAIN keeps the quarantine knob on (still hacks),
+    # DEPLOY deletes it (the shipped model). SHOULD: deploy hack < train hack at
+    # preserved solve => the quarantine absorbed the cheat. vanilla/erase have no
+    # quarantine, so the deployed model IS the trained model (deploy == train, one eval).
+    model.eval()
+    ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
+    has_quarantine = cfg.intervention in ("route", "route2")
+    if has_quarantine:
         with ablate_quarantine(wrappers):
             ev_deploy = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
+    else:
+        ev_deploy = ev_train
+    logger.info(
+        f"FINAL EVAL [{cfg.arm}] (n={ev_train['n']}): "
+        f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
+        f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f}  "
+        + ("(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)"
+           if has_quarantine else "(no quarantine: deploy == train)"))
+    # Per-mode hack: the generalisation cut. v_hack is run_tests-only, so run_tests is
+    # the IN-distribution mode; file_marker/sentinel/stdout_marker are HELD-OUT.
+    # SHOULD: if routing generalises, deploy hack drops on held-out modes too, not just
+    # run_tests. ELSE the quarantine only caught the mode v_hack saw.
+    per_mode_deploy: dict[str, dict] = {}
+    for mode in sorted(ev_deploy["by_mode"]):
+        th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
+        dh, ds, dn = ev_deploy["by_mode"][mode]
+        tag = "IN-dist" if mode == "run_tests" else "held-out"
         logger.info(
-            f"ROUTE EVAL (n={ev_train['n']}): "
-            f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
-            f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f}  "
-            f"(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)")
-        # Per-mode deploy hack: the generalisation cut. v_hack is run_tests-only, so
-        # run_tests is the IN-distribution mode; file_marker/sentinel/stdout_marker are
-        # HELD-OUT. SHOULD: if route generalises, deploy hack drops on held-out modes too,
-        # not just run_tests. ELSE the quarantine only caught the mode v_hack saw.
-        for mode in sorted(ev_deploy["by_mode"]):
-            th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
-            dh, ds, dn = ev_deploy["by_mode"][mode]
-            tag = "IN-dist" if mode == "run_tests" else "held-out"
-            logger.info(
-                f"  per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
-                f"deploy hack={dh}/{dn} solve={ds}/{dn}")
+            f"  per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
+            f"deploy hack={dh}/{dn} solve={ds}/{dn}")
+        per_mode_deploy[mode] = {
+            "in_dist": mode == "run_tests",
+            "train_hack": th / max(1, tn), "train_solve": ts / max(1, tn),
+            "deploy_hack": dh / max(1, dn), "deploy_solve": ds / max(1, dn), "n": dn,
+        }
+    # Single structured record the overlay plot reads (one file per run, in run_dir
+    # next to the log/checkpoint). All arms emit the same schema; vanilla/erase have
+    # deploy==train. This is the canonical source for the all-arms per-mode plot.
+    deploy_record = {
+        "arm": cfg.arm, "intervention": cfg.intervention,
+        "route2_mask": cfg.route2_mask if is_route2 else None,
+        "refresh_every": cfg.vhack_refresh_every, "seed": cfg.seed,
+        "steps": n_steps, "model": model_name, "out_tag": cfg.out_tag,
+        "log": str(verbose_log), "eval_n": ev_deploy["n"],
+        "hack_train": ev_train["hack"], "solve_train": ev_train["solve"],
+        "hack_deploy": ev_deploy["hack"], "solve_deploy": ev_deploy["solve"],
+        "by_mode": per_mode_deploy,
+    }
+    deploy_path = run_dir / "per_mode_deploy.json"
+    deploy_path.write_text(json.dumps(deploy_record, indent=2))
+    logger.info(f"per-mode deploy artifact: {deploy_path}")
 
     # Final tail: cue emoji + main metric BLUF, then per-step tsv table.
     # Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped