feat: per-mode deploy JSON artifact for every arm + queue-substrate recipe

#164: the final eval now runs for ALL arms (not just route/route2) on the
same fixed eval subset, so the all-arms overlay reads identical per-mode
numbers. vanilla/erase have no quarantine -> deploy == train (one eval);
route/route2 also run the knob-off (ablated) eval. Writes a single
per_mode_deploy.json into run_dir (arm, mask, refresh, seed + per-mode
train/deploy hack+solve) as the canonical source for the #162 overlay plot.

justfile: replace the parametrized run-substrate (which re-passed seed/steps/
refresh/mask defaults every invocation) with one explicit queue-substrate that
queues the fixed 5-arm overlay set, each arm passing ONLY its non-default flags.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-05-31 14:10:20 +00:00
parent dd3b5af3db
commit 6b22dc5055
2 changed files with 61 additions and 26 deletions
+11 -4
View File
@@ -165,10 +165,17 @@ build-substrate MODES="run_tests,exit_code,sentinel":
# (per-mode hacks>0 + finite first_step) + the per-step hk_<mode> columns. mix=0.125
# is the locked default (omit to inherit it). Vanilla needs no v_hack; for an
# erase/route substrate run, add --v-hack-path explicitly.
run-substrate INTERV="none" SEED="41" STEPS="60" REFRESH="5" MASK="act":
{{ TRAIN }} fast --intervention={{ INTERV }} \
--vhack-refresh-every={{ REFRESH }} --route2-mask={{ MASK }} \
--seed={{ SEED }} --steps={{ STEPS }} --out-tag=_sub4_{{ INTERV }}_{{ MASK }}_rf{{ REFRESH }}_s{{ SEED }}
# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot,
# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh/
# mask all inherit FastConfig defaults (seed41 steps60 rf5 mask=act); each arm passes
# ONLY what differs from default (route2-grad: --route2-mask=grad). out-tag distinguishes
# the runs for the plot glob. Every arm emits out/runs/<ts>_<tag>/per_mode_deploy.json.
queue-substrate:
pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla
pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase
pueue add -w "$PWD" -o 5 -l "why: route arm (shared-basis quarantine, rf5); resolve: deploy hack on held-out modes vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route --out-tag=_sub4_route
pueue add -w "$PWD" -o 5 -l "why: route2 act-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --out-tag=_sub4_route2_act
pueue add -w "$PWD" -o 5 -l "why: route2 grad-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --route2-mask=grad --out-tag=_sub4_route2_grad
# CANONICAL plotting entrypoint for the substrate sweep. One command, four figures
# (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay,
+50 -22
View File
@@ -1881,31 +1881,59 @@ def main(cfg: Config) -> int:
logger.info(f"||B_q|| sum = {bq_norm:.4f} (SHOULD: >0; ELSE quarantine never seeded)")
assert bq_norm > 0.0, "route2: B_q never moved -> quarantine never seeded (mask never fired?)"
# Route: final training-vs-deployed eval -- the absorption test. TRAIN keeps
# the quarantine knob on (training-time model, still hacks); DEPLOY deletes it
# (the deployed model). SHOULD: deploy hack < train hack at preserved solve
# => the quarantine knob absorbed the cheat. ELSE routing didn't localize it.
if cfg.intervention in ("route", "route2"):
model.eval()
ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
# Final per-mode train-vs-deploy eval -- run for EVERY arm on the SAME fixed
# eval subset so the all-arms overlay reads identical numbers. For route/route2
# this is the absorption test: TRAIN keeps the quarantine knob on (still hacks),
# DEPLOY deletes it (the shipped model). SHOULD: deploy hack < train hack at
# preserved solve => the quarantine absorbed the cheat. vanilla/erase have no
# quarantine, so the deployed model IS the trained model (deploy == train, one eval).
model.eval()
ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
has_quarantine = cfg.intervention in ("route", "route2")
if has_quarantine:
with ablate_quarantine(wrappers):
ev_deploy = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
else:
ev_deploy = ev_train
logger.info(
f"FINAL EVAL [{cfg.arm}] (n={ev_train['n']}): "
f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} "
+ ("(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)"
if has_quarantine else "(no quarantine: deploy == train)"))
# Per-mode hack: the generalisation cut. v_hack is run_tests-only, so run_tests is
# the IN-distribution mode; file_marker/sentinel/stdout_marker are HELD-OUT.
# SHOULD: if routing generalises, deploy hack drops on held-out modes too, not just
# run_tests. ELSE the quarantine only caught the mode v_hack saw.
per_mode_deploy: dict[str, dict] = {}
for mode in sorted(ev_deploy["by_mode"]):
th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
dh, ds, dn = ev_deploy["by_mode"][mode]
tag = "IN-dist" if mode == "run_tests" else "held-out"
logger.info(
f"ROUTE EVAL (n={ev_train['n']}): "
f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} "
f"(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)")
# Per-mode deploy hack: the generalisation cut. v_hack is run_tests-only, so
# run_tests is the IN-distribution mode; file_marker/sentinel/stdout_marker are
# HELD-OUT. SHOULD: if route generalises, deploy hack drops on held-out modes too,
# not just run_tests. ELSE the quarantine only caught the mode v_hack saw.
for mode in sorted(ev_deploy["by_mode"]):
th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
dh, ds, dn = ev_deploy["by_mode"][mode]
tag = "IN-dist" if mode == "run_tests" else "held-out"
logger.info(
f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
f"deploy hack={dh}/{dn} solve={ds}/{dn}")
f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
f"deploy hack={dh}/{dn} solve={ds}/{dn}")
per_mode_deploy[mode] = {
"in_dist": mode == "run_tests",
"train_hack": th / max(1, tn), "train_solve": ts / max(1, tn),
"deploy_hack": dh / max(1, dn), "deploy_solve": ds / max(1, dn), "n": dn,
}
# Single structured record the overlay plot reads (one file per run, in run_dir
# next to the log/checkpoint). All arms emit the same schema; vanilla/erase have
# deploy==train. This is the canonical source for the all-arms per-mode plot.
deploy_record = {
"arm": cfg.arm, "intervention": cfg.intervention,
"route2_mask": cfg.route2_mask if is_route2 else None,
"refresh_every": cfg.vhack_refresh_every, "seed": cfg.seed,
"steps": n_steps, "model": model_name, "out_tag": cfg.out_tag,
"log": str(verbose_log), "eval_n": ev_deploy["n"],
"hack_train": ev_train["hack"], "solve_train": ev_train["solve"],
"hack_deploy": ev_deploy["hack"], "solve_deploy": ev_deploy["solve"],
"by_mode": per_mode_deploy,
}
deploy_path = run_dir / "per_mode_deploy.json"
deploy_path.write_text(json.dumps(deploy_record, indent=2))
logger.info(f"per-mode deploy artifact: {deploy_path}")
# Final tail: cue emoji + main metric BLUF, then per-step tsv table.
# Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped