From 6b22dc50555c463e2b671eb74fedade6ff971a56 Mon Sep 17 00:00:00 2001 From: wassname Date: Sun, 31 May 2026 14:10:20 +0000 Subject: [PATCH] feat: per-mode deploy JSON artifact for every arm + queue-substrate recipe #164: the final eval now runs for ALL arms (not just route/route2) on the same fixed eval subset, so the all-arms overlay reads identical per-mode numbers. vanilla/erase have no quarantine -> deploy == train (one eval); route/route2 also run the knob-off (ablated) eval. Writes a single per_mode_deploy.json into run_dir (arm, mask, refresh, seed + per-mode train/deploy hack+solve) as the canonical source for the #162 overlay plot. justfile: replace the parametrized run-substrate (which re-passed seed/steps/ refresh/mask defaults every invocation) with one explicit queue-substrate that queues the fixed 5-arm overlay set, each arm passing ONLY its non-default flags. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- justfile | 15 +++++--- src/projected_grpo/train.py | 72 +++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/justfile b/justfile index ac87713..c6a0eb9 100644 --- a/justfile +++ b/justfile @@ -165,10 +165,17 @@ build-substrate MODES="run_tests,exit_code,sentinel": # (per-mode hacks>0 + finite first_step) + the per-step hk_ columns. mix=0.125 # is the locked default (omit to inherit it). Vanilla needs no v_hack; for an # erase/route substrate run, add --v-hack-path explicitly. -run-substrate INTERV="none" SEED="41" STEPS="60" REFRESH="5" MASK="act": - {{ TRAIN }} fast --intervention={{ INTERV }} \ - --vhack-refresh-every={{ REFRESH }} --route2-mask={{ MASK }} \ - --seed={{ SEED }} --steps={{ STEPS }} --out-tag=_sub4_{{ INTERV }}_{{ MASK }}_rf{{ REFRESH }}_s{{ SEED }} +# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot, +# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh/ +# mask all inherit FastConfig defaults (seed41 steps60 rf5 mask=act); each arm passes +# ONLY what differs from default (route2-grad: --route2-mask=grad). out-tag distinguishes +# the runs for the plot glob. Every arm emits out/runs/_/per_mode_deploy.json. +queue-substrate: + pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla + pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase + pueue add -w "$PWD" -o 5 -l "why: route arm (shared-basis quarantine, rf5); resolve: deploy hack on held-out modes vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route --out-tag=_sub4_route + pueue add -w "$PWD" -o 5 -l "why: route2 act-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --out-tag=_sub4_route2_act + pueue add -w "$PWD" -o 5 -l "why: route2 grad-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --route2-mask=grad --out-tag=_sub4_route2_grad # CANONICAL plotting entrypoint for the substrate sweep. One command, four figures # (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay, diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 3fa11d8..dfbb18d 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -1881,31 +1881,59 @@ def main(cfg: Config) -> int: logger.info(f"||B_q|| sum = {bq_norm:.4f} (SHOULD: >0; ELSE quarantine never seeded)") assert bq_norm > 0.0, "route2: B_q never moved -> quarantine never seeded (mask never fired?)" - # Route: final training-vs-deployed eval -- the absorption test. TRAIN keeps - # the quarantine knob on (training-time model, still hacks); DEPLOY deletes it - # (the deployed model). SHOULD: deploy hack < train hack at preserved solve - # => the quarantine knob absorbed the cheat. ELSE routing didn't localize it. - if cfg.intervention in ("route", "route2"): - model.eval() - ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + # Final per-mode train-vs-deploy eval -- run for EVERY arm on the SAME fixed + # eval subset so the all-arms overlay reads identical numbers. For route/route2 + # this is the absorption test: TRAIN keeps the quarantine knob on (still hacks), + # DEPLOY deletes it (the shipped model). SHOULD: deploy hack < train hack at + # preserved solve => the quarantine absorbed the cheat. vanilla/erase have no + # quarantine, so the deployed model IS the trained model (deploy == train, one eval). + model.eval() + ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + has_quarantine = cfg.intervention in ("route", "route2") + if has_quarantine: with ablate_quarantine(wrappers): ev_deploy = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new) + else: + ev_deploy = ev_train + logger.info( + f"FINAL EVAL [{cfg.arm}] (n={ev_train['n']}): " + f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | " + f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} " + + ("(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)" + if has_quarantine else "(no quarantine: deploy == train)")) + # Per-mode hack: the generalisation cut. v_hack is run_tests-only, so run_tests is + # the IN-distribution mode; file_marker/sentinel/stdout_marker are HELD-OUT. + # SHOULD: if routing generalises, deploy hack drops on held-out modes too, not just + # run_tests. ELSE the quarantine only caught the mode v_hack saw. + per_mode_deploy: dict[str, dict] = {} + for mode in sorted(ev_deploy["by_mode"]): + th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0]) + dh, ds, dn = ev_deploy["by_mode"][mode] + tag = "IN-dist" if mode == "run_tests" else "held-out" logger.info( - f"ROUTE EVAL (n={ev_train['n']}): " - f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | " - f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} " - f"(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)") - # Per-mode deploy hack: the generalisation cut. v_hack is run_tests-only, so - # run_tests is the IN-distribution mode; file_marker/sentinel/stdout_marker are - # HELD-OUT. SHOULD: if route generalises, deploy hack drops on held-out modes too, - # not just run_tests. ELSE the quarantine only caught the mode v_hack saw. - for mode in sorted(ev_deploy["by_mode"]): - th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0]) - dh, ds, dn = ev_deploy["by_mode"][mode] - tag = "IN-dist" if mode == "run_tests" else "held-out" - logger.info( - f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | " - f"deploy hack={dh}/{dn} solve={ds}/{dn}") + f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | " + f"deploy hack={dh}/{dn} solve={ds}/{dn}") + per_mode_deploy[mode] = { + "in_dist": mode == "run_tests", + "train_hack": th / max(1, tn), "train_solve": ts / max(1, tn), + "deploy_hack": dh / max(1, dn), "deploy_solve": ds / max(1, dn), "n": dn, + } + # Single structured record the overlay plot reads (one file per run, in run_dir + # next to the log/checkpoint). All arms emit the same schema; vanilla/erase have + # deploy==train. This is the canonical source for the all-arms per-mode plot. + deploy_record = { + "arm": cfg.arm, "intervention": cfg.intervention, + "route2_mask": cfg.route2_mask if is_route2 else None, + "refresh_every": cfg.vhack_refresh_every, "seed": cfg.seed, + "steps": n_steps, "model": model_name, "out_tag": cfg.out_tag, + "log": str(verbose_log), "eval_n": ev_deploy["n"], + "hack_train": ev_train["hack"], "solve_train": ev_train["solve"], + "hack_deploy": ev_deploy["hack"], "solve_deploy": ev_deploy["solve"], + "by_mode": per_mode_deploy, + } + deploy_path = run_dir / "per_mode_deploy.json" + deploy_path.write_text(json.dumps(deploy_record, indent=2)) + logger.info(f"per-mode deploy artifact: {deploy_path}") # Final tail: cue emoji + main metric BLUF, then per-step tsv table. # Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped