mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
feat: per-mode deploy JSON artifact for every arm + queue-substrate recipe
#164: the final eval now runs for ALL arms (not just route/route2) on the same fixed eval subset, so the all-arms overlay reads identical per-mode numbers. vanilla/erase have no quarantine -> deploy == train (one eval); route/route2 also run the knob-off (ablated) eval. Writes a single per_mode_deploy.json into run_dir (arm, mask, refresh, seed + per-mode train/deploy hack+solve) as the canonical source for the #162 overlay plot. justfile: replace the parametrized run-substrate (which re-passed seed/steps/ refresh/mask defaults every invocation) with one explicit queue-substrate that queues the fixed 5-arm overlay set, each arm passing ONLY its non-default flags. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -165,10 +165,17 @@ build-substrate MODES="run_tests,exit_code,sentinel":
|
||||
# (per-mode hacks>0 + finite first_step) + the per-step hk_<mode> columns. mix=0.125
|
||||
# is the locked default (omit to inherit it). Vanilla needs no v_hack; for an
|
||||
# erase/route substrate run, add --v-hack-path explicitly.
|
||||
run-substrate INTERV="none" SEED="41" STEPS="60" REFRESH="5" MASK="act":
|
||||
{{ TRAIN }} fast --intervention={{ INTERV }} \
|
||||
--vhack-refresh-every={{ REFRESH }} --route2-mask={{ MASK }} \
|
||||
--seed={{ SEED }} --steps={{ STEPS }} --out-tag=_sub4_{{ INTERV }}_{{ MASK }}_rf{{ REFRESH }}_s{{ SEED }}
|
||||
# Queue the full 5-arm substrate overlay sweep (the all-arms per-mode deploy plot,
|
||||
# #162). The arm set is FIXED -- no params, no defaults repeated. seed/steps/refresh/
|
||||
# mask all inherit FastConfig defaults (seed41 steps60 rf5 mask=act); each arm passes
|
||||
# ONLY what differs from default (route2-grad: --route2-mask=grad). out-tag distinguishes
|
||||
# the runs for the plot glob. Every arm emits out/runs/<ts>_<tag>/per_mode_deploy.json.
|
||||
queue-substrate:
|
||||
pueue add -w "$PWD" -o 5 -l "why: vanilla emergence reference (4-mode substrate); resolve: per-mode deploy-hack baseline for the overlay" -- {{ TRAIN }} fast --intervention=none --out-tag=_sub4_vanilla
|
||||
pueue add -w "$PWD" -o 5 -l "why: erase arm (one-sided projection); resolve: per-mode deploy hack vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --out-tag=_sub4_erase
|
||||
pueue add -w "$PWD" -o 5 -l "why: route arm (shared-basis quarantine, rf5); resolve: deploy hack on held-out modes vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route --out-tag=_sub4_route
|
||||
pueue add -w "$PWD" -o 5 -l "why: route2 act-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --out-tag=_sub4_route2_act
|
||||
pueue add -w "$PWD" -o 5 -l "why: route2 grad-mask (distinct-basis quarantine); resolve: held-out deploy hack suppressed vs vanilla at matched solve" -- {{ TRAIN }} fast --intervention=route2 --route2-mask=grad --out-tag=_sub4_route2_grad
|
||||
|
||||
# CANONICAL plotting entrypoint for the substrate sweep. One command, four figures
|
||||
# (per-mode by-method + by-hack, and the aggregate "total hacks per arm" + overlay,
|
||||
|
||||
+50
-22
@@ -1881,31 +1881,59 @@ def main(cfg: Config) -> int:
|
||||
logger.info(f"||B_q|| sum = {bq_norm:.4f} (SHOULD: >0; ELSE quarantine never seeded)")
|
||||
assert bq_norm > 0.0, "route2: B_q never moved -> quarantine never seeded (mask never fired?)"
|
||||
|
||||
# Route: final training-vs-deployed eval -- the absorption test. TRAIN keeps
|
||||
# the quarantine knob on (training-time model, still hacks); DEPLOY deletes it
|
||||
# (the deployed model). SHOULD: deploy hack < train hack at preserved solve
|
||||
# => the quarantine knob absorbed the cheat. ELSE routing didn't localize it.
|
||||
if cfg.intervention in ("route", "route2"):
|
||||
model.eval()
|
||||
ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
|
||||
# Final per-mode train-vs-deploy eval -- run for EVERY arm on the SAME fixed
|
||||
# eval subset so the all-arms overlay reads identical numbers. For route/route2
|
||||
# this is the absorption test: TRAIN keeps the quarantine knob on (still hacks),
|
||||
# DEPLOY deletes it (the shipped model). SHOULD: deploy hack < train hack at
|
||||
# preserved solve => the quarantine absorbed the cheat. vanilla/erase have no
|
||||
# quarantine, so the deployed model IS the trained model (deploy == train, one eval).
|
||||
model.eval()
|
||||
ev_train = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
|
||||
has_quarantine = cfg.intervention in ("route", "route2")
|
||||
if has_quarantine:
|
||||
with ablate_quarantine(wrappers):
|
||||
ev_deploy = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
|
||||
else:
|
||||
ev_deploy = ev_train
|
||||
logger.info(
|
||||
f"FINAL EVAL [{cfg.arm}] (n={ev_train['n']}): "
|
||||
f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
|
||||
f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} "
|
||||
+ ("(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)"
|
||||
if has_quarantine else "(no quarantine: deploy == train)"))
|
||||
# Per-mode hack: the generalisation cut. v_hack is run_tests-only, so run_tests is
|
||||
# the IN-distribution mode; file_marker/sentinel/stdout_marker are HELD-OUT.
|
||||
# SHOULD: if routing generalises, deploy hack drops on held-out modes too, not just
|
||||
# run_tests. ELSE the quarantine only caught the mode v_hack saw.
|
||||
per_mode_deploy: dict[str, dict] = {}
|
||||
for mode in sorted(ev_deploy["by_mode"]):
|
||||
th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
|
||||
dh, ds, dn = ev_deploy["by_mode"][mode]
|
||||
tag = "IN-dist" if mode == "run_tests" else "held-out"
|
||||
logger.info(
|
||||
f"ROUTE EVAL (n={ev_train['n']}): "
|
||||
f"train/knob-on hack={ev_train['hack']:.3f} solve={ev_train['solve']:.3f} | "
|
||||
f"deploy/knob-off hack={ev_deploy['hack']:.3f} solve={ev_deploy['solve']:.3f} "
|
||||
f"(SHOULD: deploy hack < train hack at ~matched solve => quarantine absorbed the cheat)")
|
||||
# Per-mode deploy hack: the generalisation cut. v_hack is run_tests-only, so
|
||||
# run_tests is the IN-distribution mode; file_marker/sentinel/stdout_marker are
|
||||
# HELD-OUT. SHOULD: if route generalises, deploy hack drops on held-out modes too,
|
||||
# not just run_tests. ELSE the quarantine only caught the mode v_hack saw.
|
||||
for mode in sorted(ev_deploy["by_mode"]):
|
||||
th, ts, tn = ev_train["by_mode"].get(mode, [0, 0, 0])
|
||||
dh, ds, dn = ev_deploy["by_mode"][mode]
|
||||
tag = "IN-dist" if mode == "run_tests" else "held-out"
|
||||
logger.info(
|
||||
f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
|
||||
f"deploy hack={dh}/{dn} solve={ds}/{dn}")
|
||||
f" per-mode[{mode:<13} {tag:>8}] train hack={th}/{tn} solve={ts}/{tn} | "
|
||||
f"deploy hack={dh}/{dn} solve={ds}/{dn}")
|
||||
per_mode_deploy[mode] = {
|
||||
"in_dist": mode == "run_tests",
|
||||
"train_hack": th / max(1, tn), "train_solve": ts / max(1, tn),
|
||||
"deploy_hack": dh / max(1, dn), "deploy_solve": ds / max(1, dn), "n": dn,
|
||||
}
|
||||
# Single structured record the overlay plot reads (one file per run, in run_dir
|
||||
# next to the log/checkpoint). All arms emit the same schema; vanilla/erase have
|
||||
# deploy==train. This is the canonical source for the all-arms per-mode plot.
|
||||
deploy_record = {
|
||||
"arm": cfg.arm, "intervention": cfg.intervention,
|
||||
"route2_mask": cfg.route2_mask if is_route2 else None,
|
||||
"refresh_every": cfg.vhack_refresh_every, "seed": cfg.seed,
|
||||
"steps": n_steps, "model": model_name, "out_tag": cfg.out_tag,
|
||||
"log": str(verbose_log), "eval_n": ev_deploy["n"],
|
||||
"hack_train": ev_train["hack"], "solve_train": ev_train["solve"],
|
||||
"hack_deploy": ev_deploy["hack"], "solve_deploy": ev_deploy["solve"],
|
||||
"by_mode": per_mode_deploy,
|
||||
}
|
||||
deploy_path = run_dir / "per_mode_deploy.json"
|
||||
deploy_path.write_text(json.dumps(deploy_record, indent=2))
|
||||
logger.info(f"per-mode deploy artifact: {deploy_path}")
|
||||
|
||||
# Final tail: cue emoji + main metric BLUF, then per-step tsv table.
|
||||
# Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped
|
||||
|
||||
Reference in New Issue
Block a user