mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:59:35 +08:00
70aa6aa96b
Fire the paper sweep as independent H100/A100-80 containers instead of serial pueue runs. One Volume caches model + svd + out/; train.py runs unmodified (torch 2.7 + Dao flash-attn wheel, code mounted at runtime). Verified: vanilla 60-step reproduces the local baseline. Skill at ~/.claude/skills/modal documents the patterns. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
83 lines
5.2 KiB
Python
83 lines
5.2 KiB
Python
"""Fan out the 2026-06-06 manifest's Running/Queued jobs (124-135) as parallel
|
|
Modal containers. argv copied verbatim from docs/spec/20260606_job_manifest.md
|
|
(the `vgrout.train` entries), so each Modal run == the pueue run it replaces.
|
|
|
|
Run only AFTER `modal run modal/app.py --action smoke` is verified clean.
|
|
|
|
modal run modal/launch.py # all 12
|
|
modal run modal/launch.py --only 127,134 # a subset (verify-one path)
|
|
|
|
Each container writes out/runs/<ts>_<tag>/per_mode_deploy.json to the Volume;
|
|
this entrypoint also copies them into modal/results/<tag>.json locally.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from app import app, train # noqa: E402 (same dir; registers the functions)
|
|
|
|
VOL = "vgrout-cache"
|
|
|
|
|
|
def _pull_dir(remote: str, local_parent: Path):
|
|
"""modal recreates the remote leaf dir UNDER the target, so pass the parent."""
|
|
local_parent.mkdir(parents=True, exist_ok=True)
|
|
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
|
|
|
|
|
|
def _pull_file(remote: str, local: Path):
|
|
local.parent.mkdir(parents=True, exist_ok=True)
|
|
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
|
|
|
|
# job id -> argv after `python -m vgrout.train`. Verbatim from the manifest.
|
|
JOBS: dict[int, str] = {
|
|
124: "fast --intervention=route2 --seed=41 --teacher-off-step=40 --steps=200 --eval-ablate-every=20 --out-tag=_route2_toff40_s41",
|
|
125: "fast --intervention=route --seed=41 --v-hack-path=out/vhack/v_hack_pairset_prog_wide_randomV.safetensors --vhack-refresh-every=0 --eval-ablate-every=5 --steps=60 --out-tag=_route_randomV_s41",
|
|
126: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s41",
|
|
127: "fast --intervention=erase --seed=41 --eval-ablate-every=5 --out-tag=_erase_realv_s41",
|
|
128: "fast --intervention=erase --seed=41 --vhack-pairs-path=out/pairsets/null_city.json --eval-ablate-every=5 --out-tag=_erase_placebo_nullcity_s41",
|
|
129: "fast --intervention=none --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_none200_kl5_s41",
|
|
130: "fast --intervention=route2 --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_route2200_kl5_s41",
|
|
131: "fast --intervention=none --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s42",
|
|
132: "fast --intervention=none --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s43",
|
|
133: "fast --intervention=route2 --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s42",
|
|
134: "fast --intervention=route2 --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s43",
|
|
135: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --route2-random-v-seed=0 --steps=200 --eval-ablate-every=20 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_haar_d0_teacheronly_s41",
|
|
}
|
|
|
|
|
|
@app.local_entrypoint()
|
|
def main(only: str = ""):
|
|
ids = [int(x) for x in only.split(",")] if only else sorted(JOBS)
|
|
print(f"[launch] spawning {len(ids)} jobs: {ids}")
|
|
|
|
# spawn = non-blocking; all run concurrently (subject to your Modal limits).
|
|
handles = {jid: train.spawn(JOBS[jid]) for jid in ids}
|
|
|
|
# Mirror the Volume layout locally so downloaded runs sit where train.py would
|
|
# have written them (out/runs/<stem>/, logs/<stem>.log).
|
|
repo = Path(__file__).parent.parent
|
|
results = {}
|
|
for jid, h in handles.items():
|
|
try:
|
|
res = h.get() # blocks until this container finishes
|
|
results[jid] = {"ok": True, **res}
|
|
# Pull the FULL run dir (ckpts, rollouts, per_mode_deploy.json) + the log.
|
|
_pull_dir(res["run_dir"], repo / "out" / "runs") # recreates <stem>/ under out/runs
|
|
_pull_file(res["log"], repo / res["log"])
|
|
print(f"[ok] job {jid}: {res['wall_s']/60:.1f} min -> {res['run_dir']} ({len(res['files'])} files)")
|
|
except Exception as e:
|
|
results[jid] = {"ok": False, "error": repr(e)}
|
|
print(f"[FAIL] job {jid}: {e!r}")
|
|
|
|
out_dir = Path(__file__).parent / "results"
|
|
out_dir.mkdir(exist_ok=True)
|
|
|
|
(out_dir / "_summary.json").write_text(json.dumps(results, indent=2, default=str))
|
|
n_ok = sum(r["ok"] for r in results.values())
|
|
print(f"[launch] {n_ok}/{len(ids)} ok. artifacts in {out_dir}")
|