Files
evil_MoE/modal/launch.py
T
wassname 70aa6aa96b modal: parallel GRPO sweep port (image, volume, fan-out launcher)
Fire the paper sweep as independent H100/A100-80 containers instead of
serial pueue runs. One Volume caches model + svd + out/; train.py runs
unmodified (torch 2.7 + Dao flash-attn wheel, code mounted at runtime).
Verified: vanilla 60-step reproduces the local baseline. Skill at
~/.claude/skills/modal documents the patterns.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-06 20:30:19 +08:00

83 lines
5.2 KiB
Python

"""Fan out the 2026-06-06 manifest's Running/Queued jobs (124-135) as parallel
Modal containers. argv copied verbatim from docs/spec/20260606_job_manifest.md
(the `vgrout.train` entries), so each Modal run == the pueue run it replaces.
Run only AFTER `modal run modal/app.py --action smoke` is verified clean.
modal run modal/launch.py # all 12
modal run modal/launch.py --only 127,134 # a subset (verify-one path)
Each container writes out/runs/<ts>_<tag>/per_mode_deploy.json to the Volume;
this entrypoint also copies them into modal/results/<tag>.json locally.
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from app import app, train # noqa: E402 (same dir; registers the functions)
VOL = "vgrout-cache"
def _pull_dir(remote: str, local_parent: Path):
"""modal recreates the remote leaf dir UNDER the target, so pass the parent."""
local_parent.mkdir(parents=True, exist_ok=True)
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
def _pull_file(remote: str, local: Path):
local.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
# job id -> argv after `python -m vgrout.train`. Verbatim from the manifest.
JOBS: dict[int, str] = {
124: "fast --intervention=route2 --seed=41 --teacher-off-step=40 --steps=200 --eval-ablate-every=20 --out-tag=_route2_toff40_s41",
125: "fast --intervention=route --seed=41 --v-hack-path=out/vhack/v_hack_pairset_prog_wide_randomV.safetensors --vhack-refresh-every=0 --eval-ablate-every=5 --steps=60 --out-tag=_route_randomV_s41",
126: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s41",
127: "fast --intervention=erase --seed=41 --eval-ablate-every=5 --out-tag=_erase_realv_s41",
128: "fast --intervention=erase --seed=41 --vhack-pairs-path=out/pairsets/null_city.json --eval-ablate-every=5 --out-tag=_erase_placebo_nullcity_s41",
129: "fast --intervention=none --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_none200_kl5_s41",
130: "fast --intervention=route2 --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_route2200_kl5_s41",
131: "fast --intervention=none --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s42",
132: "fast --intervention=none --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s43",
133: "fast --intervention=route2 --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s42",
134: "fast --intervention=route2 --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s43",
135: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --route2-random-v-seed=0 --steps=200 --eval-ablate-every=20 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_haar_d0_teacheronly_s41",
}
@app.local_entrypoint()
def main(only: str = ""):
ids = [int(x) for x in only.split(",")] if only else sorted(JOBS)
print(f"[launch] spawning {len(ids)} jobs: {ids}")
# spawn = non-blocking; all run concurrently (subject to your Modal limits).
handles = {jid: train.spawn(JOBS[jid]) for jid in ids}
# Mirror the Volume layout locally so downloaded runs sit where train.py would
# have written them (out/runs/<stem>/, logs/<stem>.log).
repo = Path(__file__).parent.parent
results = {}
for jid, h in handles.items():
try:
res = h.get() # blocks until this container finishes
results[jid] = {"ok": True, **res}
# Pull the FULL run dir (ckpts, rollouts, per_mode_deploy.json) + the log.
_pull_dir(res["run_dir"], repo / "out" / "runs") # recreates <stem>/ under out/runs
_pull_file(res["log"], repo / res["log"])
print(f"[ok] job {jid}: {res['wall_s']/60:.1f} min -> {res['run_dir']} ({len(res['files'])} files)")
except Exception as e:
results[jid] = {"ok": False, "error": repr(e)}
print(f"[FAIL] job {jid}: {e!r}")
out_dir = Path(__file__).parent / "results"
out_dir.mkdir(exist_ok=True)
(out_dir / "_summary.json").write_text(json.dumps(results, indent=2, default=str))
n_ok = sum(r["ok"] for r in results.values())
print(f"[launch] {n_ok}/{len(ids)} ok. artifacts in {out_dir}")