Files
evil_MoE/modal/launch.py
T
wassname 98ceb38815 modal: rename launch entrypoint main->fanout (collides with app.py::main)
launch.py imports `app` from app.py, which registers app.py's @local_entrypoint
`main`; launch.py defining its own `main` raised InvalidError(Duplicate local
entrypoint). So launch.py had never actually run -- the earlier vanilla verify
was via app.py directly. Invoke: modal run modal/launch.py::fanout [--only N].

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-06 14:09:35 +00:00

92 lines
4.3 KiB
Python

"""Fan out the n=3 v2-eval keynote set as parallel Modal containers.
15 runs = 5 arms x seeds {42,41,43}, argv mirrors the local pueue runs (same
preset/steps/eval cadence), so each Modal run == its local twin and the two
environments cross-replicate. All on the v2 train/test token-gap eval (the
mounted src/ carries the current committed code).
id 1 = the canary: seed-42 vanilla (needs only the shared teacher pool, no
pairset/direction inputs). Fire it alone first, then fan out once it's green:
.venv/bin/modal run modal/launch.py::fanout --only 1 # canary -> clean v2 FINAL EVAL
.venv/bin/modal run modal/launch.py::fanout # all 15
Each container writes out/runs/<ts>_<tag>/per_mode_deploy.json to the Volume;
this entrypoint pulls the full run dir + log back into out/runs/ locally.
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from app import app, train # noqa: E402 (same dir; registers the functions)
VOL = "vgrout-cache"
# `modal` isn't on PATH; the CLI binary sits next to the interpreter running
# this entrypoint (.venv/bin/python -> .venv/bin/modal).
MODAL_BIN = str(Path(sys.executable).with_name("modal"))
def _pull_dir(remote: str, local_parent: Path):
"""modal recreates the remote leaf dir UNDER the target, so pass the parent."""
local_parent.mkdir(parents=True, exist_ok=True)
subprocess.run([MODAL_BIN, "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
def _pull_file(remote: str, local: Path):
local.parent.mkdir(parents=True, exist_ok=True)
subprocess.run([MODAL_BIN, "volume", "get", "--force", VOL, remote, str(local)], check=False)
# The keynote arms (A2 method+baseline, A3 ablation). Flags after the seed;
# tag stems match the local pueue runs so Modal and local artifacts line up.
COMMON = "fast --eval-ablate-every=10 --steps=60"
ARMS: dict[str, str] = {
"vanilla": "--intervention=none",
"routeV_real": "--intervention=routeV --vhack-pairs-path=out/pairsets/prog_wide.json --vhack-refresh-every=5",
"routeV_pertok_real": "--intervention=routeV --vhack-pairs-path=out/pairsets/prog_wide.json --vhack-refresh-every=5 --routeV-per-token",
"routeV_randomV157": "--intervention=routeV --vhack-pairs-path=out/pairsets/prog_wide.json --vhack-refresh-every=5 --routeV-random-v-seed=157",
"routeV_placebo_vampire": "--intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --vhack-refresh-every=5",
}
# seed 42 first so id 1 == the canary (vanilla s42, the seed missing locally).
SEEDS = [42, 41, 43]
JOBS: dict[int, str] = {}
_jid = 1
for _s in SEEDS:
for _arm, _flags in ARMS.items():
JOBS[_jid] = f"{COMMON} --seed={_s} {_flags} --out-tag=_{_arm}_s{_s}"
_jid += 1
@app.local_entrypoint()
def fanout(only: str = ""): # NOT `main`: app.py (imported above) already owns that entrypoint name
ids = [int(x) for x in only.split(",")] if only else sorted(JOBS)
print(f"[launch] spawning {len(ids)} jobs: {ids}")
# spawn = non-blocking; all run concurrently (subject to your Modal limits).
handles = {jid: train.spawn(JOBS[jid]) for jid in ids}
# Mirror the Volume layout locally so downloaded runs sit where train.py would
# have written them (out/runs/<stem>/, logs/<stem>.log).
repo = Path(__file__).parent.parent
results = {}
for jid, h in handles.items():
try:
res = h.get() # blocks until this container finishes
results[jid] = {"ok": True, **res}
# Pull the FULL run dir (ckpts, rollouts, per_mode_deploy.json) + the log.
_pull_dir(res["run_dir"], repo / "out" / "runs") # recreates <stem>/ under out/runs
_pull_file(res["log"], repo / res["log"])
print(f"[ok] job {jid}: {res['wall_s']/60:.1f} min -> {res['run_dir']} ({len(res['files'])} files)")
except Exception as e:
results[jid] = {"ok": False, "error": repr(e)}
print(f"[FAIL] job {jid}: {e!r}")
out_dir = Path(__file__).parent / "results"
out_dir.mkdir(exist_ok=True)
(out_dir / "_summary.json").write_text(json.dumps(results, indent=2, default=str))
n_ok = sum(r["ok"] for r in results.values())
print(f"[launch] {n_ok}/{len(ids)} ok. artifacts in {out_dir}")