evil_MoE/modal/launch.py

"""Fan out the 2026-06-06 manifest's Running/Queued jobs (124-135) as parallel
Modal containers. argv copied verbatim from docs/spec/20260606_job_manifest.md
(the `vgrout.train` entries), so each Modal run == the pueue run it replaces.

Run only AFTER `modal run modal/app.py --action smoke` is verified clean.

  modal run modal/launch.py                  # all 12
  modal run modal/launch.py --only 127,134   # a subset (verify-one path)

Each container writes out/runs/<ts>_<tag>/per_mode_deploy.json to the Volume;
this entrypoint also copies them into modal/results/<tag>.json locally.
"""
from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))
from app import app, train  # noqa: E402  (same dir; registers the functions)

VOL = "vgrout-cache"


def _pull_dir(remote: str, local_parent: Path):
    """modal recreates the remote leaf dir UNDER the target, so pass the parent."""
    local_parent.mkdir(parents=True, exist_ok=True)
    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)


def _pull_file(remote: str, local: Path):
    local.parent.mkdir(parents=True, exist_ok=True)
    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)

# job id -> argv after `python -m vgrout.train`. Verbatim from the manifest.
JOBS: dict[int, str] = {
    124: "fast --intervention=route2 --seed=41 --teacher-off-step=40 --steps=200 --eval-ablate-every=20 --out-tag=_route2_toff40_s41",
    125: "fast --intervention=route --seed=41 --v-hack-path=out/vhack/v_hack_pairset_prog_wide_randomV.safetensors --vhack-refresh-every=0 --eval-ablate-every=5 --steps=60 --out-tag=_route_randomV_s41",
    126: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s41",
    127: "fast --intervention=erase --seed=41 --eval-ablate-every=5 --out-tag=_erase_realv_s41",
    128: "fast --intervention=erase --seed=41 --vhack-pairs-path=out/pairsets/null_city.json --eval-ablate-every=5 --out-tag=_erase_placebo_nullcity_s41",
    129: "fast --intervention=none --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_none200_kl5_s41",
    130: "fast --intervention=route2 --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_route2200_kl5_s41",
    131: "fast --intervention=none --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s42",
    132: "fast --intervention=none --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s43",
    133: "fast --intervention=route2 --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s42",
    134: "fast --intervention=route2 --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s43",
    135: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --route2-random-v-seed=0 --steps=200 --eval-ablate-every=20 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_haar_d0_teacheronly_s41",
}


@app.local_entrypoint()
def main(only: str = ""):
    ids = [int(x) for x in only.split(",")] if only else sorted(JOBS)
    print(f"[launch] spawning {len(ids)} jobs: {ids}")

    # spawn = non-blocking; all run concurrently (subject to your Modal limits).
    handles = {jid: train.spawn(JOBS[jid]) for jid in ids}

    # Mirror the Volume layout locally so downloaded runs sit where train.py would
    # have written them (out/runs/<stem>/, logs/<stem>.log).
    repo = Path(__file__).parent.parent
    results = {}
    for jid, h in handles.items():
        try:
            res = h.get()  # blocks until this container finishes
            results[jid] = {"ok": True, **res}
            # Pull the FULL run dir (ckpts, rollouts, per_mode_deploy.json) + the log.
            _pull_dir(res["run_dir"], repo / "out" / "runs")     # recreates <stem>/ under out/runs
            _pull_file(res["log"], repo / res["log"])
            print(f"[ok] job {jid}: {res['wall_s']/60:.1f} min -> {res['run_dir']} ({len(res['files'])} files)")
        except Exception as e:
            results[jid] = {"ok": False, "error": repr(e)}
            print(f"[FAIL] job {jid}: {e!r}")

    out_dir = Path(__file__).parent / "results"
    out_dir.mkdir(exist_ok=True)

    (out_dir / "_summary.json").write_text(json.dumps(results, indent=2, default=str))
    n_ok = sum(r["ok"] for r in results.values())
    print(f"[launch] {n_ok}/{len(ids)} ok. artifacts in {out_dir}")