From 70aa6aa96b02c6d34cdb70674b0ab92834317012 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sat, 6 Jun 2026 20:30:19 +0800
Subject: [PATCH] modal: parallel GRPO sweep port (image, volume, fan-out
 launcher)

Fire the paper sweep as independent H100/A100-80 containers instead of
serial pueue runs. One Volume caches model + svd + out/; train.py runs
unmodified (torch 2.7 + Dao flash-attn wheel, code mounted at runtime).
Verified: vanilla 60-step reproduces the local baseline. Skill at
~/.claude/skills/modal documents the patterns.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 modal/README.md        |  95 ++++++++++++++++++
 modal/app.py           | 213 +++++++++++++++++++++++++++++++++++++++++
 modal/fetch.py         |  42 ++++++++
 modal/launch.py        |  82 ++++++++++++++++
 modal/upload_inputs.py |  54 +++++++++++
 pyproject.toml         |   1 +
 uv.lock                | 111 ++++++++++++++++++++-
 7 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 modal/README.md
 create mode 100644 modal/app.py
 create mode 100644 modal/fetch.py
 create mode 100644 modal/launch.py
 create mode 100644 modal/upload_inputs.py

diff --git a/modal/README.md b/modal/README.md
new file mode 100644
index 0000000..8ce0687
--- /dev/null
+++ b/modal/README.md
@@ -0,0 +1,95 @@
+# Modal port — parallel GRPO runs
+
+Fan the paper's GRPO sweep (jobs 124-135 of `docs/spec/20260606_job_manifest.md`)
+out as independent H100 containers instead of running them serially through
+pueue on the one 96GB box. ~12 runs finish in one run's wall-clock instead of ~2
+days.
+
+General Modal patterns/gotchas (reusable across projects) live in the global
+`modal` skill (`~/.claude/skills/modal/SKILL.md`); this dir is its worked example.
+
+## Files
+
+- `app.py` — image, Volume, and the `train` / `warm` / `smoke` GPU functions.
+- `upload_inputs.py` — push the gitignored run inputs (pairsets, vhack, pools) to
+  the Volume. Run from a box that has them.
+- `launch.py` — fan out the 12-job inventory with `.spawn()`.
+
+## Design decisions (and why)
+
+- **GPU = `["H100", "A100-80GB"]` (80GB, fallback list).** The full preset peaked
+  ~73GB bf16 on the local card, so an 80GB card is required. H100 is ~1.5-2x
+  faster than A100-80 for ~1.6x the price (≈ same $/run, half the wall-clock).
+  On a 12-way fan-out H100 capacity can queue, so we fall back to A100-80GB — it
+  runs the same Dao flash-attn wheel (bundles sm_80) and deploy numbers are
+  hardware-independent. Override per-run with `VGROUT_GPU=H200` if a long run OOMs.
+- **torch 2.7, not the repo's pinned 2.8.** Dao-AILab ships no cp313+torch2.8
+  flash-attn wheel; the 2.8.3 line tops out at torch2.7 for cp313. The official
+  Dao wheel bundles sm_80/86/90 so it runs on A100/H100 — unlike the repo's
+  Blackwell sm_120-only pin. This keeps train.py's hardcoded `flash_attention_2`
+  path working with **zero patch to the research code**.
+- **No vllm, no causal-conv1d.** Generation is HF `.generate` (nothing in
+  `src/vgrout` imports vllm); causal-conv1d is only for Qwen3.5's gated-delta-net,
+  and the model here is standard-attention Qwen3-4B.
+- **One Volume `vgrout-cache`** mounts at `/cache` and holds the HF model cache
+  (`hf/`), the SVD basis cache (`svd_cache/`), and `out/` (uploaded inputs +
+  written `out/runs/*` artifacts). The model downloads once and the svd_cache
+  computes once; every later container reuses both. train.py's relative paths
+  (`svd_cache/`, `out/`, `logs/`) are symlinked onto the Volume from an ephemeral
+  `/work` cwd.
+
+## One-time setup
+
+```bash
+pip install modal && modal token new      # interactive; you've done this
+# Upload the gitignored INPUTS from the box that has them (the 96GB box):
+python modal/upload_inputs.py              # pushes out/pairsets, out/vhack, out/pools
+modal run modal/app.py --action warm       # download Qwen3-4B + build svd_cache once
+```
+
+`upload_inputs.py` skips dirs absent locally. The jobs need these on the Volume:
+
+| input | needed by | present on dev box? |
+|---|---|---|
+| `out/pools/substrate`, `out/pools/teacher_pool` | most jobs | yes (uploaded) |
+| `out/pairsets/prog_wide.json` | FastConfig default (124, 127, 130, ...) | **no — only on GPU box** |
+| `out/pairsets/null_city.json` | 128 (erase placebo) | **no — only on GPU box** |
+| `out/vhack/v_hack_a5_runtests.safetensors` | 126, 133, 134 (A5) | **no — only on GPU box** |
+| `out/vhack/v_hack_pairset_prog_wide_randomV.safetensors` | 125 (random-V) | **no — only on GPU box** |
+
+So: run `upload_inputs.py` **from the 96GB box** to get the pairsets/vhack bases
+onto the Volume. (Some vhack bases auto-extract from their pairset if absent, but
+that costs ~5 min GPU per run; uploading the prebuilt ones is cheaper.)
+
+## Verify one run, then fan out
+
+```bash
+modal run modal/app.py --action smoke              # 4-step route2 sanity
+modal run modal/app.py --argv "fast --intervention=route2 --seed=43 --steps=60 --eval-ablate-every=10 --out-tag=_modal_verify"
+# compare its per_mode_deploy.json to the local-box artifact for the same args
+modal run modal/launch.py                           # all 12; or --only 127,134
+```
+
+## Getting the outputs back
+
+Every run writes its full artifact set to the Volume, mirroring the local layout:
+
+- `out/runs/<ts>_<slug>/` — `per_mode_deploy.json`, `train.safetensors`,
+  `first_hack.safetensors`, `rollouts.jsonl`, periodic `ckpt_step*.safetensors`
+- `logs/<ts>_<slug>.log` — the full verbose log
+
+`launch.py` pulls each job's whole run dir + log down to the local `out/runs/` and
+`logs/` as it finishes (so they land exactly where train.py would have written
+them). For ad-hoc runs (warm/smoke/`--argv`) or a full re-sync:
+
+```bash
+python modal/fetch.py                 # all of out/runs + logs
+python modal/fetch.py <ts>_<slug>     # one run
+```
+
+## Caveat — keep the inventory fresh
+
+`launch.py::JOBS` is copied verbatim from the 2026-06-06 manifest. The live plan
+has since evolved (135 → per-token ablation; 136/137 added; n=3 fan-out gated on
+the s43 control read). Refresh the argv map from the current manifest / `pueue
+status` before the real fan-out — it's just data.
diff --git a/modal/app.py b/modal/app.py
new file mode 100644
index 0000000..4bc7ce2
--- /dev/null
+++ b/modal/app.py
@@ -0,0 +1,213 @@
+"""Modal port of the vGROUT GRPO runs (jobs 124-135 of the 2026-06-06 manifest).
+
+Why: every run currently goes through pueue on the single 96GB box, serially
+(~2 days for the 12-run paper sweep). Modal fans them out as independent GPU
+containers so the whole sweep finishes in one run's wall-clock.
+
+Design notes / deliberate choices (see modal/README.md for the runbook):
+  - GPU = H100 (80GB). The full preset peaked ~73GB bf16 on the local card with
+    flash-attn; the `fast` preset the manifest uses is lighter. Bump to "H200"
+    (141GB) here if a long run OOMs.
+  - torch 2.7 (NOT the repo's pinned 2.8). Dao-AILab ships no cp313+torch2.8
+    flash-attn wheel; 2.8.3 tops out at torch2.7 for cp313. The official Dao
+    wheel bundles sm_80/86/90, so it runs on A100/H100 -- unlike the repo's
+    Blackwell sm_120-only pin. This keeps train.py's `flash_attention_2` path
+    working with ZERO patch to the research code.
+  - No vllm (generation is HF .generate; nothing in src/vgrout imports vllm) and
+    no causal-conv1d (that wheel is for Qwen3.5's gated-delta-net; the model here
+    is Qwen3-4B, standard attention).
+  - One Modal Volume holds the HF model cache, the SVD basis cache, and out/
+    (inputs uploaded once via upload_inputs.py, run artifacts written back).
+    Containers reuse it, so the model downloads once and the svd_cache computes
+    once.
+
+Usage:
+  modal run modal/app.py::warm                 # download model + build svd_cache once
+  modal run modal/app.py::smoke                # 4-step route2 sanity on the real model
+  modal run modal/app.py::train --argv "fast --intervention=route2 --seed=43 --steps=60 ..."
+  modal run modal/launch.py                    # fan out jobs 124-135 (see launch.py)
+"""
+from __future__ import annotations
+
+import os
+import shlex
+import subprocess
+import time
+from pathlib import Path
+
+import modal
+
+# ---------------------------------------------------------------------------
+# Image
+# ---------------------------------------------------------------------------
+# cp313 to match the repo's python pin (and the flash-attn wheel abi tag).
+TORCH = "2.7.1"
+FLASH_ATTN_WHL = (
+    "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/"
+    "flash_attn-2.8.3+cu12torch2.7cxx11abiTRUE-cp313-cp313-linux_x86_64.whl"
+)
+
+image = (
+    modal.Image.debian_slim(python_version="3.13")
+    .apt_install("git")
+    .pip_install(
+        f"torch=={TORCH}",
+        index_url="https://download.pytorch.org/whl/cu126",
+    )
+    .pip_install(
+        # transformers from main: Qwen3 support + the gated-delta-net fix the repo
+        # pins. Qwen3-4B itself is stable on main.
+        "transformers @ git+https://github.com/huggingface/transformers.git",
+        "einops>=0.8",
+        "jaxtyping>=0.2",
+        "beartype>=0.18",
+        "loguru>=0.7",
+        "polars>=1.0",
+        "tabulate>=0.9",
+        "tyro>=0.8",
+        "tqdm>=4.66",
+        "numpy<2.0",
+        "datasets>=3.0",
+        "huggingface_hub>=0.24",
+        "wandb>=0.18",
+        "peft>=0.13",
+        "flash-linear-attention>=0.5.0",
+        "safetensors>=0.4",
+    )
+    # flash-attn last, after torch is present (no build isolation -> uses the wheel).
+    .pip_install(FLASH_ATTN_WHL)
+    # Research code mounted at runtime so local edits sync without an image rebuild.
+    # Only src/ is needed on PYTHONPATH; data + caches live on the Volume. Anchored
+    # to the repo (not CWD) so `modal run` works from any directory.
+    .add_local_dir(str(Path(__file__).parent.parent / "src"), "/root/src", copy=False)
+)
+
+app = modal.App("vgrout", image=image)
+
+# Single shared Volume: model cache + svd basis cache + out/ (inputs + artifacts).
+cache = modal.Volume.from_name("vgrout-cache", create_if_missing=True)
+CACHE = "/cache"
+
+# HF needs a token only for gated repos; Qwen3-4B is public, so a Secret is
+# optional. Attach it if present so wandb / private mirrors work.
+SECRETS = [modal.Secret.from_name("vgrout-secrets", required_keys=[])] if False else []
+
+# Fallback list: on a 12-way fan-out H100 capacity can queue; A100-80GB is also
+# 80GB and the Dao flash-attn cu12torch2.7 wheel bundles sm_80, so it runs
+# unmodified. Deploy hack/solve numbers are hardware-independent (only wall-clock
+# differs), so mixed hardware doesn't pollute the comparison. Override with
+# VGROUT_GPU=H200 for a job that OOMs on 80GB.
+GPU = os.environ["VGROUT_GPU"] if "VGROUT_GPU" in os.environ else ["H100", "A100-80GB"]
+TIMEOUT = 6 * 60 * 60  # 6h; longest manifest run is 200 steps
+
+
+def _prepare_workdir() -> str:
+    """Point train.py's relative paths (svd_cache/, out/, logs/) at the Volume.
+
+    train.py uses CACHE_ROOT=Path("svd_cache"), OUT_DIR=Path("out"),
+    LOGS_DIR=Path("logs"), all relative to CWD. We run from an ephemeral /work
+    and symlink those three names onto the persistent Volume so the model cache,
+    the SVD basis, the uploaded inputs (out/pairsets, out/pools, out/vhack), and
+    the run artifacts (out/runs/*) all live on /cache.
+    """
+    for sub in ("svd_cache", "out", "logs", "hf"):
+        Path(f"{CACHE}/{sub}").mkdir(parents=True, exist_ok=True)
+    work = Path("/work")
+    work.mkdir(exist_ok=True)
+    # external/ holds the read-only LeetCode dataset (uploaded to the Volume by
+    # upload_inputs.py); train.py reads it via the relative path
+    # external/rl-rewardhacking/results/data/*.jsonl.
+    for name in ("svd_cache", "out", "logs", "external"):
+        link = work / name
+        if not link.exists():
+            link.symlink_to(f"{CACHE}/{name}")
+    return str(work)
+
+
+def _run_train(argv: list[str]) -> dict:
+    """Run `python -m vgrout.train <argv>` against the Volume, return the run's
+    per_mode_deploy.json + path + wall-clock. Fail-fast: nonzero exit raises."""
+    work = _prepare_workdir()
+    env = {
+        **os.environ,
+        "PYTHONPATH": "/root/src",
+        "HF_HOME": f"{CACHE}/hf",
+        "HF_HUB_DISABLE_PROGRESS_BARS": "1",
+        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+    }
+    runs_before = set(Path(f"{CACHE}/out/runs").glob("*")) if Path(f"{CACHE}/out/runs").exists() else set()
+
+    t0 = time.time()
+    print(f"[vgrout] train {' '.join(argv)}", flush=True)
+    try:
+        subprocess.run(
+            ["python", "-m", "vgrout.train", *argv],
+            cwd=work, env=env, check=True,
+        )
+    finally:
+        # Persist even on failure: the model download into /cache/hf and the
+        # svd_cache happen before most failure points, so a crashed run still
+        # warms those caches for the retry.
+        cache.commit()
+    wall_s = time.time() - t0
+
+    runs_after = set(Path(f"{CACHE}/out/runs").glob("*"))
+    new_runs = sorted(runs_after - runs_before, key=lambda p: p.stat().st_mtime)
+    if not new_runs:
+        raise RuntimeError("train produced no out/runs/<dir> -- did it crash before the run dir was made?")
+    run_dir = new_runs[-1]
+    pmd_path = run_dir / "per_mode_deploy.json"
+    pmd = pmd_path.read_text() if pmd_path.exists() else None
+    # run_dir.name == the log stem (train.py: run_dir = RUNS_DIR / verbose_log.stem).
+    log_rel = f"logs/{run_dir.name}.log"
+    files = sorted(p.name for p in run_dir.iterdir())
+    print(f"[vgrout] done in {wall_s/60:.1f} min -> {run_dir.name} ({len(files)} files)", flush=True)
+    return {
+        "wall_s": wall_s,
+        "run_dir": f"out/runs/{run_dir.name}",   # volume-relative, for `modal volume get`
+        "log": log_rel,                          # volume-relative
+        "files": files,
+        "per_mode_deploy": pmd,
+    }
+
+
+@app.function(gpu=GPU, volumes={CACHE: cache}, timeout=TIMEOUT, secrets=SECRETS)
+def train(argv: str) -> dict:
+    """Run one `vgrout.train` invocation. `argv` is the CLI string after
+    `python -m vgrout.train`, e.g. "fast --intervention=route2 --seed=43 --steps=60"."""
+    return _run_train(shlex.split(argv))
+
+
+@app.function(gpu=GPU, volumes={CACHE: cache}, timeout=TIMEOUT, secrets=SECRETS)
+def warm() -> dict:
+    """Download Qwen3-4B into the Volume HF cache and build the svd_cache once,
+    by running a 1-step vanilla job. Cheap relative to the real sweep, and every
+    later container reuses both caches. Vanilla needs no pairset/vhack inputs."""
+    out = _run_train(shlex.split("fast --intervention=none --steps=1 --eval-n-prompts=2 --out-tag=_warm"))
+    cache.commit()
+    return out
+
+
+@app.function(gpu=GPU, volumes={CACHE: cache}, timeout=TIMEOUT, secrets=SECRETS)
+def smoke() -> dict:
+    """4-step real-model route2 sanity (the user's smoke gate before fan-out).
+    Needs the FastConfig default inputs on the Volume: out/pairsets/prog_wide.json
+    + out/pools/substrate (upload via modal/upload_inputs.py first)."""
+    return _run_train(shlex.split(
+        "fast --intervention=route2 --seed=43 --steps=4 --eval-ablate-every=2 "
+        "--eval-n-prompts=2 --out-tag=_modal_smoke"
+    ))
+
+
+@app.local_entrypoint()
+def main(argv: str = "", action: str = "train"):
+    """`modal run modal/app.py --action warm`
+       `modal run modal/app.py --action smoke`
+       `modal run modal/app.py --argv "fast --intervention=route2 --seed=43 --steps=60 ..."`"""
+    if action == "warm":
+        print(warm.remote())
+    elif action == "smoke":
+        print(smoke.remote())
+    else:
+        assert argv, "pass --argv 'fast --intervention=... ...'"
+        print(train.remote(argv))
diff --git a/modal/fetch.py b/modal/fetch.py
new file mode 100644
index 0000000..4513a0c
--- /dev/null
+++ b/modal/fetch.py
@@ -0,0 +1,42 @@
+"""Pull run artifacts + logs off the Modal Volume to the local box.
+
+The launcher (launch.py) already pulls each job's run dir + log as it finishes.
+Use this for ad-hoc runs (warm/smoke/manual `--argv`) or to re-sync everything.
+
+  python modal/fetch.py                 # all of out/runs + logs
+  python modal/fetch.py <stem>          # one run: out/runs/<stem>/ + logs/<stem>.log
+"""
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+VOL = "vgrout-cache"
+REPO = Path(__file__).resolve().parent.parent
+
+
+def get_dir(remote: str, local_parent: Path):
+    """modal recreates the remote leaf dir UNDER the target, so pass the parent."""
+    local_parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
+
+
+def get_file(remote: str, local: Path):
+    local.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
+
+
+def main():
+    if len(sys.argv) > 1:
+        stem = sys.argv[1]
+        get_dir(f"out/runs/{stem}", REPO / "out" / "runs")
+        get_file(f"logs/{stem}.log", REPO / "logs" / f"{stem}.log")
+    else:
+        get_dir("out/runs", REPO / "out")
+        get_dir("logs", REPO)
+    print(f"[done] pulled to {REPO}/out/runs and {REPO}/logs")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modal/launch.py b/modal/launch.py
new file mode 100644
index 0000000..0395147
--- /dev/null
+++ b/modal/launch.py
@@ -0,0 +1,82 @@
+"""Fan out the 2026-06-06 manifest's Running/Queued jobs (124-135) as parallel
+Modal containers. argv copied verbatim from docs/spec/20260606_job_manifest.md
+(the `vgrout.train` entries), so each Modal run == the pueue run it replaces.
+
+Run only AFTER `modal run modal/app.py --action smoke` is verified clean.
+
+  modal run modal/launch.py                  # all 12
+  modal run modal/launch.py --only 127,134   # a subset (verify-one path)
+
+Each container writes out/runs/<ts>_<tag>/per_mode_deploy.json to the Volume;
+this entrypoint also copies them into modal/results/<tag>.json locally.
+"""
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from app import app, train  # noqa: E402  (same dir; registers the functions)
+
+VOL = "vgrout-cache"
+
+
+def _pull_dir(remote: str, local_parent: Path):
+    """modal recreates the remote leaf dir UNDER the target, so pass the parent."""
+    local_parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
+
+
+def _pull_file(remote: str, local: Path):
+    local.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
+
+# job id -> argv after `python -m vgrout.train`. Verbatim from the manifest.
+JOBS: dict[int, str] = {
+    124: "fast --intervention=route2 --seed=41 --teacher-off-step=40 --steps=200 --eval-ablate-every=20 --out-tag=_route2_toff40_s41",
+    125: "fast --intervention=route --seed=41 --v-hack-path=out/vhack/v_hack_pairset_prog_wide_randomV.safetensors --vhack-refresh-every=0 --eval-ablate-every=5 --steps=60 --out-tag=_route_randomV_s41",
+    126: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s41",
+    127: "fast --intervention=erase --seed=41 --eval-ablate-every=5 --out-tag=_erase_realv_s41",
+    128: "fast --intervention=erase --seed=41 --vhack-pairs-path=out/pairsets/null_city.json --eval-ablate-every=5 --out-tag=_erase_placebo_nullcity_s41",
+    129: "fast --intervention=none --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_none200_kl5_s41",
+    130: "fast --intervention=route2 --seed=41 --beta=1e-5 --adam-beta1=0.9 --adam-beta2=0.99 --steps=200 --eval-ablate-every=20 --out-tag=_route2200_kl5_s41",
+    131: "fast --intervention=none --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s42",
+    132: "fast --intervention=none --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --out-tag=_a5_vanilla_tmrt_s43",
+    133: "fast --intervention=route2 --seed=42 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s42",
+    134: "fast --intervention=route2 --seed=43 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --v-hack-path=out/vhack/v_hack_a5_runtests.safetensors --steps=200 --eval-ablate-every=10 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_route2_teacheronly_s43",
+    135: "fast --intervention=route2 --seed=41 --teacher-pool-dir=out/pools/substrate --teacher-modes run_tests --route2-random-v-seed=0 --steps=200 --eval-ablate-every=20 --eval-n-prompts=24 --gate-anchor-teacher-only --out-tag=_a5_haar_d0_teacheronly_s41",
+}
+
+
+@app.local_entrypoint()
+def main(only: str = ""):
+    ids = [int(x) for x in only.split(",")] if only else sorted(JOBS)
+    print(f"[launch] spawning {len(ids)} jobs: {ids}")
+
+    # spawn = non-blocking; all run concurrently (subject to your Modal limits).
+    handles = {jid: train.spawn(JOBS[jid]) for jid in ids}
+
+    # Mirror the Volume layout locally so downloaded runs sit where train.py would
+    # have written them (out/runs/<stem>/, logs/<stem>.log).
+    repo = Path(__file__).parent.parent
+    results = {}
+    for jid, h in handles.items():
+        try:
+            res = h.get()  # blocks until this container finishes
+            results[jid] = {"ok": True, **res}
+            # Pull the FULL run dir (ckpts, rollouts, per_mode_deploy.json) + the log.
+            _pull_dir(res["run_dir"], repo / "out" / "runs")     # recreates <stem>/ under out/runs
+            _pull_file(res["log"], repo / res["log"])
+            print(f"[ok] job {jid}: {res['wall_s']/60:.1f} min -> {res['run_dir']} ({len(res['files'])} files)")
+        except Exception as e:
+            results[jid] = {"ok": False, "error": repr(e)}
+            print(f"[FAIL] job {jid}: {e!r}")
+
+    out_dir = Path(__file__).parent / "results"
+    out_dir.mkdir(exist_ok=True)
+
+    (out_dir / "_summary.json").write_text(json.dumps(results, indent=2, default=str))
+    n_ok = sum(r["ok"] for r in results.values())
+    print(f"[launch] {n_ok}/{len(ids)} ok. artifacts in {out_dir}")
diff --git a/modal/upload_inputs.py b/modal/upload_inputs.py
new file mode 100644
index 0000000..827882f
--- /dev/null
+++ b/modal/upload_inputs.py
@@ -0,0 +1,54 @@
+"""Push the gitignored run INPUTS to the Modal Volume.
+
+Run this from a box that actually has the artifacts (the 96GB GPU box). The
+queued jobs read these from out/ at train time; on Modal that out/ is the Volume.
+
+What gets uploaded (all small -- KB to a few hundred MB):
+  out/pairsets/        hand-authored persona contrastive pairs (prog_wide.json,
+                       null_city.json, heldout_known_runtests.json, ...)
+  out/vhack/           extracted hack-direction bases (v_hack_a5_runtests.safetensors,
+                       v_hack_pairset_prog_wide_randomV.safetensors, ...)
+  out/pools/           teacher rollout pools (substrate/, teacher_pool/)
+
+NOT uploaded here (the Volume builds these itself, see app.py::warm):
+  the Qwen3-4B weights  -> downloaded from HF into /cache/hf on first run
+  svd_cache/            -> computed once on Modal and cached
+
+Usage (on the GPU box, after `pip install modal` + `modal token new`):
+  python modal/upload_inputs.py
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import modal
+
+# Volume paths are relative to the volume ROOT (which mounts at /cache in the
+# container), so we upload to "out/..." NOT "/cache/out/...".
+DIRS = [
+    "out/pairsets",
+    "out/vhack",
+    "out/pools",
+    "external/rl-rewardhacking/results/data",  # the LeetCode problems (train/test/holdout jsonl)
+]
+
+
+def main():
+    vol = modal.Volume.from_name("vgrout-cache", create_if_missing=True)
+    repo = Path(__file__).resolve().parent.parent
+    present = [(repo / d) for d in DIRS if (repo / d).exists()]
+    missing = [d for d in DIRS if not (repo / d).exists()]
+    if missing:
+        print(f"[warn] absent locally, skipping: {missing}")
+    assert present, f"none of {DIRS} exist under {repo} -- run from the box that has the artifacts"
+
+    with vol.batch_upload(force=True) as batch:
+        for local in present:
+            remote = str(local.relative_to(repo))  # e.g. "out/pools"
+            print(f"[upload] {local} -> {remote}")
+            batch.put_directory(str(local), remote)
+    print("[done] inputs on Volume. Verify: modal volume ls vgrout-cache out")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 5aca5d1..d416643 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
     # release with Blackwell sm_120 kernels (consumer RTX PRO 6000). Pinned to
     # mjun0812 prebuilds — see [tool.uv.sources] below.
     "flash-attn",
+    "modal>=1.4.3",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index 7ccefff..bbe3007 100644
--- a/uv.lock
+++ b/uv.lock
@@ -8,7 +8,7 @@ resolution-markers = [
 ]
 
 [options]
-exclude-newer = "2026-05-24T00:00:00Z"
+exclude-newer = "2026-05-23T16:00:00Z"
 
 [[package]]
 name = "accelerate"
@@ -709,6 +709,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/7a/1c6e3562dfd8950adbb11ffbc65d21e7c89d01a6e4f137fa981056de25c5/gitpython-3.1.50-py3-none-any.whl", hash = "sha256:d352abe2908d07355014abdd21ddf798c2a961469239afec4962e9da884858f9", size = 212507, upload-time = "2026-05-06T04:01:23.799Z" },
 ]
 
+[[package]]
+name = "grpclib"
+version = "0.4.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "h2" },
+    { name = "multidict" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/28/5a2c299ec82a876a252c5919aa895a6f1d1d35c96417c5ce4a4660dc3a80/grpclib-0.4.9.tar.gz", hash = "sha256:cc589c330fa81004c6400a52a566407574498cb5b055fa927013361e21466c46", size = 84798, upload-time = "2025-12-14T22:23:14.349Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/90/b0cbbd9efcc82816c58f31a34963071aa19fb792a212a5d9caf8e0fc3097/grpclib-0.4.9-py3-none-any.whl", hash = "sha256:7762ec1c8ed94dfad597475152dd35cbd11aecaaca2f243e29702435ca24cf0e", size = 77063, upload-time = "2025-12-14T22:23:13.224Z" },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -718,6 +731,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
 
+[[package]]
+name = "h2"
+version = "4.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "hpack" },
+    { name = "hyperframe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
+]
+
 [[package]]
 name = "hf-xet"
 version = "1.5.0"
@@ -742,6 +768,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" },
 ]
 
+[[package]]
+name = "hpack"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
+]
+
 [[package]]
 name = "httpcore"
 version = "1.0.9"
@@ -805,6 +840,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" },
 ]
 
+[[package]]
+name = "hyperframe"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
+]
+
 [[package]]
 name = "idna"
 version = "3.16"
@@ -1084,6 +1128,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/82/11fd62a8d7a3e96e5c43220b17de0151e3f10101f8bb3b865f5bd9cdd074/mlx_metal-0.31.2-py3-none-macosx_26_0_arm64.whl", hash = "sha256:84ffb60ee503f03eb684f5fb168d5cff31e2a16b7f27c1731eaf7662bd6e9b46", size = 55792151, upload-time = "2026-04-22T03:14:22.059Z" },
 ]
 
+[[package]]
+name = "modal"
+version = "1.4.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "cbor2" },
+    { name = "certifi" },
+    { name = "click" },
+    { name = "grpclib" },
+    { name = "protobuf" },
+    { name = "rich" },
+    { name = "synchronicity" },
+    { name = "toml" },
+    { name = "types-certifi" },
+    { name = "types-toml" },
+    { name = "typing-extensions" },
+    { name = "watchfiles" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c6/7d/4126d0fe879ef3e86002ca821a34cb68a2588ea2e8ccb2bfe421d0f42ffe/modal-1.4.3.tar.gz", hash = "sha256:35b2fc840f759b512e12527afb538e1ea4cc232b84cfbfcef3f5d96d5a66abaa", size = 720488, upload-time = "2026-05-18T22:34:45.842Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/54/400262056c144ceee5edab40efa2541ae8928ae5f244fd9025f3ad26c909/modal-1.4.3-py3-none-any.whl", hash = "sha256:802917181f576458a0cb833322157dab09c4f367326426c5a732661a0c519577", size = 826232, upload-time = "2026-05-18T22:34:43.335Z" },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -2372,6 +2440,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
 ]
 
+[[package]]
+name = "synchronicity"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f9/5e/50ea27817003665c7cc4f5bdad309f13d6329037f657848ee87fe06c3740/synchronicity-0.12.2.tar.gz", hash = "sha256:6fd605a5035d1ec74ce48fffaca80ea00345c84ca34223914e2436fb4f162ff9", size = 60018, upload-time = "2026-04-06T15:06:15.447Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/41/44/4f6ba4e2c171847e6f9a460213b196bbf26edea43d0e66889c7ccc55d368/synchronicity-0.12.2-py3-none-any.whl", hash = "sha256:9dbaca81fb7f2b57c6dea326e514e1c80e9ccfd9c9618515e84fa6091026273b", size = 41312, upload-time = "2026-04-06T15:06:14.459Z" },
+]
+
 [[package]]
 name = "tabulate"
 version = "0.10.0"
@@ -2433,6 +2513,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
 ]
 
+[[package]]
+name = "toml"
+version = "0.10.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/be/ba/1f744cdc819428fc6b5084ec34d9b30660f6f9daaf70eead706e3203ec3c/toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f", size = 22253, upload-time = "2020-11-01T01:40:22.204Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" },
+]
+
 [[package]]
 name = "torch"
 version = "2.8.0"
@@ -2577,6 +2666,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/f9/2b3ff4e56e5fa7debfaf9eb135d0da96f3e9a1d5b27222223c7296336e5f/typer-0.25.1-py3-none-any.whl", hash = "sha256:75caa44ed46a03fb2dab8808753ffacdbfea88495e74c85a28c5eefcf5f39c89", size = 58409, upload-time = "2026-04-30T19:32:18.271Z" },
 ]
 
+[[package]]
+name = "types-certifi"
+version = "2021.10.8.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/68/943c3aeaf14624712a0357c4a67814dba5cea36d194f5c764dad7959a00c/types-certifi-2021.10.8.3.tar.gz", hash = "sha256:72cf7798d165bc0b76e1c10dd1ea3097c7063c42c21d664523b928e88b554a4f", size = 2095, upload-time = "2022-06-09T15:19:05.244Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/63/2463d89481e811f007b0e1cd0a91e52e141b47f9de724d20db7b861dcfec/types_certifi-2021.10.8.3-py3-none-any.whl", hash = "sha256:b2d1e325e69f71f7c78e5943d410e650b4707bb0ef32e4ddf3da37f54176e88a", size = 2136, upload-time = "2022-06-09T15:19:03.127Z" },
+]
+
+[[package]]
+name = "types-toml"
+version = "0.10.8.20260518"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4b/11/6ece999e91f2ccb848ab4420f3f4816e78ac0541f739e6864affdaaa5737/types_toml-0.10.8.20260518.tar.gz", hash = "sha256:80e10facd24fdeda9d5c672187d72be3ac284843788d67f5aae59e3e016db6fe", size = 9419, upload-time = "2026-05-18T06:02:16.719Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/25/489751806bf5c95e4007f8e17409199c54d31e49ffbea07c5729b1286c8e/types_toml-0.10.8.20260518-py3-none-any.whl", hash = "sha256:0e564ab05f6fde62a315b3b5a9b6624fda569399795d30a37e64705a70459303", size = 9669, upload-time = "2026-05-18T06:02:15.86Z" },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"
@@ -2682,6 +2789,7 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "jaxtyping" },
     { name = "loguru" },
+    { name = "modal" },
     { name = "numpy" },
     { name = "peft" },
     { name = "polars" },
@@ -2709,6 +2817,7 @@ requires-dist = [
     { name = "huggingface-hub", specifier = ">=0.24" },
     { name = "jaxtyping", specifier = ">=0.2" },
     { name = "loguru", specifier = ">=0.7" },
+    { name = "modal", specifier = ">=1.4.3" },
     { name = "numpy", specifier = "<2.0" },
     { name = "peft", specifier = ">=0.13" },
     { name = "polars", specifier = ">=1.0" },