Files
evil_MoE/modal/fetch.py
T
wassname 70aa6aa96b modal: parallel GRPO sweep port (image, volume, fan-out launcher)
Fire the paper sweep as independent H100/A100-80 containers instead of
serial pueue runs. One Volume caches model + svd + out/; train.py runs
unmodified (torch 2.7 + Dao flash-attn wheel, code mounted at runtime).
Verified: vanilla 60-step reproduces the local baseline. Skill at
~/.claude/skills/modal documents the patterns.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-06 20:30:19 +08:00

43 lines
1.4 KiB
Python

"""Pull run artifacts + logs off the Modal Volume to the local box.
The launcher (launch.py) already pulls each job's run dir + log as it finishes.
Use this for ad-hoc runs (warm/smoke/manual `--argv`) or to re-sync everything.
python modal/fetch.py # all of out/runs + logs
python modal/fetch.py <stem> # one run: out/runs/<stem>/ + logs/<stem>.log
"""
from __future__ import annotations
import subprocess
import sys
from pathlib import Path
VOL = "vgrout-cache"
REPO = Path(__file__).resolve().parent.parent
def get_dir(remote: str, local_parent: Path):
"""modal recreates the remote leaf dir UNDER the target, so pass the parent."""
local_parent.mkdir(parents=True, exist_ok=True)
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
def get_file(remote: str, local: Path):
local.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
def main():
if len(sys.argv) > 1:
stem = sys.argv[1]
get_dir(f"out/runs/{stem}", REPO / "out" / "runs")
get_file(f"logs/{stem}.log", REPO / "logs" / f"{stem}.log")
else:
get_dir("out/runs", REPO / "out")
get_dir("logs", REPO)
print(f"[done] pulled to {REPO}/out/runs and {REPO}/logs")
if __name__ == "__main__":
main()