mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-28 00:28:44 +08:00
70aa6aa96b
Fire the paper sweep as independent H100/A100-80 containers instead of serial pueue runs. One Volume caches model + svd + out/; train.py runs unmodified (torch 2.7 + Dao flash-attn wheel, code mounted at runtime). Verified: vanilla 60-step reproduces the local baseline. Skill at ~/.claude/skills/modal documents the patterns. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
"""Pull run artifacts + logs off the Modal Volume to the local box.
|
|
|
|
The launcher (launch.py) already pulls each job's run dir + log as it finishes.
|
|
Use this for ad-hoc runs (warm/smoke/manual `--argv`) or to re-sync everything.
|
|
|
|
python modal/fetch.py # all of out/runs + logs
|
|
python modal/fetch.py <stem> # one run: out/runs/<stem>/ + logs/<stem>.log
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
VOL = "vgrout-cache"
|
|
REPO = Path(__file__).resolve().parent.parent
|
|
|
|
|
|
def get_dir(remote: str, local_parent: Path):
|
|
"""modal recreates the remote leaf dir UNDER the target, so pass the parent."""
|
|
local_parent.mkdir(parents=True, exist_ok=True)
|
|
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local_parent)], check=False)
|
|
|
|
|
|
def get_file(remote: str, local: Path):
|
|
local.parent.mkdir(parents=True, exist_ok=True)
|
|
subprocess.run(["modal", "volume", "get", "--force", VOL, remote, str(local)], check=False)
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) > 1:
|
|
stem = sys.argv[1]
|
|
get_dir(f"out/runs/{stem}", REPO / "out" / "runs")
|
|
get_file(f"logs/{stem}.log", REPO / "logs" / f"{stem}.log")
|
|
else:
|
|
get_dir("out/runs", REPO / "out")
|
|
get_dir("logs", REPO)
|
|
print(f"[done] pulled to {REPO}/out/runs and {REPO}/logs")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|