"""Push the gitignored run INPUTS to the Modal Volume. Run this from a box that actually has the artifacts (the 96GB GPU box). The queued jobs read these from out/ at train time; on Modal that out/ is the Volume. What gets uploaded (all small -- KB to a few hundred MB): out/pairsets/ hand-authored persona contrastive pairs (prog_wide.json, heldout_known_runtests.json, ...) data/pairs/ hand-authored Markdown pairsets out/vhack/ extracted hack-direction bases (v_hack_a5_runtests.safetensors, v_hack_pairset_prog_wide_randomV.safetensors, ...) out/pools/ teacher rollout pools (substrate/, teacher_pool/) NOT uploaded here (the Volume builds these itself, see app.py::warm): the Qwen3-4B weights -> downloaded from HF into /cache/hf on first run svd_cache/ -> computed once on Modal and cached Usage (on the GPU box, after `pip install modal` + `modal token new`): python modal/upload_inputs.py """ from __future__ import annotations from pathlib import Path import modal # Volume paths are relative to the volume ROOT (which mounts at /cache in the # container), so we upload to "out/..." NOT "/cache/out/...". DIRS = [ "out/pairsets", "data/pairs", "out/vhack", "out/pools", "external/rl-rewardhacking/results/data", # the LeetCode problems (train/test/holdout jsonl) ] def main(): vol = modal.Volume.from_name("vgrout-cache", create_if_missing=True) repo = Path(__file__).resolve().parent.parent present = [(repo / d) for d in DIRS if (repo / d).exists()] missing = [d for d in DIRS if not (repo / d).exists()] if missing: print(f"[warn] absent locally, skipping: {missing}") assert present, f"none of {DIRS} exist under {repo} -- run from the box that has the artifacts" with vol.batch_upload(force=True) as batch: for local in present: remote = str(local.relative_to(repo)) # e.g. "out/pools" print(f"[upload] {local} -> {remote}") batch.put_directory(str(local), remote) print("[done] inputs on Volume. Verify: modal volume ls vgrout-cache out") if __name__ == "__main__": main()