From d96367ca5d8c9423ef1a5490e0ae90da5630c5d2 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 7 Jun 2026 09:45:17 +0800 Subject: [PATCH] modal: mount leetcode data from image; correct 2873b37 hang claim Data fix: the read-only LeetCode jsonls (44MB, tracked in the rl-rewardhacking submodule) now mount from the local checkout into the image (add_local_dir, copy=False) instead of the Volume. A Volume mount/reload race FileNotFound'd them mid-sweep even though they were committed; versioning the dataset with the code removes that failure mode. Volume now carries only mutable dirs. Verified: both a vanilla warm and a routeV smoke load data fine on the new image. Correction: 2873b37's message claimed "the smoke on pinned 5.10.2 clears the deadlock point" -- it did NOT, the smoke hung. And transformers is not the cause: on this exact 5.10.2 image, vanilla completes generate (warm, 6.8 min, exit 0) while routeV deadlocks at its first rollout generate(). Same image, same attn, same data -- the hang is routeV-specific (v_grad extraction's CUDA state x flash-attn first-generate on torch 2.7.1; local box runs routeV fine on 2.8). Known-issue section + corrected app.py comment record this. Local box produces the canonical routeV runs; Modal is proven for vanilla. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- modal/README.md | 25 +++++++++++++++++++++++++ modal/app.py | 40 +++++++++++++++++++++++++++------------- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/modal/README.md b/modal/README.md index f2622d9..0f42cf1 100644 --- a/modal/README.md +++ b/modal/README.md @@ -86,6 +86,31 @@ python modal/fetch.py # all of out/runs + logs python modal/fetch.py _ # one run ``` +## Known issue — routeV deadlocks at first generate() on Modal + +**vanilla works, routeV hangs, on the identical image.** On the `transformers==5.10.2` ++ Dao flash-attn 2.8.3 (torch 2.7.1) image, a vanilla warm completes generate cleanly +(6.8 min, exit 0, `per_mode_deploy.json` written), but a routeV smoke freezes at its +first rollout `generate()` indefinitely (killed at ~11 min). Same image, same attn, +same data, same delta_S init — the only difference is the arm. + +It is NOT: the data path (both arms load fine now), the attn backend (the other agent +reproduced the hang under sdpa too, commit 2f91561), transformers version (vanilla +runs on this exact 5.10.2), or the generate-time forward hook (routeV's `grad_probe` +branch is skipped under `generate()`'s no_grad, so both arms run the identical +`_delta_hook` else-path). What's left: routeV extracts `v_grad` fresh before the loop +(forward+backward across 252 modules), and that CUDA/allocator state collides with +flash-attn's first generate on torch 2.7.1. The **local box runs routeV fine on torch +2.8** — so this is a torch-2.7-on-Modal deadlock, not a method bug. + +Correction to commit 2873b37: its message claimed "the smoke on pinned 5.10.2 clears +the deadlock point." It does not — the smoke hung. transformers was never the cause. + +**Current call:** Modal is proven for vanilla; the local box produces the canonical +routeV runs (jobs 134/135). Don't sink Modal $ into the routeV hang unless we +specifically want the parallelism — the fix is a focused torch-2.7 generate debug +(candidate: `torch.cuda.empty_cache()` at the extraction→loop boundary, unverified). + ## Caveat — keep the inventory fresh `launch.py::JOBS` is copied verbatim from the 2026-06-06 manifest. The live plan diff --git a/modal/app.py b/modal/app.py index 423d6a6..61c971e 100644 --- a/modal/app.py +++ b/modal/app.py @@ -37,6 +37,8 @@ from pathlib import Path import modal +REPO = Path(__file__).parent.parent + # --------------------------------------------------------------------------- # Image # --------------------------------------------------------------------------- @@ -55,12 +57,13 @@ image = ( index_url="https://download.pytorch.org/whl/cu126", ) .pip_install( - # transformers: pinned released version, NOT floating `@ main` (a later main - # commit is what hung generate() -- my v60 ran clean on an earlier main, the - # other agent confirmed the hang is the transformers commit not the attn - # backend). 5.10.2 is the patch line of my verified v60 build (5.10.0.dev0). - # uv.lock keeps the exact 5.8.0.dev0 commit for the local box's fine-grained - # repro; the image uses a released wheel. Qwen3-4B needs no main-only feature. + # transformers: pinned released version, NOT floating `@ main`. uv.lock keeps + # the exact 5.8.0.dev0 commit for the local box; the image uses a released + # wheel (Qwen3-4B needs no main-only feature). NB: transformers is NOT the + # routeV generate() hang -- on this exact 5.10.2 image, vanilla completes + # generate fine (warm, 6.8 min, exit 0) while routeV deadlocks at its first + # rollout generate. The hang is routeV-specific (v_grad extraction's CUDA + # state x flash-attn first-generate on torch 2.7.1); see modal/README.md. "transformers==5.10.2", "einops>=0.8", "jaxtyping>=0.2", @@ -81,9 +84,14 @@ image = ( # flash-attn last, after torch is present (no build isolation -> uses the wheel). .pip_install(FLASH_ATTN_WHL) # Research code mounted at runtime so local edits sync without an image rebuild. - # Only src/ is needed on PYTHONPATH; data + caches live on the Volume. Anchored - # to the repo (not CWD) so `modal run` works from any directory. - .add_local_dir(str(Path(__file__).parent.parent / "src"), "/root/src", copy=False) + # Only src/ is needed on PYTHONPATH; mutable caches (svd_cache/out/logs) live on + # the Volume. Anchored to the repo (not CWD) so `modal run` works from anywhere. + .add_local_dir(str(REPO / "src"), "/root/src", copy=False) + # Read-only LeetCode dataset (44MB, 3 jsonls, tracked in the rl-rewardhacking + # submodule). Mount from the image, NOT the Volume: a Volume mount/reload race + # FileNotFound'd it mid-sweep even though the file was committed. Versioning it + # with the code makes the dataset deterministic and removes that failure mode. + .add_local_dir(str(REPO / "external/rl-rewardhacking/results/data"), "/root/leetcode_data", copy=False) ) app = modal.App("vgrout", image=image) @@ -118,13 +126,19 @@ def _prepare_workdir() -> str: Path(f"{CACHE}/{sub}").mkdir(parents=True, exist_ok=True) work = Path("/work") work.mkdir(exist_ok=True) - # external/ holds the read-only LeetCode dataset (uploaded to the Volume by - # upload_inputs.py); train.py reads it via the relative path - # external/rl-rewardhacking/results/data/*.jsonl. - for name in ("svd_cache", "out", "logs", "external"): + # Mutable dirs live on the Volume (model cache, svd basis, uploaded inputs in + # out/, written out/runs/*). Symlinked from the ephemeral /work cwd. + for name in ("svd_cache", "out", "logs"): link = work / name if not link.exists(): link.symlink_to(f"{CACHE}/{name}") + # Read-only LeetCode dataset comes from the image mount (/root/leetcode_data), + # not the Volume -- train.py reads external/rl-rewardhacking/results/data/*.jsonl + # relative to cwd, so symlink that leaf dir onto the deterministic image copy. + data = work / "external/rl-rewardhacking/results/data" + data.parent.mkdir(parents=True, exist_ok=True) + if not data.exists(): + data.symlink_to("/root/leetcode_data") return str(work)