Files
evil_MoE/scripts/migrate_out_dirs.py
T
wassname 4621488cc0 reorg: out/ sorted by datatype (vhack/ pools/ runs/ vhack_grads/ figs/)
Code writes+reads the new scheme; migrate_out_dirs.py moved 225 loose artifacts
(0 left at top level). Per-run checkpoints+rollouts now group under
runs/<ts>_<run_id>/ as train.safetensors/rollouts.jsonl. Figures land in
out/figs/ with a stable docs/figs/<name>.png symlink (figs.link_latest).
justfile also gains run-cell REFRESH param (online-erasure arm). Smoke +
smoke-vanilla + results all green on new paths. Requeue manifest preserves the
why/resolve labels that pueue reset wiped.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 03:52:24 +00:00

91 lines
3.5 KiB
Python

"""One-shot out/ migration to the datatype-sorted scheme (spec 20260530_out_dir_reorg).
Sorts loose out/ files into subdirs:
v_hack_*.safetensors -> out/vhack/
vhack_grads_*, vhack_heldout_* -> out/vhack_grads/
*.png -> out/figs/
out/probe_distill/<pool>/ -> out/pools/<pool>/
train_<tag>{,_first_hack}.safetensors + rollouts_<tag>.jsonl
-> out/runs/<log_stem>/ (ts matched from logs/*<tag>.log)
pairs_*.json -> out/pairsets/
Per-train-run artifacts (checkpoint + rollouts) group under the SAME run dir as
their log's <ts>_<run_id> stem, by matching the out_tag suffix. Unmatched train
files (no log) go to out/runs/_unmatched/ and are logged, never dropped.
uv run python scripts/migrate_out_dirs.py # dry-run (prints plan)
uv run python scripts/migrate_out_dirs.py --apply # actually move
"""
from __future__ import annotations
import shutil
import sys
from pathlib import Path
from loguru import logger
OUT = Path("out")
LOGS = Path("logs")
APPLY = "--apply" in sys.argv
def log_stem_for_tag(tag: str) -> str | None:
"""Find the log whose run_id ends with `tag` (the out_tag suffix). Returns its stem."""
cands = sorted(LOGS.glob(f"*{tag}.log"))
# Prefer an exact suffix match on the stem (run_id = <preset>_<arm>_seed<n><tag>).
exact = [p for p in cands if p.stem.endswith(tag)]
chosen = (exact or cands)
return chosen[-1].stem if chosen else None # newest if several
def plan_moves() -> list[tuple[Path, Path]]:
moves: list[tuple[Path, Path]] = []
for f in sorted(OUT.glob("*")):
if f.is_dir():
continue
n = f.name
if n.startswith("v_hack_") and n.endswith(".safetensors"):
moves.append((f, OUT / "vhack" / n))
elif n.startswith(("vhack_grads_", "vhack_heldout")):
moves.append((f, OUT / "vhack_grads" / n))
elif n.endswith(".png"):
moves.append((f, OUT / "figs" / n))
elif n.startswith("pairs_") and n.endswith(".json"):
moves.append((f, OUT / "pairsets" / n))
elif n.startswith("train_") or n.startswith("rollouts_"):
# tag = out_tag suffix shared by the file and its log.
stem = n.split(".")[0]
tag = (stem[len("train"):] if stem.startswith("train")
else "_" + stem[len("rollouts_"):])
tag = tag.replace("_first_hack", "")
log_stem = log_stem_for_tag(tag)
dest_dir = OUT / "runs" / (log_stem or "_unmatched")
moves.append((f, dest_dir / n))
else:
logger.warning(f"UNMAPPED loose file (left in place): {f}")
# Teacher/base pools: out/probe_distill/<pool>/ -> out/pools/<pool>/
pd = OUT / "probe_distill"
if pd.is_dir():
for sub in sorted(pd.iterdir()):
dst = OUT / ("figs" if sub.suffix == ".png" else "pools") / sub.name
moves.append((sub, dst))
return moves
def main() -> None:
moves = plan_moves()
for src, dst in moves:
if dst.exists():
logger.warning(f"SKIP (dest exists): {dst}")
continue
logger.info(f"{'MOVE' if APPLY else 'PLAN'}: {src} -> {dst}")
if APPLY:
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(src), str(dst))
logger.info(f"{'APPLIED' if APPLY else 'DRY-RUN'}: {len(moves)} moves. "
f"{'' if APPLY else 'Re-run with --apply to execute.'}")
if __name__ == "__main__":
main()