diff --git a/src/vgrout/train.py b/src/vgrout/train.py index d64eac7..0d9f357 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -549,21 +549,20 @@ def main(cfg: Config) -> int: route_solveT_run: list[float] = [] # per-step routed-share of solve teachers def save_ckpt(rows: list[dict], path: Path | None = None) -> None: - """Save a self-contained lora2r checkpoint: full A/B + the frozen init A0/B0, - so a loader reconstructs the net delta (B@A - B0@A0) and can ablate the - quarantine without any SVD cache. Config + per-step rows in the metadata.""" + """Save a lora2r checkpoint: trainable A/B only, bf16. The init A0/B0 are NOT + saved -- they're the seeded Gaussian (regenerable from lora_init_seed) and + ckpt_update0000 holds them anyway (A==A0 at step 0); nothing live reloads + them. Config + per-step rows in the metadata.""" n_gens = sum(r["N"] for r in rows) hr = sum(r["hack_s"][0] + r["hack_t"][0] for r in rows) / max(1, n_gens) pr = sum(r["gt_s"][0] + r["gt_t"][0] for r in rows) / max(1, n_gens) _ckpt = path or ckpt_path tensors = {} for n, info in wrappers.items(): - tensors[f"A/{n}"] = info["A"].detach().float().cpu().contiguous() - tensors[f"B/{n}"] = info["B"].detach().float().cpu().contiguous() - tensors[f"A0/{n}"] = info["A0"].detach().float().cpu().contiguous() - tensors[f"B0/{n}"] = info["B0"].detach().float().cpu().contiguous() + tensors[f"A/{n}"] = info["A"].detach().bfloat16().cpu().contiguous() + tensors[f"B/{n}"] = info["B"].detach().bfloat16().cpu().contiguous() save_file(tensors, str(_ckpt), metadata={ - "model": model_name, "dtype": "fp32", "step": str(len(rows)), + "model": model_name, "dtype": "bf16", "step": str(len(rows)), "hack_rate": f"{hr:.6f}", "pass_rate": f"{pr:.6f}", "rows": json.dumps(rows), "cfg": json.dumps(vars(cfg) | {"adapter": "lora2r"}, default=str), diff --git a/src/vgrout/train_config.py b/src/vgrout/train_config.py index 41dcfd0..cb157cf 100644 --- a/src/vgrout/train_config.py +++ b/src/vgrout/train_config.py @@ -37,7 +37,7 @@ class Config: # AdamW decay pulls raw A/B toward 0, not toward the init, which would drive # the net delta to -B0@A0 -- must stay 0 for this adapter. weight_decay: float = 0.0 - warmup_frac: float = 0.1 + warmup_frac: float = 0.2 grad_clip: float = 10.0 seed: int = 41 unbiased: bool = True @@ -127,8 +127,9 @@ class FastConfig(Config): prompts_per_step: int = 4 adam_beta1: float = 0.5 adam_beta2: float = 0.9 - lr: float = 5e-4 # user: bump from 1e-4 to learn faster in the short grad-starved budget - # Each lora2r ckpt is ~1.3G (A/B + redundant frozen A0/B0 for 252 modules, fp32); 20-step - # cadence keeps ~6/run for the eval curve without filling the 768G disk. (TODO: drop A0/B0 - # from ckpts -- reconstructible from lora_init_seed -- to halve size, needs a loader change.) + lr: float = 3e-4 # 5e-4 peaked at warmup-end (step ~10) and diverged; 3e-4 + 20% warmup + # Each lora2r ckpt is ~0.33G (A/B for 252 modules, bf16). A0/B0 are NOT saved -- they're + # the seeded init (regenerable from lora_init_seed in the metadata; ckpt_update0000 is the + # init since A==A0 at step 0), and nothing live reloads them (inline eval uses in-memory + # wrappers). 20-step cadence keeps ~6/run for the eval curve. save_ckpt_every: int = 20