diff --git a/src/vgrout/train.py b/src/vgrout/train.py
index d64eac7..0d9f357 100644
--- a/src/vgrout/train.py
+++ b/src/vgrout/train.py
@@ -549,21 +549,20 @@ def main(cfg: Config) -> int:
     route_solveT_run: list[float] = []  # per-step routed-share of solve teachers
 
     def save_ckpt(rows: list[dict], path: Path | None = None) -> None:
-        """Save a self-contained lora2r checkpoint: full A/B + the frozen init A0/B0,
-        so a loader reconstructs the net delta (B@A - B0@A0) and can ablate the
-        quarantine without any SVD cache. Config + per-step rows in the metadata."""
+        """Save a lora2r checkpoint: trainable A/B only, bf16. The init A0/B0 are NOT
+        saved -- they're the seeded Gaussian (regenerable from lora_init_seed) and
+        ckpt_update0000 holds them anyway (A==A0 at step 0); nothing live reloads
+        them. Config + per-step rows in the metadata."""
         n_gens = sum(r["N"] for r in rows)
         hr = sum(r["hack_s"][0] + r["hack_t"][0] for r in rows) / max(1, n_gens)
         pr = sum(r["gt_s"][0]   + r["gt_t"][0]   for r in rows) / max(1, n_gens)
         _ckpt = path or ckpt_path
         tensors = {}
         for n, info in wrappers.items():
-            tensors[f"A/{n}"]  = info["A"].detach().float().cpu().contiguous()
-            tensors[f"B/{n}"]  = info["B"].detach().float().cpu().contiguous()
-            tensors[f"A0/{n}"] = info["A0"].detach().float().cpu().contiguous()
-            tensors[f"B0/{n}"] = info["B0"].detach().float().cpu().contiguous()
+            tensors[f"A/{n}"] = info["A"].detach().bfloat16().cpu().contiguous()
+            tensors[f"B/{n}"] = info["B"].detach().bfloat16().cpu().contiguous()
         save_file(tensors, str(_ckpt), metadata={
-            "model": model_name, "dtype": "fp32", "step": str(len(rows)),
+            "model": model_name, "dtype": "bf16", "step": str(len(rows)),
             "hack_rate": f"{hr:.6f}", "pass_rate": f"{pr:.6f}",
             "rows": json.dumps(rows),
             "cfg": json.dumps(vars(cfg) | {"adapter": "lora2r"}, default=str),
diff --git a/src/vgrout/train_config.py b/src/vgrout/train_config.py
index 41dcfd0..cb157cf 100644
--- a/src/vgrout/train_config.py
+++ b/src/vgrout/train_config.py
@@ -37,7 +37,7 @@ class Config:
     # AdamW decay pulls raw A/B toward 0, not toward the init, which would drive
     # the net delta to -B0@A0 -- must stay 0 for this adapter.
     weight_decay: float = 0.0
-    warmup_frac: float = 0.1
+    warmup_frac: float = 0.2
     grad_clip: float = 10.0
     seed: int = 41
     unbiased: bool = True
@@ -127,8 +127,9 @@ class FastConfig(Config):
     prompts_per_step: int = 4
     adam_beta1: float = 0.5
     adam_beta2: float = 0.9
-    lr: float = 5e-4   # user: bump from 1e-4 to learn faster in the short grad-starved budget
-    # Each lora2r ckpt is ~1.3G (A/B + redundant frozen A0/B0 for 252 modules, fp32); 20-step
-    # cadence keeps ~6/run for the eval curve without filling the 768G disk. (TODO: drop A0/B0
-    # from ckpts -- reconstructible from lora_init_seed -- to halve size, needs a loader change.)
+    lr: float = 3e-4   # 5e-4 peaked at warmup-end (step ~10) and diverged; 3e-4 + 20% warmup
+    # Each lora2r ckpt is ~0.33G (A/B for 252 modules, bf16). A0/B0 are NOT saved -- they're
+    # the seeded init (regenerable from lora_init_seed in the metadata; ckpt_update0000 is the
+    # init since A==A0 at step 0), and nothing live reloads them (inline eval uses in-memory
+    # wrappers). 20-step cadence keeps ~6/run for the eval curve.
     save_ckpt_every: int = 20