From f1dd9fb33e74fbe6a827f18edc5d65755114ab3e Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:35:29 +0000 Subject: [PATCH] chore: FastConfig save_ckpt_every 10->20 (disk pressure; ~6 ckpts/run suffices) Each lora2r ckpt is ~1.3G (A/B + redundant frozen A0/B0, 252 modules fp32). The 768G disk filled and runs crashed at the step-0 ckpt save. 20-step cadence halves the per-run footprint while keeping enough points for the eval curve. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- src/vgrout/train_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vgrout/train_config.py b/src/vgrout/train_config.py index 0c8437f..41dcfd0 100644 --- a/src/vgrout/train_config.py +++ b/src/vgrout/train_config.py @@ -128,3 +128,7 @@ class FastConfig(Config): adam_beta1: float = 0.5 adam_beta2: float = 0.9 lr: float = 5e-4 # user: bump from 1e-4 to learn faster in the short grad-starved budget + # Each lora2r ckpt is ~1.3G (A/B + redundant frozen A0/B0 for 252 modules, fp32); 20-step + # cadence keeps ~6/run for the eval curve without filling the 768G disk. (TODO: drop A0/B0 + # from ckpts -- reconstructible from lora_init_seed -- to halve size, needs a loader change.) + save_ckpt_every: int = 20