mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:00:59 +08:00
refactor: OneCycleLR replaces SequentialLR(LinearLR, CosineAnnealingLR)
One scheduler object does warmup + cosine relaxation; pct_start=warmup_frac is the explicit warmup. cycle_momentum=False so it doesn't clobber the configured AdamW betas (adam_beta1). Curve (100 steps, 20% warmup, 3e-4): peaks 3e-4 at step ~19 via smooth cos ramp (vs old linear), 1.7e-4 at step 10 where 5e-4 had diverged. Smoke + all verify gates green. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
+7
-11
@@ -413,17 +413,13 @@ def main(cfg: Config) -> int:
|
||||
# ── optimizer + schedule ── (A and B of both blocks; masks route grads)
|
||||
opt = torch.optim.AdamW(
|
||||
delta_params, lr=lr, weight_decay=cfg.weight_decay, betas=(adam_beta1, adam_beta2))
|
||||
# Fractional warmup preserves the intended schedule across preset lengths.
|
||||
warmup_steps = max(1, int(cfg.warmup_frac * steps))
|
||||
sched = torch.optim.lr_scheduler.SequentialLR(
|
||||
opt,
|
||||
schedulers=[
|
||||
torch.optim.lr_scheduler.LinearLR(opt, start_factor=1e-3, end_factor=1.0,
|
||||
total_iters=warmup_steps),
|
||||
torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=max(1, steps - warmup_steps)),
|
||||
],
|
||||
milestones=[warmup_steps],
|
||||
)
|
||||
# OneCycle does warmup + cosine relaxation in one object: cos ramp from lr/div_factor
|
||||
# up to lr over the first pct_start of steps (the explicit warmup), then cos anneal to
|
||||
# ~0. cycle_momentum=False so it leaves the configured AdamW betas alone (else it would
|
||||
# clobber adam_beta1). pct_start = warmup_frac keeps warmup fractional across presets.
|
||||
sched = torch.optim.lr_scheduler.OneCycleLR(
|
||||
opt, max_lr=lr, total_steps=steps, pct_start=cfg.warmup_frac,
|
||||
anneal_strategy="cos", div_factor=25.0, final_div_factor=1e4, cycle_momentum=False)
|
||||
|
||||
# ── generation config ──
|
||||
# Use the same sampling policy for training and evaluation.
|
||||
|
||||
Reference in New Issue
Block a user