mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:45:56 +08:00
feat: near_zero/near_one init for trainable params (breaks bf16 dead-grad symmetry)
Trainable params that were init'd at exact 0 or 1 now use near_zero (N(0,1e-4)) or near_one (1 + N(0,1e-4)) to break bf16 symmetry without meaningfully breaking identity-at-t=0. Exact-zero init is kept where zero IS the identity constraint (DeLoRA lora_B, EVA lora_B -- both scaled by other params so any nonzero B would blow up the output). AntiPaSTO: delta_s and rot_T now near_zero. The old exact-zero could leave rotation learning dead in bf16 where step sizes round back to zero. IA3: lora_g now near_one instead of exact ones. Avoids the bf16 spacing issue around 1.0 where eps_bf16 ~ 7.8e-3 and lr=1e-3 updates were rounding away. PiSSA: lora_A and lora_B now near_zero (both overwritten by SVD in init(), so the init value is moot -- but ParamSpec now documents intent correctly). HRA: lora_U now near_zero (overwritten by symmetric init in init()). ParamSpec: added 'near_zero' and 'near_one' init modes. Default changed from 'zeros' to 'near_zero'. Tests relaxed identity tolerances accordingly.
This commit is contained in:
+11
-11
@@ -31,16 +31,16 @@ CFG_BY_VARIANT = {
|
||||
# Per-variant identity tolerance at t=0 (after attach, before any step).
|
||||
# fp32 SVD round-trip + per-row norm = looser tolerance for pissa/dora/antipasto.
|
||||
IDENTITY_TOL = {
|
||||
"lora": 1e-6,
|
||||
"pissa": 5e-4,
|
||||
"delora": 1e-6,
|
||||
"ia3": 1e-6,
|
||||
"ia3_ff": 1e-6,
|
||||
"dora": 5e-5,
|
||||
"hra": 5e-6,
|
||||
"eva": 1e-6,
|
||||
"antipasto": 5e-4,
|
||||
"road": 1e-6,
|
||||
"lora": 5e-3, # near_zero B: B@A ~ sqrt(r)*eps*kaiming
|
||||
"pissa": 5e-4, # SVD round-trip
|
||||
"delora": 1e-6, # exact-zero B, lambda0-scaled
|
||||
"ia3": 5e-3, # near_one gate
|
||||
"ia3_ff": 5e-3, # near_one gate
|
||||
"dora": 5e-3, # near_zero B + m
|
||||
"hra": 1e-2, # near_zero U + paired-symmetry init
|
||||
"eva": 5e-4, # exact-zero B, SVD A overwritten in group_init
|
||||
"antipasto": 5e-4, # SVD round-trip
|
||||
"road": 5e-3, # near_zero theta
|
||||
}
|
||||
|
||||
|
||||
@@ -302,7 +302,7 @@ def test_dora_bias_passthrough():
|
||||
ll.attach(model, ll.DoRAConfig(r=2, alpha=4, dtype=torch.float32, target_roles=()))
|
||||
with torch.no_grad():
|
||||
y = model(x)
|
||||
assert (y - y_base).abs().max().item() < 1e-5
|
||||
assert (y - y_base).abs().max().item() < 5e-3 # near_zero B + m init
|
||||
|
||||
|
||||
def test_hra_forward_is_x_R_T():
|
||||
|
||||
Reference in New Issue
Block a user