mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:30:44 +08:00
fixes
This commit is contained in:
@@ -9,6 +9,14 @@ wassname 2026 https://arxiv.org/abs/2601.07473
|
||||
|
||||
Identity at t=0: rot_T=0 -> R=I, delta_s=0 -> y == x @ W^T (fp32 SVD round-trip).
|
||||
|
||||
Scope cut vs antipasto3: this is a fine-tuning adapter, not the full runtime
|
||||
steering interface. There is no per-call alpha, so it does not expose the
|
||||
bidirectional R(+alpha) / R(-alpha) inference symmetry. The V-basis path uses the
|
||||
opposite chirality to antipasto3's default U-basis path, so checkpoints are not
|
||||
portable without a sign/basis convention. Zero-init is stricter identity than
|
||||
antipasto3's small positive/random symmetry-breaking init, but can leave rotation
|
||||
learning to be started by the task gradient rather than init noise.
|
||||
|
||||
Refs:
|
||||
- paper: https://github.com/wassname/AntiPaSTO
|
||||
- lite port of: https://github.com/wassname/antipasto3
|
||||
|
||||
@@ -8,7 +8,8 @@ Per peft upstream: ||W|| is per-input-channel (not scalar Frobenius), and
|
||||
per-rank norms divide inside the diag (not via F.normalize on A,B) so
|
||||
gradients flow through un-normalized parameters.
|
||||
|
||||
Identity at t=0: lambda0=0 -> delta=0 (bit-identity).
|
||||
Identity at t=0 comes from B=0, so any lambda0 keeps delta=0. Keep lambda0
|
||||
nonzero for training: lambda0=0 makes every DeLoRA gradient zero on step 0.
|
||||
|
||||
Refs:
|
||||
- paper code: https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py
|
||||
@@ -30,9 +31,9 @@ from ..config import AdapterConfig, register_config
|
||||
@dataclass
|
||||
class DeLoRAConfig(AdapterConfig):
|
||||
variant: str = "delora"
|
||||
# 0.0 = bit-identity at t=0, but A,B get zero grad until lambda moves
|
||||
# (delta ∝ lambda). peft default is 15.0.
|
||||
lambda0: float = 0.0
|
||||
# peft/paper default. B=0 preserves t=0 identity; lambda0=0 would make the
|
||||
# whole adapter dead on step 0 because delta and all gradients scale by lambda.
|
||||
lambda0: float = 15.0
|
||||
|
||||
|
||||
@register
|
||||
|
||||
+52
-3
@@ -99,12 +99,10 @@ class FakeBnbModel(nn.Module):
|
||||
|
||||
|
||||
def cfg_for(variant: str) -> ll.AdapterConfig:
|
||||
extra = {"lambda0": 0.1} if variant == "delora" else {}
|
||||
return CFG_BY_VARIANT[variant](
|
||||
r=4,
|
||||
alpha=4 if variant == "pissa" else 8,
|
||||
alpha=8,
|
||||
dtype=torch.float32,
|
||||
**extra,
|
||||
)
|
||||
|
||||
|
||||
@@ -221,6 +219,57 @@ def test_eva_requires_calibration():
|
||||
ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
|
||||
|
||||
def test_delora_default_has_live_step0_gradient():
|
||||
"""Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient."""
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel(n_layers=1)
|
||||
ids = torch.randint(0, 100, (2, 8))
|
||||
ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
|
||||
assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0)
|
||||
loss = model(ids).pow(2).mean()
|
||||
loss.backward()
|
||||
|
||||
b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item()
|
||||
assert b_grad > 0
|
||||
|
||||
|
||||
def test_pissa_identity_with_nonunit_scale():
|
||||
"""Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r."""
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel(n_layers=1)
|
||||
ids = torch.randint(0, 100, (2, 8))
|
||||
with torch.no_grad():
|
||||
y_base = model(ids).clone()
|
||||
|
||||
ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
with torch.no_grad():
|
||||
y = model(ids)
|
||||
assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"]
|
||||
|
||||
|
||||
def test_antipasto_blockwise_rotation_matches_explicit_blockdiag():
|
||||
"""The einsum/rearrange path must equal the old explicit blockdiag math."""
|
||||
from lora_lite.variants.antipasto import _build_rotation
|
||||
|
||||
torch.manual_seed(0)
|
||||
n_blocks, bs, d_in, d_out = 3, 4, 7, 5
|
||||
r = n_blocks * bs
|
||||
rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1
|
||||
Vh = torch.randn(r, d_in)
|
||||
U = torch.randn(d_out, r)
|
||||
R_blocks = _build_rotation(rot_T, bs, 0.5)
|
||||
R = torch.block_diag(*list(R_blocks))
|
||||
|
||||
Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in))
|
||||
Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in)
|
||||
U_blocks = torch.reshape(U, (d_out, n_blocks, bs))
|
||||
U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r)
|
||||
|
||||
assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6
|
||||
assert (U_rot - U @ R.T).abs().max().item() < 1e-6
|
||||
|
||||
|
||||
def test_dora_bias_passthrough():
|
||||
"""Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0."""
|
||||
torch.manual_seed(0)
|
||||
|
||||
Reference in New Issue
Block a user