fixes

2026-06-27 16:30:44 +08:00 · 2026-04-27 07:46:10 +08:00
parent bb8887e66c
commit 1a93df10b2
3 changed files with 65 additions and 7 deletions
@@ -9,6 +9,14 @@ wassname 2026  https://arxiv.org/abs/2601.07473

 Identity at t=0: rot_T=0 -> R=I, delta_s=0 -> y == x @ W^T (fp32 SVD round-trip).

+Scope cut vs antipasto3: this is a fine-tuning adapter, not the full runtime
+steering interface. There is no per-call alpha, so it does not expose the
+bidirectional R(+alpha) / R(-alpha) inference symmetry. The V-basis path uses the
+opposite chirality to antipasto3's default U-basis path, so checkpoints are not
+portable without a sign/basis convention. Zero-init is stricter identity than
+antipasto3's small positive/random symmetry-breaking init, but can leave rotation
+learning to be started by the task gradient rather than init noise.
+
 Refs:
  - paper: https://github.com/wassname/AntiPaSTO
  - lite port of: https://github.com/wassname/antipasto3
@@ -8,7 +8,8 @@ Per peft upstream: ||W|| is per-input-channel (not scalar Frobenius), and
 per-rank norms divide inside the diag (not via F.normalize on A,B) so
 gradients flow through un-normalized parameters.

-Identity at t=0: lambda0=0 -> delta=0 (bit-identity).
+Identity at t=0 comes from B=0, so any lambda0 keeps delta=0. Keep lambda0
+nonzero for training: lambda0=0 makes every DeLoRA gradient zero on step 0.

 Refs:
  - paper code: https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py
@@ -30,9 +31,9 @@ from ..config import AdapterConfig, register_config
@dataclass
 class DeLoRAConfig(AdapterConfig):
    variant: str = "delora"
-    # 0.0 = bit-identity at t=0, but A,B get zero grad until lambda moves
-    # (delta ∝ lambda). peft default is 15.0.
-    lambda0: float = 0.0
+    # peft/paper default. B=0 preserves t=0 identity; lambda0=0 would make the
+    # whole adapter dead on step 0 because delta and all gradients scale by lambda.
+    lambda0: float = 15.0


@register
@@ -99,12 +99,10 @@ class FakeBnbModel(nn.Module):


 def cfg_for(variant: str) -> ll.AdapterConfig:
-    extra = {"lambda0": 0.1} if variant == "delora" else {}
    return CFG_BY_VARIANT[variant](
        r=4,
-        alpha=4 if variant == "pissa" else 8,
+        alpha=8,
        dtype=torch.float32,
-        **extra,
    )


@@ -221,6 +219,57 @@ def test_eva_requires_calibration():
        ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32))


+def test_delora_default_has_live_step0_gradient():
+    """Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient."""
+    torch.manual_seed(0)
+    model = TinyModel(n_layers=1)
+    ids = torch.randint(0, 100, (2, 8))
+    ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32))
+
+    assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0)
+    loss = model(ids).pow(2).mean()
+    loss.backward()
+
+    b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item()
+    assert b_grad > 0
+
+
+def test_pissa_identity_with_nonunit_scale():
+    """Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r."""
+    torch.manual_seed(0)
+    model = TinyModel(n_layers=1)
+    ids = torch.randint(0, 100, (2, 8))
+    with torch.no_grad():
+        y_base = model(ids).clone()
+
+    ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32))
+    with torch.no_grad():
+        y = model(ids)
+    assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"]
+
+
+def test_antipasto_blockwise_rotation_matches_explicit_blockdiag():
+    """The einsum/rearrange path must equal the old explicit blockdiag math."""
+    from lora_lite.variants.antipasto import _build_rotation
+
+    torch.manual_seed(0)
+    n_blocks, bs, d_in, d_out = 3, 4, 7, 5
+    r = n_blocks * bs
+    rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1
+    Vh = torch.randn(r, d_in)
+    U = torch.randn(d_out, r)
+    R_blocks = _build_rotation(rot_T, bs, 0.5)
+    R = torch.block_diag(*list(R_blocks))
+
+    Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in))
+    Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in)
+    U_blocks = torch.reshape(U, (d_out, n_blocks, bs))
+    U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r)
+
+    assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6
+    assert (U_rot - U @ R.T).abs().max().item() < 1e-6
+
+
 def test_dora_bias_passthrough():
    """Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0."""
    torch.manual_seed(0)