diff --git a/src/lora_lite/variants/antipasto.py b/src/lora_lite/variants/antipasto.py index fd0a392..1b4e839 100644 --- a/src/lora_lite/variants/antipasto.py +++ b/src/lora_lite/variants/antipasto.py @@ -9,6 +9,14 @@ wassname 2026 https://arxiv.org/abs/2601.07473 Identity at t=0: rot_T=0 -> R=I, delta_s=0 -> y == x @ W^T (fp32 SVD round-trip). +Scope cut vs antipasto3: this is a fine-tuning adapter, not the full runtime +steering interface. There is no per-call alpha, so it does not expose the +bidirectional R(+alpha) / R(-alpha) inference symmetry. The V-basis path uses the +opposite chirality to antipasto3's default U-basis path, so checkpoints are not +portable without a sign/basis convention. Zero-init is stricter identity than +antipasto3's small positive/random symmetry-breaking init, but can leave rotation +learning to be started by the task gradient rather than init noise. + Refs: - paper: https://github.com/wassname/AntiPaSTO - lite port of: https://github.com/wassname/antipasto3 diff --git a/src/lora_lite/variants/delora.py b/src/lora_lite/variants/delora.py index a7bf54f..3bf491a 100644 --- a/src/lora_lite/variants/delora.py +++ b/src/lora_lite/variants/delora.py @@ -8,7 +8,8 @@ Per peft upstream: ||W|| is per-input-channel (not scalar Frobenius), and per-rank norms divide inside the diag (not via F.normalize on A,B) so gradients flow through un-normalized parameters. -Identity at t=0: lambda0=0 -> delta=0 (bit-identity). +Identity at t=0 comes from B=0, so any lambda0 keeps delta=0. Keep lambda0 +nonzero for training: lambda0=0 makes every DeLoRA gradient zero on step 0. Refs: - paper code: https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py @@ -30,9 +31,9 @@ from ..config import AdapterConfig, register_config @dataclass class DeLoRAConfig(AdapterConfig): variant: str = "delora" - # 0.0 = bit-identity at t=0, but A,B get zero grad until lambda moves - # (delta ∝ lambda). peft default is 15.0. - lambda0: float = 0.0 + # peft/paper default. B=0 preserves t=0 identity; lambda0=0 would make the + # whole adapter dead on step 0 because delta and all gradients scale by lambda. + lambda0: float = 15.0 @register diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py index 190a38f..b45c859 100644 --- a/tests/test_lora_lite.py +++ b/tests/test_lora_lite.py @@ -99,12 +99,10 @@ class FakeBnbModel(nn.Module): def cfg_for(variant: str) -> ll.AdapterConfig: - extra = {"lambda0": 0.1} if variant == "delora" else {} return CFG_BY_VARIANT[variant]( r=4, - alpha=4 if variant == "pissa" else 8, + alpha=8, dtype=torch.float32, - **extra, ) @@ -221,6 +219,57 @@ def test_eva_requires_calibration(): ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32)) +def test_delora_default_has_live_step0_gradient(): + """Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient.""" + torch.manual_seed(0) + model = TinyModel(n_layers=1) + ids = torch.randint(0, 100, (2, 8)) + ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32)) + + assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0) + loss = model(ids).pow(2).mean() + loss.backward() + + b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item() + assert b_grad > 0 + + +def test_pissa_identity_with_nonunit_scale(): + """Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r.""" + torch.manual_seed(0) + model = TinyModel(n_layers=1) + ids = torch.randint(0, 100, (2, 8)) + with torch.no_grad(): + y_base = model(ids).clone() + + ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32)) + with torch.no_grad(): + y = model(ids) + assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"] + + +def test_antipasto_blockwise_rotation_matches_explicit_blockdiag(): + """The einsum/rearrange path must equal the old explicit blockdiag math.""" + from lora_lite.variants.antipasto import _build_rotation + + torch.manual_seed(0) + n_blocks, bs, d_in, d_out = 3, 4, 7, 5 + r = n_blocks * bs + rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1 + Vh = torch.randn(r, d_in) + U = torch.randn(d_out, r) + R_blocks = _build_rotation(rot_T, bs, 0.5) + R = torch.block_diag(*list(R_blocks)) + + Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in)) + Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in) + U_blocks = torch.reshape(U, (d_out, n_blocks, bs)) + U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r) + + assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6 + assert (U_rot - U @ R.T).abs().max().item() < 1e-6 + + def test_dora_bias_passthrough(): """Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0.""" torch.manual_seed(0)