diff --git a/src/lora_lite/variants/antipasto.py b/src/lora_lite/variants/antipasto.py
index fd0a392..1b4e839 100644
--- a/src/lora_lite/variants/antipasto.py
+++ b/src/lora_lite/variants/antipasto.py
@@ -9,6 +9,14 @@ wassname 2026  https://arxiv.org/abs/2601.07473
 
 Identity at t=0: rot_T=0 -> R=I, delta_s=0 -> y == x @ W^T (fp32 SVD round-trip).
 
+Scope cut vs antipasto3: this is a fine-tuning adapter, not the full runtime
+steering interface. There is no per-call alpha, so it does not expose the
+bidirectional R(+alpha) / R(-alpha) inference symmetry. The V-basis path uses the
+opposite chirality to antipasto3's default U-basis path, so checkpoints are not
+portable without a sign/basis convention. Zero-init is stricter identity than
+antipasto3's small positive/random symmetry-breaking init, but can leave rotation
+learning to be started by the task gradient rather than init noise.
+
 Refs:
   - paper: https://github.com/wassname/AntiPaSTO
   - lite port of: https://github.com/wassname/antipasto3
diff --git a/src/lora_lite/variants/delora.py b/src/lora_lite/variants/delora.py
index a7bf54f..3bf491a 100644
--- a/src/lora_lite/variants/delora.py
+++ b/src/lora_lite/variants/delora.py
@@ -8,7 +8,8 @@ Per peft upstream: ||W|| is per-input-channel (not scalar Frobenius), and
 per-rank norms divide inside the diag (not via F.normalize on A,B) so
 gradients flow through un-normalized parameters.
 
-Identity at t=0: lambda0=0 -> delta=0 (bit-identity).
+Identity at t=0 comes from B=0, so any lambda0 keeps delta=0. Keep lambda0
+nonzero for training: lambda0=0 makes every DeLoRA gradient zero on step 0.
 
 Refs:
   - paper code: https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py
@@ -30,9 +31,9 @@ from ..config import AdapterConfig, register_config
 @dataclass
 class DeLoRAConfig(AdapterConfig):
     variant: str = "delora"
-    # 0.0 = bit-identity at t=0, but A,B get zero grad until lambda moves
-    # (delta ∝ lambda). peft default is 15.0.
-    lambda0: float = 0.0
+    # peft/paper default. B=0 preserves t=0 identity; lambda0=0 would make the
+    # whole adapter dead on step 0 because delta and all gradients scale by lambda.
+    lambda0: float = 15.0
 
 
 @register
diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py
index 190a38f..b45c859 100644
--- a/tests/test_lora_lite.py
+++ b/tests/test_lora_lite.py
@@ -99,12 +99,10 @@ class FakeBnbModel(nn.Module):
 
 
 def cfg_for(variant: str) -> ll.AdapterConfig:
-    extra = {"lambda0": 0.1} if variant == "delora" else {}
     return CFG_BY_VARIANT[variant](
         r=4,
-        alpha=4 if variant == "pissa" else 8,
+        alpha=8,
         dtype=torch.float32,
-        **extra,
     )
 
 
@@ -221,6 +219,57 @@ def test_eva_requires_calibration():
         ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32))
 
 
+def test_delora_default_has_live_step0_gradient():
+    """Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient."""
+    torch.manual_seed(0)
+    model = TinyModel(n_layers=1)
+    ids = torch.randint(0, 100, (2, 8))
+    ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32))
+
+    assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0)
+    loss = model(ids).pow(2).mean()
+    loss.backward()
+
+    b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item()
+    assert b_grad > 0
+
+
+def test_pissa_identity_with_nonunit_scale():
+    """Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r."""
+    torch.manual_seed(0)
+    model = TinyModel(n_layers=1)
+    ids = torch.randint(0, 100, (2, 8))
+    with torch.no_grad():
+        y_base = model(ids).clone()
+
+    ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32))
+    with torch.no_grad():
+        y = model(ids)
+    assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"]
+
+
+def test_antipasto_blockwise_rotation_matches_explicit_blockdiag():
+    """The einsum/rearrange path must equal the old explicit blockdiag math."""
+    from lora_lite.variants.antipasto import _build_rotation
+
+    torch.manual_seed(0)
+    n_blocks, bs, d_in, d_out = 3, 4, 7, 5
+    r = n_blocks * bs
+    rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1
+    Vh = torch.randn(r, d_in)
+    U = torch.randn(d_out, r)
+    R_blocks = _build_rotation(rot_T, bs, 0.5)
+    R = torch.block_diag(*list(R_blocks))
+
+    Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in))
+    Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in)
+    U_blocks = torch.reshape(U, (d_out, n_blocks, bs))
+    U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r)
+
+    assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6
+    assert (U_rot - U @ R.T).abs().max().item() < 1e-6
+
+
 def test_dora_bias_passthrough():
     """Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0."""
     torch.manual_seed(0)