This commit is contained in:
wassname
2026-04-27 07:46:10 +08:00
parent bb8887e66c
commit 1a93df10b2
3 changed files with 65 additions and 7 deletions
+8
View File
@@ -9,6 +9,14 @@ wassname 2026 https://arxiv.org/abs/2601.07473
Identity at t=0: rot_T=0 -> R=I, delta_s=0 -> y == x @ W^T (fp32 SVD round-trip).
Scope cut vs antipasto3: this is a fine-tuning adapter, not the full runtime
steering interface. There is no per-call alpha, so it does not expose the
bidirectional R(+alpha) / R(-alpha) inference symmetry. The V-basis path uses the
opposite chirality to antipasto3's default U-basis path, so checkpoints are not
portable without a sign/basis convention. Zero-init is stricter identity than
antipasto3's small positive/random symmetry-breaking init, but can leave rotation
learning to be started by the task gradient rather than init noise.
Refs:
- paper: https://github.com/wassname/AntiPaSTO
- lite port of: https://github.com/wassname/antipasto3
+5 -4
View File
@@ -8,7 +8,8 @@ Per peft upstream: ||W|| is per-input-channel (not scalar Frobenius), and
per-rank norms divide inside the diag (not via F.normalize on A,B) so
gradients flow through un-normalized parameters.
Identity at t=0: lambda0=0 -> delta=0 (bit-identity).
Identity at t=0 comes from B=0, so any lambda0 keeps delta=0. Keep lambda0
nonzero for training: lambda0=0 makes every DeLoRA gradient zero on step 0.
Refs:
- paper code: https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py
@@ -30,9 +31,9 @@ from ..config import AdapterConfig, register_config
@dataclass
class DeLoRAConfig(AdapterConfig):
variant: str = "delora"
# 0.0 = bit-identity at t=0, but A,B get zero grad until lambda moves
# (delta ∝ lambda). peft default is 15.0.
lambda0: float = 0.0
# peft/paper default. B=0 preserves t=0 identity; lambda0=0 would make the
# whole adapter dead on step 0 because delta and all gradients scale by lambda.
lambda0: float = 15.0
@register
+52 -3
View File
@@ -99,12 +99,10 @@ class FakeBnbModel(nn.Module):
def cfg_for(variant: str) -> ll.AdapterConfig:
extra = {"lambda0": 0.1} if variant == "delora" else {}
return CFG_BY_VARIANT[variant](
r=4,
alpha=4 if variant == "pissa" else 8,
alpha=8,
dtype=torch.float32,
**extra,
)
@@ -221,6 +219,57 @@ def test_eva_requires_calibration():
ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32))
def test_delora_default_has_live_step0_gradient():
"""Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient."""
torch.manual_seed(0)
model = TinyModel(n_layers=1)
ids = torch.randint(0, 100, (2, 8))
ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32))
assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0)
loss = model(ids).pow(2).mean()
loss.backward()
b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item()
assert b_grad > 0
def test_pissa_identity_with_nonunit_scale():
"""Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r."""
torch.manual_seed(0)
model = TinyModel(n_layers=1)
ids = torch.randint(0, 100, (2, 8))
with torch.no_grad():
y_base = model(ids).clone()
ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32))
with torch.no_grad():
y = model(ids)
assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"]
def test_antipasto_blockwise_rotation_matches_explicit_blockdiag():
"""The einsum/rearrange path must equal the old explicit blockdiag math."""
from lora_lite.variants.antipasto import _build_rotation
torch.manual_seed(0)
n_blocks, bs, d_in, d_out = 3, 4, 7, 5
r = n_blocks * bs
rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1
Vh = torch.randn(r, d_in)
U = torch.randn(d_out, r)
R_blocks = _build_rotation(rot_T, bs, 0.5)
R = torch.block_diag(*list(R_blocks))
Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in))
Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in)
U_blocks = torch.reshape(U, (d_out, n_blocks, bs))
U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r)
assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6
assert (U_rot - U @ R.T).abs().max().item() < 1e-6
def test_dora_bias_passthrough():
"""Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0."""
torch.manual_seed(0)