fix v2 review bugs + add EVA, AntiPaSTO

DeLoRA: per-input-channel wnorm buffer (not scalar Parameter), forward
matches peft (x*wnorm @ A.T then per-rank scale (lambda/r)/(An*Bn)).
Smoke: 89.7% loss drop (was 35.8%).

HRA: symmetric repeated-column init (PEFT-style) instead of zero gate.
Adjacent Householder pairs cancel exactly so R=I at t=0, and U receives
gradient from step 0 (no dead-grad). Even r required.

IA3: split into two variants. ia3 stays output-side (k_proj/v_proj);
new ia3_ff is input-side (down_proj/fc2), matching peft is_feedforward.

Config: dropout field removed (never honored by any variant).

PiSSA: adapter.save records base-weight fingerprint per target;
adapter.load recomputes init then verifies fingerprint -> fails loud
when reloaded onto a different base.

EVA (new): data-driven init via group_init + calibration_data. Top-r
right singular vectors of pooled layer-input activations -> lora_A
(buffer, frozen); only lora_B trains. Stress-tests group_init API.

AntiPaSTO (new): SVD steering with frozen U,S,Vh,W_res and learnable
delta_s (per-singular-value bias) + rot_T (block-diagonal Cayley
rotation on V or U). Lite port of antipasto3 SVD adapter.

ParamSpec: as_buffer field + make_tensor() for buffer registration.
adapter.attach honors as_buffer with register_buffer; detach cleans
both _parameters and _buffers.

Smoke covers all 8 variants: identity at t=0, save/load round-trip,
gradient-driven loss drop. EVA gets dedicated test for calibration
data path. ALL PASS including bnb 4/8-bit path.
This commit is contained in:
copilot
2026-04-26 19:41:59 +08:00
parent fdb4c77d6c
commit 185eb29c70
11 changed files with 479 additions and 74 deletions
+43 -3
View File
@@ -48,8 +48,12 @@ def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list
for pname, spec in variant.param_specs(d_in, d_out, cfg).items():
if hasattr(layer, pname):
raise RuntimeError(f"{name} already has attribute {pname}; detach first")
p = spec.make(cfg.dtype, layer.weight.device)
layer.register_parameter(pname, p)
if spec.as_buffer:
t = spec.make_tensor(cfg.dtype, layer.weight.device)
layer.register_buffer(pname, t, persistent=True)
else:
p = spec.make(cfg.dtype, layer.weight.device)
layer.register_parameter(pname, p)
layer._lora_cfg = cfg
layer._lora_variant = variant
layer._lora_role = role
@@ -85,18 +89,44 @@ def detach(model: nn.Module) -> None:
for pname in variant.param_specs(layer.in_features, layer.out_features, layer._lora_cfg):
if pname in layer._parameters:
del layer._parameters[pname]
elif pname in layer._buffers:
del layer._buffers[pname]
for attr in ("_lora_cfg", "_lora_variant", "_lora_role"):
if hasattr(layer, attr):
delattr(layer, attr)
delattr(model, _ATTACHED_ATTR)
def _base_weight_fingerprint(model: nn.Module) -> dict[str, str]:
"""Per-target fingerprint of the (post-init) base weights so PiSSA-style
variants that mutate `layer.weight` can fail loud on base mismatch.
Uses a cheap fp32 sum-of-squares + shape signature; not cryptographic.
"""
state = getattr(model, _ATTACHED_ATTR, None)
if state is None:
return {}
fp = {}
for name, layer in model.named_modules():
if not hasattr(layer, "_lora_variant"):
continue
if name not in state["targets"]:
continue
w = layer.weight.detach().to(torch.float32, copy=False)
fp[name] = f"{tuple(w.shape)}|{float((w * w).sum()):.6e}"
return fp
def save(model: nn.Module, path: str) -> None:
state = getattr(model, _ATTACHED_ATTR, None)
if state is None:
raise RuntimeError("no adapter attached; call attach() first")
sd = {k: v.detach().cpu() for k, v in model.state_dict().items() if "lora_" in k}
torch.save({"cfg": state["cfg"].to_dict(), "state": sd}, path)
blob = {
"cfg": state["cfg"].to_dict(),
"state": sd,
"base_fp": _base_weight_fingerprint(model),
}
torch.save(blob, path)
def load(model: nn.Module, path: str) -> list[RemovableHandle]:
@@ -111,4 +141,14 @@ def load(model: nn.Module, path: str) -> list[RemovableHandle]:
unexpected_lora = [k for k in unexpected if "lora_" in k]
if unexpected_lora:
raise RuntimeError(f"unexpected lora keys in checkpoint: {unexpected_lora}")
saved_fp = blob.get("base_fp", {})
if saved_fp:
cur_fp = _base_weight_fingerprint(model)
diffs = [k for k in saved_fp if saved_fp[k] != cur_fp.get(k)]
if diffs:
raise RuntimeError(
f"base weight fingerprint mismatch on {len(diffs)} layer(s) "
f"(e.g. {diffs[0]}). For PiSSA the saved adapter assumes the same "
"base; reload onto the original model or re-run init."
)
return handles
-1
View File
@@ -8,7 +8,6 @@ class LoraLiteConfig:
variant: str = "lora"
r: int = 8
alpha: float = 16.0
dropout: float = 0.0 # currently ignored; variants may use cfg.variant_kwargs
dtype: torch.dtype = torch.bfloat16
# targeting
+13 -2
View File
@@ -12,8 +12,9 @@ class ParamSpec:
shape: tuple[int, ...]
init: str | Callable[[torch.Tensor], None] = "zeros" # 'zeros'|'kaiming'|'ones'|callable(t)
trainable: bool = True
as_buffer: bool = False # if True, register_buffer instead of register_parameter
def make(self, dtype: torch.dtype, device) -> nn.Parameter:
def _empty(self, dtype: torch.dtype, device) -> torch.Tensor:
t = torch.empty(self.shape, dtype=dtype, device=device)
if callable(self.init):
self.init(t)
@@ -26,7 +27,17 @@ class ParamSpec:
nn.init.kaiming_uniform_(t, a=5 ** 0.5) if t.ndim >= 2 else t.normal_(0, 0.02)
else:
raise ValueError(f"unknown init: {self.init}")
return nn.Parameter(t, requires_grad=self.trainable)
return t
def make(self, dtype: torch.dtype, device) -> nn.Parameter:
# legacy entry: returns a Parameter (used for trainable adapter params)
if self.as_buffer:
raise RuntimeError("as_buffer spec must be installed via register_buffer; see adapter.attach")
return nn.Parameter(self._empty(dtype, device), requires_grad=self.trainable)
def make_tensor(self, dtype: torch.dtype, device) -> torch.Tensor:
# returns a raw tensor for buffer registration
return self._empty(dtype, device)
class Variant(Protocol):
+1 -1
View File
@@ -1 +1 @@
from . import lora, pissa, delora, ia3, dora, hra # noqa: F401 side-effect: register
from . import lora, pissa, delora, ia3, dora, hra, eva, antipasto # noqa: F401 side-effect: register
+145
View File
@@ -0,0 +1,145 @@
"""AntiPaSTO: SVD steering with learnable singular-value deltas + block-diagonal Cayley rotation.
Lite port of wassname's AntiPaSTO3 SVD adapter (research code, not an
upstream peft variant). Reference:
https://github.com/wassname/antipasto3 (offline: docs/refs/antipasto3_svd_adapter.py)
Decomposition (PyTorch nn.Linear convention, weight (d_out, d_in)):
W = U diag(S) Vh + W_res (top-r SVD; W_res = W - U_r diag(S_r) Vh_r)
We freeze U, S, Vh, W_res and learn:
- delta_s : (r,) -- additive delta to singular values
- rot_T : (n_blocks, bs(bs-1)/2) -- upper-triangle of skew matrix per block
Forward (matches base layer convention exactly at t=0):
R = block_diag(Cayley(skew(rot_T))) # (r, r) effective
Vh_rot = R @ Vh # rotates input basis
S_eff = S + delta_s # learnable spectrum
delta_y = ((x @ Vh_rot.T) * S_eff) @ U.T # rank-r path
base_y = x @ W_res.T # frozen residual
y_total = base_y + delta_y # == original output at t=0
At init: rot_T = 0 -> R = I -> Vh_rot = Vh, delta_s = 0 -> S_eff = S, so
delta_y reconstructs the truncated SVD term and y_total == x @ W^T to numerical
precision (fp32 SVD round-tripped to cfg.dtype).
WHICH BASIS IS ROTATED:
By default we rotate Vh (the INPUT singular basis). This is what AntiPaSTO3
calls `rotate_V=True` in adapter terms (V == Vh.T columns). To rotate U
(output basis) instead, pass variant_kwargs={'rotate_basis': 'U'}.
Rotating both is not implemented (one rotation is enough to span the
identifiable steering directions; two is degenerate).
REQUIRES even rank divisible by `block_size` (default 4). r=8, bs=4 -> 2 blocks.
"""
from __future__ import annotations
import math
import torch
from einops import einsum
from torch import nn
from ..variant import register, ParamSpec
def _cayley(skew: torch.Tensor) -> torch.Tensor:
"""R = (I - X)^-1 (I + X) for X = skew/2; preserves orthogonality."""
bs = skew.shape[-1]
eye = torch.eye(bs, dtype=skew.dtype, device=skew.device).expand_as(skew)
X = skew / 2
return torch.linalg.solve(eye - X, eye + X)
def _build_rotation(rot_T: torch.Tensor, bs: int, max_angle: float) -> torch.Tensor:
"""rot_T: (n_blocks, bs*(bs-1)/2) -> R: (n_blocks, bs, bs) Cayley rotation."""
n_blocks, _ = rot_T.shape
rows, cols = torch.triu_indices(bs, bs, offset=1, device=rot_T.device).unbind(0)
A = torch.zeros(n_blocks, bs, bs, dtype=rot_T.dtype, device=rot_T.device)
A[:, rows, cols] = rot_T
A = 0.5 * (A - A.transpose(-1, -2))
a_limit = 2.0 * math.tan(max_angle / 2.0)
A = a_limit * torch.tanh(A / a_limit)
return _cayley(A)
def _block_diag(blocks: torch.Tensor) -> torch.Tensor:
"""(n_blocks, bs, bs) -> (n_blocks*bs, n_blocks*bs) block-diagonal."""
n, bs, _ = blocks.shape
out = blocks.new_zeros(n * bs, n * bs)
for i in range(n):
out[i * bs : (i + 1) * bs, i * bs : (i + 1) * bs] = blocks[i]
return out
@register
class AntiPaSTO:
name = "antipasto"
@staticmethod
def param_specs(d_in, d_out, cfg):
r = cfg.r
bs = int(cfg.variant_kwargs.get("block_size", 4))
if r % bs != 0:
raise ValueError(f"AntiPaSTO requires r={r} divisible by block_size={bs}")
n_blocks = r // bs
n_triu = bs * (bs - 1) // 2
return {
# Frozen SVD components captured at init (buffers travel with state_dict).
"lora_U": ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
"lora_S": ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
"lora_Vh": ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
# Trainable: per-singular-value delta + block-diagonal Cayley rotation.
"lora_delta_s": ParamSpec((r,), init="zeros", trainable=True),
"lora_rot_T": ParamSpec((n_blocks, n_triu), init="zeros", trainable=True),
}
@staticmethod
def init(layer: nn.Linear, cfg) -> None:
if type(layer) is not nn.Linear:
raise TypeError(
"AntiPaSTO mutates layer.weight into W_res (like PiSSA), so v1 "
"only supports plain nn.Linear, not bnb 4/8-bit."
)
with torch.no_grad():
W = layer.weight.data.float()
U, S, Vh = torch.linalg.svd(W, full_matrices=False)
r = cfg.r
Ur, Sr, Vhr = U[:, :r], S[:r], Vh[:r, :]
layer.lora_U.copy_(Ur.to(layer.lora_U.dtype))
layer.lora_S.copy_(Sr.to(layer.lora_S.dtype))
layer.lora_Vh.copy_(Vhr.to(layer.lora_Vh.dtype))
# W_res is the residual after rank-r truncation. Forward adds back
# the truncated path so total == W exactly at init (mod dtype).
W_res = (W - (Ur * Sr) @ Vhr).to(layer.weight.dtype)
layer.weight.data.copy_(W_res)
@staticmethod
def forward(layer: nn.Linear, x, y):
cfg = layer._lora_cfg
bs = int(cfg.variant_kwargs.get("block_size", 4))
max_angle = float(cfg.variant_kwargs.get("max_rotation_angle", 0.5))
rotate_basis = cfg.variant_kwargs.get("rotate_basis", "V")
U = layer.lora_U.to(x.dtype) # (d_out, r)
S = layer.lora_S.to(x.dtype) # (r,)
Vh = layer.lora_Vh.to(x.dtype) # (r, d_in)
R_blocks = _build_rotation(layer.lora_rot_T.float(), bs, max_angle)
R = _block_diag(R_blocks).to(x.dtype) # (r, r)
if rotate_basis == "V":
Vh_eff = R @ Vh # rotate INPUT basis
U_eff = U
elif rotate_basis == "U":
Vh_eff = Vh
U_eff = U @ R.T # rotate OUTPUT basis
else:
raise ValueError(f"rotate_basis must be 'U' or 'V', got {rotate_basis!r}")
S_eff = S + layer.lora_delta_s.to(x.dtype) # (r,)
h = einsum(x, Vh_eff, "... i, r i -> ... r") # x @ Vh_eff.T
h = h * S_eff # diag(S_eff)
delta = einsum(h, U_eff, "... r, o r -> ... o") # @ U_eff.T
return y + delta
+33 -24
View File
@@ -1,27 +1,29 @@
"""DeLoRA: column-normalised A, B, scaled by lambda * ||W||_F / r.
"""DeLoRA: per-input-channel weight-norm scaling, per-rank A/B normalization.
Bini et al. 2025 (ICLR'25) https://arxiv.org/abs/2503.18225
Paper Eq. 8: W' = W + (lambda * ||W||_F / r) B Xi A
where Xi_{i,i} = 1 / (||b_i|| ||a_i||) makes each rank-1 component unit-norm.
This is equivalent to row-normalising A and column-normalising B (each column of
B and row of A has unit norm), so each rank-1 outer product b_i a_i^T has unit
spectral norm -> the whole low-rank update is bounded.
Identity at t=0: paper uses kaiming init for both A and B with `lambda` initialised
to 0 (or small) so the effective delta starts near zero. We honour that:
default lambda0 == 0 gives bit-identity; user can override via variant_kwargs.
Implementation follows the peft upstream (which the DeLoRA authors maintain),
which differs from the paper notation in two ways that are equivalent at the
forward level but matter for gradients/numerics:
1. ||W|| is captured PER INPUT CHANNEL (shape (d_in,)), not as a scalar
Frobenius norm. Used to scale `x` element-wise on the input dim.
See docs/refs/peft_delora_layer.py:150 (init) and :250 (forward).
2. Per-rank normalization applied via division (1/||A_i||*||B^j||) inside
the diagonal scaling, instead of as F.normalize on A,B themselves.
This keeps the gradient flowing through the un-normalized parameters.
Identity at t=0: lambda0=0 -> delta is exactly zero (bit-identity).
KNOWN GRADIENT ISSUE (flagged by external review 2026-04-26):
With lambda0=0 the *forward* is identity but `A,B` get zero gradient on step 0
(delta = lambda * ... -> d_output/d_A is proportional to lambda). Only
`lora_lambda` moves first step. With lambda0>0, A,B train but identity is broken.
Paper's true initialization (frozen-copy trick, see Eq. 9) achieves both;
we do NOT implement that here.
(delta is proportional to lambda). Only `lora_lambda` moves first step.
The paper's true initialization (frozen-copy trick, Eq. 9) achieves both
identity AND non-zero A/B gradients; we do NOT implement it here.
The frozen ||W||_F factor is captured once at init() into a buffer `lora_wnorm`.
Reference implementations (for review/cross-check):
Reference implementations:
- DeLoRA paper authors (ExplainableML/DeLoRA) -- their fork of peft:
https://github.com/ExplainableML/DeLoRA/blob/main/peft/src/peft/tuners/delora.py
(offline: docs/refs/orig_delora.py)
@@ -30,7 +32,6 @@ Reference implementations (for review/cross-check):
(offline: docs/refs/peft_delora_layer.py)
"""
import torch
import torch.nn.functional as F
from einops import einsum
from torch import nn
@@ -50,8 +51,9 @@ class DeLoRA:
"lora_lambda": ParamSpec(
(), init=lambda t: t.fill_(lam0), trainable=True
),
# ||W||_F captured at init; frozen scalar buffer (no grad)
"lora_wnorm": ParamSpec((), init="zeros", trainable=False),
# ||W||_2 per input channel (shape (d_in,)); frozen buffer captured at init
# per peft DeLoRA (docs/refs/peft_delora_layer.py:150).
"lora_wnorm": ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True),
}
@staticmethod
@@ -60,16 +62,23 @@ class DeLoRA:
# dequantizes via .float() round-trip if available, or fails cleanly.
with torch.no_grad():
W = layer.weight.data.float()
layer.lora_wnorm.data.fill_(W.norm().item())
wnorm = W.norm(dim=0).detach().to(layer.lora_wnorm.dtype)
layer.lora_wnorm.copy_(wnorm)
return
@staticmethod
def forward(layer: nn.Linear, x, y):
cfg = layer._lora_cfg
# rows of A unit, cols of B unit (per paper, equivalent to Xi)
A = F.normalize(layer.lora_A, dim=1) # (r, d_in)
B = F.normalize(layer.lora_B, dim=0) # (d_out, r)
scale = layer.lora_lambda * layer.lora_wnorm / cfg.r
h = einsum(x, A, "... i, r i -> ... r")
A = layer.lora_A # (r, d_in)
B = layer.lora_B # (d_out, r)
# peft delora forward (docs/refs/peft_delora_layer.py:248-260):
# h = (x * w_norm) @ A.T; scale per-rank = (lambda/r) / (||A_i|| * ||B^j||);
# delta = (h * scale) @ B.T
x_scaled = x * layer.lora_wnorm # (..., d_in)
h = einsum(x_scaled, A, "... i, r i -> ... r")
An = torch.clamp(A.norm(dim=1), min=1e-4) # (r,)
Bn = torch.clamp(B.norm(dim=0), min=1e-4) # (r,)
scale = (layer.lora_lambda / cfg.r) / (An * Bn) # (r,)
h = h * scale
delta = einsum(h, B, "... r, o r -> ... o")
return y + scale * delta
return y + delta
+121
View File
@@ -0,0 +1,121 @@
"""EVA: Explained-Variance Adaptation. Paischer et al. 2024.
Paper: https://arxiv.org/abs/2410.07170 (also referred to as ICLR'25 EVA).
Idea: instead of random A and zero B (LoRA) or SVD of W (PiSSA), initialize
`lora_A` to the top-r right singular vectors of the LAYER INPUT distribution
on a small calibration set. Forward = `y + scale * (B @ A @ x)` exactly like
LoRA; with `lora_B = 0` the adapter is identity at t=0. Only B trains
afterwards (A frozen). The result: each rank slot points along a direction
that actually carries information at this layer.
This is a stripped-down EVA; we do NOT implement:
- rank redistribution across layers via explained-variance ratios
(peft EVA computes an explained_variance_ratio per layer then redistributes
the global rank budget; we use a uniform `cfg.r` per layer).
- Incremental PCA over many micro-batches (we run one full SVD on the
pooled calibration activations per layer).
- Equal-input deduplication (peft hashes inputs to share SVD across QKV).
API stress-test: this variant requires data-driven init, so it implements
`group_init(model, targets, cfg, calibration_data)` to drive a single forward
pass on `calibration_data` with hooks that capture each target's input.
Identity at t=0: `lora_B = 0` -> delta = 0 -> y unchanged.
References:
- peft EVA (full impl, with IncrementalPCA + redistribution):
https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/eva.py
(offline: docs/refs/peft_eva.py)
- peft fine-tuning script demonstrating initialize_lora_eva_weights:
https://github.com/huggingface/peft/blob/main/examples/eva_finetuning/eva_finetuning.py
(offline: docs/refs/peft_eva_finetuning.py)
"""
from __future__ import annotations
import torch
from einops import einsum
from torch import nn
from ..variant import register, ParamSpec
@register
class EVA:
name = "eva"
@staticmethod
def param_specs(d_in, d_out, cfg):
return {
# A is frozen (set in group_init from calibration data); kept as a
# buffer so it travels with state_dict and is not optimized.
"lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=False, as_buffer=True),
# B is the only trainable bit; zero-init -> identity at t=0.
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
}
@staticmethod
def init(layer: nn.Linear, cfg) -> None:
# No-op; group_init does the data-driven SVD across all targets at once.
return
@staticmethod
def group_init(model: nn.Module, targets, cfg, calibration_data) -> None:
if calibration_data is None:
raise ValueError(
"EVA requires calibration_data: an iterable of model inputs "
"(dicts of kwargs to model.forward, or single tensors) used to "
"estimate the input PCA per layer. Pass via "
"lora_lite.attach(model, cfg, calibration_data=batches)."
)
# Collect input activations per target via forward hooks.
layers = {name: layer for name, layer, _ in targets}
captured: dict[str, list[torch.Tensor]] = {n: [] for n in layers}
def make_hook(name):
def _h(module, args, kwargs):
# signature: pre-forward, args[0] is the input tensor
x = args[0].detach()
captured[name].append(x.reshape(-1, x.shape[-1]).to(torch.float32).cpu())
return _h
handles = [
layers[n].register_forward_pre_hook(make_hook(n), with_kwargs=True)
for n in layers
]
try:
was_training = model.training
model.eval()
with torch.no_grad():
for batch in calibration_data:
if isinstance(batch, dict):
model(**batch)
elif isinstance(batch, (list, tuple)):
model(*batch)
else:
model(batch)
if was_training:
model.train()
finally:
for h in handles:
h.remove()
# SVD per target on pooled inputs; top-r right singular vectors -> A.
for name, layer in layers.items():
X = torch.cat(captured[name], dim=0) # (N, d_in)
if X.shape[0] < cfg.r:
raise RuntimeError(
f"EVA at {name}: only {X.shape[0]} calibration tokens, need >= r={cfg.r}"
)
# full_matrices=False -> Vh shape (min(N,d_in), d_in); take top-r rows
_, _, Vh = torch.linalg.svd(X, full_matrices=False)
A = Vh[: cfg.r, :].to(layer.lora_A.dtype).to(layer.lora_A.device)
layer.lora_A.copy_(A)
@staticmethod
def forward(layer: nn.Linear, x, y):
cfg = layer._lora_cfg
scale = cfg.alpha / cfg.r
h = einsum(x, layer.lora_A, "... i, r i -> ... r")
delta = einsum(h, layer.lora_B, "... r, o r -> ... o")
return y + scale * delta
+29 -21
View File
@@ -9,27 +9,22 @@ so the layer output becomes y' = W' x = W (R x). R is in INPUT space (d_in x d
We implement this via a `forward_input` pre-hook that returns `R x`, then the
frozen base layer (including bnb 4/8-bit Linear) computes `W (R x)` itself.
Identity at t=0: `lora_gate` is initialized to 0 and gates each Householder
vector, so the effective u_i starts at 0 -> H_i = I -> R = I -> y' = y.
At training time the gate scales the active reflection direction.
Identity at t=0 (PEFT-style symmetric init, requires even r):
Rows are kaiming-init in pairs: U[0]=U[1], U[2]=U[3], ... Adjacent pairs of
Householder reflections with identical vectors cancel exactly
(H_i H_i = I), so R = I at init -> y' = y to bit-precision.
After the first gradient step the paired rows diverge and the chain becomes a
general orthogonal matrix; gradient flows into U from step 0 (no dead-grad).
Odd r is rejected (matches peft warning behaviour).
KNOWN GRADIENT ISSUE (flagged by external review 2026-04-26):
Forward is `x + gate * (Rx - x)`. With gate=0 at init, d_output/d_U is
proportional to gate, so on step 0 ONLY `lora_gate` receives gradient;
`lora_U` is dead. Once gate moves off zero, U starts learning. This deviates
from the paper, which has no such gate -- paper uses orthogonal init of U so
R != I from step 0. We trade paper-faithful init for identity-at-init.
OMITTED: paper also adds an orthogonality regularizer
lambda * sum_i (u_i^T u_j)^2 (Eq. 6 / Sec. 3.3)
which is a loss term, not a forward-pass change. Add it in your training loop if
you want the regularized HRA variant.
OMITTED: paper also adds an orthogonality regularizer (Eq. 6 / Sec. 3.3),
a loss-side term. Add it in your training loop if you want regularized HRA.
Reference implementations (for review/cross-check):
- HRA paper authors (DaShenZi721/HRA), llama variant of OFT layer with HRA:
https://github.com/DaShenZi721/HRA/blob/master/llama/peft/oft/layer_GS_HRA.py
(offline: docs/refs/orig_hra_layer.py)
- peft HRA layer (cleaner, includes apply_GS toggle for orthogonalization):
- peft HRA layer, reset_hra_parameters (lines 100-108):
https://github.com/huggingface/peft/blob/main/src/peft/tuners/hra/layer.py
(offline: docs/refs/peft_hra_layer.py)
"""
@@ -46,20 +41,33 @@ class HRA:
@staticmethod
def param_specs(d_in, d_out, cfg):
if cfg.r % 2 != 0:
raise ValueError(
f"HRA symmetric init requires even r; got r={cfg.r}. "
"Pick an even rank or use a different variant."
)
return {
# one Householder vector per rank slot in INPUT space R^{d_in}
"lora_U": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
# identity gate; 0 -> R = I exactly
"lora_gate": ParamSpec((), init="zeros", trainable=True),
# Householder vectors stacked as rows (one vector per rank slot)
# init done in init() to enforce paired rows -> R = I at t=0.
"lora_U": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
}
@staticmethod
def init(layer: nn.Linear, cfg) -> None:
# Symmetric init per peft (docs/refs/peft_hra_layer.py:101-108):
# half = kaiming(r//2, d_in); U = repeat_interleave(half, 2, dim=0)
# Adjacent pairs (H_2k H_2k+1) cancel since H^2 = I, so R = I exactly,
# while gradient still flows into U from step 0.
with torch.no_grad():
r, d_in = layer.lora_U.shape
half = torch.empty(r // 2, d_in, dtype=layer.lora_U.dtype, device=layer.lora_U.device)
nn.init.kaiming_uniform_(half, a=5 ** 0.5)
layer.lora_U.copy_(torch.repeat_interleave(half, 2, dim=0))
return
@staticmethod
def forward_input(layer: nn.Linear, x: torch.Tensor) -> torch.Tensor:
"""Apply x + gate * (Rx - x). gate=0 -> identity; nonzero -> full Householder chain."""
"""Apply Rx where R = prod_i H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2."""
U = layer.lora_U # (r, d_in)
Rx = x
for i in range(U.shape[0]):
@@ -67,4 +75,4 @@ class HRA:
sq = (u * u).sum().clamp_min(1e-12)
coeff = einsum(Rx, u, "... i, i -> ...") * (2.0 / sq)
Rx = Rx - coeff.unsqueeze(-1) * u
return x + layer.lora_gate * (Rx - x)
return Rx
+37 -18
View File
@@ -1,26 +1,28 @@
"""IA3-style output gating. Liu et al. 2022 https://arxiv.org/abs/2205.05638
"""IA3-style elementwise gating. Liu et al. 2022 https://arxiv.org/abs/2205.05638
y_new = y * g, g initialized to 1 (identity at t=0)
Two registered variants, matching the paper's two regimes:
DEVIATION FROM PAPER:
The original IA3 gates only three positions per transformer block:
l_k * (k_proj output), l_v * (v_proj output), l_ff * (FFN intermediate after activation)
This implementation gates ANY linear layer the targeting system selects.
To match the paper exactly on a typical Llama/Qwen-style block, attach with:
* `ia3` -- OUTPUT-side gating, parameter shape (d_out,).
y_new = y * g. Use for attention projections (k_proj, v_proj).
cfg = LoraLiteConfig(
variant="ia3",
target_names=(r"\\.k_proj$", r"\\.v_proj$", r"\\.up_proj$"),
target_roles=(),
)
* `ia3_ff` -- INPUT-side gating, parameter shape (d_in,).
y_new = base_layer(x * g). Use for FFN-down layers (down_proj,
fc2). Equivalent to the paper's "gate the FFN intermediate (post-
activation)" position because down_proj's input IS that
intermediate hidden state.
`up_proj` is the closest stand-in for "FFN intermediate" in gated-MLP blocks
(Llama uses gate * up; gating the up branch is the IA3-spirit choice).
In both cases g is initialized to 1 -> identity at t=0.
Reference implementations (for review/cross-check):
- peft IA3 layer (uses ia3_l elementwise scaling, fan_in_fan_out aware):
To match the paper exactly on a Llama/Qwen-style block requires TWO attach
passes (one per variant), since each variant uses one hook type:
cfg_attn = LoraLiteConfig(variant="ia3", target_names=(r"\\.k_proj$", r"\\.v_proj$"))
cfg_ffn = LoraLiteConfig(variant="ia3_ff", target_names=(r"\\.down_proj$",))
Reference implementation:
- peft IA3 layer (is_feedforward toggles input-vs-output gating, see
docs/refs/peft_ia3_layer.py:177-188 forward and :214 update_layer):
https://github.com/huggingface/peft/blob/main/src/peft/tuners/ia3/layer.py
(offline: docs/refs/peft_ia3_layer.py)
"""
import torch
from torch import nn
@@ -42,4 +44,21 @@ class IA3:
@staticmethod
def forward(layer: nn.Linear, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
return y * layer.lora_g
return y * layer.lora_g
@register
class IA3FF:
name = "ia3_ff"
@staticmethod
def param_specs(d_in, d_out, cfg):
return {"lora_g": ParamSpec((d_in,), init="ones", trainable=True)}
@staticmethod
def init(layer: nn.Linear, cfg) -> None:
return
@staticmethod
def forward_input(layer: nn.Linear, x: torch.Tensor) -> torch.Tensor:
return x * layer.lora_g
+5 -3
View File
@@ -6,9 +6,11 @@ W_eff(t=0) = W_res + B@A = W (numerically; bf16 round-trip not bit-exact).
DEVIATION FROM PAPER (documented):
- Paper sets adapter scale = 1 (no alpha/r factor); we keep LoRA's alpha/r
pipeline so callers must pass alpha=r to get paper-faithful identity.
- Saved adapter does NOT include W_res; load() recomputes PiSSA init on the
*same-seed base* before overwriting A/B. Reload is exact only on identical
base weights.
- Saved adapter does NOT include W_res (would double checkpoint size). Instead
`adapter.save` records a fingerprint of the post-init base weights and
`adapter.load` re-runs PiSSA init then verifies the fingerprint matches
-- so loading onto a different base weight raises loudly instead of
silently producing wrong outputs.
Reference implementations (for review/cross-check):
- PiSSA original (NeurIPS'24 spotlight) init script (SVD on dequant W):
+52 -1
View File
@@ -132,6 +132,7 @@ def variant_test(variant: str, dtype=torch.float32):
"ia3": 1e-6,
"dora": 5e-5, # m * V/||V|| with V=W -> rounding in norm/divide
"hra": 1e-6, # gate=0 -> exact identity
"antipasto": 5e-4, # SVD truncation + W_res reconstruction in fp32
}[variant] * max(1.0, base_scale)
assert err < tol, f" FAIL identity: err {err} > tol {tol}"
print(f" SHOULD: err<{tol:.1e}. PASS.")
@@ -173,6 +174,8 @@ def variant_test(variant: str, dtype=torch.float32):
opt = torch.optim.Adam(trainable, lr=1e-1)
elif variant == "dora":
opt = torch.optim.Adam(trainable, lr=1e-3) # m near ||W||_c, bigger lr blows up
elif variant == "antipasto":
opt = torch.optim.Adam(trainable, lr=1e-2) # delta_s + rot_T, sensitive
else:
opt = torch.optim.SGD(trainable, lr=1e-2)
losses = []
@@ -278,13 +281,61 @@ def bitsandbytes_cuda_smoke(require_bnb: bool):
del model
def eva_smoke():
"""EVA needs calibration data: drives forward + per-target SVD on inputs."""
print("\n=== variant=eva (data-driven init via group_init+calibration_data) ===")
torch.manual_seed(0)
model = TinyModel().to(torch.float32)
ids = torch.randint(0, 100, (2, 16))
with torch.no_grad():
y_base = model(ids).clone()
cfg = ll.LoraLiteConfig(variant="eva", r=4, alpha=8, dtype=torch.float32)
# 4 calibration batches of random ids
calib = [torch.randint(0, 100, (2, 16)) for _ in range(4)]
ll.attach(model, cfg, calibration_data=calib)
n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f" trainable params={n_trainable} (should be only lora_B since A is buffer)")
with torch.no_grad():
y_adapt = model(ids)
err = (y_adapt - y_base).abs().max().item()
print(f" t=0 identity: max|y_adapt - y_base| = {err:.3e}")
assert err < 1e-6, f"EVA should be exact identity (B=0); got {err}"
print(" SHOULD: err==0 (B=0 init). PASS.")
# check A buffer is non-zero (data-driven)
a_norms = [layer.lora_A.norm().item() for layer in [m for m in model.modules() if hasattr(m, "lora_A")]]
assert all(n > 0 for n in a_norms), "EVA lora_A buffers all zero -> group_init never ran"
print(f" SHOULD: lora_A buffers populated. PASS (mean ||A||={sum(a_norms)/len(a_norms):.3f}).")
# gradient flow: only B trains
target = torch.randn(2, 16, 100, dtype=torch.float32) * 0.1
trainable = [p for p in model.parameters() if p.requires_grad]
opt = torch.optim.SGD(trainable, lr=1e-2)
losses = []
for _ in range(20):
opt.zero_grad()
loss = (model(ids) - target).pow(2).mean()
loss.backward()
assert_no_base_grads(model)
opt.step()
losses.append(loss.item())
drop = (losses[0] - losses[-1]) / max(losses[0], 1e-12)
print(f" loss[0]={losses[0]:.4f} loss[-1]={losses[-1]:.4f} drop={100*drop:.1f}%")
assert drop > 0.05
print(" SHOULD: drop>5%. PASS.")
ll.detach(model)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--require-bnb", action="store_true")
args = parser.parse_args()
for v in ("lora", "pissa", "delora", "ia3", "dora", "hra"):
for v in ("lora", "pissa", "delora", "ia3", "dora", "hra", "antipasto"):
variant_test(v, dtype=torch.float32)
eva_smoke()
structural_linear_like_test()
bitsandbytes_cuda_smoke(args.require_bnb)
print("\nALL PASS.")