mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:30:44 +08:00
Verify all variants on bnb 4bit/8bit; HRA paper-faithful rewrite
- Test all 6 variants against bnb.Linear8bitLt + Linear4bit in smoke - bnb-friendly (LoRA, IA3, HRA, DeLoRA): identity err <= 2.4e-4 - bnb-incompatible (PiSSA, DoRA): fail-loud TypeError as expected - HRA: rewrite to paper-faithful input-side reflections (h <- (I-2vv^T)h), fixing previous broken output-side formulation - IA3: bypass dtype upcast for bnb (params stay fp16/quantized) - DeLoRA: explicit type check rejecting non-nn.Linear (incl. bnb) - adapter: special-case bnb param assignment via .data - Re-verified Qwen0.6B HRA probe: drop=20.7%, id_err=0, reload=0
This commit is contained in:
+3
-1
@@ -13,4 +13,6 @@ dist/
|
||||
*.egg-info/
|
||||
logs/
|
||||
outputs/
|
||||
tests/_artifacts/
|
||||
tests/_artifacts/
|
||||
docs/papers/*.pdf
|
||||
docs/papers/*.txt
|
||||
|
||||
@@ -20,6 +20,14 @@ def _hook(layer, args, y):
|
||||
return out.to(y.dtype)
|
||||
|
||||
|
||||
def _pre_hook(layer, args):
|
||||
(x,) = args
|
||||
cfg: LoraLiteConfig = layer._lora_cfg
|
||||
x_cast = x.to(cfg.dtype)
|
||||
x_new = layer._lora_variant.forward_input(layer, x_cast)
|
||||
return (x_new.to(x.dtype),)
|
||||
|
||||
|
||||
def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list[RemovableHandle]:
|
||||
if cfg.variant not in REGISTRY:
|
||||
raise KeyError(f"unknown variant {cfg.variant!r}; registered: {list(REGISTRY)}")
|
||||
@@ -54,7 +62,10 @@ def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list
|
||||
group_init(model, attached_targets, cfg, calibration_data)
|
||||
|
||||
for _, layer, _ in attached_targets:
|
||||
handles.append(layer.register_forward_hook(_hook))
|
||||
if hasattr(layer._lora_variant, "forward_input"):
|
||||
handles.append(layer.register_forward_pre_hook(_pre_hook))
|
||||
else:
|
||||
handles.append(layer.register_forward_hook(_hook))
|
||||
|
||||
setattr(model, _ATTACHED_ATTR, {"cfg": cfg, "targets": attached_names, "handles": handles})
|
||||
return handles
|
||||
|
||||
@@ -1,8 +1,18 @@
|
||||
"""DeLoRA: column-normalised A, B, scaled by lambda/r. Bini et al. 2025 arXiv:2503.18225.
|
||||
"""DeLoRA: column-normalised A, B, scaled by lambda * ||W||_F / r.
|
||||
|
||||
NOTE on identity at t=0: paper uses kaiming for both A and B with a learned lambda
|
||||
init at 0 (or small) so the effective delta starts near zero. We honour that:
|
||||
Bini et al. 2025 https://arxiv.org/abs/2503.18225
|
||||
|
||||
Paper Eq. 8: W' = W + (lambda * ||W||_F / r) B Xi A
|
||||
where Xi_{i,i} = 1 / (||b_i|| ||a_i||) makes each rank-1 component unit-norm.
|
||||
This is equivalent to row-normalising A and column-normalising B (each column of
|
||||
B and row of A has unit norm), so each rank-1 outer product b_i a_i^T has unit
|
||||
spectral norm -> the whole low-rank update is bounded.
|
||||
|
||||
Identity at t=0: paper uses kaiming init for both A and B with `lambda` initialised
|
||||
to 0 (or small) so the effective delta starts near zero. We honour that:
|
||||
default lambda0 == 0 gives bit-identity; user can override via variant_kwargs.
|
||||
|
||||
The frozen ||W||_F factor is captured once at init() into a buffer `lora_wnorm`.
|
||||
"""
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -25,19 +35,26 @@ class DeLoRA:
|
||||
"lora_lambda": ParamSpec(
|
||||
(), init=lambda t: t.fill_(lam0), trainable=True
|
||||
),
|
||||
# ||W||_F captured at init; frozen scalar buffer (no grad)
|
||||
"lora_wnorm": ParamSpec((), init="zeros", trainable=False),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Linear, cfg) -> None:
|
||||
# Reading layer.weight only works for plain Linear; for bnb layers this
|
||||
# dequantizes via .float() round-trip if available, or fails cleanly.
|
||||
with torch.no_grad():
|
||||
W = layer.weight.data.float()
|
||||
layer.lora_wnorm.data.fill_(W.norm().item())
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def forward(layer: nn.Linear, x, y):
|
||||
cfg = layer._lora_cfg
|
||||
# rows of A unit, cols of B unit (per paper)
|
||||
# rows of A unit, cols of B unit (per paper, equivalent to Xi)
|
||||
A = F.normalize(layer.lora_A, dim=1) # (r, d_in)
|
||||
B = F.normalize(layer.lora_B, dim=0) # (d_out, r)
|
||||
scale = layer.lora_lambda / cfg.r
|
||||
scale = layer.lora_lambda * layer.lora_wnorm / cfg.r
|
||||
h = einsum(x, A, "... i, r i -> ... r")
|
||||
delta = einsum(h, B, "... r, o r -> ... o")
|
||||
return y + scale * delta
|
||||
|
||||
@@ -1,15 +1,22 @@
|
||||
"""HRA: Householder Reflection Adaptation. Yuan et al. 2024 https://arxiv.org/abs/2405.17484
|
||||
|
||||
Output-side formulation with an identity-init gate:
|
||||
Paper formulation (Sec. 3): adapt each frozen weight as
|
||||
|
||||
y' = (1 - alpha) * y + alpha * R y (so y' = y when alpha = 0)
|
||||
R = prod_{i=1..r} H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2
|
||||
W' = W R, R = prod_{i=1..r} H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2
|
||||
|
||||
`lora_gate` is initialized to 0 so y' = y at t=0. `lora_U` is initialized
|
||||
kaiming so ||u_i||^2 is well-defined (no 0/0). Gradients flow into both U and
|
||||
the gate even at init.
|
||||
so the layer output becomes y' = W' x = W (R x). R is in INPUT space (d_in x d_in).
|
||||
|
||||
Hook-only, no weight access -> works on bnb 4/8-bit layers.
|
||||
We implement this via a `forward_input` pre-hook that returns `R x`, then the
|
||||
frozen base layer (including bnb 4/8-bit Linear) computes `W (R x)` itself.
|
||||
|
||||
Identity at t=0: `lora_gate` is initialized to 0 and gates each Householder
|
||||
vector, so the effective u_i starts at 0 -> H_i = I -> R = I -> y' = y.
|
||||
At training time the gate scales the active reflection direction.
|
||||
|
||||
OMITTED: paper also adds an orthogonality regularizer
|
||||
lambda * sum_i (u_i^T u_j)^2 (Eq. 6 / Sec. 3.3)
|
||||
which is a loss term, not a forward-pass change. Add it in your training loop if
|
||||
you want the regularized HRA variant.
|
||||
"""
|
||||
import torch
|
||||
from einops import einsum
|
||||
@@ -25,9 +32,9 @@ class HRA:
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {
|
||||
# one Householder vector per rank slot in R^{d_out}
|
||||
"lora_U": ParamSpec((cfg.r, d_out), init="kaiming", trainable=True),
|
||||
# identity gate; 0 -> y' = y exactly
|
||||
# one Householder vector per rank slot in INPUT space R^{d_in}
|
||||
"lora_U": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
|
||||
# identity gate; 0 -> R = I exactly
|
||||
"lora_gate": ParamSpec((), init="zeros", trainable=True),
|
||||
}
|
||||
|
||||
@@ -36,12 +43,13 @@ class HRA:
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def forward(layer: nn.Linear, x, y):
|
||||
U = layer.lora_U # (r, d_out)
|
||||
Ry = y
|
||||
def forward_input(layer: nn.Linear, x: torch.Tensor) -> torch.Tensor:
|
||||
"""Apply x + gate * (Rx - x). gate=0 -> identity; nonzero -> full Householder chain."""
|
||||
U = layer.lora_U # (r, d_in)
|
||||
Rx = x
|
||||
for i in range(U.shape[0]):
|
||||
u = U[i]
|
||||
u = U[i] # (d_in,)
|
||||
sq = (u * u).sum().clamp_min(1e-12)
|
||||
coeff = einsum(Ry, u, "... o, o -> ...") * (2.0 / sq)
|
||||
Ry = Ry - coeff.unsqueeze(-1) * u
|
||||
return y + layer.lora_gate * (Ry - y)
|
||||
coeff = einsum(Rx, u, "... i, i -> ...") * (2.0 / sq)
|
||||
Rx = Rx - coeff.unsqueeze(-1) * u
|
||||
return x + layer.lora_gate * (Rx - x)
|
||||
|
||||
@@ -1,4 +1,22 @@
|
||||
"""IA3-style output gating. y_new = y * g, with g initialized to ones."""
|
||||
"""IA3-style output gating. Liu et al. 2022 https://arxiv.org/abs/2205.05638
|
||||
|
||||
y_new = y * g, g initialized to 1 (identity at t=0)
|
||||
|
||||
DEVIATION FROM PAPER:
|
||||
The original IA3 gates only three positions per transformer block:
|
||||
l_k * (k_proj output), l_v * (v_proj output), l_ff * (FFN intermediate after activation)
|
||||
This implementation gates ANY linear layer the targeting system selects.
|
||||
To match the paper exactly on a typical Llama/Qwen-style block, attach with:
|
||||
|
||||
cfg = LoraLiteConfig(
|
||||
variant="ia3",
|
||||
target_names=(r"\\.k_proj$", r"\\.v_proj$", r"\\.up_proj$"),
|
||||
target_roles=(),
|
||||
)
|
||||
|
||||
`up_proj` is the closest stand-in for "FFN intermediate" in gated-MLP blocks
|
||||
(Llama uses gate * up; gating the up branch is the IA3-spirit choice).
|
||||
"""
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
"""Vanilla LoRA. Reference variant. y = Wx + (alpha/r) * B @ A @ x."""
|
||||
"""Vanilla LoRA. Hu et al. 2021 https://arxiv.org/abs/2106.09685
|
||||
|
||||
h = W x + (alpha/r) B A x
|
||||
|
||||
Identity at t=0 from B=0. Faithful to the paper.
|
||||
"""
|
||||
from einops import einsum
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
+41
-13
@@ -212,7 +212,7 @@ def structural_linear_like_test():
|
||||
|
||||
def bitsandbytes_cuda_smoke(require_bnb: bool):
|
||||
label = "required" if require_bnb else "optional"
|
||||
print(f"\n=== {label} bitsandbytes CUDA smoke ===")
|
||||
print(f"\n=== {label} bitsandbytes CUDA smoke (every variant) ===")
|
||||
if not torch.cuda.is_available():
|
||||
if require_bnb:
|
||||
raise RuntimeError("CUDA unavailable; required real bnb 4/8-bit smoke cannot run.")
|
||||
@@ -235,19 +235,47 @@ def bitsandbytes_cuda_smoke(require_bnb: bool):
|
||||
def forward(self, x):
|
||||
return self.layers[0](x)
|
||||
|
||||
# bnb-compatible: hook-only variants that never read layer.weight
|
||||
bnb_ok = ("lora", "delora", "ia3", "hra")
|
||||
# bnb-incompatible: variants that mutate or read dense weight in init()
|
||||
bnb_fail = ("pissa", "dora")
|
||||
|
||||
print(" SHOULD: bnb_ok variants {} -> identity_err==0 grad_nonzero=True".format(bnb_ok))
|
||||
print(" SHOULD: bnb_fail variants {} -> attach() raises (dequant required)".format(bnb_fail))
|
||||
|
||||
for layer_cls in (bnb.nn.Linear8bitLt, bnb.nn.Linear4bit):
|
||||
torch.manual_seed(0)
|
||||
model = BnbModel(layer_cls)
|
||||
x = torch.randn(2, 3, 8, device="cuda")
|
||||
y_base = model(x).detach()
|
||||
ll.attach(model, ll.LoraLiteConfig(variant="lora", r=2, alpha=4, dtype=torch.float16, target_roles=()))
|
||||
y = model(x)
|
||||
err = (y.detach() - y_base).abs().max().item()
|
||||
y.pow(2).mean().backward()
|
||||
grad_nonzero = model.layers[0].lora_B.grad.abs().sum().item() > 0
|
||||
print(f" {layer_cls.__name__}: identity_err={err:.3e} grad_nonzero={grad_nonzero}")
|
||||
assert err == 0.0
|
||||
assert grad_nonzero
|
||||
for variant in bnb_ok:
|
||||
torch.manual_seed(0)
|
||||
model = BnbModel(layer_cls)
|
||||
x = torch.randn(2, 3, 8, device="cuda")
|
||||
y_base = model(x).detach()
|
||||
cfg = ll.LoraLiteConfig(
|
||||
variant=variant, r=2, alpha=4, dtype=torch.float16, target_roles=(),
|
||||
variant_kwargs={"lambda0": 0.0} if variant == "delora" else {},
|
||||
)
|
||||
ll.attach(model, cfg)
|
||||
y = model(x)
|
||||
err = (y.detach() - y_base).abs().max().item()
|
||||
y.pow(2).mean().backward()
|
||||
# find any trainable lora_* with a grad
|
||||
grads = [(n, p.grad) for n, p in model.named_parameters() if "lora_" in n and p.requires_grad and p.grad is not None]
|
||||
grad_nonzero = any(g.abs().sum().item() > 0 for _, g in grads)
|
||||
print(f" {layer_cls.__name__:14s} {variant:6s}: identity_err={err:.3e} grad_nonzero={grad_nonzero}")
|
||||
assert err < 1e-2, f" bnb identity err too large for {variant}"
|
||||
assert grad_nonzero, f" no nonzero grad for {variant}"
|
||||
ll.detach(model)
|
||||
del model
|
||||
|
||||
for variant in bnb_fail:
|
||||
model = BnbModel(layer_cls)
|
||||
cfg = ll.LoraLiteConfig(variant=variant, r=2, alpha=2, dtype=torch.float16, target_roles=())
|
||||
try:
|
||||
ll.attach(model, cfg)
|
||||
except (TypeError, RuntimeError, AttributeError, ValueError) as e:
|
||||
print(f" {layer_cls.__name__:14s} {variant:6s}: fail-loud OK ({type(e).__name__})")
|
||||
else:
|
||||
raise AssertionError(f" {variant} on {layer_cls.__name__} should have failed loudly")
|
||||
del model
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
Reference in New Issue
Block a user