From 7eeaeed206dea1bfbee124cdec1ecdaafb6cf6ef Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 26 Apr 2026 18:08:06 +0800 Subject: [PATCH] Verify all variants on bnb 4bit/8bit; HRA paper-faithful rewrite - Test all 6 variants against bnb.Linear8bitLt + Linear4bit in smoke - bnb-friendly (LoRA, IA3, HRA, DeLoRA): identity err <= 2.4e-4 - bnb-incompatible (PiSSA, DoRA): fail-loud TypeError as expected - HRA: rewrite to paper-faithful input-side reflections (h <- (I-2vv^T)h), fixing previous broken output-side formulation - IA3: bypass dtype upcast for bnb (params stay fp16/quantized) - DeLoRA: explicit type check rejecting non-nn.Linear (incl. bnb) - adapter: special-case bnb param assignment via .data - Re-verified Qwen0.6B HRA probe: drop=20.7%, id_err=0, reload=0 --- .gitignore | 4 ++- src/lora_lite/adapter.py | 13 +++++++- src/lora_lite/variants/delora.py | 27 +++++++++++++--- src/lora_lite/variants/hra.py | 42 +++++++++++++++---------- src/lora_lite/variants/ia3.py | 20 +++++++++++- src/lora_lite/variants/lora.py | 7 ++++- tests/smoke.py | 54 ++++++++++++++++++++++++-------- 7 files changed, 128 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index 782206b..2235f24 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ dist/ *.egg-info/ logs/ outputs/ -tests/_artifacts/ \ No newline at end of file +tests/_artifacts/ +docs/papers/*.pdf +docs/papers/*.txt diff --git a/src/lora_lite/adapter.py b/src/lora_lite/adapter.py index 10cc64c..6faf7ea 100644 --- a/src/lora_lite/adapter.py +++ b/src/lora_lite/adapter.py @@ -20,6 +20,14 @@ def _hook(layer, args, y): return out.to(y.dtype) +def _pre_hook(layer, args): + (x,) = args + cfg: LoraLiteConfig = layer._lora_cfg + x_cast = x.to(cfg.dtype) + x_new = layer._lora_variant.forward_input(layer, x_cast) + return (x_new.to(x.dtype),) + + def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list[RemovableHandle]: if cfg.variant not in REGISTRY: raise KeyError(f"unknown variant {cfg.variant!r}; registered: {list(REGISTRY)}") @@ -54,7 +62,10 @@ def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list group_init(model, attached_targets, cfg, calibration_data) for _, layer, _ in attached_targets: - handles.append(layer.register_forward_hook(_hook)) + if hasattr(layer._lora_variant, "forward_input"): + handles.append(layer.register_forward_pre_hook(_pre_hook)) + else: + handles.append(layer.register_forward_hook(_hook)) setattr(model, _ATTACHED_ATTR, {"cfg": cfg, "targets": attached_names, "handles": handles}) return handles diff --git a/src/lora_lite/variants/delora.py b/src/lora_lite/variants/delora.py index 6b6e66d..094e9a3 100644 --- a/src/lora_lite/variants/delora.py +++ b/src/lora_lite/variants/delora.py @@ -1,8 +1,18 @@ -"""DeLoRA: column-normalised A, B, scaled by lambda/r. Bini et al. 2025 arXiv:2503.18225. +"""DeLoRA: column-normalised A, B, scaled by lambda * ||W||_F / r. -NOTE on identity at t=0: paper uses kaiming for both A and B with a learned lambda -init at 0 (or small) so the effective delta starts near zero. We honour that: +Bini et al. 2025 https://arxiv.org/abs/2503.18225 + +Paper Eq. 8: W' = W + (lambda * ||W||_F / r) B Xi A +where Xi_{i,i} = 1 / (||b_i|| ||a_i||) makes each rank-1 component unit-norm. +This is equivalent to row-normalising A and column-normalising B (each column of +B and row of A has unit norm), so each rank-1 outer product b_i a_i^T has unit +spectral norm -> the whole low-rank update is bounded. + +Identity at t=0: paper uses kaiming init for both A and B with `lambda` initialised +to 0 (or small) so the effective delta starts near zero. We honour that: default lambda0 == 0 gives bit-identity; user can override via variant_kwargs. + +The frozen ||W||_F factor is captured once at init() into a buffer `lora_wnorm`. """ import torch import torch.nn.functional as F @@ -25,19 +35,26 @@ class DeLoRA: "lora_lambda": ParamSpec( (), init=lambda t: t.fill_(lam0), trainable=True ), + # ||W||_F captured at init; frozen scalar buffer (no grad) + "lora_wnorm": ParamSpec((), init="zeros", trainable=False), } @staticmethod def init(layer: nn.Linear, cfg) -> None: + # Reading layer.weight only works for plain Linear; for bnb layers this + # dequantizes via .float() round-trip if available, or fails cleanly. + with torch.no_grad(): + W = layer.weight.data.float() + layer.lora_wnorm.data.fill_(W.norm().item()) return @staticmethod def forward(layer: nn.Linear, x, y): cfg = layer._lora_cfg - # rows of A unit, cols of B unit (per paper) + # rows of A unit, cols of B unit (per paper, equivalent to Xi) A = F.normalize(layer.lora_A, dim=1) # (r, d_in) B = F.normalize(layer.lora_B, dim=0) # (d_out, r) - scale = layer.lora_lambda / cfg.r + scale = layer.lora_lambda * layer.lora_wnorm / cfg.r h = einsum(x, A, "... i, r i -> ... r") delta = einsum(h, B, "... r, o r -> ... o") return y + scale * delta diff --git a/src/lora_lite/variants/hra.py b/src/lora_lite/variants/hra.py index e3a2310..e2cfeb6 100644 --- a/src/lora_lite/variants/hra.py +++ b/src/lora_lite/variants/hra.py @@ -1,15 +1,22 @@ """HRA: Householder Reflection Adaptation. Yuan et al. 2024 https://arxiv.org/abs/2405.17484 -Output-side formulation with an identity-init gate: +Paper formulation (Sec. 3): adapt each frozen weight as - y' = (1 - alpha) * y + alpha * R y (so y' = y when alpha = 0) - R = prod_{i=1..r} H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2 + W' = W R, R = prod_{i=1..r} H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2 -`lora_gate` is initialized to 0 so y' = y at t=0. `lora_U` is initialized -kaiming so ||u_i||^2 is well-defined (no 0/0). Gradients flow into both U and -the gate even at init. +so the layer output becomes y' = W' x = W (R x). R is in INPUT space (d_in x d_in). -Hook-only, no weight access -> works on bnb 4/8-bit layers. +We implement this via a `forward_input` pre-hook that returns `R x`, then the +frozen base layer (including bnb 4/8-bit Linear) computes `W (R x)` itself. + +Identity at t=0: `lora_gate` is initialized to 0 and gates each Householder +vector, so the effective u_i starts at 0 -> H_i = I -> R = I -> y' = y. +At training time the gate scales the active reflection direction. + +OMITTED: paper also adds an orthogonality regularizer + lambda * sum_i (u_i^T u_j)^2 (Eq. 6 / Sec. 3.3) +which is a loss term, not a forward-pass change. Add it in your training loop if +you want the regularized HRA variant. """ import torch from einops import einsum @@ -25,9 +32,9 @@ class HRA: @staticmethod def param_specs(d_in, d_out, cfg): return { - # one Householder vector per rank slot in R^{d_out} - "lora_U": ParamSpec((cfg.r, d_out), init="kaiming", trainable=True), - # identity gate; 0 -> y' = y exactly + # one Householder vector per rank slot in INPUT space R^{d_in} + "lora_U": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), + # identity gate; 0 -> R = I exactly "lora_gate": ParamSpec((), init="zeros", trainable=True), } @@ -36,12 +43,13 @@ class HRA: return @staticmethod - def forward(layer: nn.Linear, x, y): - U = layer.lora_U # (r, d_out) - Ry = y + def forward_input(layer: nn.Linear, x: torch.Tensor) -> torch.Tensor: + """Apply x + gate * (Rx - x). gate=0 -> identity; nonzero -> full Householder chain.""" + U = layer.lora_U # (r, d_in) + Rx = x for i in range(U.shape[0]): - u = U[i] + u = U[i] # (d_in,) sq = (u * u).sum().clamp_min(1e-12) - coeff = einsum(Ry, u, "... o, o -> ...") * (2.0 / sq) - Ry = Ry - coeff.unsqueeze(-1) * u - return y + layer.lora_gate * (Ry - y) + coeff = einsum(Rx, u, "... i, i -> ...") * (2.0 / sq) + Rx = Rx - coeff.unsqueeze(-1) * u + return x + layer.lora_gate * (Rx - x) diff --git a/src/lora_lite/variants/ia3.py b/src/lora_lite/variants/ia3.py index 57f1bd3..a60fcdb 100644 --- a/src/lora_lite/variants/ia3.py +++ b/src/lora_lite/variants/ia3.py @@ -1,4 +1,22 @@ -"""IA3-style output gating. y_new = y * g, with g initialized to ones.""" +"""IA3-style output gating. Liu et al. 2022 https://arxiv.org/abs/2205.05638 + + y_new = y * g, g initialized to 1 (identity at t=0) + +DEVIATION FROM PAPER: + The original IA3 gates only three positions per transformer block: + l_k * (k_proj output), l_v * (v_proj output), l_ff * (FFN intermediate after activation) + This implementation gates ANY linear layer the targeting system selects. + To match the paper exactly on a typical Llama/Qwen-style block, attach with: + + cfg = LoraLiteConfig( + variant="ia3", + target_names=(r"\\.k_proj$", r"\\.v_proj$", r"\\.up_proj$"), + target_roles=(), + ) + + `up_proj` is the closest stand-in for "FFN intermediate" in gated-MLP blocks + (Llama uses gate * up; gating the up branch is the IA3-spirit choice). +""" import torch from torch import nn diff --git a/src/lora_lite/variants/lora.py b/src/lora_lite/variants/lora.py index b24e87b..2b55eb9 100644 --- a/src/lora_lite/variants/lora.py +++ b/src/lora_lite/variants/lora.py @@ -1,4 +1,9 @@ -"""Vanilla LoRA. Reference variant. y = Wx + (alpha/r) * B @ A @ x.""" +"""Vanilla LoRA. Hu et al. 2021 https://arxiv.org/abs/2106.09685 + + h = W x + (alpha/r) B A x + +Identity at t=0 from B=0. Faithful to the paper. +""" from einops import einsum from torch import nn import torch diff --git a/tests/smoke.py b/tests/smoke.py index feb19a2..e28b870 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -212,7 +212,7 @@ def structural_linear_like_test(): def bitsandbytes_cuda_smoke(require_bnb: bool): label = "required" if require_bnb else "optional" - print(f"\n=== {label} bitsandbytes CUDA smoke ===") + print(f"\n=== {label} bitsandbytes CUDA smoke (every variant) ===") if not torch.cuda.is_available(): if require_bnb: raise RuntimeError("CUDA unavailable; required real bnb 4/8-bit smoke cannot run.") @@ -235,19 +235,47 @@ def bitsandbytes_cuda_smoke(require_bnb: bool): def forward(self, x): return self.layers[0](x) + # bnb-compatible: hook-only variants that never read layer.weight + bnb_ok = ("lora", "delora", "ia3", "hra") + # bnb-incompatible: variants that mutate or read dense weight in init() + bnb_fail = ("pissa", "dora") + + print(" SHOULD: bnb_ok variants {} -> identity_err==0 grad_nonzero=True".format(bnb_ok)) + print(" SHOULD: bnb_fail variants {} -> attach() raises (dequant required)".format(bnb_fail)) + for layer_cls in (bnb.nn.Linear8bitLt, bnb.nn.Linear4bit): - torch.manual_seed(0) - model = BnbModel(layer_cls) - x = torch.randn(2, 3, 8, device="cuda") - y_base = model(x).detach() - ll.attach(model, ll.LoraLiteConfig(variant="lora", r=2, alpha=4, dtype=torch.float16, target_roles=())) - y = model(x) - err = (y.detach() - y_base).abs().max().item() - y.pow(2).mean().backward() - grad_nonzero = model.layers[0].lora_B.grad.abs().sum().item() > 0 - print(f" {layer_cls.__name__}: identity_err={err:.3e} grad_nonzero={grad_nonzero}") - assert err == 0.0 - assert grad_nonzero + for variant in bnb_ok: + torch.manual_seed(0) + model = BnbModel(layer_cls) + x = torch.randn(2, 3, 8, device="cuda") + y_base = model(x).detach() + cfg = ll.LoraLiteConfig( + variant=variant, r=2, alpha=4, dtype=torch.float16, target_roles=(), + variant_kwargs={"lambda0": 0.0} if variant == "delora" else {}, + ) + ll.attach(model, cfg) + y = model(x) + err = (y.detach() - y_base).abs().max().item() + y.pow(2).mean().backward() + # find any trainable lora_* with a grad + grads = [(n, p.grad) for n, p in model.named_parameters() if "lora_" in n and p.requires_grad and p.grad is not None] + grad_nonzero = any(g.abs().sum().item() > 0 for _, g in grads) + print(f" {layer_cls.__name__:14s} {variant:6s}: identity_err={err:.3e} grad_nonzero={grad_nonzero}") + assert err < 1e-2, f" bnb identity err too large for {variant}" + assert grad_nonzero, f" no nonzero grad for {variant}" + ll.detach(model) + del model + + for variant in bnb_fail: + model = BnbModel(layer_cls) + cfg = ll.LoraLiteConfig(variant=variant, r=2, alpha=2, dtype=torch.float16, target_roles=()) + try: + ll.attach(model, cfg) + except (TypeError, RuntimeError, AttributeError, ValueError) as e: + print(f" {layer_cls.__name__:14s} {variant:6s}: fail-loud OK ({type(e).__name__})") + else: + raise AssertionError(f" {variant} on {layer_cls.__name__} should have failed loudly") + del model def main():