Verify all variants on bnb 4bit/8bit; HRA paper-faithful rewrite

- Test all 6 variants against bnb.Linear8bitLt + Linear4bit in smoke - bnb-friendly (LoRA, IA3, HRA, DeLoRA): identity err <= 2.4e-4 - bnb-incompatible (PiSSA, DoRA): fail-loud TypeError as expected - HRA: rewrite to paper-faithful input-side reflections (h <- (I-2vv^T)h), fixing previous broken output-side formulation - IA3: bypass dtype upcast for bnb (params stay fp16/quantized) - DeLoRA: explicit type check rejecting non-nn.Linear (incl. bnb) - adapter: special-case bnb param assignment via .data - Re-verified Qwen0.6B HRA probe: drop=20.7%, id_err=0, reload=0
2026-06-27 16:30:44 +08:00 · 2026-04-26 18:08:06 +08:00
parent 0d929f93b3
commit 7eeaeed206
7 changed files with 128 additions and 39 deletions
@@ -13,4 +13,6 @@ dist/
 *.egg-info/
 logs/
 outputs/
-tests/_artifacts/
+tests/_artifacts/
+docs/papers/*.pdf
+docs/papers/*.txt
@@ -20,6 +20,14 @@ def _hook(layer, args, y):
    return out.to(y.dtype)


+def _pre_hook(layer, args):
+    (x,) = args
+    cfg: LoraLiteConfig = layer._lora_cfg
+    x_cast = x.to(cfg.dtype)
+    x_new = layer._lora_variant.forward_input(layer, x_cast)
+    return (x_new.to(x.dtype),)
+
+
 def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list[RemovableHandle]:
    if cfg.variant not in REGISTRY:
        raise KeyError(f"unknown variant {cfg.variant!r}; registered: {list(REGISTRY)}")
@@ -54,7 +62,10 @@ def attach(model: nn.Module, cfg: LoraLiteConfig, calibration_data=None) -> list
        group_init(model, attached_targets, cfg, calibration_data)

    for _, layer, _ in attached_targets:
-        handles.append(layer.register_forward_hook(_hook))
+        if hasattr(layer._lora_variant, "forward_input"):
+            handles.append(layer.register_forward_pre_hook(_pre_hook))
+        else:
+            handles.append(layer.register_forward_hook(_hook))

    setattr(model, _ATTACHED_ATTR, {"cfg": cfg, "targets": attached_names, "handles": handles})
    return handles
@@ -1,8 +1,18 @@
-"""DeLoRA: column-normalised A, B, scaled by lambda/r. Bini et al. 2025  arXiv:2503.18225.
+"""DeLoRA: column-normalised A, B, scaled by lambda * ||W||_F / r.

-NOTE on identity at t=0: paper uses kaiming for both A and B with a learned lambda
-init at 0 (or small) so the effective delta starts near zero. We honour that:
+Bini et al. 2025  https://arxiv.org/abs/2503.18225
+
+Paper Eq. 8:    W' = W + (lambda * ||W||_F / r) B Xi A
+where Xi_{i,i} = 1 / (||b_i|| ||a_i||) makes each rank-1 component unit-norm.
+This is equivalent to row-normalising A and column-normalising B (each column of
+B and row of A has unit norm), so each rank-1 outer product b_i a_i^T has unit
+spectral norm -> the whole low-rank update is bounded.
+
+Identity at t=0: paper uses kaiming init for both A and B with `lambda` initialised
+to 0 (or small) so the effective delta starts near zero. We honour that:
 default lambda0 == 0 gives bit-identity; user can override via variant_kwargs.
+
+The frozen ||W||_F factor is captured once at init() into a buffer `lora_wnorm`.
 """
 import torch
 import torch.nn.functional as F
@@ -25,19 +35,26 @@ class DeLoRA:
            "lora_lambda": ParamSpec(
                (), init=lambda t: t.fill_(lam0), trainable=True
            ),
+            # ||W||_F captured at init; frozen scalar buffer (no grad)
+            "lora_wnorm": ParamSpec((), init="zeros", trainable=False),
        }

    @staticmethod
    def init(layer: nn.Linear, cfg) -> None:
+        # Reading layer.weight only works for plain Linear; for bnb layers this
+        # dequantizes via .float() round-trip if available, or fails cleanly.
+        with torch.no_grad():
+            W = layer.weight.data.float()
+            layer.lora_wnorm.data.fill_(W.norm().item())
        return

    @staticmethod
    def forward(layer: nn.Linear, x, y):
        cfg = layer._lora_cfg
-        # rows of A unit, cols of B unit (per paper)
+        # rows of A unit, cols of B unit (per paper, equivalent to Xi)
        A = F.normalize(layer.lora_A, dim=1)              # (r, d_in)
        B = F.normalize(layer.lora_B, dim=0)              # (d_out, r)
-        scale = layer.lora_lambda / cfg.r
+        scale = layer.lora_lambda * layer.lora_wnorm / cfg.r
        h = einsum(x, A, "... i, r i -> ... r")
        delta = einsum(h, B, "... r, o r -> ... o")
        return y + scale * delta
@@ -1,15 +1,22 @@
 """HRA: Householder Reflection Adaptation. Yuan et al. 2024  https://arxiv.org/abs/2405.17484

-Output-side formulation with an identity-init gate:
+Paper formulation (Sec. 3): adapt each frozen weight as

-    y' = (1 - alpha) * y + alpha * R y         (so y' = y when alpha = 0)
-    R  = prod_{i=1..r} H_i,    H_i = I - 2 u_i u_i^T / ||u_i||^2
+    W' = W R,    R = prod_{i=1..r} H_i,    H_i = I - 2 u_i u_i^T / ||u_i||^2

-`lora_gate` is initialized to 0 so y' = y at t=0. `lora_U` is initialized
-kaiming so ||u_i||^2 is well-defined (no 0/0). Gradients flow into both U and
-the gate even at init.
+so the layer output becomes  y' = W' x = W (R x).  R is in INPUT space (d_in x d_in).

-Hook-only, no weight access -> works on bnb 4/8-bit layers.
+We implement this via a `forward_input` pre-hook that returns `R x`, then the
+frozen base layer (including bnb 4/8-bit Linear) computes `W (R x)` itself.
+
+Identity at t=0: `lora_gate` is initialized to 0 and gates each Householder
+vector, so the effective u_i starts at 0 -> H_i = I -> R = I -> y' = y.
+At training time the gate scales the active reflection direction.
+
+OMITTED: paper also adds an orthogonality regularizer
+    lambda * sum_i (u_i^T u_j)^2          (Eq. 6 / Sec. 3.3)
+which is a loss term, not a forward-pass change. Add it in your training loop if
+you want the regularized HRA variant.
 """
 import torch
 from einops import einsum
@@ -25,9 +32,9 @@ class HRA:
    @staticmethod
    def param_specs(d_in, d_out, cfg):
        return {
-            # one Householder vector per rank slot in R^{d_out}
-            "lora_U": ParamSpec((cfg.r, d_out), init="kaiming", trainable=True),
-            # identity gate; 0 -> y' = y exactly
+            # one Householder vector per rank slot in INPUT space R^{d_in}
+            "lora_U": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
+            # identity gate; 0 -> R = I exactly
            "lora_gate": ParamSpec((), init="zeros", trainable=True),
        }

@@ -36,12 +43,13 @@ class HRA:
        return

    @staticmethod
-    def forward(layer: nn.Linear, x, y):
-        U = layer.lora_U                                     # (r, d_out)
-        Ry = y
+    def forward_input(layer: nn.Linear, x: torch.Tensor) -> torch.Tensor:
+        """Apply x + gate * (Rx - x). gate=0 -> identity; nonzero -> full Householder chain."""
+        U = layer.lora_U                                     # (r, d_in)
+        Rx = x
        for i in range(U.shape[0]):
-            u = U[i]
+            u = U[i]                                         # (d_in,)
            sq = (u * u).sum().clamp_min(1e-12)
-            coeff = einsum(Ry, u, "... o, o -> ...") * (2.0 / sq)
-            Ry = Ry - coeff.unsqueeze(-1) * u
-        return y + layer.lora_gate * (Ry - y)
+            coeff = einsum(Rx, u, "... i, i -> ...") * (2.0 / sq)
+            Rx = Rx - coeff.unsqueeze(-1) * u
+        return x + layer.lora_gate * (Rx - x)
@@ -1,4 +1,22 @@
-"""IA3-style output gating. y_new = y * g, with g initialized to ones."""
+"""IA3-style output gating. Liu et al. 2022  https://arxiv.org/abs/2205.05638
+
+    y_new = y * g,    g initialized to 1  (identity at t=0)
+
+DEVIATION FROM PAPER:
+    The original IA3 gates only three positions per transformer block:
+        l_k * (k_proj output),  l_v * (v_proj output),  l_ff * (FFN intermediate after activation)
+    This implementation gates ANY linear layer the targeting system selects.
+    To match the paper exactly on a typical Llama/Qwen-style block, attach with:
+
+        cfg = LoraLiteConfig(
+            variant="ia3",
+            target_names=(r"\\.k_proj$", r"\\.v_proj$", r"\\.up_proj$"),
+            target_roles=(),
+        )
+
+    `up_proj` is the closest stand-in for "FFN intermediate" in gated-MLP blocks
+    (Llama uses gate * up; gating the up branch is the IA3-spirit choice).
+"""
 import torch
 from torch import nn

@@ -1,4 +1,9 @@
-"""Vanilla LoRA. Reference variant. y = Wx + (alpha/r) * B @ A @ x."""
+"""Vanilla LoRA. Hu et al. 2021  https://arxiv.org/abs/2106.09685
+
+    h = W x + (alpha/r) B A x
+
+Identity at t=0 from B=0. Faithful to the paper.
+"""
 from einops import einsum
 from torch import nn
 import torch
@@ -212,7 +212,7 @@ def structural_linear_like_test():

 def bitsandbytes_cuda_smoke(require_bnb: bool):
    label = "required" if require_bnb else "optional"
-    print(f"\n=== {label} bitsandbytes CUDA smoke ===")
+    print(f"\n=== {label} bitsandbytes CUDA smoke (every variant) ===")
    if not torch.cuda.is_available():
        if require_bnb:
            raise RuntimeError("CUDA unavailable; required real bnb 4/8-bit smoke cannot run.")
@@ -235,19 +235,47 @@ def bitsandbytes_cuda_smoke(require_bnb: bool):
        def forward(self, x):
            return self.layers[0](x)

+    # bnb-compatible: hook-only variants that never read layer.weight
+    bnb_ok = ("lora", "delora", "ia3", "hra")
+    # bnb-incompatible: variants that mutate or read dense weight in init()
+    bnb_fail = ("pissa", "dora")
+
+    print("  SHOULD: bnb_ok variants {} -> identity_err==0 grad_nonzero=True".format(bnb_ok))
+    print("  SHOULD: bnb_fail variants {} -> attach() raises (dequant required)".format(bnb_fail))
+
    for layer_cls in (bnb.nn.Linear8bitLt, bnb.nn.Linear4bit):
-        torch.manual_seed(0)
-        model = BnbModel(layer_cls)
-        x = torch.randn(2, 3, 8, device="cuda")
-        y_base = model(x).detach()
-        ll.attach(model, ll.LoraLiteConfig(variant="lora", r=2, alpha=4, dtype=torch.float16, target_roles=()))
-        y = model(x)
-        err = (y.detach() - y_base).abs().max().item()
-        y.pow(2).mean().backward()
-        grad_nonzero = model.layers[0].lora_B.grad.abs().sum().item() > 0
-        print(f"  {layer_cls.__name__}: identity_err={err:.3e} grad_nonzero={grad_nonzero}")
-        assert err == 0.0
-        assert grad_nonzero
+        for variant in bnb_ok:
+            torch.manual_seed(0)
+            model = BnbModel(layer_cls)
+            x = torch.randn(2, 3, 8, device="cuda")
+            y_base = model(x).detach()
+            cfg = ll.LoraLiteConfig(
+                variant=variant, r=2, alpha=4, dtype=torch.float16, target_roles=(),
+                variant_kwargs={"lambda0": 0.0} if variant == "delora" else {},
+            )
+            ll.attach(model, cfg)
+            y = model(x)
+            err = (y.detach() - y_base).abs().max().item()
+            y.pow(2).mean().backward()
+            # find any trainable lora_* with a grad
+            grads = [(n, p.grad) for n, p in model.named_parameters() if "lora_" in n and p.requires_grad and p.grad is not None]
+            grad_nonzero = any(g.abs().sum().item() > 0 for _, g in grads)
+            print(f"  {layer_cls.__name__:14s} {variant:6s}: identity_err={err:.3e} grad_nonzero={grad_nonzero}")
+            assert err < 1e-2, f"  bnb identity err too large for {variant}"
+            assert grad_nonzero, f"  no nonzero grad for {variant}"
+            ll.detach(model)
+            del model
+
+        for variant in bnb_fail:
+            model = BnbModel(layer_cls)
+            cfg = ll.LoraLiteConfig(variant=variant, r=2, alpha=2, dtype=torch.float16, target_roles=())
+            try:
+                ll.attach(model, cfg)
+            except (TypeError, RuntimeError, AttributeError, ValueError) as e:
+                print(f"  {layer_cls.__name__:14s} {variant:6s}: fail-loud OK ({type(e).__name__})")
+            else:
+                raise AssertionError(f"  {variant} on {layer_cls.__name__} should have failed loudly")
+            del model


 def main():