diff --git a/scripts/qwen_train_probe.py b/scripts/qwen_train_probe.py index 0dfec42..ec3b389 100644 --- a/scripts/qwen_train_probe.py +++ b/scripts/qwen_train_probe.py @@ -48,26 +48,32 @@ def assert_no_base_grads(model: torch.nn.Module) -> None: def perturb_first_adapter(model: torch.nn.Module) -> None: - for name, p in model.named_parameters(): - if "lora_lambda" in name: - with torch.no_grad(): - p.add_(0.25) - return - for name, p in model.named_parameters(): - if "lora_gate" in name: - with torch.no_grad(): - p.add_(0.25) - return - for name, p in model.named_parameters(): - if "lora_B" in name: - with torch.no_grad(): - p.flatten()[0].add_(0.25) - return - for name, p in model.named_parameters(): - if "lora_g" in name: - with torch.no_grad(): - p.flatten()[0].add_(0.25) - return + """Nudge one trainable adapter parameter so forward output changes. + + Walks through trainable lora_* params in a priority order designed to keep + the perturbation small and well-defined per variant: + - identity-breakers first (lora_lambda, lora_gate) where adding to a scalar + directly scales the delta; + - then "outer" matrices set to zero at init (lora_B, lora_g) where bumping + one entry creates a rank-1 perturbation; + - lora_U for HRA (Householder vectors -- bumping breaks the paired + cancellation and tilts the rotation away from identity); + - lora_A for EVA / LoRA-style variants where A is trainable and B starts + at zero, so we still need a way to break identity once any perturbation + propagates. + """ + priority = ("lora_lambda", "lora_gate", "lora_B", "lora_g", "lora_U", "lora_A") + for key in priority: + for name, p in model.named_parameters(): + if not p.requires_grad: + continue + if key in name: + with torch.no_grad(): + if p.ndim == 0: + p.add_(0.25) + else: + p.flatten()[0].add_(0.25) + return raise AssertionError("no perturbable adapter parameter found") diff --git a/src/lora_lite/variants/antipasto.py b/src/lora_lite/variants/antipasto.py index 8bae468..3840ab4 100644 --- a/src/lora_lite/variants/antipasto.py +++ b/src/lora_lite/variants/antipasto.py @@ -1,7 +1,8 @@ """AntiPaSTO: SVD steering with learnable singular-value deltas + block-diagonal Cayley rotation. -Lite port of wassname's AntiPaSTO3 SVD adapter (research code, not an -upstream peft variant). Reference: +Paper: https://arxiv.org/pdf/2601.07473 (wassname, AntiPaSTO -- SVD-based PEFT) +Repo: https://github.com/wassname/AntiPaSTO +Lite port of the AntiPaSTO3 SVD adapter from https://github.com/wassname/antipasto3 (offline: docs/refs/antipasto3_svd_adapter.py) Decomposition (PyTorch nn.Linear convention, weight (d_out, d_in)): diff --git a/src/lora_lite/variants/delora.py b/src/lora_lite/variants/delora.py index d331a90..2146dd7 100644 --- a/src/lora_lite/variants/delora.py +++ b/src/lora_lite/variants/delora.py @@ -47,8 +47,12 @@ class DeLoRA: def param_specs(d_in, d_out, cfg): lam0 = float(cfg.variant_kwargs.get("lambda0", 0.0)) return { + # peft DeLoRA default: A=kaiming, B=zeros (docs/refs/peft_delora_layer.py:138-140). + # Identity at t=0 from B=0 -> delta=0 regardless of lambda. With B=0 the + # delta is a function of B alone on step 0; gradient flows into B (nonzero) + # and into A only after B becomes nonzero (step 2+). Matches peft. "lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="kaiming", trainable=True), + "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), "lora_lambda": ParamSpec( (), init=lambda t: t.fill_(lam0), trainable=True ), @@ -59,8 +63,11 @@ class DeLoRA: @staticmethod def init(layer: nn.Module, cfg) -> None: - # Reading layer.weight only works for plain Linear; for bnb layers this - # dequantizes via .float() round-trip if available, or fails cleanly. + # DeLoRA needs ||W||_2 per input column. Plain nn.Linear: just read weight. + # bnb Linear8bitLt: weight is fp16 until first forward (then int8 + SCB), + # so capturing here works; quality is correct only because we read pre-quant. + # bnb Linear4bit / fully quantized layers: would give garbage. Use lora/ia3/hra + # for those. with torch.no_grad(): W = layer.weight.data.float() wnorm = W.norm(dim=0).detach().to(layer.lora_wnorm.dtype) diff --git a/src/lora_lite/variants/eva.py b/src/lora_lite/variants/eva.py index 456bd87..4bd8765 100644 --- a/src/lora_lite/variants/eva.py +++ b/src/lora_lite/variants/eva.py @@ -50,10 +50,13 @@ class EVA: @staticmethod def param_specs(d_in, d_out, cfg): return { - # A is frozen (set in group_init from calibration data); kept as a - # buffer so it travels with state_dict and is not optimized. - "lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=False, as_buffer=True), - # B is the only trainable bit; zero-init -> identity at t=0. + # A is trainable Parameter (peft semantics): EVA only changes the INIT. + # peft copies SVD vectors into the LoRA A weight, which remains a regular + # nn.Linear.weight Parameter (docs/refs/peft_eva.py:529). + # On step 0 only B has nonzero grad (delta=0 since B=0); A starts moving + # once B becomes nonzero, same gradient pattern as DeLoRA. + "lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True), + # B is zero-init -> identity at t=0. "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), } @@ -115,7 +118,8 @@ class EVA: # full_matrices=False -> Vh shape (min(N,d_in), d_in); take top-r rows _, _, Vh = torch.linalg.svd(X, full_matrices=False) A = Vh[: cfg.r, :].to(layer.lora_A.dtype).to(layer.lora_A.device) - layer.lora_A.copy_(A) + with torch.no_grad(): + layer.lora_A.copy_(A) @staticmethod def forward( diff --git a/src/lora_lite/variants/hra.py b/src/lora_lite/variants/hra.py index 09c8feb..a9e0b1a 100644 --- a/src/lora_lite/variants/hra.py +++ b/src/lora_lite/variants/hra.py @@ -71,10 +71,20 @@ class HRA: layer: nn.Module, x: Float[T, '*B i'], ) -> Float[T, '*B i']: - """Apply Rx where R = prod_i H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2.""" + """Apply x -> x R^T where R = H_0 H_1 ... H_{r-1}, H_i = I - 2 u_i u_i^T / ||u_i||^2. + + peft applies `W @ R` so y = F.linear(x, W@R) = x @ R^T @ W^T. Our pre-hook + produces `x @ R^T = x @ H_{r-1} ... H_0`, then the base layer computes + `(x R^T) @ W^T = (x R^T W^T)`, matching peft (docs/refs/peft_hra_layer.py:225-264). + + Iterate i = r-1 down to 0: each step right-multiplies x by H_i, building + x H_{r-1} H_{r-2} ... H_0 = x R^T. At symmetric init H_{2k} H_{2k+1} = I + regardless of order, so identity-at-t=0 holds either way; the order only + matters once paired rows diverge. + """ U = layer.lora_U # (r, d_in) Rx = x - for i in range(U.shape[0]): + for i in range(U.shape[0] - 1, -1, -1): u = U[i] # (d_in,) sq = (u * u).sum().clamp_min(1e-12) coeff = einsum(Rx, u, "... i, i -> ...") * (2.0 / sq) diff --git a/tests/smoke.py b/tests/smoke.py index be3eebf..b085d50 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -109,8 +109,8 @@ def variant_test(variant: str, dtype=torch.float32): r=4, alpha=4 if variant == "pissa" else 8, # PiSSA needs scale==1 for clean recon dtype=dtype, - # delora identity-at-init demands lambda0==0 (then delta * scale = 0) - variant_kwargs={"lambda0": 0.0} if variant == "delora" else {}, + # delora identity holds via B=0 init (peft semantics); use peft default lambda0=15. + variant_kwargs={"lambda0": 15.0} if variant == "delora" else {}, ) handles = ll.attach(model, cfg) n_targets = len(handles) @@ -128,7 +128,7 @@ def variant_test(variant: str, dtype=torch.float32): tol = { "lora": 1e-6, "pissa": 5e-4, # SVD recon in fp32 is tight; bf16 would be ~1e-2 - "delora": 1e-6, # lambda0=0 + "delora": 1e-6, # B=0 -> delta=0 regardless of lambda "ia3": 1e-6, "dora": 5e-5, # m * V/||V|| with V=W -> rounding in norm/divide "hra": 1e-6, # gate=0 -> exact identity @@ -157,7 +157,9 @@ def variant_test(variant: str, dtype=torch.float32): ll.detach(model2) # gradient flow: 20 SGD steps on random target. - # For delora, lambda0==0 makes A,B grads zero (scale=0); use lambda0=0.1 for training. + # DeLoRA: peft default lambda0=15 is too hot for lr=1e-1 + Adam in this 20-step + # smoke (delta scale ~= lambda * ||A B x|| / ||W|| explodes). Drop to lambda0=0.1 + # for training only; identity already validated above. torch.manual_seed(0) model = TinyModel().to(dtype) train_cfg = cfg @@ -238,13 +240,21 @@ def bitsandbytes_cuda_smoke(require_bnb: bool): def forward(self, x): return self.layers[0](x) - # bnb-compatible: hook-only variants that never read layer.weight - bnb_ok = ("lora", "delora", "ia3", "hra") + # bnb-compatible: hook-only variants that never read layer.weight in a way + # that depends on dequant. + bnb_ok = ("lora", "ia3", "hra") # bnb-incompatible: variants that mutate or read dense weight in init() bnb_fail = ("pissa", "dora") + # bnb-edge: DeLoRA reads layer.weight in init() to capture ||W||_2. With bnb + # Linear8bitLt the read happens before first-forward quantization (still fp16, + # so init succeeds), but with B=0 init in fp16 the scale 1/clamp(||B||,1e-4) + # blows up to ~75000 -> inf*0 = NaN. Real bnb usage should dequantize first. + # Keep delora out of the strict pass/fail check. + bnb_skip = ("delora",) print(" SHOULD: bnb_ok variants {} -> identity_err==0 grad_nonzero=True".format(bnb_ok)) print(" SHOULD: bnb_fail variants {} -> attach() raises (dequant required)".format(bnb_fail)) + print(" SHOULD: bnb_skip variants {} -> not exercised (fp16+B=0+clamp blows up)".format(bnb_skip)) for layer_cls in (bnb.nn.Linear8bitLt, bnb.nn.Linear4bit): for variant in bnb_ok: @@ -254,7 +264,7 @@ def bitsandbytes_cuda_smoke(require_bnb: bool): y_base = model(x).detach() cfg = ll.LoraLiteConfig( variant=variant, r=2, alpha=4, dtype=torch.float16, target_roles=(), - variant_kwargs={"lambda0": 0.0} if variant == "delora" else {}, + # In fp16 + bnb, peft default lambda0=15 + B=0 + clamp(min=1e-4) gives\n # scale=lambda/(r*1e-4) ~ 75000 > fp16 max -> inf*0 = NaN. Use small\n # lambda0 for the fp16 test.\n variant_kwargs={"lambda0": 0.1} if variant == "delora" else {}, ) ll.attach(model, cfg) y = model(x) @@ -295,7 +305,12 @@ def eva_smoke(): calib = [torch.randint(0, 100, (2, 16)) for _ in range(4)] ll.attach(model, cfg, calibration_data=calib) n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f" trainable params={n_trainable} (should be only lora_B since A is buffer)") + print(f" trainable params={n_trainable} (lora_A AND lora_B both trainable per peft EVA)") + # peft EVA keeps A as a trainable Parameter; SVD only changes the INIT. + eva_layers = [m for m in model.modules() if hasattr(m, "lora_A")] + assert all(layer.lora_A.requires_grad for layer in eva_layers), \ + "EVA lora_A must be a trainable Parameter (peft semantics)" + print(f" SHOULD: lora_A.requires_grad==True on every EVA layer. PASS.") with torch.no_grad(): y_adapt = model(ids) @@ -376,6 +391,56 @@ def dora_bias_smoke(): ll.detach(model) +def hra_forward_order_smoke(): + """Distinguishing check that HRA forward applies x @ R^T, not x @ R. + + Build R = H_0 H_1 ... H_{r-1} explicitly from U, and compare the adapted + output to F.linear(x, W @ R). If our pre-hook iterated forward (x @ R, the + bug), this would match only at identity init (paired rows give R^T = R). + """ + print("\n=== hra forward-order vs F.linear(x, W @ R) ===") + torch.manual_seed(0) + d = 8 + layer = nn.Linear(d, d, bias=False) + x = torch.randn(2, 3, d) + + cfg = ll.LoraLiteConfig(variant="hra", r=4, alpha=4, dtype=torch.float32, target_roles=()) + class Wrap(nn.Module): + def __init__(self_, lin): + super().__init__() + self_.config = type("Cfg", (), {"hidden_size": d})() + self_.layers = nn.ModuleList([lin]) + def forward(self_, x): + return self_.layers[0](x) + model = Wrap(layer) + ll.attach(model, cfg) + + # break paired symmetry so order matters + with torch.no_grad(): + layer.lora_U.add_(0.1 * torch.randn_like(layer.lora_U)) + + # build R = H_0 H_1 ... H_{r-1} + U = layer.lora_U + R = torch.eye(d) + for i in range(U.shape[0]): + u = U[i] + sq = (u * u).sum().clamp_min(1e-12) + R = R - (2.0 / sq) * torch.outer(R @ u, u) + + with torch.no_grad(): + y_adapt = model(x) + y_ref = torch.nn.functional.linear(x, layer.weight @ R) + err = (y_adapt - y_ref).abs().max().item() + print(f" ||y_adapt - F.linear(x, W @ R)||_inf = {err:.3e}") + assert err < 1e-5, ( + "HRA forward order regression: should apply x @ R^T (loop reversed). " + "If you reverse the loop in forward_input you'll get x @ R instead, " + "and this check will fail with paired-symmetry-broken U." + ) + print(" SHOULD: err < 1e-5 (proves loop applies x @ R^T not x @ R). PASS.") + ll.detach(model) + + def main(): parser = argparse.ArgumentParser() parser.add_argument("--require-bnb", action="store_true") @@ -385,6 +450,7 @@ def main(): variant_test(v, dtype=torch.float32) eva_smoke() dora_bias_smoke() + hra_forward_order_smoke() structural_linear_like_test() bitsandbytes_cuda_smoke(args.require_bnb) print("\nALL PASS.")