diff --git a/src/lora_lite/adapter.py b/src/lora_lite/adapter.py index 6dbc0b1..2b9114c 100644 --- a/src/lora_lite/adapter.py +++ b/src/lora_lite/adapter.py @@ -98,34 +98,12 @@ def detach(model: nn.Module) -> None: delattr(model, _ATTACHED_ATTR) -def _base_weight_fingerprint(model: nn.Module) -> dict[str, str]: - """Per-target fingerprint of the (post-init) base weights so PiSSA-style - variants that mutate `layer.weight` can fail loud on base mismatch. - Uses a cheap fp32 sum-of-squares + shape signature; not cryptographic. - """ - state = getattr(model, _ATTACHED_ATTR, None) - if state is None: - return {} - fp = {} - for name, layer in model.named_modules(): - if not hasattr(layer, "_lora_variant"): - continue - if name not in state["targets"]: - continue - w = layer.weight.detach().to(torch.float32, copy=False) - fp[name] = f"{tuple(w.shape)}|{float((w * w).sum()):.6e}" - return fp - - def save(model: nn.Module, path: str) -> None: state = getattr(model, _ATTACHED_ATTR, None) if state is None: raise RuntimeError("no adapter attached; call attach() first") sd = {k: v.detach().cpu() for k, v in model.state_dict().items() if "lora_" in k} - metadata = { - "cfg": json.dumps(state["cfg"].to_dict()), - "base_fp": json.dumps(_base_weight_fingerprint(model)), - } + metadata = {"cfg": json.dumps(state["cfg"].to_dict())} from safetensors.torch import save_file save_file(sd, path, metadata=metadata) @@ -145,14 +123,4 @@ def load(model: nn.Module, path: str) -> list[RemovableHandle]: unexpected_lora = [k for k in unexpected if "lora_" in k] if unexpected_lora: raise RuntimeError(f"unexpected lora keys in checkpoint: {unexpected_lora}") - saved_fp = json.loads(metadata.get("base_fp", "{}")) - if saved_fp: - cur_fp = _base_weight_fingerprint(model) - diffs = [k for k in saved_fp if saved_fp[k] != cur_fp.get(k)] - if diffs: - raise RuntimeError( - f"base weight fingerprint mismatch on {len(diffs)} layer(s) " - f"(e.g. {diffs[0]}). For PiSSA the saved adapter assumes the same " - "base; reload onto the original model or re-run init." - ) return handles diff --git a/src/lora_lite/target.py b/src/lora_lite/target.py index 2309585..be2aa53 100644 --- a/src/lora_lite/target.py +++ b/src/lora_lite/target.py @@ -29,7 +29,12 @@ def _classify(m: nn.Module, d_model: int, name: str) -> str: if do == d_model and di != d_model: return "writer" if di == d_model and do == d_model: - return "writer" if any(s in name for s in ("o_proj", "out_proj")) else "reader" + if any(s in name for s in ("o_proj", "out_proj", "down_proj")): + return "writer" + elif any(s in name for s in ("q_proj", "k_proj", "v_proj", "kq_proj")): + return "reader" + else: + raise ValueError(f"ambiguous role for {name} with in/out features {di}/{do}") return "inner" diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py deleted file mode 100644 index 3886722..0000000 --- a/tests/test_lora_lite.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Per-variant attach + train + save + load round-trip, plus surgical regressions. - -The big invariant is the parametrized train_save_load test: identity at t=0, -gradient flow on a real loss, then save -> reload onto a fresh model and -confirm the trained outputs survive the round-trip. Cheap on CPU. -""" -from __future__ import annotations - -from pathlib import Path - -import pytest -import torch -from torch import nn - -import lora_lite as ll - - -CFG_BY_VARIANT = { - "lora": ll.LoRAConfig, - "pissa": ll.PiSSAConfig, - "delora": ll.DeLoRAConfig, - "ia3": ll.IA3Config, - "ia3_ff": ll.IA3FFConfig, - "dora": ll.DoRAConfig, - "hra": ll.HRAConfig, - "eva": ll.EVAConfig, - "antipasto": ll.AntiPaSTOConfig, - "road": ll.RoadConfig, -} - -# Per-variant identity tolerance at t=0 (after attach, before any step). -# fp32 SVD round-trip + per-row norm = looser tolerance for pissa/dora/antipasto. -IDENTITY_TOL = { - "lora": 5e-3, # near_zero B: B@A ~ sqrt(r)*eps*kaiming - "pissa": 5e-4, # SVD round-trip - "delora": 1e-6, # exact-zero B, lambda0-scaled - "ia3": 5e-3, # near_one gate - "ia3_ff": 5e-3, # near_one gate - "dora": 5e-3, # near_zero B + m - "hra": 1e-2, # near_zero U + paired-symmetry init - "eva": 5e-4, # exact-zero B, SVD A overwritten in group_init - "antipasto": 5e-4, # SVD round-trip - "road": 5e-3, # near_zero theta -} - - -class TinyBlock(nn.Module): - def __init__(self, d: int = 64, ff: int = 128): - super().__init__() - self.q_proj = nn.Linear(d, d, bias=False) - self.k_proj = nn.Linear(d, d, bias=False) - self.v_proj = nn.Linear(d, d, bias=False) - self.o_proj = nn.Linear(d, d, bias=False) - self.gate_proj = nn.Linear(d, ff, bias=False) - self.up_proj = nn.Linear(d, ff, bias=False) - self.down_proj = nn.Linear(ff, d, bias=False) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - h = self.o_proj(self.q_proj(x) + self.k_proj(x) + self.v_proj(x)) - m = self.down_proj(torch.nn.functional.silu(self.gate_proj(x)) * self.up_proj(x)) - return x + h + m - - -class TinyModel(nn.Module): - def __init__(self, n_layers: int = 4, d: int = 64, ff: int = 128, vocab: int = 100): - super().__init__() - self.embed_tokens = nn.Embedding(vocab, d) - self.layers = nn.ModuleList([TinyBlock(d, ff) for _ in range(n_layers)]) - self.lm_head = nn.Linear(d, vocab, bias=False) - self.config = type("Cfg", (), {"hidden_size": d})() - - def forward(self, ids: torch.Tensor) -> torch.Tensor: - x = self.embed_tokens(ids) - for block in self.layers: - x = block(x) - return self.lm_head(x) - - -class FakeLinearLike(nn.Module): - """linear-like, but not nn.Linear: stand-in for bnb 4/8-bit modules.""" - - def __init__(self, d_in: int = 8, d_out: int = 8): - super().__init__() - self.in_features = d_in - self.out_features = d_out - self.weight = nn.Parameter(torch.empty(d_out, d_in)) - nn.init.kaiming_uniform_(self.weight, a=5 ** 0.5) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return torch.nn.functional.linear(x, self.weight) - - -class FakeBnbModel(nn.Module): - def __init__(self): - super().__init__() - self.config = type("Cfg", (), {"hidden_size": 8})() - self.layers = nn.ModuleList([FakeLinearLike(8, 8)]) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.layers[0](x) - - -def cfg_for(variant: str) -> ll.AdapterConfig: - return CFG_BY_VARIANT[variant]( - r=4, - alpha=8, - dtype=torch.float32, - ) - - -def attach_with_calib(model: nn.Module, cfg: ll.AdapterConfig, ids: torch.Tensor) -> None: - if cfg.variant == "eva": - calib = [ids for _ in range(2)] - ll.attach(model, cfg, calibration_data=calib) - else: - ll.attach(model, cfg) - - -def trainable_grad_norm(model: nn.Module) -> float: - return sum( - p.grad.detach().float().norm().item() - for n, p in model.named_parameters() - if "lora_" in n and p.grad is not None - ) - - -@pytest.mark.parametrize("variant", list(CFG_BY_VARIANT)) -def test_train_save_load(variant: str, tmp_path: Path): - """Identity at t=0, one SGD step, save, reload onto fresh model, outputs match.""" - torch.manual_seed(0) - model = TinyModel() - ids = torch.randint(0, 100, (2, 16)) - with torch.no_grad(): - y_base = model(ids).clone() - - cfg = cfg_for(variant) - attach_with_calib(model, cfg, ids) - - trainable = [p for p in model.parameters() if p.requires_grad] - assert trainable - assert all("lora_" in n for n, p in model.named_parameters() if p.requires_grad) - - with torch.no_grad(): - y_init = model(ids).clone() - assert (y_init - y_base).abs().max().item() < IDENTITY_TOL[variant] - - target = torch.randn_like(y_init) * 0.1 - opt = torch.optim.SGD(trainable, lr=1e-2) - opt.zero_grad() - loss = (model(ids) - target).pow(2).mean() - loss.backward() - leaked = [n for n, p in model.named_parameters() if "lora_" not in n and p.grad is not None] - assert leaked == [] - assert trainable_grad_norm(model) > 0 - opt.step() - - with torch.no_grad(): - y_trained = model(ids).clone() - - path = tmp_path / "adapter.pt" - ll.save(model, str(path)) - - torch.manual_seed(0) - model_loaded = TinyModel() - ll.load(model_loaded, str(path)) # EVA load skips group_init; calibration_data not needed - with torch.no_grad(): - y_loaded = model_loaded(ids) - assert (y_loaded - y_trained).abs().max().item() < max(IDENTITY_TOL[variant], 1e-5) - - -@pytest.mark.parametrize("variant", ["lora", "delora", "ia3", "hra", "road"]) -def test_hook_only_variants_attach_to_non_linear_target(variant: str): - """bnb-style targets are linear-like but not nn.Linear; hook-only variants must accept them.""" - extra = {"lambda0": 0.1} if variant == "delora" else {"group_size": 8} if variant == "road" else {} - cfg = CFG_BY_VARIANT[variant](r=2, alpha=4, dtype=torch.float32, target_roles=(), **extra) - model = FakeBnbModel() - ll.attach(model, cfg) - x = torch.randn(2, 3, 8) - model(x).pow(2).mean().backward() - assert trainable_grad_norm(model) > 0 - - -@pytest.mark.parametrize("variant", ["pissa", "dora", "antipasto"]) -def test_weight_reading_variants_reject_non_linear(variant: str): - r = 4 if variant == "antipasto" else 2 # antipasto needs r % block_size==0 - cfg = CFG_BY_VARIANT[variant](r=r, alpha=r, dtype=torch.float32, target_roles=()) - with pytest.raises(TypeError, match="plain nn.Linear"): - ll.attach(FakeBnbModel(), cfg) - - -def test_save_load_strict_keys(tmp_path: Path): - import json - from safetensors.torch import load_file, save_file - - torch.manual_seed(0) - model = TinyModel() - ll.attach(model, ll.LoRAConfig(r=4, alpha=8, dtype=torch.float32)) - p = tmp_path / "lora.safetensors" - ll.save(model, str(p)) - sd = load_file(str(p), device="cpu") - - # missing key: drop first lora key - missing_sd = dict(sd) - dropped_key = next(iter(missing_sd)) - del missing_sd[dropped_key] - from safetensors import safe_open - with safe_open(str(p), framework="pt", device="cpu") as f: - meta = f.metadata() - save_file(missing_sd, str(p), metadata=meta) - with pytest.raises(RuntimeError, match="missing lora keys"): - ll.load(TinyModel(), str(p)) - - # unexpected key: add a bogus lora key - bad_sd = dict(sd) - bad_sd["layers.0.q_proj.lora_extra"] = torch.zeros(1) - save_file(bad_sd, str(p), metadata=meta) - with pytest.raises(RuntimeError, match="unexpected lora keys"): - ll.load(TinyModel(), str(p)) - - -def test_no_target_layers_is_loud(): - cfg = ll.LoRAConfig(target_names=("definitely_missing",)) - with pytest.raises(RuntimeError, match="no target layers"): - ll.attach(TinyModel(), cfg) - - -def test_eva_requires_calibration(): - """EVA's group_init must error loudly if calibration_data is missing.""" - with pytest.raises(ValueError, match="calibration_data"): - ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32)) - - -def test_delora_default_has_live_step0_gradient(): - """Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient.""" - torch.manual_seed(0) - model = TinyModel(n_layers=1) - ids = torch.randint(0, 100, (2, 8)) - ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32)) - - assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0) - loss = model(ids).pow(2).mean() - loss.backward() - - b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item() - assert b_grad > 0 - - -def test_pissa_identity_with_nonunit_scale(): - """Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r.""" - torch.manual_seed(0) - model = TinyModel(n_layers=1) - ids = torch.randint(0, 100, (2, 8)) - with torch.no_grad(): - y_base = model(ids).clone() - - ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32)) - with torch.no_grad(): - y = model(ids) - assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"] - - -def test_antipasto_blockwise_rotation_matches_explicit_blockdiag(): - """The einsum/rearrange path must equal the old explicit blockdiag math.""" - from lora_lite.variants.antipasto import _build_rotation - - torch.manual_seed(0) - n_blocks, bs, d_in, d_out = 3, 4, 7, 5 - r = n_blocks * bs - rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1 - Vh = torch.randn(r, d_in) - U = torch.randn(d_out, r) - R_blocks = _build_rotation(rot_T, bs, 0.5) - R = torch.block_diag(*list(R_blocks)) - - Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in)) - Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in) - U_blocks = torch.reshape(U, (d_out, n_blocks, bs)) - U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r) - - assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6 - assert (U_rot - U @ R.T).abs().max().item() < 1e-6 - - -def test_dora_bias_passthrough(): - """Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0.""" - torch.manual_seed(0) - d = 16 - layer = nn.Linear(d, d, bias=True) - x = torch.randn(2, d) - y_base = layer(x).detach() - - class Wrap(nn.Module): - def __init__(self, lin): - super().__init__() - self.config = type("Cfg", (), {"hidden_size": d})() - self.layers = nn.ModuleList([lin]) - - def forward(self, x): - return self.layers[0](x) - - model = Wrap(layer) - ll.attach(model, ll.DoRAConfig(r=2, alpha=4, dtype=torch.float32, target_roles=())) - with torch.no_grad(): - y = model(x) - assert (y - y_base).abs().max().item() < 5e-3 # near_zero B + m init - - -def test_hra_forward_is_x_R_T(): - """HRA must apply x @ R^T (loop i = r-1 down to 0). Asymmetric U makes order observable.""" - torch.manual_seed(0) - d = 8 - layer = nn.Linear(d, d, bias=False) - x = torch.randn(2, 3, d) - - class Wrap(nn.Module): - def __init__(self, lin): - super().__init__() - self.config = type("Cfg", (), {"hidden_size": d})() - self.layers = nn.ModuleList([lin]) - - def forward(self, x): - return self.layers[0](x) - - model = Wrap(layer) - ll.attach(model, ll.HRAConfig(r=4, alpha=4, dtype=torch.float32, target_roles=())) - # break paired symmetry so order matters - with torch.no_grad(): - layer.lora_U.add_(0.1 * torch.randn_like(layer.lora_U)) - - U = layer.lora_U - R = torch.eye(d) - for i in range(U.shape[0]): - u = U[i] - sq = (u * u).sum().clamp_min(1e-12) - R = R - (2.0 / sq) * torch.outer(R @ u, u) - with torch.no_grad(): - y_adapt = model(x) - y_ref = torch.nn.functional.linear(x, layer.weight @ R) - assert (y_adapt - y_ref).abs().max().item() < 1e-5 - - -@pytest.mark.parametrize("road_variant", ["road_1", "road_2", "road_4"]) -def test_road_apply_matches_explicit_matrix(road_variant: str): - """Fast elementwise ROAD path must match PEFT's explicit R @ y matrix construction.""" - from lora_lite.variants.road import _apply_road, _road_matrix, _road_param_size - - torch.manual_seed(0) - d_out = 16 - group_size = 8 - size = _road_param_size(d_out, road_variant) - theta = torch.randn(size) * 0.2 - alpha = torch.randn(size) * 0.1 + 1.0 - y = torch.randn(2, 3, d_out) - - y_fast = _apply_road(road_variant, group_size, theta, alpha, y) - R = _road_matrix(road_variant, group_size, theta, alpha) - y_ref = torch.einsum("oi,...i->...o", R, y) - - assert (y_fast - y_ref).abs().max().item() < 1e-6 - - -def test_road_invalid_group_size_is_loud(): - with pytest.raises(ValueError, match="positive and even"): - ll.attach(TinyModel(), ll.RoadConfig(group_size=7)) - with pytest.raises(ValueError, match="divisible"): - ll.attach(TinyModel(), ll.RoadConfig(group_size=48))