mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 15:15:55 +08:00
remove base_weight_fingerprint and test_lora_lite.py
- _base_weight_fingerprint was PiSSA-only defensive check that cluttered every save with per-target SHA256. If you load onto wrong base, you get wrong weights -- that's user error, not a library bug. - test_lora_lite.py deleted. All coverage lives in test_metamath_smoke.py which runs the real benchmark pipeline per variant.
This commit is contained in:
@@ -98,34 +98,12 @@ def detach(model: nn.Module) -> None:
|
||||
delattr(model, _ATTACHED_ATTR)
|
||||
|
||||
|
||||
def _base_weight_fingerprint(model: nn.Module) -> dict[str, str]:
|
||||
"""Per-target fingerprint of the (post-init) base weights so PiSSA-style
|
||||
variants that mutate `layer.weight` can fail loud on base mismatch.
|
||||
Uses a cheap fp32 sum-of-squares + shape signature; not cryptographic.
|
||||
"""
|
||||
state = getattr(model, _ATTACHED_ATTR, None)
|
||||
if state is None:
|
||||
return {}
|
||||
fp = {}
|
||||
for name, layer in model.named_modules():
|
||||
if not hasattr(layer, "_lora_variant"):
|
||||
continue
|
||||
if name not in state["targets"]:
|
||||
continue
|
||||
w = layer.weight.detach().to(torch.float32, copy=False)
|
||||
fp[name] = f"{tuple(w.shape)}|{float((w * w).sum()):.6e}"
|
||||
return fp
|
||||
|
||||
|
||||
def save(model: nn.Module, path: str) -> None:
|
||||
state = getattr(model, _ATTACHED_ATTR, None)
|
||||
if state is None:
|
||||
raise RuntimeError("no adapter attached; call attach() first")
|
||||
sd = {k: v.detach().cpu() for k, v in model.state_dict().items() if "lora_" in k}
|
||||
metadata = {
|
||||
"cfg": json.dumps(state["cfg"].to_dict()),
|
||||
"base_fp": json.dumps(_base_weight_fingerprint(model)),
|
||||
}
|
||||
metadata = {"cfg": json.dumps(state["cfg"].to_dict())}
|
||||
from safetensors.torch import save_file
|
||||
save_file(sd, path, metadata=metadata)
|
||||
|
||||
@@ -145,14 +123,4 @@ def load(model: nn.Module, path: str) -> list[RemovableHandle]:
|
||||
unexpected_lora = [k for k in unexpected if "lora_" in k]
|
||||
if unexpected_lora:
|
||||
raise RuntimeError(f"unexpected lora keys in checkpoint: {unexpected_lora}")
|
||||
saved_fp = json.loads(metadata.get("base_fp", "{}"))
|
||||
if saved_fp:
|
||||
cur_fp = _base_weight_fingerprint(model)
|
||||
diffs = [k for k in saved_fp if saved_fp[k] != cur_fp.get(k)]
|
||||
if diffs:
|
||||
raise RuntimeError(
|
||||
f"base weight fingerprint mismatch on {len(diffs)} layer(s) "
|
||||
f"(e.g. {diffs[0]}). For PiSSA the saved adapter assumes the same "
|
||||
"base; reload onto the original model or re-run init."
|
||||
)
|
||||
return handles
|
||||
|
||||
@@ -29,7 +29,12 @@ def _classify(m: nn.Module, d_model: int, name: str) -> str:
|
||||
if do == d_model and di != d_model:
|
||||
return "writer"
|
||||
if di == d_model and do == d_model:
|
||||
return "writer" if any(s in name for s in ("o_proj", "out_proj")) else "reader"
|
||||
if any(s in name for s in ("o_proj", "out_proj", "down_proj")):
|
||||
return "writer"
|
||||
elif any(s in name for s in ("q_proj", "k_proj", "v_proj", "kq_proj")):
|
||||
return "reader"
|
||||
else:
|
||||
raise ValueError(f"ambiguous role for {name} with in/out features {di}/{do}")
|
||||
return "inner"
|
||||
|
||||
|
||||
|
||||
@@ -1,366 +0,0 @@
|
||||
"""Per-variant attach + train + save + load round-trip, plus surgical regressions.
|
||||
|
||||
The big invariant is the parametrized train_save_load test: identity at t=0,
|
||||
gradient flow on a real loss, then save -> reload onto a fresh model and
|
||||
confirm the trained outputs survive the round-trip. Cheap on CPU.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
import lora_lite as ll
|
||||
|
||||
|
||||
CFG_BY_VARIANT = {
|
||||
"lora": ll.LoRAConfig,
|
||||
"pissa": ll.PiSSAConfig,
|
||||
"delora": ll.DeLoRAConfig,
|
||||
"ia3": ll.IA3Config,
|
||||
"ia3_ff": ll.IA3FFConfig,
|
||||
"dora": ll.DoRAConfig,
|
||||
"hra": ll.HRAConfig,
|
||||
"eva": ll.EVAConfig,
|
||||
"antipasto": ll.AntiPaSTOConfig,
|
||||
"road": ll.RoadConfig,
|
||||
}
|
||||
|
||||
# Per-variant identity tolerance at t=0 (after attach, before any step).
|
||||
# fp32 SVD round-trip + per-row norm = looser tolerance for pissa/dora/antipasto.
|
||||
IDENTITY_TOL = {
|
||||
"lora": 5e-3, # near_zero B: B@A ~ sqrt(r)*eps*kaiming
|
||||
"pissa": 5e-4, # SVD round-trip
|
||||
"delora": 1e-6, # exact-zero B, lambda0-scaled
|
||||
"ia3": 5e-3, # near_one gate
|
||||
"ia3_ff": 5e-3, # near_one gate
|
||||
"dora": 5e-3, # near_zero B + m
|
||||
"hra": 1e-2, # near_zero U + paired-symmetry init
|
||||
"eva": 5e-4, # exact-zero B, SVD A overwritten in group_init
|
||||
"antipasto": 5e-4, # SVD round-trip
|
||||
"road": 5e-3, # near_zero theta
|
||||
}
|
||||
|
||||
|
||||
class TinyBlock(nn.Module):
|
||||
def __init__(self, d: int = 64, ff: int = 128):
|
||||
super().__init__()
|
||||
self.q_proj = nn.Linear(d, d, bias=False)
|
||||
self.k_proj = nn.Linear(d, d, bias=False)
|
||||
self.v_proj = nn.Linear(d, d, bias=False)
|
||||
self.o_proj = nn.Linear(d, d, bias=False)
|
||||
self.gate_proj = nn.Linear(d, ff, bias=False)
|
||||
self.up_proj = nn.Linear(d, ff, bias=False)
|
||||
self.down_proj = nn.Linear(ff, d, bias=False)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
h = self.o_proj(self.q_proj(x) + self.k_proj(x) + self.v_proj(x))
|
||||
m = self.down_proj(torch.nn.functional.silu(self.gate_proj(x)) * self.up_proj(x))
|
||||
return x + h + m
|
||||
|
||||
|
||||
class TinyModel(nn.Module):
|
||||
def __init__(self, n_layers: int = 4, d: int = 64, ff: int = 128, vocab: int = 100):
|
||||
super().__init__()
|
||||
self.embed_tokens = nn.Embedding(vocab, d)
|
||||
self.layers = nn.ModuleList([TinyBlock(d, ff) for _ in range(n_layers)])
|
||||
self.lm_head = nn.Linear(d, vocab, bias=False)
|
||||
self.config = type("Cfg", (), {"hidden_size": d})()
|
||||
|
||||
def forward(self, ids: torch.Tensor) -> torch.Tensor:
|
||||
x = self.embed_tokens(ids)
|
||||
for block in self.layers:
|
||||
x = block(x)
|
||||
return self.lm_head(x)
|
||||
|
||||
|
||||
class FakeLinearLike(nn.Module):
|
||||
"""linear-like, but not nn.Linear: stand-in for bnb 4/8-bit modules."""
|
||||
|
||||
def __init__(self, d_in: int = 8, d_out: int = 8):
|
||||
super().__init__()
|
||||
self.in_features = d_in
|
||||
self.out_features = d_out
|
||||
self.weight = nn.Parameter(torch.empty(d_out, d_in))
|
||||
nn.init.kaiming_uniform_(self.weight, a=5 ** 0.5)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return torch.nn.functional.linear(x, self.weight)
|
||||
|
||||
|
||||
class FakeBnbModel(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.config = type("Cfg", (), {"hidden_size": 8})()
|
||||
self.layers = nn.ModuleList([FakeLinearLike(8, 8)])
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.layers[0](x)
|
||||
|
||||
|
||||
def cfg_for(variant: str) -> ll.AdapterConfig:
|
||||
return CFG_BY_VARIANT[variant](
|
||||
r=4,
|
||||
alpha=8,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
|
||||
def attach_with_calib(model: nn.Module, cfg: ll.AdapterConfig, ids: torch.Tensor) -> None:
|
||||
if cfg.variant == "eva":
|
||||
calib = [ids for _ in range(2)]
|
||||
ll.attach(model, cfg, calibration_data=calib)
|
||||
else:
|
||||
ll.attach(model, cfg)
|
||||
|
||||
|
||||
def trainable_grad_norm(model: nn.Module) -> float:
|
||||
return sum(
|
||||
p.grad.detach().float().norm().item()
|
||||
for n, p in model.named_parameters()
|
||||
if "lora_" in n and p.grad is not None
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("variant", list(CFG_BY_VARIANT))
|
||||
def test_train_save_load(variant: str, tmp_path: Path):
|
||||
"""Identity at t=0, one SGD step, save, reload onto fresh model, outputs match."""
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel()
|
||||
ids = torch.randint(0, 100, (2, 16))
|
||||
with torch.no_grad():
|
||||
y_base = model(ids).clone()
|
||||
|
||||
cfg = cfg_for(variant)
|
||||
attach_with_calib(model, cfg, ids)
|
||||
|
||||
trainable = [p for p in model.parameters() if p.requires_grad]
|
||||
assert trainable
|
||||
assert all("lora_" in n for n, p in model.named_parameters() if p.requires_grad)
|
||||
|
||||
with torch.no_grad():
|
||||
y_init = model(ids).clone()
|
||||
assert (y_init - y_base).abs().max().item() < IDENTITY_TOL[variant]
|
||||
|
||||
target = torch.randn_like(y_init) * 0.1
|
||||
opt = torch.optim.SGD(trainable, lr=1e-2)
|
||||
opt.zero_grad()
|
||||
loss = (model(ids) - target).pow(2).mean()
|
||||
loss.backward()
|
||||
leaked = [n for n, p in model.named_parameters() if "lora_" not in n and p.grad is not None]
|
||||
assert leaked == []
|
||||
assert trainable_grad_norm(model) > 0
|
||||
opt.step()
|
||||
|
||||
with torch.no_grad():
|
||||
y_trained = model(ids).clone()
|
||||
|
||||
path = tmp_path / "adapter.pt"
|
||||
ll.save(model, str(path))
|
||||
|
||||
torch.manual_seed(0)
|
||||
model_loaded = TinyModel()
|
||||
ll.load(model_loaded, str(path)) # EVA load skips group_init; calibration_data not needed
|
||||
with torch.no_grad():
|
||||
y_loaded = model_loaded(ids)
|
||||
assert (y_loaded - y_trained).abs().max().item() < max(IDENTITY_TOL[variant], 1e-5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("variant", ["lora", "delora", "ia3", "hra", "road"])
|
||||
def test_hook_only_variants_attach_to_non_linear_target(variant: str):
|
||||
"""bnb-style targets are linear-like but not nn.Linear; hook-only variants must accept them."""
|
||||
extra = {"lambda0": 0.1} if variant == "delora" else {"group_size": 8} if variant == "road" else {}
|
||||
cfg = CFG_BY_VARIANT[variant](r=2, alpha=4, dtype=torch.float32, target_roles=(), **extra)
|
||||
model = FakeBnbModel()
|
||||
ll.attach(model, cfg)
|
||||
x = torch.randn(2, 3, 8)
|
||||
model(x).pow(2).mean().backward()
|
||||
assert trainable_grad_norm(model) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("variant", ["pissa", "dora", "antipasto"])
|
||||
def test_weight_reading_variants_reject_non_linear(variant: str):
|
||||
r = 4 if variant == "antipasto" else 2 # antipasto needs r % block_size==0
|
||||
cfg = CFG_BY_VARIANT[variant](r=r, alpha=r, dtype=torch.float32, target_roles=())
|
||||
with pytest.raises(TypeError, match="plain nn.Linear"):
|
||||
ll.attach(FakeBnbModel(), cfg)
|
||||
|
||||
|
||||
def test_save_load_strict_keys(tmp_path: Path):
|
||||
import json
|
||||
from safetensors.torch import load_file, save_file
|
||||
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel()
|
||||
ll.attach(model, ll.LoRAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
p = tmp_path / "lora.safetensors"
|
||||
ll.save(model, str(p))
|
||||
sd = load_file(str(p), device="cpu")
|
||||
|
||||
# missing key: drop first lora key
|
||||
missing_sd = dict(sd)
|
||||
dropped_key = next(iter(missing_sd))
|
||||
del missing_sd[dropped_key]
|
||||
from safetensors import safe_open
|
||||
with safe_open(str(p), framework="pt", device="cpu") as f:
|
||||
meta = f.metadata()
|
||||
save_file(missing_sd, str(p), metadata=meta)
|
||||
with pytest.raises(RuntimeError, match="missing lora keys"):
|
||||
ll.load(TinyModel(), str(p))
|
||||
|
||||
# unexpected key: add a bogus lora key
|
||||
bad_sd = dict(sd)
|
||||
bad_sd["layers.0.q_proj.lora_extra"] = torch.zeros(1)
|
||||
save_file(bad_sd, str(p), metadata=meta)
|
||||
with pytest.raises(RuntimeError, match="unexpected lora keys"):
|
||||
ll.load(TinyModel(), str(p))
|
||||
|
||||
|
||||
def test_no_target_layers_is_loud():
|
||||
cfg = ll.LoRAConfig(target_names=("definitely_missing",))
|
||||
with pytest.raises(RuntimeError, match="no target layers"):
|
||||
ll.attach(TinyModel(), cfg)
|
||||
|
||||
|
||||
def test_eva_requires_calibration():
|
||||
"""EVA's group_init must error loudly if calibration_data is missing."""
|
||||
with pytest.raises(ValueError, match="calibration_data"):
|
||||
ll.attach(TinyModel(), ll.EVAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
|
||||
|
||||
def test_delora_default_has_live_step0_gradient():
|
||||
"""Default lambda0 must be nonzero; B=0 preserves identity while B gets gradient."""
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel(n_layers=1)
|
||||
ids = torch.randint(0, 100, (2, 8))
|
||||
ll.attach(model, ll.DeLoRAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
|
||||
assert model.layers[0].q_proj.lora_lambda.item() == pytest.approx(15.0)
|
||||
loss = model(ids).pow(2).mean()
|
||||
loss.backward()
|
||||
|
||||
b_grad = model.layers[0].q_proj.lora_B.grad.detach().abs().max().item()
|
||||
assert b_grad > 0
|
||||
|
||||
|
||||
def test_pissa_identity_with_nonunit_scale():
|
||||
"""Regression: PiSSA must pre-divide S by alpha/r, not require alpha == r."""
|
||||
torch.manual_seed(0)
|
||||
model = TinyModel(n_layers=1)
|
||||
ids = torch.randint(0, 100, (2, 8))
|
||||
with torch.no_grad():
|
||||
y_base = model(ids).clone()
|
||||
|
||||
ll.attach(model, ll.PiSSAConfig(r=4, alpha=8, dtype=torch.float32))
|
||||
with torch.no_grad():
|
||||
y = model(ids)
|
||||
assert (y - y_base).abs().max().item() < IDENTITY_TOL["pissa"]
|
||||
|
||||
|
||||
def test_antipasto_blockwise_rotation_matches_explicit_blockdiag():
|
||||
"""The einsum/rearrange path must equal the old explicit blockdiag math."""
|
||||
from lora_lite.variants.antipasto import _build_rotation
|
||||
|
||||
torch.manual_seed(0)
|
||||
n_blocks, bs, d_in, d_out = 3, 4, 7, 5
|
||||
r = n_blocks * bs
|
||||
rot_T = torch.randn(n_blocks, bs * (bs - 1) // 2) * 0.1
|
||||
Vh = torch.randn(r, d_in)
|
||||
U = torch.randn(d_out, r)
|
||||
R_blocks = _build_rotation(rot_T, bs, 0.5)
|
||||
R = torch.block_diag(*list(R_blocks))
|
||||
|
||||
Vh_blocks = torch.reshape(Vh, (n_blocks, bs, d_in))
|
||||
Vh_rot = torch.einsum("nab,nbi->nai", R_blocks, Vh_blocks).reshape(r, d_in)
|
||||
U_blocks = torch.reshape(U, (d_out, n_blocks, bs))
|
||||
U_rot = torch.einsum("dnb,ncb->dnc", U_blocks, R_blocks).reshape(d_out, r)
|
||||
|
||||
assert (Vh_rot - R @ Vh).abs().max().item() < 1e-6
|
||||
assert (U_rot - U @ R.T).abs().max().item() < 1e-6
|
||||
|
||||
|
||||
def test_dora_bias_passthrough():
|
||||
"""Regression: DoRA must NOT scale bias; identity holds with bias=True at t=0."""
|
||||
torch.manual_seed(0)
|
||||
d = 16
|
||||
layer = nn.Linear(d, d, bias=True)
|
||||
x = torch.randn(2, d)
|
||||
y_base = layer(x).detach()
|
||||
|
||||
class Wrap(nn.Module):
|
||||
def __init__(self, lin):
|
||||
super().__init__()
|
||||
self.config = type("Cfg", (), {"hidden_size": d})()
|
||||
self.layers = nn.ModuleList([lin])
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers[0](x)
|
||||
|
||||
model = Wrap(layer)
|
||||
ll.attach(model, ll.DoRAConfig(r=2, alpha=4, dtype=torch.float32, target_roles=()))
|
||||
with torch.no_grad():
|
||||
y = model(x)
|
||||
assert (y - y_base).abs().max().item() < 5e-3 # near_zero B + m init
|
||||
|
||||
|
||||
def test_hra_forward_is_x_R_T():
|
||||
"""HRA must apply x @ R^T (loop i = r-1 down to 0). Asymmetric U makes order observable."""
|
||||
torch.manual_seed(0)
|
||||
d = 8
|
||||
layer = nn.Linear(d, d, bias=False)
|
||||
x = torch.randn(2, 3, d)
|
||||
|
||||
class Wrap(nn.Module):
|
||||
def __init__(self, lin):
|
||||
super().__init__()
|
||||
self.config = type("Cfg", (), {"hidden_size": d})()
|
||||
self.layers = nn.ModuleList([lin])
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers[0](x)
|
||||
|
||||
model = Wrap(layer)
|
||||
ll.attach(model, ll.HRAConfig(r=4, alpha=4, dtype=torch.float32, target_roles=()))
|
||||
# break paired symmetry so order matters
|
||||
with torch.no_grad():
|
||||
layer.lora_U.add_(0.1 * torch.randn_like(layer.lora_U))
|
||||
|
||||
U = layer.lora_U
|
||||
R = torch.eye(d)
|
||||
for i in range(U.shape[0]):
|
||||
u = U[i]
|
||||
sq = (u * u).sum().clamp_min(1e-12)
|
||||
R = R - (2.0 / sq) * torch.outer(R @ u, u)
|
||||
with torch.no_grad():
|
||||
y_adapt = model(x)
|
||||
y_ref = torch.nn.functional.linear(x, layer.weight @ R)
|
||||
assert (y_adapt - y_ref).abs().max().item() < 1e-5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("road_variant", ["road_1", "road_2", "road_4"])
|
||||
def test_road_apply_matches_explicit_matrix(road_variant: str):
|
||||
"""Fast elementwise ROAD path must match PEFT's explicit R @ y matrix construction."""
|
||||
from lora_lite.variants.road import _apply_road, _road_matrix, _road_param_size
|
||||
|
||||
torch.manual_seed(0)
|
||||
d_out = 16
|
||||
group_size = 8
|
||||
size = _road_param_size(d_out, road_variant)
|
||||
theta = torch.randn(size) * 0.2
|
||||
alpha = torch.randn(size) * 0.1 + 1.0
|
||||
y = torch.randn(2, 3, d_out)
|
||||
|
||||
y_fast = _apply_road(road_variant, group_size, theta, alpha, y)
|
||||
R = _road_matrix(road_variant, group_size, theta, alpha)
|
||||
y_ref = torch.einsum("oi,...i->...o", R, y)
|
||||
|
||||
assert (y_fast - y_ref).abs().max().item() < 1e-6
|
||||
|
||||
|
||||
def test_road_invalid_group_size_is_loud():
|
||||
with pytest.raises(ValueError, match="positive and even"):
|
||||
ll.attach(TinyModel(), ll.RoadConfig(group_size=7))
|
||||
with pytest.raises(ValueError, match="divisible"):
|
||||
ll.attach(TinyModel(), ll.RoadConfig(group_size=48))
|
||||
Reference in New Issue
Block a user