fix V3 review must-fixes: DoRA bias passthrough + EVA load path

V3 external review (docs/audit/variants_review_v3.md, 97KB) found 3
must-fix bugs.

DoRA: bias was being scaled by m/||V|| because we operated on the full
base layer output. Now subtract bias before normalization, add back
after. Matches peft DoRA exactly (docs/refs/peft_lora_dora.py:157-161).
New smoke dora_bias_smoke verifies identity at t=0 with bias=True.

EVA load: adapter.load() called attach() which called group_init() which
required calibration_data and raised. Added _skip_group_init flag to
attach(); load() passes it. EVA group_init still raises loudly when
called directly without data. New smoke verifies save+load WITHOUT
calibration data on load path.

Also tightened EVA error message.

Smoke now covers 8 variants + EVA roundtrip + DoRA-bias roundtrip + bnb
4/8-bit. ALL PASS.

V3 nice-to-haves (PiSSA scaling, AntiPaSTO init choice, stale GH refs)
deferred -- documented as intentional in module docstrings.
This commit is contained in:
copilot
2026-04-26 19:50:48 +08:00
parent 185eb29c70
commit 55757e829d
6 changed files with 1841 additions and 9 deletions
+49
View File
@@ -309,6 +309,24 @@ def eva_smoke():
assert all(n > 0 for n in a_norms), "EVA lora_A buffers all zero -> group_init never ran"
print(f" SHOULD: lora_A buffers populated. PASS (mean ||A||={sum(a_norms)/len(a_norms):.3f}).")
# save/load round-trip WITHOUT calibration data on load (load path uses _skip_group_init)
ARTIFACT_DIR.mkdir(exist_ok=True)
p = ARTIFACT_DIR / "eva_smoke_adapter.pt"
ll.save(model, str(p))
ll.detach(model)
torch.manual_seed(0)
model2 = TinyModel().to(torch.float32)
ll.load(model2, str(p)) # must NOT require calibration_data
with torch.no_grad():
y_loaded = model2(ids)
err2 = (y_loaded - y_adapt).abs().max().item()
print(f" save/load (no calibration on load): max err = {err2:.3e}")
assert err2 < 1e-6, f"EVA save/load mismatch {err2}"
print(" SHOULD: load without calibration_data works (uses _skip_group_init). PASS.")
ll.detach(model2)
# re-attach model for training section below
ll.attach(model, cfg, calibration_data=calib)
# gradient flow: only B trains
target = torch.randn(2, 16, 100, dtype=torch.float32) * 0.1
trainable = [p for p in model.parameters() if p.requires_grad]
@@ -328,6 +346,36 @@ def eva_smoke():
ll.detach(model)
def dora_bias_smoke():
"""V3 review caught: DoRA was scaling bias by m/||V||. Fixed; bias passes through."""
print("\n=== dora bias passthrough (V3 fix) ===")
torch.manual_seed(0)
d = 16
layer = nn.Linear(d, d, bias=True).to(torch.float32)
x = torch.randn(2, d)
y_base = layer(x).detach()
class Wrap(nn.Module):
def __init__(self, lin):
super().__init__()
self.config = type("Cfg", (), {"hidden_size": d})()
self.layers = nn.ModuleList([lin])
def forward(self, x):
return self.layers[0](x)
model = Wrap(layer)
cfg = ll.LoraLiteConfig(variant="dora", r=2, alpha=4, dtype=torch.float32, target_roles=())
ll.attach(model, cfg)
with torch.no_grad():
y_adapt = model(x)
err = (y_adapt - y_base).abs().max().item()
print(f" identity with bias=True: max err = {err:.3e}")
assert err < 1e-5, f"DoRA bias-passthrough broken: err {err} (likely bias being scaled)"
print(" SHOULD: identity err < 1e-5 even with bias. PASS.")
ll.detach(model)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--require-bnb", action="store_true")
@@ -336,6 +384,7 @@ def main():
for v in ("lora", "pissa", "delora", "ia3", "dora", "hra", "antipasto"):
variant_test(v, dtype=torch.float32)
eva_smoke()
dora_bias_smoke()
structural_linear_like_test()
bitsandbytes_cuda_smoke(args.require_bnb)
print("\nALL PASS.")