diff --git a/README.md b/README.md index d30686e..757dee6 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) | PiSSA | yes, fp only | mutates `weight` into `W_res`; quantized PiSSA intentionally fails | | DeLoRA | yes | normalized additive adapter with learned scalar | | IA3 | yes | output gate initialized to ones | -| DoRA | no | next small candidate | +| DoRA | yes, fp only | reads dense `weight` for column-norm; quantized DoRA fails loudly | | SSVD / OFT / HRA / ROAD | no | planned after the hook-only invariant is clear | | S-steer / AntiPaSTO | no | should use data-calibrated `group_init`, not plain LoRA tests | diff --git a/docs/developer_guide.md b/docs/developer_guide.md index fd264fa..c2a9c09 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -74,8 +74,8 @@ Activation-aware variants implement `group_init(model, targets, cfg, calibration | Variant | Fit to current runtime | Next invariant | |---|---|---| -| IA3 | Done. Output gate `y * g`, identity at `g=1`. | Qwen proof task 79. | -| DoRA | Likely additive hook for fp layers; quantized norm semantics need care. | fp identity, perturb, save/load, loss drop. | +| IA3 | Done. Output gate `y * g`, identity at `g=1`. | Qwen proof in latest probe. | +| DoRA | Done for fp layers. Reads dense `weight` to compute `||V||_c`; quantized layers fail fast. | Qwen proof in latest probe. | | SSVD / PiSSA-family | Fits weight-SVD init path. | reconstruction/identity invariant plus train proof. | | HRA / OFT / ROAD | Interesting, but weight-transform semantics need clearer hook-only formulation. | pseudocode first, then rotation/non-dead-code invariant. | | S-steer / AntiPaSTO | Should use `group_init` and activation evidence. | calibration consumed, hooks removed, load works without calibration. | diff --git a/justfile b/justfile index 749697d..75d64e6 100644 --- a/justfile +++ b/justfile @@ -26,6 +26,6 @@ qwen-queue variants="lora pissa delora ia3" steps="16": #!/usr/bin/env bash set -euo pipefail pueue add \ - -l "why: verify Qwen0.6B train/save-load proof for {{variants}} at {{steps}} steps; resolve: publish docs only if exact targets, lora-only grads, loss drop, reload pass" \ + -l "why: verify Qwen0.6B train/save-load proof for {{variants}} at {{steps}} steps; resolve: publish only if exact targets, lora-only grads, loss drop, reload identity" \ -w "$PWD" -o 1 -- \ - bash -lc 'uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}}' \ No newline at end of file + uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}} \ No newline at end of file diff --git a/scripts/qwen_train_probe.py b/scripts/qwen_train_probe.py index 4d21a77..e85c072 100644 --- a/scripts/qwen_train_probe.py +++ b/scripts/qwen_train_probe.py @@ -173,7 +173,7 @@ def run_variant(args, variant: str, input_ids: torch.Tensor, labels: torch.Tenso def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--model", default="Qwen/Qwen3-0.6B") - parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3"]) + parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3", "dora"]) parser.add_argument("--device", default="cuda") parser.add_argument("--torch-dtype", default="bfloat16") parser.add_argument("--steps", type=int, default=8) diff --git a/src/lora_lite/variants/__init__.py b/src/lora_lite/variants/__init__.py index aaaf9e3..e2fc3ab 100644 --- a/src/lora_lite/variants/__init__.py +++ b/src/lora_lite/variants/__init__.py @@ -1 +1 @@ -from . import lora, pissa, delora, ia3 # noqa: F401 side-effect: register +from . import lora, pissa, delora, ia3, dora # noqa: F401 side-effect: register diff --git a/src/lora_lite/variants/dora.py b/src/lora_lite/variants/dora.py new file mode 100644 index 0000000..395c879 --- /dev/null +++ b/src/lora_lite/variants/dora.py @@ -0,0 +1,55 @@ +"""DoRA: weight-decomposed LoRA. Liu et al. 2024 https://arxiv.org/abs/2402.09353 + +W' = m * V / ||V||_c where V = W + (alpha/r) B A (||.||_c = per-output-row L2 norm) + +At t=0: B=0 -> V=W -> y_new = (m_init / ||W||_c) (Wx + 0) = Wx when m_init = ||W||_c. + +Limitation: requires materializing the dense weight to compute ||V||_c. v1 supports +plain nn.Linear only; bnb 4/8-bit layers raise loudly. +""" +import torch +import torch.nn.functional as F +from einops import einsum +from torch import nn + +from ..variant import register, ParamSpec + + +@register +class DoRA: + name = "dora" + + @staticmethod + def param_specs(d_in, d_out, cfg): + return { + "lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), + "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), + # m is filled from ||W||_c during init(); shape (d_out,) + "lora_m": ParamSpec((d_out,), init="zeros", trainable=True), + } + + @staticmethod + def init(layer: nn.Linear, cfg) -> None: + if type(layer) is not nn.Linear: + raise TypeError( + "DoRA needs ||W||_c, so v1 only supports plain nn.Linear. " + "For bnb layers, dequantize first or use LoRA/IA3." + ) + with torch.no_grad(): + W = layer.weight.data.float() # (d_out, d_in) + col_norm = W.norm(dim=1).to(layer.lora_m.dtype) # (d_out,) + layer.lora_m.data.copy_(col_norm) + + @staticmethod + def forward(layer: nn.Linear, x, y): + cfg = layer._lora_cfg + scale = cfg.alpha / cfg.r + # V = W + scale * B @ A + BA = einsum(layer.lora_B, layer.lora_A, "o r, r i -> o i") + V = layer.weight + scale * BA # (d_out, d_in) + v_norm = V.norm(dim=1).clamp_min(1e-12) # (d_out,) + # y' = (m / ||V||_c) * (Wx + scale * BAx) = (m / ||V||_c) * (y + scale * BAx) + h = einsum(x, layer.lora_A, "... i, r i -> ... r") + delta = einsum(h, layer.lora_B, "... r, o r -> ... o") + combined = y + scale * delta + return (layer.lora_m / v_norm) * combined diff --git a/tests/smoke.py b/tests/smoke.py index e9c150e..9252ead 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -130,6 +130,7 @@ def variant_test(variant: str, dtype=torch.float32): "pissa": 5e-4, # SVD recon in fp32 is tight; bf16 would be ~1e-2 "delora": 1e-6, # lambda0=0 "ia3": 1e-6, + "dora": 5e-5, # m * V/||V|| with V=W -> rounding in norm/divide }[variant] * max(1.0, base_scale) assert err < tol, f" FAIL identity: err {err} > tol {tol}" print(f" SHOULD: err<{tol:.1e}. PASS.") @@ -169,6 +170,8 @@ def variant_test(variant: str, dtype=torch.float32): # delora has tightly-normalised updates; use Adam with higher lr to see signal in 20 steps if variant in ("delora", "ia3"): opt = torch.optim.Adam(trainable, lr=1e-1) + elif variant == "dora": + opt = torch.optim.Adam(trainable, lr=1e-3) # m near ||W||_c, bigger lr blows up else: opt = torch.optim.SGD(trainable, lr=1e-2) losses = [] @@ -251,7 +254,7 @@ def main(): parser.add_argument("--require-bnb", action="store_true") args = parser.parse_args() - for v in ("lora", "pissa", "delora", "ia3"): + for v in ("lora", "pissa", "delora", "ia3", "dora"): variant_test(v, dtype=torch.float32) structural_linear_like_test() bitsandbytes_cuda_smoke(args.require_bnb) diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py index bd38dc4..de214a2 100644 --- a/tests/test_lora_lite.py +++ b/tests/test_lora_lite.py @@ -111,7 +111,7 @@ def perturb_first_adapter(model: nn.Module) -> None: raise AssertionError("no perturbable adapter parameter found") -@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3"]) +@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3", "dora"]) def test_variant_identity_hook_save_load_and_training(variant: str): ARTIFACT_DIR.mkdir(exist_ok=True) torch.manual_seed(0) @@ -129,7 +129,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str): with torch.no_grad(): y_init = model(ids).clone() identity_err = (y_init - y_base).abs().max().item() - identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6}[variant] + identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6, "dora": 5e-5}[variant] assert identity_err < identity_tol before_perturb = adapter_state(model) @@ -162,7 +162,9 @@ def test_variant_identity_hook_save_load_and_training(variant: str): assert_only_lora_trainable(train_model) target = torch.randn(2, 16, 100) * 0.1 trainable = [p for p in train_model.parameters() if p.requires_grad] - opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3") else torch.optim.SGD(trainable, lr=1e-2) + opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3") else ( + torch.optim.Adam(trainable, lr=1e-3) if variant == "dora" else torch.optim.SGD(trainable, lr=1e-2) + ) losses = [] first_grad_norm = math.nan before_train = adapter_state(train_model) @@ -247,7 +249,8 @@ def test_structural_non_linear_target_trains_for_forward_only_variants(variant: assert adapter_grad_norm > 0 -def test_pissa_rejects_structural_non_linear_target(): - cfg = ll.LoraLiteConfig(variant="pissa", r=2, alpha=2, dtype=torch.float32, target_roles=()) +@pytest.mark.parametrize("variant", ["pissa", "dora"]) +def test_weight_reading_variants_reject_structural_non_linear_target(variant: str): + cfg = ll.LoraLiteConfig(variant=variant, r=2, alpha=2, dtype=torch.float32, target_roles=()) with pytest.raises(TypeError, match="plain nn.Linear"): ll.attach(FakeBnbModel(), cfg) \ No newline at end of file