From 0d929f93b345dc0b2fb6d99a2163c424d795772b Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:58:56 +0800 Subject: [PATCH] feat(hra): add Householder Reflection Adaptation, hook-only/bnb-friendly + Qwen proof --- README.md | 3 +- docs/developer_guide.md | 3 +- docs/spec/20260426_lora_lite_plan.md | 2 ++ scripts/qwen_train_probe.py | 7 ++++- src/lora_lite/variants/__init__.py | 2 +- src/lora_lite/variants/hra.py | 47 ++++++++++++++++++++++++++++ tests/smoke.py | 5 +-- tests/test_lora_lite.py | 13 +++++--- 8 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 src/lora_lite/variants/hra.py diff --git a/README.md b/README.md index 757dee6..fdb9240 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,8 @@ See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) | DeLoRA | yes | normalized additive adapter with learned scalar | | IA3 | yes | output gate initialized to ones | | DoRA | yes, fp only | reads dense `weight` for column-norm; quantized DoRA fails loudly | -| SSVD / OFT / HRA / ROAD | no | planned after the hook-only invariant is clear | +| HRA | yes | output-side Householder reflection with identity gate; works on bnb | +| SSVD / OFT / ROAD | no | planned | | S-steer / AntiPaSTO | no | should use data-calibrated `group_init`, not plain LoRA tests | ## Targeting diff --git a/docs/developer_guide.md b/docs/developer_guide.md index c2a9c09..202c397 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -76,6 +76,7 @@ Activation-aware variants implement `group_init(model, targets, cfg, calibration |---|---|---| | IA3 | Done. Output gate `y * g`, identity at `g=1`. | Qwen proof in latest probe. | | DoRA | Done for fp layers. Reads dense `weight` to compute `||V||_c`; quantized layers fail fast. | Qwen proof in latest probe. | +| HRA | Done. Output-side Householder with identity gate; hook-only -> works on bnb. | Qwen proof in latest probe. | | SSVD / PiSSA-family | Fits weight-SVD init path. | reconstruction/identity invariant plus train proof. | -| HRA / OFT / ROAD | Interesting, but weight-transform semantics need clearer hook-only formulation. | pseudocode first, then rotation/non-dead-code invariant. | +| OFT / ROAD | Block-diagonal rotations; weight-transform semantics need clearer hook-only formulation. | pseudocode first, then rotation/non-dead-code invariant. | | S-steer / AntiPaSTO | Should use `group_init` and activation evidence. | calibration consumed, hooks removed, load works without calibration. | diff --git a/docs/spec/20260426_lora_lite_plan.md b/docs/spec/20260426_lora_lite_plan.md index 7cd4ae3..351c038 100644 --- a/docs/spec/20260426_lora_lite_plan.md +++ b/docs/spec/20260426_lora_lite_plan.md @@ -38,6 +38,7 @@ The core bet is that adapter variants should own the relationship between `(x, l | DeLoRA | done | `src/lora_lite/variants/delora.py` | | IA3 | done | `src/lora_lite/variants/ia3.py` | | DoRA | done, fp-only | `src/lora_lite/variants/dora.py` | +| HRA | done | `src/lora_lite/variants/hra.py` (output-side Householder, hook-only -> bnb-compatible) | | Smoke tests | done | `tests/smoke.py` | | bnb minimal forward smoke | done | `Linear8bitLt` and `Linear4bit` pass on CUDA with `just bnb-smoke` | @@ -116,6 +117,7 @@ Follow-up tasks 80 (lora/pissa/delora/ia3 at 16 steps) and 81 (dora at 16 steps) | delora | 2 | 20482 | 0.3281 | 0.3125 | 5.261 | 4.823 | 8.322 | 0.06303 | 15.1 | 0 | `outputs/qwen_train_probe/delora_adapter.pt` | | ia3 | 2 | 3072 | 0 | 0.375 | 5.25 | 4.473 | 14.79 | 0.463 | 5.926 | 0 | `outputs/qwen_train_probe/ia3_adapter.pt` | | dora | 2 | 23552 | 0 | 0.3203 | 5.25 | 2.439 | 53.54 | 1.776 | 7.44 | 0 | `outputs/qwen_train_probe/dora_adapter.pt` | +| hra | 2 | 12290 | 0 | 0.3438 | 5.25 | 4.07 | 22.47 | 0.05225 | 4.735 | 0 | `outputs/qwen_train_probe/hra_adapter.pt` | Failure-mode interpretation: diff --git a/scripts/qwen_train_probe.py b/scripts/qwen_train_probe.py index e85c072..0dfec42 100644 --- a/scripts/qwen_train_probe.py +++ b/scripts/qwen_train_probe.py @@ -53,6 +53,11 @@ def perturb_first_adapter(model: torch.nn.Module) -> None: with torch.no_grad(): p.add_(0.25) return + for name, p in model.named_parameters(): + if "lora_gate" in name: + with torch.no_grad(): + p.add_(0.25) + return for name, p in model.named_parameters(): if "lora_B" in name: with torch.no_grad(): @@ -173,7 +178,7 @@ def run_variant(args, variant: str, input_ids: torch.Tensor, labels: torch.Tenso def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--model", default="Qwen/Qwen3-0.6B") - parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3", "dora"]) + parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3", "dora", "hra"]) parser.add_argument("--device", default="cuda") parser.add_argument("--torch-dtype", default="bfloat16") parser.add_argument("--steps", type=int, default=8) diff --git a/src/lora_lite/variants/__init__.py b/src/lora_lite/variants/__init__.py index e2fc3ab..b8a3567 100644 --- a/src/lora_lite/variants/__init__.py +++ b/src/lora_lite/variants/__init__.py @@ -1 +1 @@ -from . import lora, pissa, delora, ia3, dora # noqa: F401 side-effect: register +from . import lora, pissa, delora, ia3, dora, hra # noqa: F401 side-effect: register diff --git a/src/lora_lite/variants/hra.py b/src/lora_lite/variants/hra.py new file mode 100644 index 0000000..e3a2310 --- /dev/null +++ b/src/lora_lite/variants/hra.py @@ -0,0 +1,47 @@ +"""HRA: Householder Reflection Adaptation. Yuan et al. 2024 https://arxiv.org/abs/2405.17484 + +Output-side formulation with an identity-init gate: + + y' = (1 - alpha) * y + alpha * R y (so y' = y when alpha = 0) + R = prod_{i=1..r} H_i, H_i = I - 2 u_i u_i^T / ||u_i||^2 + +`lora_gate` is initialized to 0 so y' = y at t=0. `lora_U` is initialized +kaiming so ||u_i||^2 is well-defined (no 0/0). Gradients flow into both U and +the gate even at init. + +Hook-only, no weight access -> works on bnb 4/8-bit layers. +""" +import torch +from einops import einsum +from torch import nn + +from ..variant import register, ParamSpec + + +@register +class HRA: + name = "hra" + + @staticmethod + def param_specs(d_in, d_out, cfg): + return { + # one Householder vector per rank slot in R^{d_out} + "lora_U": ParamSpec((cfg.r, d_out), init="kaiming", trainable=True), + # identity gate; 0 -> y' = y exactly + "lora_gate": ParamSpec((), init="zeros", trainable=True), + } + + @staticmethod + def init(layer: nn.Linear, cfg) -> None: + return + + @staticmethod + def forward(layer: nn.Linear, x, y): + U = layer.lora_U # (r, d_out) + Ry = y + for i in range(U.shape[0]): + u = U[i] + sq = (u * u).sum().clamp_min(1e-12) + coeff = einsum(Ry, u, "... o, o -> ...") * (2.0 / sq) + Ry = Ry - coeff.unsqueeze(-1) * u + return y + layer.lora_gate * (Ry - y) diff --git a/tests/smoke.py b/tests/smoke.py index 9252ead..feb19a2 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -131,6 +131,7 @@ def variant_test(variant: str, dtype=torch.float32): "delora": 1e-6, # lambda0=0 "ia3": 1e-6, "dora": 5e-5, # m * V/||V|| with V=W -> rounding in norm/divide + "hra": 1e-6, # gate=0 -> exact identity }[variant] * max(1.0, base_scale) assert err < tol, f" FAIL identity: err {err} > tol {tol}" print(f" SHOULD: err<{tol:.1e}. PASS.") @@ -168,7 +169,7 @@ def variant_test(variant: str, dtype=torch.float32): target = torch.randn(2, 16, 100, dtype=dtype) * 0.1 trainable = [p for p in model.parameters() if p.requires_grad] # delora has tightly-normalised updates; use Adam with higher lr to see signal in 20 steps - if variant in ("delora", "ia3"): + if variant in ("delora", "ia3", "hra"): opt = torch.optim.Adam(trainable, lr=1e-1) elif variant == "dora": opt = torch.optim.Adam(trainable, lr=1e-3) # m near ||W||_c, bigger lr blows up @@ -254,7 +255,7 @@ def main(): parser.add_argument("--require-bnb", action="store_true") args = parser.parse_args() - for v in ("lora", "pissa", "delora", "ia3", "dora"): + for v in ("lora", "pissa", "delora", "ia3", "dora", "hra"): variant_test(v, dtype=torch.float32) structural_linear_like_test() bitsandbytes_cuda_smoke(args.require_bnb) diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py index de214a2..e6f5527 100644 --- a/tests/test_lora_lite.py +++ b/tests/test_lora_lite.py @@ -98,6 +98,11 @@ def perturb_first_adapter(model: nn.Module) -> None: with torch.no_grad(): p.add_(0.25) return + for name, p in model.named_parameters(): + if "lora_gate" in name: + with torch.no_grad(): + p.add_(0.25) + return for name, p in model.named_parameters(): if "lora_B" in name: with torch.no_grad(): @@ -111,7 +116,7 @@ def perturb_first_adapter(model: nn.Module) -> None: raise AssertionError("no perturbable adapter parameter found") -@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3", "dora"]) +@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3", "dora", "hra"]) def test_variant_identity_hook_save_load_and_training(variant: str): ARTIFACT_DIR.mkdir(exist_ok=True) torch.manual_seed(0) @@ -129,7 +134,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str): with torch.no_grad(): y_init = model(ids).clone() identity_err = (y_init - y_base).abs().max().item() - identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6, "dora": 5e-5}[variant] + identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6, "dora": 5e-5, "hra": 1e-6}[variant] assert identity_err < identity_tol before_perturb = adapter_state(model) @@ -162,7 +167,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str): assert_only_lora_trainable(train_model) target = torch.randn(2, 16, 100) * 0.1 trainable = [p for p in train_model.parameters() if p.requires_grad] - opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3") else ( + opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3", "hra") else ( torch.optim.Adam(trainable, lr=1e-3) if variant == "dora" else torch.optim.SGD(trainable, lr=1e-2) ) losses = [] @@ -221,7 +226,7 @@ def test_no_target_layers_is_loud_failure(): ll.attach(TinyModel(), cfg) -@pytest.mark.parametrize("variant", ["lora", "delora", "ia3"]) +@pytest.mark.parametrize("variant", ["lora", "delora", "ia3", "hra"]) def test_structural_non_linear_target_trains_for_forward_only_variants(variant: str): torch.manual_seed(0) model = FakeBnbModel()