From 699fde31bf5abce680b4814422fea2792cab09b4 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:49:17 +0800 Subject: [PATCH] feat: ia3 variant, real bnb 4bit/8bit smoke, dev guide split, user-only readme --- README.md | 188 +++++---------------------- docs/developer_guide.md | 81 ++++++++++++ docs/spec/20260426_lora_lite_plan.md | 19 ++- justfile | 9 +- pyproject.toml | 1 + scripts/qwen_train_probe.py | 7 +- src/lora_lite/variants/__init__.py | 2 +- src/lora_lite/variants/ia3.py | 22 ++++ tests/smoke.py | 23 +++- tests/test_lora_lite.py | 13 +- uv.lock | 25 +++- 11 files changed, 216 insertions(+), 174 deletions(-) create mode 100644 docs/developer_guide.md create mode 100644 src/lora_lite/variants/ia3.py diff --git a/README.md b/README.md index a27c1c2..d30686e 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # lora-lite -A hackable, single-file-per-variant LoRA library built on PyTorch forward hooks. +Hackable PyTorch adapters for LoRA-family and small PEFT experiments. -The goal is not to be PEFT-compatible. The goal is to make adapter ideas easy to read, edit, test, and throw away. +`lora-lite` uses forward hooks instead of module replacement. Adapter parameters are plain `nn.Parameter`s on the target layer, e.g. `model.layers[5].self_attn.q_proj.lora_A`. ## Install ```bash -pip install -e . +pip install -e git+https://github.com/wassname/lora-lite.git#egg=lora-lite ``` ## Quickstart @@ -15,7 +15,7 @@ pip install -e . ```python import torch, lora_lite as ll -model = MyTransformer() # any nn.Module containing linear-like children +model = MyTransformer() cfg = ll.LoraLiteConfig(variant="lora", r=8, alpha=16, dtype=torch.bfloat16) ll.attach(model, cfg) @@ -27,172 +27,54 @@ ll.detach(model) ll.load(model, "adapter.pt") ``` -Inspect a live adapter tensor directly: +## Does it work? -```python -A = model.layers[5].self_attn.q_proj.lora_A +```bash +just check # pytest + smoke + package build + metadata check +just bnb-smoke # required CUDA bitsandbytes 4bit/8bit smoke +just qwen-probe # Qwen/Qwen3-0.6B train/save-load probe ``` -## Core idea +See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) for verification history and exact results. -Each variant owns the adapter math. The runtime only finds target layers, attaches `lora_*` parameters, registers hooks, and saves full-path adapter keys. +## Variants -```python -def attach(model, cfg): - targets ← find_linear_like_modules(model, cfg) - freeze(model.parameters()) - for name, layer in targets: - layer.lora_* ← variant.param_specs(layer, cfg) - hook(layer, lambda x, y: variant.forward(layer, x, y)) - -def save(model, path): - torch.save({"cfg": cfg, "state": state_dict_keys_containing("lora_")}, path) -``` - -Minimal by design: - -- One file per variant. -- No module replacement, merge/unmerge, mixed-adapter batches, or PEFT config soup. -- LoRA/DeLoRA hooks work with `nn.Linear` and bnb-style `Linear{4bit,8bitLt}` modules that expose `in_features`, `out_features`, and `weight`. -- PiSSA is fp-only because it mutates `weight` into `W_res`; quantized PiSSA should fail loudly until dequantize/requantize is explicit. - -Currently shipped variants: - -| Variant | Class | File | +| Variant | Support | Notes | |---|---|---| -| LoRA | A (additive) | [src/lora_lite/variants/lora.py](src/lora_lite/variants/lora.py) | -| PiSSA ([Meng+ 2024](https://arxiv.org/abs/2404.02948)) | A + B (special init mutates W) | [src/lora_lite/variants/pissa.py](src/lora_lite/variants/pissa.py) | -| DeLoRA ([Bini+ 2025](https://arxiv.org/abs/2503.18225)) | A (additive, normalised) | [src/lora_lite/variants/delora.py](src/lora_lite/variants/delora.py) | - -See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) for goals, status, and the current design plan. The original broader design was stress-tested against the [adapters_as_hypotheses](https://github.com/wassname/adapters_as_hypotheses) catalog (~26/27 variants covered with 3 small API tweaks). +| LoRA | yes | additive low-rank adapter | +| PiSSA | yes, fp only | mutates `weight` into `W_res`; quantized PiSSA intentionally fails | +| DeLoRA | yes | normalized additive adapter with learned scalar | +| IA3 | yes | output gate initialized to ones | +| DoRA | no | next small candidate | +| SSVD / OFT / HRA / ROAD | no | planned after the hook-only invariant is clear | +| S-steer / AntiPaSTO | no | should use data-calibrated `group_init`, not plain LoRA tests | ## Targeting -By default we target linear-like modules (`in_features`, `out_features`, `weight`) whose shape matches a "reader" (`d_in == d_model`) or "writer" (`d_out == d_model`) role, excluding `lm_head` and `embed_tokens`. This structural test is what lets bnb Linear4bit/8bitLt modules be targeted without a backend-specific class. Knobs on `LoraLiteConfig`: +By default, `lora-lite` targets linear-like modules with `in_features`, `out_features`, and `weight`, excluding `lm_head` and `embed_tokens`. -- `target_roles`: subset of `("reader", "writer", "inner")`. `()` = all. -- `target_names`: regex includes (must match if non-empty). -- `exclude_names`: regex excludes (default skips `lm_head`, `embed_tokens`). -- `layers`: tuple of layer indices, or `None` for all (matches `.layers..` in module name). +Useful `LoraLiteConfig` fields: -## Variant API +- `target_roles`: subset of `("reader", "writer", "inner")`; `()` means all. +- `target_names`: regex includes. +- `exclude_names`: regex excludes. +- `layers`: layer indices, matching `.layers..` in module names. -A variant is a class with a `name` and three statics: +This structural targeting is why LoRA, DeLoRA, and IA3 can run on bnb-style `Linear4bit`/`Linear8bitLt` modules. PiSSA is different because it edits the base weight. + +## Save format + +Adapters are just: ```python -@register -class MyVariant: - name = "myvariant" - - @staticmethod - def param_specs(d_in, d_out, cfg) -> dict[str, ParamSpec]: - return {"lora_A": ParamSpec((cfg.r, d_in), init="kaiming"), ...} - - @staticmethod - def init(layer, cfg) -> None: - # Optional. Run after params are created. May read/mutate layer.weight. - ... - - @staticmethod - def forward(layer, x, y) -> Tensor: - # Return the layer's NEW output (additive: `return y + delta`). - ... +torch.save({"cfg": cfg.to_dict(), "state": lora_state_dict}, "adapter.pt") ``` -Adapter params attached as `layer.lora_*` get full-path keys in `state_dict()` automatically (e.g. `model.layers.5.self_attn.q_proj.lora_A`). +`lora_state_dict` contains full-path keys with `"lora_"` in the name. Missing or unexpected adapter keys fail on load. -## Data-calibrated init +## Developer docs -PiSSA, DeLoRA, and LoRA only use `layer.weight` for init -- no calibration data needed. - -For variants that DO need data (e.g. AntiPaSTO, LoRA-GA, activation-aware SVD), keep dataloaders out of `cfg` so adapter checkpoints stay serializable. Use: - -```python -ll.attach(model, cfg, calibration_data=calib) -``` - -where `calib` is an iterable of whole-model inputs, e.g. `Iterable[dict[str, Tensor]]` for HF models or `Iterable[Tensor]` of token ids. Activation-aware variants implement: - -```python -@staticmethod -def group_init(model, targets, cfg, calibration_data): ... -``` - -`targets` is `list[(name, layer, role)]`. The variant adds temporary hooks, runs `model(batch)` over `calibration_data`, removes the hooks, then writes `lora_*` params. Per-layer `init(layer, cfg)` stays weight-only. - -Sketch: - -```python -@register -class ActSVD: - name = "actsvd" - @staticmethod - def param_specs(d_in, d_out, cfg): ... - @staticmethod - def group_init(model, targets, cfg, calibration_data): - bufs = {name: [] for name, _, _ in targets} - hooks = [ - layer.register_forward_pre_hook( - lambda m, args, name=name: bufs[name].append(args[0].detach().float()) - ) - for name, layer, _ in targets - ] - try: - with torch.no_grad(): - for batch in calibration_data: - model(**batch) if isinstance(batch, dict) else model(batch) - finally: - for h in hooks: - h.remove() - # For each target: X = torch.cat(bufs[name], dim=0); do SVD; write A/B. -``` - -## Smoke test - -```bash -just check # pytest + smoke + package build -just test -just smoke -just qwen-queue # queued Qwen/Qwen3-0.6B proof via pueue -``` - -`just test` verifies, for each of `lora`, `pissa`, `delora`: - -1. Identity at t=0: `max|y_adapter - y_base|` within float tolerance. -2. Adapter hooks are live: perturbing only `lora_*` changes outputs. -3. Save/load round-trip preserves full-path adapter keys and tensors. -4. Missing or unexpected `lora_*` checkpoint keys fail loudly. -5. Only `lora_*` parameters are trainable and base parameters get no gradients. -6. A 20-step tiny regression training probe gets finite nonzero adapter gradients and >5% loss drop. - -`just qwen-probe` is the real-model proof. It loads `Qwen/Qwen3-0.6B` fresh per variant, attaches only layer-0 `q_proj`/`v_proj`, trains one fixed LM batch, saves adapters, reloads into a fresh base model, and checks logits match. Last verified on 2026-04-26: - -| variant | targets | trainable | identity err | perturb delta | loss0 | lossN | drop % | grad norm | adapter delta | reload err | -|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| LoRA | 2 | 20,480 | 0 | 0.3750 | 5.250 | 3.131 | 40.36 | 1.432 | 4.262 | 0 | -| PiSSA | 2 | 20,480 | 0.3125 | 0.7500 | 5.250 | 3.629 | 30.88 | 6.124 | 4.381 | 0 | -| DeLoRA | 2 | 20,482 | 0.3750 | 0.4062 | 5.246 | 5.166 | 1.537 | 0.04778 | 8.196 | 0 | - -This is an interface/training proof, not a benchmark: exact Qwen target names, hook activity, lora-only gradients, loss decrease, adapter tensor save/load, and reload equivalence on a 0.6B HF model. - -CI runs `just check` on GitHub. The larger Qwen proof stays in `pueue` because it needs the shared GPU. - -## What's NOT in v1 - -| Feature | Why dropped | -|---|---| -| merge/unmerge | reload base if you want vanilla | -| 4/8-bit-aware merge | DoRA on bnb supported in forward only (drop merge path) | -| Embedding / Conv adapters | trivial extension; add when needed | -| `adapter_names=` mixed batch forward | rare; add when needed | -| Multiple named adapters per layer | one variant per `attach()` | -| HF `PeftConfig` / hub upload | `torch.save({cfg, state})` is enough | -| AdaLoRA-style rank scheduling | needs `Variant.on_step(step)` -- punt | -| ReFT-style position interventions | sibling submodule (different hook site) | - -## Status - -v0.0.1: LoRA + PiSSA + DeLoRA + minimal functional tests + Qwen proof. Next likely variants are IA3 or DoRA because they fit the current hook contract with little new machinery. OFT/ROAD/AntiPaSTO-style methods are more interesting, but should wait until the simple hook family is boring. +See [docs/developer_guide.md](docs/developer_guide.md) for the variant API, data-calibrated init, and adapter roadmap. ## Citation @@ -203,4 +85,4 @@ v0.0.1: LoRA + PiSSA + DeLoRA + minimal functional tests + Qwen proof. Next like year = {2026}, url = {https://github.com/wassname/lora-lite/} } -``` \ No newline at end of file +``` diff --git a/docs/developer_guide.md b/docs/developer_guide.md new file mode 100644 index 0000000..fd264fa --- /dev/null +++ b/docs/developer_guide.md @@ -0,0 +1,81 @@ +# Developer guide + +This is the implementation note for people adding adapter variants. The README is only for prospective users. + +## Design principles + +- Variants own adapter math. +- The runtime owns targeting, parameter attachment, hooks, and save/load. +- Adapter parameters live directly on target layers as `lora_*` parameters. +- Save/load uses normal full-path `state_dict()` keys filtered by `"lora_"`. +- Fail loudly on unsupported weight semantics. No silent quantized PiSSA or merge fallback. + +## Variant contract + +A variant is a registered class with a small static interface: + +```python +@register +class MyVariant: + name = "myvariant" + + @staticmethod + def param_specs(d_in, d_out, cfg) -> dict[str, ParamSpec]: + return {"lora_A": ParamSpec((cfg.r, d_in), init="kaiming")} + + @staticmethod + def init(layer, cfg) -> None: + ... + + @staticmethod + def forward(layer, x, y): + return y_new +``` + +Pseudocode for the runtime: + +```python +def attach(model, cfg): + targets ← find_linear_like_modules(model, cfg) + freeze(model.parameters()) + for name, layer in targets: + layer.lora_* ← variant.param_specs(layer, cfg) + variant.init(layer, cfg) + hook(layer, lambda x, y: variant.forward(layer, x, y)) + +def save(model, path): + torch.save({"cfg": cfg, "state": state_dict_keys_containing("lora_")}, path) +``` + +## Data-calibrated init + +LoRA, PiSSA, DeLoRA, and IA3 only use `layer.weight` or identity constants for init. + +Variants that need data, e.g. AntiPaSTO, LoRA-GA, or activation-aware SVD, should keep dataloaders out of `cfg` so adapter checkpoints stay serializable: + +```python +ll.attach(model, cfg, calibration_data=calib) +``` + +Activation-aware variants implement `group_init(model, targets, cfg, calibration_data)`. The variant may add temporary hooks, run calibration batches, remove hooks, then write `lora_*` params. `load()` should not require calibration data. + +## Current limitations + +| Feature | Current choice | +|---|---| +| merge/unmerge | reload the base model if vanilla weights are needed | +| multiple named adapters | one variant per `attach()` | +| mixed-adapter batches | out of scope until needed | +| quantized PiSSA | fail-fast; explicit dequantize/requantize required | +| AdaLoRA rank scheduling | needs a future `Variant.on_step(step)` hook | +| ReFT-style interventions | likely a sibling module or different hook site | + +## Adapter roadmap + +| Variant | Fit to current runtime | Next invariant | +|---|---|---| +| IA3 | Done. Output gate `y * g`, identity at `g=1`. | Qwen proof task 79. | +| DoRA | Likely additive hook for fp layers; quantized norm semantics need care. | fp identity, perturb, save/load, loss drop. | +| SSVD / PiSSA-family | Fits weight-SVD init path. | reconstruction/identity invariant plus train proof. | +| HRA / OFT / ROAD | Interesting, but weight-transform semantics need clearer hook-only formulation. | pseudocode first, then rotation/non-dead-code invariant. | +| S-steer / AntiPaSTO | Should use `group_init` and activation evidence. | calibration consumed, hooks removed, load works without calibration. | diff --git a/docs/spec/20260426_lora_lite_plan.md b/docs/spec/20260426_lora_lite_plan.md index f9381df..a601419 100644 --- a/docs/spec/20260426_lora_lite_plan.md +++ b/docs/spec/20260426_lora_lite_plan.md @@ -36,8 +36,9 @@ The core bet is that adapter variants should own the relationship between `(x, l | LoRA | done | `src/lora_lite/variants/lora.py` | | PiSSA | done, fp-only | `src/lora_lite/variants/pissa.py` | | DeLoRA | done | `src/lora_lite/variants/delora.py` | +| IA3 | done | `src/lora_lite/variants/ia3.py` | | Smoke tests | done | `tests/smoke.py` | -| bnb minimal forward smoke | done | `Linear8bitLt` and `Linear4bit` pass on CUDA | +| bnb minimal forward smoke | done | `Linear8bitLt` and `Linear4bit` pass on CUDA with `just bnb-smoke` | ## Current smoke evidence @@ -51,6 +52,8 @@ Last verified log: `/home/wassname/.cache/agent-tmp/lora_lite_smoke_after_review | PiSSA loss drop | `11.5%` | | DeLoRA identity | `0.000e+00` | | DeLoRA loss drop | `93.4%` | +| IA3 identity | `0.000e+00` | +| IA3 loss drop | `88.7%` | | fake non-`nn.Linear` target | attaches, identity `0.000e+00`, grad nonzero | | bnb `Linear8bitLt` | identity `0.000e+00`, grad nonzero | | bnb `Linear4bit` | identity `0.000e+00`, grad nonzero | @@ -64,6 +67,7 @@ Goal: upgrade from smoke-tested sketch to evidence that the current PEFT-lite in In: - Pytest coverage for LoRA, PiSSA, and DeLoRA correctness invariants. +- IA3 coverage as the first waiting-time simple adapter. - A real `Qwen/Qwen3-0.6B` probe that trains each current variant on layer-0 `q_proj` and `v_proj`. - Repeatable `just` recipes and workspace-local logs/artifacts. @@ -130,18 +134,25 @@ Out: | Requirement | Distinguishing check | Evidence | |---|---|---| -| R7: fast CI catches broken tests/builds | `just check` must run pytest, smoke, `uv build`, and `twine check`; a broken test, wheel, sdist, or README metadata fails the hook. | `just check` -> pytest `8 passed in 9.53s`, smoke all pass, wheel/sdist built, `twine check dist/*` passed | -| R8: large proof is queued, not hidden in CI | `just qwen-queue` must create a pueue task in the repo cwd with why/resolve label and intended Qwen command. | `just qwen-queue && pueue status` -> task 74 queued at `/media/wassname/SGIronWolf/projects5/2026/lora-lite` with `just qwen-probe lora pissa delora 16` | +| R7: fast CI catches broken tests/builds | `just check` must run pytest, smoke, `uv build`, and `twine check`; a broken test, wheel, sdist, or README metadata fails the hook. | `just check` -> pytest `10 passed in 4.10s`, smoke all pass, wheel/sdist built, `twine check dist/*` passed | +| R8: large proof is queued, not hidden in CI | `just qwen-queue` must create a pueue task in the repo cwd with why/resolve label and intended Qwen command. | task 79 queued at `/media/wassname/SGIronWolf/projects5/2026/lora-lite` with `--variants lora pissa delora ia3 --steps 16` | | R9: README is publishable enough to judge | Reader sees install, quickstart, pseudocode core, testing commands, proof caveat before variant wishlist. | `README.md` reordered and human note removed | | R10: variant roadmap buys simplicity | Next variant is ranked by fit to current hook contract; non-hook variants are deferred rather than half-supported. | this section | Fresh review first blocked on weak `qwen-queue` evidence and README citation/comment junk. Fixes: queued real pueue task 74, added `twine check`, fenced citation, removed the stray README note. Final fresh review verdict: PASS. +Follow-up after omega correction: + +- The recursive task failed previously because it was left as a passive sentinel instead of being expanded immediately after push. +- Real bnb proof now runs through required `just bnb-smoke` with `bitsandbytes==0.49.2`; `Linear8bitLt` and `Linear4bit` both show identity error `0.000e+00` and nonzero adapter gradients. The command now fails instead of skipping if CUDA or bnb is unavailable. +- IA3 was added while waiting for queued Qwen work: `just test` now has 10 tests, `just smoke` includes IA3, and pueue task 79 is queued for LoRA/PiSSA/DeLoRA/IA3 on Qwen 0.6B. Task 78 caught a quoting bug in `qwen-queue`; the recipe now queues a single `bash -lc` command. +- README was reduced to prospective-user content; implementation notes moved to `docs/developer_guide.md`. + ### Adapter roadmap, ranked by simplicity | Variant | Why it fits or waits | Next check | |---|---|---| -| IA3 | Multiplicative vector on activations. Probably the smallest new file and no base-weight mutation. | Identity with ones, perturb changes output, loss drops, save/load exact. | +| IA3 | Implemented. Multiplicative output vector, no base-weight mutation. | `just test` -> 10 tests passed; `just smoke` -> identity/save-load/loss drop passed. Qwen task 79 queued. | | DoRA | Fits additive hook for fp layers; bnb norm handling must be explicit or fail-fast. | fp smoke first; quantized proof only after norm semantics are obvious. | | SSVD / PiSSA-family | Fits current `weight`-SVD pattern and teaches the SVD adapter path. | Reconstruction/identity invariant plus train proof. | | HRA / OFT / ROAD | Interesting, but likely wants orthogonal or weight-transform semantics. Keep until hook-only formulation is clear. | Pseudocode first, then one invariant that distinguishes real rotation from dead code. | diff --git a/justfile b/justfile index f716827..749697d 100644 --- a/justfile +++ b/justfile @@ -11,18 +11,21 @@ test: smoke: uv run --extra test python tests/smoke.py +bnb-smoke: + uv run --extra test --extra bnb-test python tests/smoke.py --require-bnb + build: rm -rf dist uv build uv run --extra build twine check dist/* -qwen-probe variants="lora pissa delora" steps="8": +qwen-probe variants="lora pissa delora ia3" steps="8": uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}} -qwen-queue variants="lora pissa delora" steps="16": +qwen-queue variants="lora pissa delora ia3" steps="16": #!/usr/bin/env bash set -euo pipefail pueue add \ -l "why: verify Qwen0.6B train/save-load proof for {{variants}} at {{steps}} steps; resolve: publish docs only if exact targets, lora-only grads, loss drop, reload pass" \ -w "$PWD" -o 1 -- \ - just qwen-probe "{{variants}}" "{{steps}}" \ No newline at end of file + bash -lc 'uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}}' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7a885b9..faa96b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ Issues = "https://github.com/wassname/lora-lite/issues" build = ["twine>=6"] test = ["pytest", "tabulate"] hf-test = ["accelerate>=1.6", "safetensors>=0.5", "transformers>=4.51"] +bnb-test = ["bitsandbytes>=0.46"] [build-system] requires = ["setuptools>=68"] diff --git a/scripts/qwen_train_probe.py b/scripts/qwen_train_probe.py index 82e32c3..4d21a77 100644 --- a/scripts/qwen_train_probe.py +++ b/scripts/qwen_train_probe.py @@ -58,6 +58,11 @@ def perturb_first_adapter(model: torch.nn.Module) -> None: with torch.no_grad(): p.flatten()[0].add_(0.25) return + for name, p in model.named_parameters(): + if "lora_g" in name: + with torch.no_grad(): + p.flatten()[0].add_(0.25) + return raise AssertionError("no perturbable adapter parameter found") @@ -168,7 +173,7 @@ def run_variant(args, variant: str, input_ids: torch.Tensor, labels: torch.Tenso def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--model", default="Qwen/Qwen3-0.6B") - parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora"]) + parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3"]) parser.add_argument("--device", default="cuda") parser.add_argument("--torch-dtype", default="bfloat16") parser.add_argument("--steps", type=int, default=8) diff --git a/src/lora_lite/variants/__init__.py b/src/lora_lite/variants/__init__.py index ea22188..aaaf9e3 100644 --- a/src/lora_lite/variants/__init__.py +++ b/src/lora_lite/variants/__init__.py @@ -1 +1 @@ -from . import lora, pissa, delora # noqa: F401 side-effect: register +from . import lora, pissa, delora, ia3 # noqa: F401 side-effect: register diff --git a/src/lora_lite/variants/ia3.py b/src/lora_lite/variants/ia3.py new file mode 100644 index 0000000..57f1bd3 --- /dev/null +++ b/src/lora_lite/variants/ia3.py @@ -0,0 +1,22 @@ +"""IA3-style output gating. y_new = y * g, with g initialized to ones.""" +import torch +from torch import nn + +from ..variant import register, ParamSpec + + +@register +class IA3: + name = "ia3" + + @staticmethod + def param_specs(d_in, d_out, cfg): + return {"lora_g": ParamSpec((d_out,), init="ones", trainable=True)} + + @staticmethod + def init(layer: nn.Linear, cfg) -> None: + return + + @staticmethod + def forward(layer: nn.Linear, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return y * layer.lora_g \ No newline at end of file diff --git a/tests/smoke.py b/tests/smoke.py index 15f3267..e9c150e 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -1,4 +1,4 @@ -"""Smoke test: lora / pissa / delora on a tiny synthetic transformer-like model. +"""Smoke test: current variants on a tiny synthetic transformer-like model. Verifies: 1. Identity at t=0 (delta ~ 0, output close to base). @@ -15,6 +15,7 @@ BLUF format: SHOULD: loss decreases > 5% over 20 SGD steps for all variants. ELSE grad/wiring bug. """ from __future__ import annotations +import argparse import os, sys, math from pathlib import Path import torch @@ -128,6 +129,7 @@ def variant_test(variant: str, dtype=torch.float32): "lora": 1e-6, "pissa": 5e-4, # SVD recon in fp32 is tight; bf16 would be ~1e-2 "delora": 1e-6, # lambda0=0 + "ia3": 1e-6, }[variant] * max(1.0, base_scale) assert err < tol, f" FAIL identity: err {err} > tol {tol}" print(f" SHOULD: err<{tol:.1e}. PASS.") @@ -165,7 +167,7 @@ def variant_test(variant: str, dtype=torch.float32): target = torch.randn(2, 16, 100, dtype=dtype) * 0.1 trainable = [p for p in model.parameters() if p.requires_grad] # delora has tightly-normalised updates; use Adam with higher lr to see signal in 20 steps - if variant == "delora": + if variant in ("delora", "ia3"): opt = torch.optim.Adam(trainable, lr=1e-1) else: opt = torch.optim.SGD(trainable, lr=1e-2) @@ -204,14 +206,19 @@ def structural_linear_like_test(): print(" SHOULD: structural target attaches and lora_B receives grad. PASS.") -def bitsandbytes_cuda_smoke(): - print("\n=== optional bitsandbytes CUDA smoke ===") +def bitsandbytes_cuda_smoke(require_bnb: bool): + label = "required" if require_bnb else "optional" + print(f"\n=== {label} bitsandbytes CUDA smoke ===") if not torch.cuda.is_available(): + if require_bnb: + raise RuntimeError("CUDA unavailable; required real bnb 4/8-bit smoke cannot run.") print(" SKIP: CUDA unavailable; real bnb 4/8-bit forward needs GPU on this machine.") return try: import bitsandbytes as bnb except ImportError: + if require_bnb: + raise RuntimeError("bitsandbytes unavailable; install the bnb-test extra.") print(" SKIP: bitsandbytes unavailable.") return @@ -240,10 +247,14 @@ def bitsandbytes_cuda_smoke(): def main(): - for v in ("lora", "pissa", "delora"): + parser = argparse.ArgumentParser() + parser.add_argument("--require-bnb", action="store_true") + args = parser.parse_args() + + for v in ("lora", "pissa", "delora", "ia3"): variant_test(v, dtype=torch.float32) structural_linear_like_test() - bitsandbytes_cuda_smoke() + bitsandbytes_cuda_smoke(args.require_bnb) print("\nALL PASS.") diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py index 3eed389..bd38dc4 100644 --- a/tests/test_lora_lite.py +++ b/tests/test_lora_lite.py @@ -103,10 +103,15 @@ def perturb_first_adapter(model: nn.Module) -> None: with torch.no_grad(): p.flatten()[0].add_(0.25) return + for name, p in model.named_parameters(): + if "lora_g" in name: + with torch.no_grad(): + p.flatten()[0].add_(0.25) + return raise AssertionError("no perturbable adapter parameter found") -@pytest.mark.parametrize("variant", ["lora", "pissa", "delora"]) +@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3"]) def test_variant_identity_hook_save_load_and_training(variant: str): ARTIFACT_DIR.mkdir(exist_ok=True) torch.manual_seed(0) @@ -124,7 +129,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str): with torch.no_grad(): y_init = model(ids).clone() identity_err = (y_init - y_base).abs().max().item() - identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6}[variant] + identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6}[variant] assert identity_err < identity_tol before_perturb = adapter_state(model) @@ -157,7 +162,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str): assert_only_lora_trainable(train_model) target = torch.randn(2, 16, 100) * 0.1 trainable = [p for p in train_model.parameters() if p.requires_grad] - opt = torch.optim.Adam(trainable, lr=0.1) if variant == "delora" else torch.optim.SGD(trainable, lr=1e-2) + opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3") else torch.optim.SGD(trainable, lr=1e-2) losses = [] first_grad_norm = math.nan before_train = adapter_state(train_model) @@ -214,7 +219,7 @@ def test_no_target_layers_is_loud_failure(): ll.attach(TinyModel(), cfg) -@pytest.mark.parametrize("variant", ["lora", "delora"]) +@pytest.mark.parametrize("variant", ["lora", "delora", "ia3"]) def test_structural_non_linear_target_trains_for_forward_only_variants(variant: str): torch.manual_seed(0) model = FakeBnbModel() diff --git a/uv.lock b/uv.lock index 75af905..bf2e10d 100644 --- a/uv.lock +++ b/uv.lock @@ -7,7 +7,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-21T09:06:31.868106008Z" +exclude-newer = "2026-04-21T09:27:46.246831625Z" exclude-newer-span = "P5D" [[package]] @@ -61,6 +61,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b9/fa/123043af240e49752f1c4bd24da5053b6bd00cad78c2be53c0d1e8b975bc/backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34", size = 30181, upload-time = "2024-05-28T17:01:53.112Z" }, ] +[[package]] +name = "bitsandbytes" +version = "0.49.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging" }, + { name = "torch" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/7d/f1fe0992334b18cd8494f89aeec1dcc674635584fcd9f115784fea3a1d05/bitsandbytes-0.49.2-py3-none-macosx_14_0_arm64.whl", hash = "sha256:87be5975edeac5396d699ecbc39dfc47cf2c026daaf2d5852a94368611a6823f", size = 131940, upload-time = "2026-02-16T21:26:04.572Z" }, + { url = "https://files.pythonhosted.org/packages/29/71/acff7af06c818664aa87ff73e17a52c7788ad746b72aea09d3cb8e424348/bitsandbytes-0.49.2-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:2fc0830c5f7169be36e60e11f2be067c8f812dfcb829801a8703735842450750", size = 31442815, upload-time = "2026-02-16T21:26:06.783Z" }, + { url = "https://files.pythonhosted.org/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:54b771f06e1a3c73af5c7f16ccf0fc23a846052813d4b008d10cb6e017dd1c8c", size = 60651714, upload-time = "2026-02-16T21:26:11.579Z" }, + { url = "https://files.pythonhosted.org/packages/b6/d4/501655842ad6771fb077f576d78cbedb5445d15b1c3c91343ed58ca46f0e/bitsandbytes-0.49.2-py3-none-win_amd64.whl", hash = "sha256:2e0ddd09cd778155388023cbe81f00afbb7c000c214caef3ce83386e7144df7d", size = 55372289, upload-time = "2026-02-16T21:26:16.267Z" }, +] + [[package]] name = "certifi" version = "2026.2.25" @@ -617,6 +634,9 @@ dependencies = [ ] [package.optional-dependencies] +bnb-test = [ + { name = "bitsandbytes" }, +] build = [ { name = "twine" }, ] @@ -633,6 +653,7 @@ test = [ [package.metadata] requires-dist = [ { name = "accelerate", marker = "extra == 'hf-test'", specifier = ">=1.6" }, + { name = "bitsandbytes", marker = "extra == 'bnb-test'", specifier = ">=0.46" }, { name = "einops", specifier = ">=0.7" }, { name = "pytest", marker = "extra == 'test'" }, { name = "safetensors", marker = "extra == 'hf-test'", specifier = ">=0.5" }, @@ -641,7 +662,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'hf-test'", specifier = ">=4.51" }, { name = "twine", marker = "extra == 'build'", specifier = ">=6" }, ] -provides-extras = ["build", "test", "hf-test"] +provides-extras = ["build", "test", "hf-test", "bnb-test"] [[package]] name = "markdown-it-py"