From 699fde31bf5abce680b4814422fea2792cab09b4 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sun, 26 Apr 2026 17:49:17 +0800
Subject: [PATCH] feat: ia3 variant, real bnb 4bit/8bit smoke, dev guide split,
 user-only readme

---
 README.md                            | 188 +++++----------------------
 docs/developer_guide.md              |  81 ++++++++++++
 docs/spec/20260426_lora_lite_plan.md |  19 ++-
 justfile                             |   9 +-
 pyproject.toml                       |   1 +
 scripts/qwen_train_probe.py          |   7 +-
 src/lora_lite/variants/__init__.py   |   2 +-
 src/lora_lite/variants/ia3.py        |  22 ++++
 tests/smoke.py                       |  23 +++-
 tests/test_lora_lite.py              |  13 +-
 uv.lock                              |  25 +++-
 11 files changed, 216 insertions(+), 174 deletions(-)
 create mode 100644 docs/developer_guide.md
 create mode 100644 src/lora_lite/variants/ia3.py

diff --git a/README.md b/README.md
index a27c1c2..d30686e 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # lora-lite
 
-A hackable, single-file-per-variant LoRA library built on PyTorch forward hooks.
+Hackable PyTorch adapters for LoRA-family and small PEFT experiments.
 
-The goal is not to be PEFT-compatible. The goal is to make adapter ideas easy to read, edit, test, and throw away.
+`lora-lite` uses forward hooks instead of module replacement. Adapter parameters are plain `nn.Parameter`s on the target layer, e.g. `model.layers[5].self_attn.q_proj.lora_A`.
 
 ## Install
 
 ```bash
-pip install -e .
+pip install -e git+https://github.com/wassname/lora-lite.git#egg=lora-lite
 ```
 
 ## Quickstart
@@ -15,7 +15,7 @@ pip install -e .
 ```python
 import torch, lora_lite as ll
 
-model = MyTransformer()  # any nn.Module containing linear-like children
+model = MyTransformer()
 cfg = ll.LoraLiteConfig(variant="lora", r=8, alpha=16, dtype=torch.bfloat16)
 ll.attach(model, cfg)
 
@@ -27,172 +27,54 @@ ll.detach(model)
 ll.load(model, "adapter.pt")
 ```
 
-Inspect a live adapter tensor directly:
+## Does it work?
 
-```python
-A = model.layers[5].self_attn.q_proj.lora_A
+```bash
+just check       # pytest + smoke + package build + metadata check
+just bnb-smoke   # required CUDA bitsandbytes 4bit/8bit smoke
+just qwen-probe  # Qwen/Qwen3-0.6B train/save-load probe
 ```
 
-## Core idea
+See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) for verification history and exact results.
 
-Each variant owns the adapter math. The runtime only finds target layers, attaches `lora_*` parameters, registers hooks, and saves full-path adapter keys.
+## Variants
 
-```python
-def attach(model, cfg):
-    targets ← find_linear_like_modules(model, cfg)
-    freeze(model.parameters())
-    for name, layer in targets:
-        layer.lora_* ← variant.param_specs(layer, cfg)
-        hook(layer, lambda x, y: variant.forward(layer, x, y))
-
-def save(model, path):
-    torch.save({"cfg": cfg, "state": state_dict_keys_containing("lora_")}, path)
-```
-
-Minimal by design:
-
-- One file per variant.
-- No module replacement, merge/unmerge, mixed-adapter batches, or PEFT config soup.
-- LoRA/DeLoRA hooks work with `nn.Linear` and bnb-style `Linear{4bit,8bitLt}` modules that expose `in_features`, `out_features`, and `weight`.
-- PiSSA is fp-only because it mutates `weight` into `W_res`; quantized PiSSA should fail loudly until dequantize/requantize is explicit.
-
-Currently shipped variants:
-
-| Variant | Class | File |
+| Variant | Support | Notes |
 |---|---|---|
-| LoRA | A (additive) | [src/lora_lite/variants/lora.py](src/lora_lite/variants/lora.py) |
-| PiSSA ([Meng+ 2024](https://arxiv.org/abs/2404.02948)) | A + B (special init mutates W) | [src/lora_lite/variants/pissa.py](src/lora_lite/variants/pissa.py) |
-| DeLoRA ([Bini+ 2025](https://arxiv.org/abs/2503.18225)) | A (additive, normalised) | [src/lora_lite/variants/delora.py](src/lora_lite/variants/delora.py) |
-
-See [docs/spec/20260426_lora_lite_plan.md](docs/spec/20260426_lora_lite_plan.md) for goals, status, and the current design plan. The original broader design was stress-tested against the [adapters_as_hypotheses](https://github.com/wassname/adapters_as_hypotheses) catalog (~26/27 variants covered with 3 small API tweaks).
+| LoRA | yes | additive low-rank adapter |
+| PiSSA | yes, fp only | mutates `weight` into `W_res`; quantized PiSSA intentionally fails |
+| DeLoRA | yes | normalized additive adapter with learned scalar |
+| IA3 | yes | output gate initialized to ones |
+| DoRA | no | next small candidate |
+| SSVD / OFT / HRA / ROAD | no | planned after the hook-only invariant is clear |
+| S-steer / AntiPaSTO | no | should use data-calibrated `group_init`, not plain LoRA tests |
 
 ## Targeting
 
-By default we target linear-like modules (`in_features`, `out_features`, `weight`) whose shape matches a "reader" (`d_in == d_model`) or "writer" (`d_out == d_model`) role, excluding `lm_head` and `embed_tokens`. This structural test is what lets bnb Linear4bit/8bitLt modules be targeted without a backend-specific class. Knobs on `LoraLiteConfig`:
+By default, `lora-lite` targets linear-like modules with `in_features`, `out_features`, and `weight`, excluding `lm_head` and `embed_tokens`.
 
-- `target_roles`: subset of `("reader", "writer", "inner")`. `()` = all.
-- `target_names`: regex includes (must match if non-empty).
-- `exclude_names`: regex excludes (default skips `lm_head`, `embed_tokens`).
-- `layers`: tuple of layer indices, or `None` for all (matches `.layers.<idx>.` in module name).
+Useful `LoraLiteConfig` fields:
 
-## Variant API
+- `target_roles`: subset of `("reader", "writer", "inner")`; `()` means all.
+- `target_names`: regex includes.
+- `exclude_names`: regex excludes.
+- `layers`: layer indices, matching `.layers.<idx>.` in module names.
 
-A variant is a class with a `name` and three statics:
+This structural targeting is why LoRA, DeLoRA, and IA3 can run on bnb-style `Linear4bit`/`Linear8bitLt` modules. PiSSA is different because it edits the base weight.
+
+## Save format
+
+Adapters are just:
 
 ```python
-@register
-class MyVariant:
-    name = "myvariant"
-
-    @staticmethod
-    def param_specs(d_in, d_out, cfg) -> dict[str, ParamSpec]:
-        return {"lora_A": ParamSpec((cfg.r, d_in), init="kaiming"), ...}
-
-    @staticmethod
-    def init(layer, cfg) -> None:
-        # Optional. Run after params are created. May read/mutate layer.weight.
-        ...
-
-    @staticmethod
-    def forward(layer, x, y) -> Tensor:
-        # Return the layer's NEW output (additive: `return y + delta`).
-        ...
+torch.save({"cfg": cfg.to_dict(), "state": lora_state_dict}, "adapter.pt")
 ```
 
-Adapter params attached as `layer.lora_*` get full-path keys in `state_dict()` automatically (e.g. `model.layers.5.self_attn.q_proj.lora_A`).
+`lora_state_dict` contains full-path keys with `"lora_"` in the name. Missing or unexpected adapter keys fail on load.
 
-## Data-calibrated init
+## Developer docs
 
-PiSSA, DeLoRA, and LoRA only use `layer.weight` for init -- no calibration data needed.
-
-For variants that DO need data (e.g. AntiPaSTO, LoRA-GA, activation-aware SVD), keep dataloaders out of `cfg` so adapter checkpoints stay serializable. Use:
-
-```python
-ll.attach(model, cfg, calibration_data=calib)
-```
-
-where `calib` is an iterable of whole-model inputs, e.g. `Iterable[dict[str, Tensor]]` for HF models or `Iterable[Tensor]` of token ids. Activation-aware variants implement:
-
-```python
-@staticmethod
-def group_init(model, targets, cfg, calibration_data): ...
-```
-
-`targets` is `list[(name, layer, role)]`. The variant adds temporary hooks, runs `model(batch)` over `calibration_data`, removes the hooks, then writes `lora_*` params. Per-layer `init(layer, cfg)` stays weight-only.
-
-Sketch:
-
-```python
-@register
-class ActSVD:
-    name = "actsvd"
-    @staticmethod
-    def param_specs(d_in, d_out, cfg): ...
-    @staticmethod
-    def group_init(model, targets, cfg, calibration_data):
-        bufs = {name: [] for name, _, _ in targets}
-        hooks = [
-            layer.register_forward_pre_hook(
-                lambda m, args, name=name: bufs[name].append(args[0].detach().float())
-            )
-            for name, layer, _ in targets
-        ]
-        try:
-            with torch.no_grad():
-                for batch in calibration_data:
-                    model(**batch) if isinstance(batch, dict) else model(batch)
-        finally:
-            for h in hooks:
-                h.remove()
-        # For each target: X = torch.cat(bufs[name], dim=0); do SVD; write A/B.
-```
-
-## Smoke test
-
-```bash
-just check       # pytest + smoke + package build
-just test
-just smoke
-just qwen-queue  # queued Qwen/Qwen3-0.6B proof via pueue
-```
-
-`just test` verifies, for each of `lora`, `pissa`, `delora`:
-
-1. Identity at t=0: `max|y_adapter - y_base|` within float tolerance.
-2. Adapter hooks are live: perturbing only `lora_*` changes outputs.
-3. Save/load round-trip preserves full-path adapter keys and tensors.
-4. Missing or unexpected `lora_*` checkpoint keys fail loudly.
-5. Only `lora_*` parameters are trainable and base parameters get no gradients.
-6. A 20-step tiny regression training probe gets finite nonzero adapter gradients and >5% loss drop.
-
-`just qwen-probe` is the real-model proof. It loads `Qwen/Qwen3-0.6B` fresh per variant, attaches only layer-0 `q_proj`/`v_proj`, trains one fixed LM batch, saves adapters, reloads into a fresh base model, and checks logits match. Last verified on 2026-04-26:
-
-| variant | targets | trainable | identity err | perturb delta | loss0 | lossN | drop % | grad norm | adapter delta | reload err |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-| LoRA | 2 | 20,480 | 0 | 0.3750 | 5.250 | 3.131 | 40.36 | 1.432 | 4.262 | 0 |
-| PiSSA | 2 | 20,480 | 0.3125 | 0.7500 | 5.250 | 3.629 | 30.88 | 6.124 | 4.381 | 0 |
-| DeLoRA | 2 | 20,482 | 0.3750 | 0.4062 | 5.246 | 5.166 | 1.537 | 0.04778 | 8.196 | 0 |
-
-This is an interface/training proof, not a benchmark: exact Qwen target names, hook activity, lora-only gradients, loss decrease, adapter tensor save/load, and reload equivalence on a 0.6B HF model.
-
-CI runs `just check` on GitHub. The larger Qwen proof stays in `pueue` because it needs the shared GPU.
-
-## What's NOT in v1
-
-| Feature | Why dropped |
-|---|---|
-| merge/unmerge | reload base if you want vanilla |
-| 4/8-bit-aware merge | DoRA on bnb supported in forward only (drop merge path) |
-| Embedding / Conv adapters | trivial extension; add when needed |
-| `adapter_names=` mixed batch forward | rare; add when needed |
-| Multiple named adapters per layer | one variant per `attach()` |
-| HF `PeftConfig` / hub upload | `torch.save({cfg, state})` is enough |
-| AdaLoRA-style rank scheduling | needs `Variant.on_step(step)` -- punt |
-| ReFT-style position interventions | sibling submodule (different hook site) |
-
-## Status
-
-v0.0.1: LoRA + PiSSA + DeLoRA + minimal functional tests + Qwen proof. Next likely variants are IA3 or DoRA because they fit the current hook contract with little new machinery. OFT/ROAD/AntiPaSTO-style methods are more interesting, but should wait until the simple hook family is boring.
+See [docs/developer_guide.md](docs/developer_guide.md) for the variant API, data-calibrated init, and adapter roadmap.
 
 ## Citation
 
@@ -203,4 +85,4 @@ v0.0.1: LoRA + PiSSA + DeLoRA + minimal functional tests + Qwen proof. Next like
   year = {2026},
   url = {https://github.com/wassname/lora-lite/}
 }
-```
\ No newline at end of file
+```
diff --git a/docs/developer_guide.md b/docs/developer_guide.md
new file mode 100644
index 0000000..fd264fa
--- /dev/null
+++ b/docs/developer_guide.md
@@ -0,0 +1,81 @@
+# Developer guide
+
+This is the implementation note for people adding adapter variants. The README is only for prospective users.
+
+## Design principles
+
+- Variants own adapter math.
+- The runtime owns targeting, parameter attachment, hooks, and save/load.
+- Adapter parameters live directly on target layers as `lora_*` parameters.
+- Save/load uses normal full-path `state_dict()` keys filtered by `"lora_"`.
+- Fail loudly on unsupported weight semantics. No silent quantized PiSSA or merge fallback.
+
+## Variant contract
+
+A variant is a registered class with a small static interface:
+
+```python
+@register
+class MyVariant:
+    name = "myvariant"
+
+    @staticmethod
+    def param_specs(d_in, d_out, cfg) -> dict[str, ParamSpec]:
+        return {"lora_A": ParamSpec((cfg.r, d_in), init="kaiming")}
+
+    @staticmethod
+    def init(layer, cfg) -> None:
+        ...
+
+    @staticmethod
+    def forward(layer, x, y):
+        return y_new
+```
+
+Pseudocode for the runtime:
+
+```python
+def attach(model, cfg):
+    targets ← find_linear_like_modules(model, cfg)
+    freeze(model.parameters())
+    for name, layer in targets:
+        layer.lora_* ← variant.param_specs(layer, cfg)
+        variant.init(layer, cfg)
+        hook(layer, lambda x, y: variant.forward(layer, x, y))
+
+def save(model, path):
+    torch.save({"cfg": cfg, "state": state_dict_keys_containing("lora_")}, path)
+```
+
+## Data-calibrated init
+
+LoRA, PiSSA, DeLoRA, and IA3 only use `layer.weight` or identity constants for init.
+
+Variants that need data, e.g. AntiPaSTO, LoRA-GA, or activation-aware SVD, should keep dataloaders out of `cfg` so adapter checkpoints stay serializable:
+
+```python
+ll.attach(model, cfg, calibration_data=calib)
+```
+
+Activation-aware variants implement `group_init(model, targets, cfg, calibration_data)`. The variant may add temporary hooks, run calibration batches, remove hooks, then write `lora_*` params. `load()` should not require calibration data.
+
+## Current limitations
+
+| Feature | Current choice |
+|---|---|
+| merge/unmerge | reload the base model if vanilla weights are needed |
+| multiple named adapters | one variant per `attach()` |
+| mixed-adapter batches | out of scope until needed |
+| quantized PiSSA | fail-fast; explicit dequantize/requantize required |
+| AdaLoRA rank scheduling | needs a future `Variant.on_step(step)` hook |
+| ReFT-style interventions | likely a sibling module or different hook site |
+
+## Adapter roadmap
+
+| Variant | Fit to current runtime | Next invariant |
+|---|---|---|
+| IA3 | Done. Output gate `y * g`, identity at `g=1`. | Qwen proof task 79. |
+| DoRA | Likely additive hook for fp layers; quantized norm semantics need care. | fp identity, perturb, save/load, loss drop. |
+| SSVD / PiSSA-family | Fits weight-SVD init path. | reconstruction/identity invariant plus train proof. |
+| HRA / OFT / ROAD | Interesting, but weight-transform semantics need clearer hook-only formulation. | pseudocode first, then rotation/non-dead-code invariant. |
+| S-steer / AntiPaSTO | Should use `group_init` and activation evidence. | calibration consumed, hooks removed, load works without calibration. |
diff --git a/docs/spec/20260426_lora_lite_plan.md b/docs/spec/20260426_lora_lite_plan.md
index f9381df..a601419 100644
--- a/docs/spec/20260426_lora_lite_plan.md
+++ b/docs/spec/20260426_lora_lite_plan.md
@@ -36,8 +36,9 @@ The core bet is that adapter variants should own the relationship between `(x, l
 | LoRA | done | `src/lora_lite/variants/lora.py` |
 | PiSSA | done, fp-only | `src/lora_lite/variants/pissa.py` |
 | DeLoRA | done | `src/lora_lite/variants/delora.py` |
+| IA3 | done | `src/lora_lite/variants/ia3.py` |
 | Smoke tests | done | `tests/smoke.py` |
-| bnb minimal forward smoke | done | `Linear8bitLt` and `Linear4bit` pass on CUDA |
+| bnb minimal forward smoke | done | `Linear8bitLt` and `Linear4bit` pass on CUDA with `just bnb-smoke` |
 
 ## Current smoke evidence
 
@@ -51,6 +52,8 @@ Last verified log: `/home/wassname/.cache/agent-tmp/lora_lite_smoke_after_review
 | PiSSA loss drop | `11.5%` |
 | DeLoRA identity | `0.000e+00` |
 | DeLoRA loss drop | `93.4%` |
+| IA3 identity | `0.000e+00` |
+| IA3 loss drop | `88.7%` |
 | fake non-`nn.Linear` target | attaches, identity `0.000e+00`, grad nonzero |
 | bnb `Linear8bitLt` | identity `0.000e+00`, grad nonzero |
 | bnb `Linear4bit` | identity `0.000e+00`, grad nonzero |
@@ -64,6 +67,7 @@ Goal: upgrade from smoke-tested sketch to evidence that the current PEFT-lite in
 In:
 
 - Pytest coverage for LoRA, PiSSA, and DeLoRA correctness invariants.
+- IA3 coverage as the first waiting-time simple adapter.
 - A real `Qwen/Qwen3-0.6B` probe that trains each current variant on layer-0 `q_proj` and `v_proj`.
 - Repeatable `just` recipes and workspace-local logs/artifacts.
 
@@ -130,18 +134,25 @@ Out:
 
 | Requirement | Distinguishing check | Evidence |
 |---|---|---|
-| R7: fast CI catches broken tests/builds | `just check` must run pytest, smoke, `uv build`, and `twine check`; a broken test, wheel, sdist, or README metadata fails the hook. | `just check` -> pytest `8 passed in 9.53s`, smoke all pass, wheel/sdist built, `twine check dist/*` passed |
-| R8: large proof is queued, not hidden in CI | `just qwen-queue` must create a pueue task in the repo cwd with why/resolve label and intended Qwen command. | `just qwen-queue && pueue status` -> task 74 queued at `/media/wassname/SGIronWolf/projects5/2026/lora-lite` with `just qwen-probe lora pissa delora 16` |
+| R7: fast CI catches broken tests/builds | `just check` must run pytest, smoke, `uv build`, and `twine check`; a broken test, wheel, sdist, or README metadata fails the hook. | `just check` -> pytest `10 passed in 4.10s`, smoke all pass, wheel/sdist built, `twine check dist/*` passed |
+| R8: large proof is queued, not hidden in CI | `just qwen-queue` must create a pueue task in the repo cwd with why/resolve label and intended Qwen command. | task 79 queued at `/media/wassname/SGIronWolf/projects5/2026/lora-lite` with `--variants lora pissa delora ia3 --steps 16` |
 | R9: README is publishable enough to judge | Reader sees install, quickstart, pseudocode core, testing commands, proof caveat before variant wishlist. | `README.md` reordered and human note removed |
 | R10: variant roadmap buys simplicity | Next variant is ranked by fit to current hook contract; non-hook variants are deferred rather than half-supported. | this section |
 
 Fresh review first blocked on weak `qwen-queue` evidence and README citation/comment junk. Fixes: queued real pueue task 74, added `twine check`, fenced citation, removed the stray README note. Final fresh review verdict: PASS.
 
+Follow-up after omega correction:
+
+- The recursive task failed previously because it was left as a passive sentinel instead of being expanded immediately after push.
+- Real bnb proof now runs through required `just bnb-smoke` with `bitsandbytes==0.49.2`; `Linear8bitLt` and `Linear4bit` both show identity error `0.000e+00` and nonzero adapter gradients. The command now fails instead of skipping if CUDA or bnb is unavailable.
+- IA3 was added while waiting for queued Qwen work: `just test` now has 10 tests, `just smoke` includes IA3, and pueue task 79 is queued for LoRA/PiSSA/DeLoRA/IA3 on Qwen 0.6B. Task 78 caught a quoting bug in `qwen-queue`; the recipe now queues a single `bash -lc` command.
+- README was reduced to prospective-user content; implementation notes moved to `docs/developer_guide.md`.
+
 ### Adapter roadmap, ranked by simplicity
 
 | Variant | Why it fits or waits | Next check |
 |---|---|---|
-| IA3 | Multiplicative vector on activations. Probably the smallest new file and no base-weight mutation. | Identity with ones, perturb changes output, loss drops, save/load exact. |
+| IA3 | Implemented. Multiplicative output vector, no base-weight mutation. | `just test` -> 10 tests passed; `just smoke` -> identity/save-load/loss drop passed. Qwen task 79 queued. |
 | DoRA | Fits additive hook for fp layers; bnb norm handling must be explicit or fail-fast. | fp smoke first; quantized proof only after norm semantics are obvious. |
 | SSVD / PiSSA-family | Fits current `weight`-SVD pattern and teaches the SVD adapter path. | Reconstruction/identity invariant plus train proof. |
 | HRA / OFT / ROAD | Interesting, but likely wants orthogonal or weight-transform semantics. Keep until hook-only formulation is clear. | Pseudocode first, then one invariant that distinguishes real rotation from dead code. |
diff --git a/justfile b/justfile
index f716827..749697d 100644
--- a/justfile
+++ b/justfile
@@ -11,18 +11,21 @@ test:
 smoke:
 	uv run --extra test python tests/smoke.py
 
+bnb-smoke:
+	uv run --extra test --extra bnb-test python tests/smoke.py --require-bnb
+
 build:
 	rm -rf dist
 	uv build
 	uv run --extra build twine check dist/*
 
-qwen-probe variants="lora pissa delora" steps="8":
+qwen-probe variants="lora pissa delora ia3" steps="8":
 	uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}}
 
-qwen-queue variants="lora pissa delora" steps="16":
+qwen-queue variants="lora pissa delora ia3" steps="16":
 	#!/usr/bin/env bash
 	set -euo pipefail
 	pueue add \
 		-l "why: verify Qwen0.6B train/save-load proof for {{variants}} at {{steps}} steps; resolve: publish docs only if exact targets, lora-only grads, loss drop, reload pass" \
 		-w "$PWD" -o 1 -- \
-		just qwen-probe "{{variants}}" "{{steps}}"
\ No newline at end of file
+		bash -lc 'uv run --extra test --extra hf-test python scripts/qwen_train_probe.py --variants {{variants}} --steps {{steps}}'
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 7a885b9..faa96b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ Issues = "https://github.com/wassname/lora-lite/issues"
 build = ["twine>=6"]
 test = ["pytest", "tabulate"]
 hf-test = ["accelerate>=1.6", "safetensors>=0.5", "transformers>=4.51"]
+bnb-test = ["bitsandbytes>=0.46"]
 
 [build-system]
 requires = ["setuptools>=68"]
diff --git a/scripts/qwen_train_probe.py b/scripts/qwen_train_probe.py
index 82e32c3..4d21a77 100644
--- a/scripts/qwen_train_probe.py
+++ b/scripts/qwen_train_probe.py
@@ -58,6 +58,11 @@ def perturb_first_adapter(model: torch.nn.Module) -> None:
             with torch.no_grad():
                 p.flatten()[0].add_(0.25)
             return
+    for name, p in model.named_parameters():
+        if "lora_g" in name:
+            with torch.no_grad():
+                p.flatten()[0].add_(0.25)
+            return
     raise AssertionError("no perturbable adapter parameter found")
 
 
@@ -168,7 +173,7 @@ def run_variant(args, variant: str, input_ids: torch.Tensor, labels: torch.Tenso
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", default="Qwen/Qwen3-0.6B")
-    parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora"])
+    parser.add_argument("--variants", nargs="+", default=["lora", "pissa", "delora", "ia3"])
     parser.add_argument("--device", default="cuda")
     parser.add_argument("--torch-dtype", default="bfloat16")
     parser.add_argument("--steps", type=int, default=8)
diff --git a/src/lora_lite/variants/__init__.py b/src/lora_lite/variants/__init__.py
index ea22188..aaaf9e3 100644
--- a/src/lora_lite/variants/__init__.py
+++ b/src/lora_lite/variants/__init__.py
@@ -1 +1 @@
-from . import lora, pissa, delora  # noqa: F401  side-effect: register
+from . import lora, pissa, delora, ia3  # noqa: F401  side-effect: register
diff --git a/src/lora_lite/variants/ia3.py b/src/lora_lite/variants/ia3.py
new file mode 100644
index 0000000..57f1bd3
--- /dev/null
+++ b/src/lora_lite/variants/ia3.py
@@ -0,0 +1,22 @@
+"""IA3-style output gating. y_new = y * g, with g initialized to ones."""
+import torch
+from torch import nn
+
+from ..variant import register, ParamSpec
+
+
+@register
+class IA3:
+    name = "ia3"
+
+    @staticmethod
+    def param_specs(d_in, d_out, cfg):
+        return {"lora_g": ParamSpec((d_out,), init="ones", trainable=True)}
+
+    @staticmethod
+    def init(layer: nn.Linear, cfg) -> None:
+        return
+
+    @staticmethod
+    def forward(layer: nn.Linear, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return y * layer.lora_g
\ No newline at end of file
diff --git a/tests/smoke.py b/tests/smoke.py
index 15f3267..e9c150e 100644
--- a/tests/smoke.py
+++ b/tests/smoke.py
@@ -1,4 +1,4 @@
-"""Smoke test: lora / pissa / delora on a tiny synthetic transformer-like model.
+"""Smoke test: current variants on a tiny synthetic transformer-like model.
 
 Verifies:
     1. Identity at t=0 (delta ~ 0, output close to base).
@@ -15,6 +15,7 @@ BLUF format:
     SHOULD: loss decreases > 5% over 20 SGD steps for all variants. ELSE grad/wiring bug.
 """
 from __future__ import annotations
+import argparse
 import os, sys, math
 from pathlib import Path
 import torch
@@ -128,6 +129,7 @@ def variant_test(variant: str, dtype=torch.float32):
         "lora": 1e-6,
         "pissa": 5e-4,    # SVD recon in fp32 is tight; bf16 would be ~1e-2
         "delora": 1e-6,   # lambda0=0
+        "ia3": 1e-6,
     }[variant] * max(1.0, base_scale)
     assert err < tol, f"  FAIL identity: err {err} > tol {tol}"
     print(f"  SHOULD: err<{tol:.1e}. PASS.")
@@ -165,7 +167,7 @@ def variant_test(variant: str, dtype=torch.float32):
     target = torch.randn(2, 16, 100, dtype=dtype) * 0.1
     trainable = [p for p in model.parameters() if p.requires_grad]
     # delora has tightly-normalised updates; use Adam with higher lr to see signal in 20 steps
-    if variant == "delora":
+    if variant in ("delora", "ia3"):
         opt = torch.optim.Adam(trainable, lr=1e-1)
     else:
         opt = torch.optim.SGD(trainable, lr=1e-2)
@@ -204,14 +206,19 @@ def structural_linear_like_test():
     print("  SHOULD: structural target attaches and lora_B receives grad. PASS.")
 
 
-def bitsandbytes_cuda_smoke():
-    print("\n=== optional bitsandbytes CUDA smoke ===")
+def bitsandbytes_cuda_smoke(require_bnb: bool):
+    label = "required" if require_bnb else "optional"
+    print(f"\n=== {label} bitsandbytes CUDA smoke ===")
     if not torch.cuda.is_available():
+        if require_bnb:
+            raise RuntimeError("CUDA unavailable; required real bnb 4/8-bit smoke cannot run.")
         print("  SKIP: CUDA unavailable; real bnb 4/8-bit forward needs GPU on this machine.")
         return
     try:
         import bitsandbytes as bnb
     except ImportError:
+        if require_bnb:
+            raise RuntimeError("bitsandbytes unavailable; install the bnb-test extra.")
         print("  SKIP: bitsandbytes unavailable.")
         return
 
@@ -240,10 +247,14 @@ def bitsandbytes_cuda_smoke():
 
 
 def main():
-    for v in ("lora", "pissa", "delora"):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--require-bnb", action="store_true")
+    args = parser.parse_args()
+
+    for v in ("lora", "pissa", "delora", "ia3"):
         variant_test(v, dtype=torch.float32)
     structural_linear_like_test()
-    bitsandbytes_cuda_smoke()
+    bitsandbytes_cuda_smoke(args.require_bnb)
     print("\nALL PASS.")
 
 
diff --git a/tests/test_lora_lite.py b/tests/test_lora_lite.py
index 3eed389..bd38dc4 100644
--- a/tests/test_lora_lite.py
+++ b/tests/test_lora_lite.py
@@ -103,10 +103,15 @@ def perturb_first_adapter(model: nn.Module) -> None:
             with torch.no_grad():
                 p.flatten()[0].add_(0.25)
             return
+    for name, p in model.named_parameters():
+        if "lora_g" in name:
+            with torch.no_grad():
+                p.flatten()[0].add_(0.25)
+            return
     raise AssertionError("no perturbable adapter parameter found")
 
 
-@pytest.mark.parametrize("variant", ["lora", "pissa", "delora"])
+@pytest.mark.parametrize("variant", ["lora", "pissa", "delora", "ia3"])
 def test_variant_identity_hook_save_load_and_training(variant: str):
     ARTIFACT_DIR.mkdir(exist_ok=True)
     torch.manual_seed(0)
@@ -124,7 +129,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str):
     with torch.no_grad():
         y_init = model(ids).clone()
     identity_err = (y_init - y_base).abs().max().item()
-    identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6}[variant]
+    identity_tol = {"lora": 1e-6, "pissa": 5e-4, "delora": 1e-6, "ia3": 1e-6}[variant]
     assert identity_err < identity_tol
 
     before_perturb = adapter_state(model)
@@ -157,7 +162,7 @@ def test_variant_identity_hook_save_load_and_training(variant: str):
     assert_only_lora_trainable(train_model)
     target = torch.randn(2, 16, 100) * 0.1
     trainable = [p for p in train_model.parameters() if p.requires_grad]
-    opt = torch.optim.Adam(trainable, lr=0.1) if variant == "delora" else torch.optim.SGD(trainable, lr=1e-2)
+    opt = torch.optim.Adam(trainable, lr=0.1) if variant in ("delora", "ia3") else torch.optim.SGD(trainable, lr=1e-2)
     losses = []
     first_grad_norm = math.nan
     before_train = adapter_state(train_model)
@@ -214,7 +219,7 @@ def test_no_target_layers_is_loud_failure():
         ll.attach(TinyModel(), cfg)
 
 
-@pytest.mark.parametrize("variant", ["lora", "delora"])
+@pytest.mark.parametrize("variant", ["lora", "delora", "ia3"])
 def test_structural_non_linear_target_trains_for_forward_only_variants(variant: str):
     torch.manual_seed(0)
     model = FakeBnbModel()
diff --git a/uv.lock b/uv.lock
index 75af905..bf2e10d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -7,7 +7,7 @@ resolution-markers = [
 ]
 
 [options]
-exclude-newer = "2026-04-21T09:06:31.868106008Z"
+exclude-newer = "2026-04-21T09:27:46.246831625Z"
 exclude-newer-span = "P5D"
 
 [[package]]
@@ -61,6 +61,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/fa/123043af240e49752f1c4bd24da5053b6bd00cad78c2be53c0d1e8b975bc/backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34", size = 30181, upload-time = "2024-05-28T17:01:53.112Z" },
 ]
 
+[[package]]
+name = "bitsandbytes"
+version = "0.49.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "packaging" },
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/7d/f1fe0992334b18cd8494f89aeec1dcc674635584fcd9f115784fea3a1d05/bitsandbytes-0.49.2-py3-none-macosx_14_0_arm64.whl", hash = "sha256:87be5975edeac5396d699ecbc39dfc47cf2c026daaf2d5852a94368611a6823f", size = 131940, upload-time = "2026-02-16T21:26:04.572Z" },
+    { url = "https://files.pythonhosted.org/packages/29/71/acff7af06c818664aa87ff73e17a52c7788ad746b72aea09d3cb8e424348/bitsandbytes-0.49.2-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:2fc0830c5f7169be36e60e11f2be067c8f812dfcb829801a8703735842450750", size = 31442815, upload-time = "2026-02-16T21:26:06.783Z" },
+    { url = "https://files.pythonhosted.org/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:54b771f06e1a3c73af5c7f16ccf0fc23a846052813d4b008d10cb6e017dd1c8c", size = 60651714, upload-time = "2026-02-16T21:26:11.579Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/d4/501655842ad6771fb077f576d78cbedb5445d15b1c3c91343ed58ca46f0e/bitsandbytes-0.49.2-py3-none-win_amd64.whl", hash = "sha256:2e0ddd09cd778155388023cbe81f00afbb7c000c214caef3ce83386e7144df7d", size = 55372289, upload-time = "2026-02-16T21:26:16.267Z" },
+]
+
 [[package]]
 name = "certifi"
 version = "2026.2.25"
@@ -617,6 +634,9 @@ dependencies = [
 ]
 
 [package.optional-dependencies]
+bnb-test = [
+    { name = "bitsandbytes" },
+]
 build = [
     { name = "twine" },
 ]
@@ -633,6 +653,7 @@ test = [
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'hf-test'", specifier = ">=1.6" },
+    { name = "bitsandbytes", marker = "extra == 'bnb-test'", specifier = ">=0.46" },
     { name = "einops", specifier = ">=0.7" },
     { name = "pytest", marker = "extra == 'test'" },
     { name = "safetensors", marker = "extra == 'hf-test'", specifier = ">=0.5" },
@@ -641,7 +662,7 @@ requires-dist = [
     { name = "transformers", marker = "extra == 'hf-test'", specifier = ">=4.51" },
     { name = "twine", marker = "extra == 'build'", specifier = ">=6" },
 ]
-provides-extras = ["build", "test", "hf-test"]
+provides-extras = ["build", "test", "hf-test", "bnb-test"]
 
 [[package]]
 name = "markdown-it-py"