diff --git a/README.md b/README.md index 66c190d..07f91eb 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,8 @@ use r=32; the AntiPaSTO family uses r=256 (it tunes only S-space gain, so it nee | [LoRA](https://arxiv.org/abs/2106.09685) | 59.8 | 68.0 | 3.54M | 3.54M | 173 / 573 | 0.02 | | [PiSSA](https://arxiv.org/abs/2404.02948) | 59.8 | 76.0 | 3.54M | 3.54M | 146 / 549 | 2.04 | | [DoRA](https://arxiv.org/abs/2402.09353) | 60.2 | 68.0 | 3.56M | 3.54M | 161 / 556 | 0.16 | -| [DeLoRA](https://arxiv.org/abs/2503.18225) | — | — | — | — | — | — | -| [HRA](https://arxiv.org/abs/2405.17484) | — | — | — | — | — | — | +| [DeLoRA](https://arxiv.org/abs/2503.18225) | 56.2 | 62.0 | 3.54M | 3.54M | 169 / 593 | 0.21 | +| [HRA](https://arxiv.org/abs/2405.17484) | 59.2 | 70.0 | 2.75M | 2.75M | 225 / 948 | 0.04 | | [EVA](https://arxiv.org/abs/2410.07170) | — | — | — | — | — | — | | [IA3](https://arxiv.org/pdf/2205.05638) | — | — | — | — | — | — | | [IA3-FF](https://arxiv.org/pdf/2205.05638) | — | — | — | — | — | — | diff --git a/scripts/metamath_gsm8k_benchmark.py b/scripts/metamath_gsm8k_benchmark.py index dc08401..806227e 100644 --- a/scripts/metamath_gsm8k_benchmark.py +++ b/scripts/metamath_gsm8k_benchmark.py @@ -142,9 +142,7 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf extra = {"group_size": args.road_group_size} if args.variant == "antipasto_rot": extra = {"rotate_basis": args.antipasto_rotate_basis} - if args.variant == "antipasto": - extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only} - if args.variant in ("antipasto_corda", "antipasto_asvd"): + if args.variant in ("antipasto", "antipasto_corda", "antipasto_asvd"): extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only} if args.variant == "antipasto_ablate": extra = {"coeff": args.antipasto_coeff, "k": args.antipasto_ablate_k, diff --git a/src/lora_lite/config.py b/src/lora_lite/config.py index dfdd275..6c01969 100644 --- a/src/lora_lite/config.py +++ b/src/lora_lite/config.py @@ -1,9 +1,8 @@ """AdapterConfig: per-variant typed dataclass. -Replaces the older `LoraLiteConfig` + `variant_kwargs` dict. Each variant -ships its own subclass under `variants/*.py` (e.g. `DeLoRAConfig`), adding -strongly-typed fields so users discover the knobs via IDE / dataclass -introspection instead of stringly-typed dict lookups. +Each variant ships its own subclass under `variants/*.py` (e.g. `DeLoRAConfig`), +adding strongly-typed fields so the knobs are discoverable via IDE / dataclass +introspection rather than stringly-typed dict lookups. Wire-up: - `AdapterConfig` holds the universal fields (variant name, rank, alpha, diff --git a/src/lora_lite/variant.py b/src/lora_lite/variant.py index 0c50868..b5a085e 100644 --- a/src/lora_lite/variant.py +++ b/src/lora_lite/variant.py @@ -24,9 +24,6 @@ class ParamSpec: # avoid exact-zero dead zone; N(0, 1e-4) is small enough to be # ~identity but nonzero so gradients always have somewhere to go t.normal_(0, 1e-4) - elif self.init == "near_one": - # avoid exact-one dead zone; 1 + N(0, 1e-4) - t.fill_(1.0).add_(torch.randn_like(t).mul_(1e-4)) elif self.init == "ones": t.fill_(1.0) elif self.init == "kaiming": @@ -37,7 +34,7 @@ class ParamSpec: return t def make(self, dtype: torch.dtype, device) -> nn.Parameter: - # legacy entry: returns a Parameter (used for trainable adapter params) + # trainable params -> Parameter; buffers go through make_tensor (see attach) if self.as_buffer: raise RuntimeError("as_buffer spec must be installed via register_buffer; see adapter.attach") return nn.Parameter(self._empty(dtype, device), requires_grad=self.trainable) diff --git a/src/lora_lite/variants/antipasto_ablate.py b/src/lora_lite/variants/antipasto_ablate.py index be8d1ff..13806df 100644 --- a/src/lora_lite/variants/antipasto_ablate.py +++ b/src/lora_lite/variants/antipasto_ablate.py @@ -85,9 +85,9 @@ class AntiPaSTOAblate: layer.lora_Vh.copy_(Vhr.to(layer.lora_Vh.dtype)) W_res = (W - (Ur * Sr) @ Vhr).to(layer.weight.dtype) layer.weight.data.copy_(W_res) - # Optional but recommended: group_init() should warm-start lora_c from the - # S-space contrastive direction dS (see sspace.py extract). Random init also - # trains, just slower and with no guarantee it finds the behavior direction. + # FIXME: lora_c is random-init. A group_init warm-start from the S-space + # contrastive direction dS (cf. sspace.py extract) would converge faster and + # land on the behavior direction; not implemented -- random trains, just slower. @staticmethod def group_init(model: nn.Module, targets, cfg, calibration_data: CalibrationData | None) -> None: diff --git a/src/lora_lite/variants/antipasto_asvd.py b/src/lora_lite/variants/antipasto_asvd.py new file mode 100644 index 0000000..dea43cc --- /dev/null +++ b/src/lora_lite/variants/antipasto_asvd.py @@ -0,0 +1,43 @@ +"""AntiPaSTO-ASVD: diagonal-covariance sibling of antipasto_corda. + +Same frozen-basis bounded gain, but orients the SVD by the DIAGONAL of the input +second moment (per-channel activation scale) instead of the full covariance: + + M = diag(E[x_i^2]) vs CorDA's full C = E[x x^T] + +This is Activation-aware SVD (Yuan+ 2023, arXiv:2312.05821): SVD(W diag(s)) with s a +per-channel scale. It is NOT a sub-basis of CorDA -- diag(C)^{1/2} and C^{1/2} are +different oblique rotations, so the top-r directions differ and either can win on a task. +ASVD is the cheap arm: O(d_in) moment, no d_in x d_in matrix, no eigh. The head-to-head +with antipasto_corda isolates whether the off-diagonal of C earns its init cost here. + +Reuses antipasto_corda's buffers (U, S, P, g), plain-SVD init, gain forward, and the +shared `_covariance_orient` (only the diag flag differs), so there is one copy of the +math to keep in sync. + +Refs: antipasto_corda.py (full-covariance sibling), ASVD arXiv:2312.05821. +""" +from dataclasses import dataclass + +from ..variant import register +from ..config import register_config +from .antipasto_corda import AntiPaSTOCorDA, AntiPaSTOCorDAConfig, _covariance_orient + + +@register_config +@dataclass +class AntiPaSTOASVDConfig(AntiPaSTOCorDAConfig): + variant: str = "antipasto_asvd" + + +@register +class AntiPaSTOASVD: + name = "antipasto_asvd" + param_specs = staticmethod(AntiPaSTOCorDA.param_specs) + init = staticmethod(AntiPaSTOCorDA.init) + forward = staticmethod(AntiPaSTOCorDA.forward) + + @staticmethod + def group_init(model, targets, cfg, calibration_data) -> None: + """ASVD: re-orient by the diagonal of the input second moment (per-channel).""" + _covariance_orient(model, targets, cfg, calibration_data, diag=True) diff --git a/src/lora_lite/variants/antipasto_rot.py b/src/lora_lite/variants/antipasto_rot.py index 1055ee8..159a5c3 100644 --- a/src/lora_lite/variants/antipasto_rot.py +++ b/src/lora_lite/variants/antipasto_rot.py @@ -1,6 +1,5 @@ -"""AntiPaSTO-Rot: the original SVD adapter with learnable singular-value deltas + -block-diagonal Cayley rotation. Kept as a SEPARATE variant so we can benchmark the -rotation version against the rotation-free 1+ELU gain (antipasto.py) head to head. +"""AntiPaSTO-Rot: SVD adapter with learnable singular-value deltas + a block-diagonal +Cayley rotation of the frozen basis. The rotation arm vs antipasto.py's gain-only core. wassname 2026 https://arxiv.org/abs/2601.07473 @@ -10,12 +9,7 @@ wassname 2026 https://arxiv.org/abs/2601.07473 y = x @ W_res.T + ((x @ Vh_eff.T) * (S + delta_s)) @ U_eff.T Identity at t=0: rot_T=0 -> R=I, delta_s~4e-4 -> y ~ x @ W^T (tiny positive bias on -delta_s breaks sign symmetry). - -Why antipasto.py dropped the rotation: rotating Vh/U leaves the interpretable singular -basis, and the Cayley solve was numerically finicky. This file preserves it for the -all-else-equal comparison (does the cross-direction mixing the rotation buys beat the -cheaper, more stable gain-only adapter on the same targets and budget?). +delta_s breaks sign symmetry; rotation alone can't). Refs: - paper: https://github.com/wassname/AntiPaSTO