mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 15:15:55 +08:00
comment hygiene + HRA row: shorten docstrings, drop dead init branch, track asvd
- variant.py: fix mislabeled "legacy entry" (make() is the live param path); drop unused near_one init branch - config.py: drop "replaces older LoraLiteConfig" history narration - antipasto_ablate.py: aspirational "should warm-start" comment -> tracked FIXME - antipasto_rot.py: cut "kept as separate variant" / "why antipasto dropped rotation" ramble - benchmark: merge duplicate antipasto/corda/asvd cfg branch - README: fill HRA row (test 59.2 / valid 70.0) - track antipasto_asvd.py (was imported+registered but uncommitted) Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -56,8 +56,8 @@ use r=32; the AntiPaSTO family uses r=256 (it tunes only S-space gain, so it nee
|
||||
| [LoRA](https://arxiv.org/abs/2106.09685) | 59.8 | 68.0 | 3.54M | 3.54M | 173 / 573 | 0.02 |
|
||||
| [PiSSA](https://arxiv.org/abs/2404.02948) | 59.8 | 76.0 | 3.54M | 3.54M | 146 / 549 | 2.04 |
|
||||
| [DoRA](https://arxiv.org/abs/2402.09353) | 60.2 | 68.0 | 3.56M | 3.54M | 161 / 556 | 0.16 |
|
||||
| [DeLoRA](https://arxiv.org/abs/2503.18225) | — | — | — | — | — | — |
|
||||
| [HRA](https://arxiv.org/abs/2405.17484) | — | — | — | — | — | — |
|
||||
| [DeLoRA](https://arxiv.org/abs/2503.18225) | 56.2 | 62.0 | 3.54M | 3.54M | 169 / 593 | 0.21 |
|
||||
| [HRA](https://arxiv.org/abs/2405.17484) | 59.2 | 70.0 | 2.75M | 2.75M | 225 / 948 | 0.04 |
|
||||
| [EVA](https://arxiv.org/abs/2410.07170) | — | — | — | — | — | — |
|
||||
| [IA3](https://arxiv.org/pdf/2205.05638) | — | — | — | — | — | — |
|
||||
| [IA3-FF](https://arxiv.org/pdf/2205.05638) | — | — | — | — | — | — |
|
||||
|
||||
@@ -142,9 +142,7 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf
|
||||
extra = {"group_size": args.road_group_size}
|
||||
if args.variant == "antipasto_rot":
|
||||
extra = {"rotate_basis": args.antipasto_rotate_basis}
|
||||
if args.variant == "antipasto":
|
||||
extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only}
|
||||
if args.variant in ("antipasto_corda", "antipasto_asvd"):
|
||||
if args.variant in ("antipasto", "antipasto_corda", "antipasto_asvd"):
|
||||
extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only}
|
||||
if args.variant == "antipasto_ablate":
|
||||
extra = {"coeff": args.antipasto_coeff, "k": args.antipasto_ablate_k,
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
"""AdapterConfig: per-variant typed dataclass.
|
||||
|
||||
Replaces the older `LoraLiteConfig` + `variant_kwargs` dict. Each variant
|
||||
ships its own subclass under `variants/*.py` (e.g. `DeLoRAConfig`), adding
|
||||
strongly-typed fields so users discover the knobs via IDE / dataclass
|
||||
introspection instead of stringly-typed dict lookups.
|
||||
Each variant ships its own subclass under `variants/*.py` (e.g. `DeLoRAConfig`),
|
||||
adding strongly-typed fields so the knobs are discoverable via IDE / dataclass
|
||||
introspection rather than stringly-typed dict lookups.
|
||||
|
||||
Wire-up:
|
||||
- `AdapterConfig` holds the universal fields (variant name, rank, alpha,
|
||||
|
||||
@@ -24,9 +24,6 @@ class ParamSpec:
|
||||
# avoid exact-zero dead zone; N(0, 1e-4) is small enough to be
|
||||
# ~identity but nonzero so gradients always have somewhere to go
|
||||
t.normal_(0, 1e-4)
|
||||
elif self.init == "near_one":
|
||||
# avoid exact-one dead zone; 1 + N(0, 1e-4)
|
||||
t.fill_(1.0).add_(torch.randn_like(t).mul_(1e-4))
|
||||
elif self.init == "ones":
|
||||
t.fill_(1.0)
|
||||
elif self.init == "kaiming":
|
||||
@@ -37,7 +34,7 @@ class ParamSpec:
|
||||
return t
|
||||
|
||||
def make(self, dtype: torch.dtype, device) -> nn.Parameter:
|
||||
# legacy entry: returns a Parameter (used for trainable adapter params)
|
||||
# trainable params -> Parameter; buffers go through make_tensor (see attach)
|
||||
if self.as_buffer:
|
||||
raise RuntimeError("as_buffer spec must be installed via register_buffer; see adapter.attach")
|
||||
return nn.Parameter(self._empty(dtype, device), requires_grad=self.trainable)
|
||||
|
||||
@@ -85,9 +85,9 @@ class AntiPaSTOAblate:
|
||||
layer.lora_Vh.copy_(Vhr.to(layer.lora_Vh.dtype))
|
||||
W_res = (W - (Ur * Sr) @ Vhr).to(layer.weight.dtype)
|
||||
layer.weight.data.copy_(W_res)
|
||||
# Optional but recommended: group_init() should warm-start lora_c from the
|
||||
# S-space contrastive direction dS (see sspace.py extract). Random init also
|
||||
# trains, just slower and with no guarantee it finds the behavior direction.
|
||||
# FIXME: lora_c is random-init. A group_init warm-start from the S-space
|
||||
# contrastive direction dS (cf. sspace.py extract) would converge faster and
|
||||
# land on the behavior direction; not implemented -- random trains, just slower.
|
||||
|
||||
@staticmethod
|
||||
def group_init(model: nn.Module, targets, cfg, calibration_data: CalibrationData | None) -> None:
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
"""AntiPaSTO-ASVD: diagonal-covariance sibling of antipasto_corda.
|
||||
|
||||
Same frozen-basis bounded gain, but orients the SVD by the DIAGONAL of the input
|
||||
second moment (per-channel activation scale) instead of the full covariance:
|
||||
|
||||
M = diag(E[x_i^2]) vs CorDA's full C = E[x x^T]
|
||||
|
||||
This is Activation-aware SVD (Yuan+ 2023, arXiv:2312.05821): SVD(W diag(s)) with s a
|
||||
per-channel scale. It is NOT a sub-basis of CorDA -- diag(C)^{1/2} and C^{1/2} are
|
||||
different oblique rotations, so the top-r directions differ and either can win on a task.
|
||||
ASVD is the cheap arm: O(d_in) moment, no d_in x d_in matrix, no eigh. The head-to-head
|
||||
with antipasto_corda isolates whether the off-diagonal of C earns its init cost here.
|
||||
|
||||
Reuses antipasto_corda's buffers (U, S, P, g), plain-SVD init, gain forward, and the
|
||||
shared `_covariance_orient` (only the diag flag differs), so there is one copy of the
|
||||
math to keep in sync.
|
||||
|
||||
Refs: antipasto_corda.py (full-covariance sibling), ASVD arXiv:2312.05821.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..variant import register
|
||||
from ..config import register_config
|
||||
from .antipasto_corda import AntiPaSTOCorDA, AntiPaSTOCorDAConfig, _covariance_orient
|
||||
|
||||
|
||||
@register_config
|
||||
@dataclass
|
||||
class AntiPaSTOASVDConfig(AntiPaSTOCorDAConfig):
|
||||
variant: str = "antipasto_asvd"
|
||||
|
||||
|
||||
@register
|
||||
class AntiPaSTOASVD:
|
||||
name = "antipasto_asvd"
|
||||
param_specs = staticmethod(AntiPaSTOCorDA.param_specs)
|
||||
init = staticmethod(AntiPaSTOCorDA.init)
|
||||
forward = staticmethod(AntiPaSTOCorDA.forward)
|
||||
|
||||
@staticmethod
|
||||
def group_init(model, targets, cfg, calibration_data) -> None:
|
||||
"""ASVD: re-orient by the diagonal of the input second moment (per-channel)."""
|
||||
_covariance_orient(model, targets, cfg, calibration_data, diag=True)
|
||||
@@ -1,6 +1,5 @@
|
||||
"""AntiPaSTO-Rot: the original SVD adapter with learnable singular-value deltas +
|
||||
block-diagonal Cayley rotation. Kept as a SEPARATE variant so we can benchmark the
|
||||
rotation version against the rotation-free 1+ELU gain (antipasto.py) head to head.
|
||||
"""AntiPaSTO-Rot: SVD adapter with learnable singular-value deltas + a block-diagonal
|
||||
Cayley rotation of the frozen basis. The rotation arm vs antipasto.py's gain-only core.
|
||||
|
||||
wassname 2026 https://arxiv.org/abs/2601.07473
|
||||
|
||||
@@ -10,12 +9,7 @@ wassname 2026 https://arxiv.org/abs/2601.07473
|
||||
y = x @ W_res.T + ((x @ Vh_eff.T) * (S + delta_s)) @ U_eff.T
|
||||
|
||||
Identity at t=0: rot_T=0 -> R=I, delta_s~4e-4 -> y ~ x @ W^T (tiny positive bias on
|
||||
delta_s breaks sign symmetry).
|
||||
|
||||
Why antipasto.py dropped the rotation: rotating Vh/U leaves the interpretable singular
|
||||
basis, and the Cayley solve was numerically finicky. This file preserves it for the
|
||||
all-else-equal comparison (does the cross-direction mixing the rotation buys beat the
|
||||
cheaper, more stable gain-only adapter on the same targets and budget?).
|
||||
delta_s breaks sign symmetry; rotation alone can't).
|
||||
|
||||
Refs:
|
||||
- paper: https://github.com/wassname/AntiPaSTO
|
||||
|
||||
Reference in New Issue
Block a user