From 9d027752ad4e9cfdd94773ffa36ea4faafcc028c Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Mon, 15 Jun 2026 20:13:15 +0800 Subject: [PATCH] variants: replace arrow's dense block with diagonal-plus-low-rank core antipasto_arrow -> antipasto_dplr. The arrowhead's dense b x b block is the wrong shape: b^2 params, mixes only the top-b, and sits on the S-scaled coords so its perturbation is amplified by the largest singular values (block=128 collapsed to 45.7% at the gain's lr). Replace it with LoRA's lesson -- a low-rank core inside the frozen basis, ADDED to the gain: DeltaW = U [diag(S_eff) + coeff * B A] Vh, A:(k,r) B:(r,k), B=0 at init The low-rank part mixes the whole top-r subspace for 2*r*k params (k=LoRA's rank), and being additive (not * diag(S)) it is S-independent -- the amplification edge is gone by construction. Diagonal gain unchanged; identity at init from B=0 and g=0. Wired through benchmark (antipasto_lora_rank, run_id __k suffix), justfile, cost_report, smoke (green, dplr attaches/trains/round-trips). Arrow code removed; its run results stay on disk for comparison. Co-Authored-By: Claudypoo --- justfile | 8 +- scripts/cost_report.py | 2 +- scripts/metamath_gsm8k_benchmark.py | 18 +-- src/lora_lite/__init__.py | 4 +- src/lora_lite/variants/__init__.py | 2 +- .../{antipasto_arrow.py => antipasto_dplr.py} | 114 ++++++++++-------- tests/test_metamath_smoke.py | 6 +- 7 files changed, 79 insertions(+), 75 deletions(-) rename src/lora_lite/variants/{antipasto_arrow.py => antipasto_dplr.py} (51%) diff --git a/justfile b/justfile index f279e3c..9a5bb76 100644 --- a/justfile +++ b/justfile @@ -75,7 +75,7 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base": # Run a single MetaMathQA->GSM8K benchmark for a given variant. # Per-variant lr / target-name defaults are baked in here. -bench-variant model variant steps="5000" block="8" r_override="": +bench-variant model variant steps="5000" lora_rank="8" r_override="": #!/usr/bin/env bash set -euo pipefail lr=1e-4 @@ -94,10 +94,6 @@ bench-variant model variant steps="5000" block="8" r_override="": # matches the published AntiPaSTO row. alpha=r (no extra scaling). antipasto*) lr=5e-3; r=256; alpha=256 ;; esac - # 5e-3 suits the tiny S-space gain, but arrow's large dense block is LoRA-like - # and destabilizes at that lr (block=128 got 45.7% vs block=8's 60.5%). Drop to - # LoRA's 1e-4 once the block dominates the param count. - if [ "{{variant}}" = "antipasto_arrow" ] && [ "{{block}}" -gt 8 ]; then lr=1e-4; fi # r override (e.g. low-rank corda sweep); alpha tracks r for the antipasto family. if [ -n "{{r_override}}" ]; then r="{{r_override}}"; alpha="{{r_override}}"; fi exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \ @@ -106,7 +102,7 @@ bench-variant model variant steps="5000" block="8" r_override="": --steps {{steps}} \ --lr "$lr" \ --target-name "$target" \ - --antipasto-block {{block}} \ + --antipasto-lora-rank {{lora_rank}} \ --layers all --r "$r" --alpha "$alpha" metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto": diff --git a/scripts/cost_report.py b/scripts/cost_report.py index df36457..3e9f9ea 100644 --- a/scripts/cost_report.py +++ b/scripts/cost_report.py @@ -50,7 +50,7 @@ def main() -> None: ap.add_argument("--model", default="Qwen/Qwen3-0.6B-Base") ap.add_argument("--variants", nargs="+", default=["lora", "antipasto", "antipasto_rot", "antipasto_corda", - "antipasto_ablate", "antipasto_arrow"]) + "antipasto_ablate", "antipasto_dplr"]) ap.add_argument("--target-name", nargs="+", default=[r"q_proj$", r"v_proj$"]) ap.add_argument("--r", type=int, default=32) ap.add_argument("--layers", default="all", diff --git a/scripts/metamath_gsm8k_benchmark.py b/scripts/metamath_gsm8k_benchmark.py index 5942350..42af6f9 100644 --- a/scripts/metamath_gsm8k_benchmark.py +++ b/scripts/metamath_gsm8k_benchmark.py @@ -37,7 +37,7 @@ CFG_BY_VARIANT = { "antipasto_rot": ll.AntiPaSTORotConfig, "antipasto_ablate": ll.AntiPaSTOAblateConfig, "antipasto_corda": ll.AntiPaSTOCorDAConfig, - "antipasto_arrow": ll.AntiPaSTOArrowConfig, + "antipasto_dplr": ll.AntiPaSTODPLRConfig, "road": ll.RoadConfig, } @@ -47,7 +47,7 @@ class BenchmarkConfig: """MetaMathQA -> GSM8K benchmark config. Tyro turns this into the CLI.""" model: str = "Qwen/Qwen3-0.6B-Base" - variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_arrow", "road"] = "lora" + variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_dplr", "road"] = "lora" mode: Literal["benchmark", "probe"] = "benchmark" device: str = "cuda" torch_dtype: str = "bfloat16" @@ -64,8 +64,8 @@ class BenchmarkConfig: antipasto_cov_orient: bool = False # AntiPaSTO-rot (legacy rotation variant) basis to rotate. antipasto_rotate_basis: Literal["V", "U", "none"] = "V" - # AntiPaSTO-arrow: dense interaction block size on the top-b singular directions. - antipasto_block: int = 8 + # AntiPaSTO-dplr: rank of the low-rank mixing core in the frozen subspace. + antipasto_lora_rank: int = 8 target_name: list[str] = field(default_factory=lambda: list(DEFAULT_TARGETS)) layers: str = "all" train_dataset: str = "meta-math/MetaMathQA" @@ -146,9 +146,9 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf if args.variant == "antipasto_ablate": extra = {"coeff": args.antipasto_coeff, "k": args.antipasto_ablate_k, "cov_orient": args.antipasto_cov_orient} - if args.variant == "antipasto_arrow": + if args.variant == "antipasto_dplr": extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only, - "block": args.antipasto_block} + "lora_rank": args.antipasto_lora_rank} return CFG_BY_VARIANT[args.variant]( r=args.r, alpha=args.r if args.variant == "pissa" else args.alpha, @@ -533,9 +533,9 @@ def run(args: BenchmarkConfig) -> dict[str, Any]: dtype = getattr(torch, args.torch_dtype) run_commit = current_git_commit() run_id = f"{args.model.replace('/', '--')}__{args.variant}__s{args.steps}__seed{args.seed}" - # arrow's capacity is set by block, not r, so keep block-sweep runs from colliding. - if args.variant == "antipasto_arrow" and args.antipasto_block != 8: - run_id += f"__b{args.antipasto_block}" + # dplr capacity is set by lora_rank, not r, so keep rank-sweep runs from colliding. + if args.variant == "antipasto_dplr" and args.antipasto_lora_rank != 8: + run_id += f"__k{args.antipasto_lora_rank}" # antipasto family defaults to r=256; low-rank sweeps get their own dirs. if args.variant.startswith("antipasto") and args.r != 256: run_id += f"__r{args.r}" diff --git a/src/lora_lite/__init__.py b/src/lora_lite/__init__.py index a67401e..84c54c9 100644 --- a/src/lora_lite/__init__.py +++ b/src/lora_lite/__init__.py @@ -23,7 +23,7 @@ from .variants.antipasto import AntiPaSTOConfig from .variants.antipasto_rot import AntiPaSTORotConfig from .variants.antipasto_ablate import AntiPaSTOAblateConfig from .variants.antipasto_corda import AntiPaSTOCorDAConfig -from .variants.antipasto_arrow import AntiPaSTOArrowConfig +from .variants.antipasto_dplr import AntiPaSTODPLRConfig from .variants.road import RoadConfig __all__ = [ @@ -40,7 +40,7 @@ __all__ = [ "AntiPaSTORotConfig", "AntiPaSTOAblateConfig", "AntiPaSTOCorDAConfig", - "AntiPaSTOArrowConfig", + "AntiPaSTODPLRConfig", "RoadConfig", "attach", "detach", diff --git a/src/lora_lite/variants/__init__.py b/src/lora_lite/variants/__init__.py index 07ef5c3..930ed25 100644 --- a/src/lora_lite/variants/__init__.py +++ b/src/lora_lite/variants/__init__.py @@ -1,4 +1,4 @@ from . import ( # noqa: F401 side-effect: register lora, pissa, delora, ia3, dora, hra, eva, antipasto, road, - antipasto_rot, antipasto_ablate, antipasto_corda, antipasto_arrow, + antipasto_rot, antipasto_ablate, antipasto_corda, antipasto_dplr, ) diff --git a/src/lora_lite/variants/antipasto_arrow.py b/src/lora_lite/variants/antipasto_dplr.py similarity index 51% rename from src/lora_lite/variants/antipasto_arrow.py rename to src/lora_lite/variants/antipasto_dplr.py index deb5da6..f143c4f 100644 --- a/src/lora_lite/variants/antipasto_arrow.py +++ b/src/lora_lite/variants/antipasto_dplr.py @@ -1,22 +1,33 @@ -"""AntiPaSTO-Arrow: cross-direction mixing via a cheap arrowhead core. +"""AntiPaSTO-DPLR: diagonal-plus-low-rank core in the frozen SVD basis. -antipasto's core is diagonal (S_eff = S * gain): it reweights each singular direction -independently but cannot let direction i drive direction j. A full dense r x r core -restores all mixing but costs r^2 params. The arrowhead is the cheap middle: a dense -block on the top-b directions (where the action lives), the diagonal gain on the rest. +antipasto's core is diagonal (a per-direction gain); it rescales each singular +direction but cannot mix one into another. The arrowhead tried a dense b x b block +on the top-b directions, but a dense block is the wrong shape (b^2 params, mixes only +the top-b) and -- sitting on the S-scaled coords -- its perturbation is amplified by +the largest singular values, so it destabilizes. The fix is LoRA's lesson: a low-rank +core. Put a trainable rank-k core inside the frozen U/Vh basis, ADDED to the gain: - core C (r x r, on the S-scaled coords): - [ B (b x b dense) | 0 ] B = I_b + coeff*M (top-b mixing) - [ 0 | diag(1 + ELU(coeff*g)) ] tail = antipasto's gain - DeltaW = U @ C @ diag(S) @ Vh - cost: b^2 + (r-b) params, one b x b matmul per forward. + W = U diag(S) Vh + W_res # frozen top-r SVD + learn: g (r,) # diagonal gain + A (k,r), B (r,k) # low-rank mixing core, B=0 at init + S_eff = S * (1 + ELU(coeff * g)) + y = x @ W_res.T + ( (Vh x) * S_eff + coeff * B (A (Vh x)) ) @ U.T -Identity at init: M=0 -> B=I, g=0 -> 1+ELU(0)=1, so C=I and DeltaW = U diag(S) Vh. -coeff=0 -> C=I too (runtime off). The block is the linear (1+z) regime -- stable but -not strictly bounded; for a can't-blow-up guarantee on the top directions use -antipasto_ablate. +so the trainable core is C = diag(S_eff) + coeff * B A acting in S-space, and +DeltaW = U C Vh. The diagonal part scales directions; the low-rank part B A mixes them +across the whole top-r subspace for 2*r*k params (k=LoRA's rank), not b^2. -Refs: antipasto.py (diagonal sibling), antipasto_corda.py (off-axis basis argument). +Why the low-rank part is ADDED, not multiplied into diag(S): an additive core +U (BA) Vh is independent of S, so a unit step in BA moves W by O(1), not O(S). That is +exactly the S-amplification edge that made the dense arrowhead block blow up at the +gain's learning rate -- gone by construction. + +Identity at init: B=0 -> BA=0, g=0 -> 1+ELU(0)=1, so C=diag(S) and DeltaW = U diag(S) Vh. +coeff=0 -> identity too (runtime off). The basis (U, Vh) stays frozen and interpretable; +only the gain and the rank-k core move. + +Refs: antipasto.py (diagonal sibling), lora.py (the low-rank core), antipasto_corda.py +(oriented basis -- composes with this core). """ from dataclasses import dataclass from typing import Iterable, Literal @@ -36,42 +47,42 @@ CalibrationData = Iterable[CalibrationBatch] @register_config @dataclass -class AntiPaSTOArrowConfig(AdapterConfig): - variant: str = "antipasto_arrow" +class AntiPaSTODPLRConfig(AdapterConfig): + variant: str = "antipasto_dplr" r: int = 256 - # Dense interaction block on the top-b singular directions; sets capacity and the - # only quadratic cost (b^2 params/module). b=1 degenerates to antipasto; b->r - # approaches a full dense r-core (~LoRA params) at the cost arrow exists to avoid. - block: int = 8 - suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected. - # Tail guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies. - coeff: float = 1.0 # runtime knob: 0=identity, scales both block and tail - act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto + # Rank of the low-rank mixing core (LoRA's r, but inside the frozen subspace). + # Params = r (gain) + 2*r*lora_rank. k=0 degenerates to plain antipasto. + lora_rank: int = 8 + suppress_only: bool = False # clamp the gain g<=0 (attenuate only); core unaffected. + coeff: float = 1.0 # runtime knob: 0=identity, scales gain and core. + act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto. @register -class AntiPaSTOArrow: - name = "antipasto_arrow" +class AntiPaSTODPLR: + name = "antipasto_dplr" @staticmethod def param_specs(d_in, d_out, cfg): - r, b = cfg.r, cfg.block - if not 1 <= b < r: - raise ValueError(f"antipasto_arrow needs 1 <= block({b}) < r({r}).") + r, k = cfg.r, cfg.lora_rank + if not 0 < k <= r: + raise ValueError(f"antipasto_dplr needs 0 < lora_rank({k}) <= r({r}).") return dict( lora_U=ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True), lora_S=ParamSpec((r,), init="zeros", trainable=False, as_buffer=True), lora_Vh=ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True), - # Dense b x b interaction on the top-b directions. init 0 -> B=I -> identity. - lora_M=ParamSpec((b, b), init="zeros"), - # Diagonal bounded gain on the remaining r-b directions (== antipasto's g). - lora_g=ParamSpec((r - b,), init="zeros"), + # Diagonal gain (== antipasto). init 0 -> 1+ELU(0)=1 -> identity. + lora_g=ParamSpec((r,), init="zeros"), + # Low-rank core B@A in the frozen subspace. A down (r->k), B up (k->r). + # B=0 at init -> core=0 -> identity (LoRA convention). + lora_A=ParamSpec((k, r), init="kaiming"), + lora_B=ParamSpec((r, k), init="zeros"), ) @staticmethod def init(layer: nn.Module, cfg) -> None: if type(layer) is not nn.Linear: - raise TypeError("AntiPaSTOArrow mutates layer.weight into W_res; nn.Linear only.") + raise TypeError("AntiPaSTODPLR mutates layer.weight into W_res; nn.Linear only.") with torch.no_grad(): W = layer.weight.data.float() U, S, Vh = torch.linalg.svd(W, full_matrices=False) @@ -85,9 +96,9 @@ class AntiPaSTOArrow: @staticmethod def group_init(model: nn.Module, targets, cfg, calibration_data: CalibrationData | None) -> None: - """Wanda-style data-driven dimension selection, identical to antipasto: re-pick - the top-r directions by S[i] * pool|X @ Vh[i]|. Runs before training (g, M at - their zero init), so re-selecting the basis is a harmless no-op on the core.""" + """Wanda-style re-selection of the top-r directions, identical to antipasto. + Runs before training while g and B are still zero, so the core contributes + nothing and re-selecting the basis is a no-op on the adapter output.""" if calibration_data is None: return @@ -122,15 +133,13 @@ class AntiPaSTOArrow: for name, layer in layers.items(): X = torch.cat(captured[name], dim=0) if X.shape[0] < r: - raise RuntimeError(f"AntiPaSTOArrow at {name}: {X.shape[0]} tokens, need >= r={r}") + raise RuntimeError(f"AntiPaSTODPLR at {name}: {X.shape[0]} tokens, need >= r={r}") + # Rebuild the FULL W exactly (W_res + stored top-r), then re-select top-r. W_res = layer.weight.data.float() W_orig = W_res + (layer.lora_U.float() * layer.lora_S.float()) @ layer.lora_Vh.float() U_full, S_full, Vh_full = torch.linalg.svd(W_orig, full_matrices=False) proj = X.to(Vh_full) @ Vh_full.T act_mag = proj.pow(2).mean(0).sqrt() if pool == "rms" else proj.abs().mean(0) - # Pick top-r by score, then sort by SVD index. svd() returns S descending, - # so the block's first-b coords are the b largest-S among the selected r - # (= where the action lives), not the b highest-score. idx = (S_full * act_mag).argsort(descending=True)[:r].sort().values Ur, Sr, Vhr = U_full[:, idx], S_full[idx], Vh_full[idx] W_res_new = (W_orig - (Ur * Sr) @ Vhr).to(layer.weight.dtype) @@ -150,19 +159,18 @@ class AntiPaSTOArrow: U = layer.lora_U.to(x.dtype) # (d_out, r) S = layer.lora_S.to(x.dtype) # (r,) Vh = layer.lora_Vh.to(x.dtype) # (r, d_in) - M = layer.lora_M.to(x.dtype) # (b, b) - g = layer.lora_g.to(x.dtype) # (r-b,) - coeff, b = float(cfg.coeff), cfg.block + g = layer.lora_g.to(x.dtype) # (r,) + A = layer.lora_A.to(x.dtype) # (k, r) + B = layer.lora_B.to(x.dtype) # (r, k) + coeff = float(cfg.coeff) - cS = (x @ Vh.T) * S # (..., r) = diag(S) Vh x - - # Top-b: dense block B = I_b + coeff*M couples the top singular directions. - eye = torch.eye(b, dtype=x.dtype, device=x.device) - top = cS[..., :b] @ (eye + coeff * M).T # (..., b) - # Tail: antipasto's bounded diagonal gain (see antipasto.py for the 1+ELU why). if cfg.suppress_only: g = torch.clamp(g, max=0.0) - tail = cS[..., b:] * (1.0 + F.elu(coeff * g)) # (..., r-b) - h = torch.cat([top, tail], dim=-1) # (..., r) + p = x @ Vh.T # (..., r) = Vh x (unscaled) + S_eff = S * (1.0 + F.elu(coeff * g)) # diagonal gain (see antipasto.py) + # Diagonal part scales each direction; low-rank part B@A mixes across the + # subspace. Additive (not * diag(S)), so the core is S-independent: a unit + # step in B@A moves W by O(1), not O(S) -- no S-amplification edge. + h = p * S_eff + coeff * (p @ A.T) @ B.T # (..., r) return y + h @ U.T diff --git a/tests/test_metamath_smoke.py b/tests/test_metamath_smoke.py index b37a5f3..adf9982 100644 --- a/tests/test_metamath_smoke.py +++ b/tests/test_metamath_smoke.py @@ -32,12 +32,12 @@ SPEC.loader.exec_module(benchmark) VARIANTS = ["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", - "antipasto_arrow", "road"] + "antipasto_dplr", "road"] # Variants that fail loud when attached on a bnb-loaded base (read dense weight in init). # delora/eva also read weight but currently silently dequant -- they produce sane attach, # so we don't expect a raise from them in the attach-only smoke. BNB_RAISERS = {"pissa", "dora", "antipasto", "antipasto_rot", "antipasto_ablate", - "antipasto_corda", "antipasto_arrow"} + "antipasto_corda", "antipasto_dplr"} TINY_MODEL = "hf-internal-testing/tiny-random-LlamaForCausalLM" HAS_CUDA = torch.cuda.is_available() @@ -59,7 +59,7 @@ def quick_cfg(variant: str, tmp_path: Path, quantization: str = "none") -> "benc quantization=quantization, r=4, alpha=8, - antipasto_block=2, # antipasto_arrow needs block < r (r=4 here) + antipasto_lora_rank=2, # antipasto_dplr needs 0 < lora_rank <= r (r=4 here) target_name=target_name, layers="all", steps=2,