mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:45:56 +08:00
variants: replace arrow's dense block with diagonal-plus-low-rank core
antipasto_arrow -> antipasto_dplr. The arrowhead's dense b x b block is the wrong
shape: b^2 params, mixes only the top-b, and sits on the S-scaled coords so its
perturbation is amplified by the largest singular values (block=128 collapsed to
45.7% at the gain's lr). Replace it with LoRA's lesson -- a low-rank core inside
the frozen basis, ADDED to the gain:
DeltaW = U [diag(S_eff) + coeff * B A] Vh, A:(k,r) B:(r,k), B=0 at init
The low-rank part mixes the whole top-r subspace for 2*r*k params (k=LoRA's rank),
and being additive (not * diag(S)) it is S-independent -- the amplification edge is
gone by construction. Diagonal gain unchanged; identity at init from B=0 and g=0.
Wired through benchmark (antipasto_lora_rank, run_id __k suffix), justfile, cost_report,
smoke (green, dplr attaches/trains/round-trips). Arrow code removed; its run results
stay on disk for comparison.
Co-Authored-By: Claudypoo <noreply@anthropic.com>
This commit is contained in:
@@ -75,7 +75,7 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base":
|
||||
|
||||
# Run a single MetaMathQA->GSM8K benchmark for a given variant.
|
||||
# Per-variant lr / target-name defaults are baked in here.
|
||||
bench-variant model variant steps="5000" block="8" r_override="":
|
||||
bench-variant model variant steps="5000" lora_rank="8" r_override="":
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
lr=1e-4
|
||||
@@ -94,10 +94,6 @@ bench-variant model variant steps="5000" block="8" r_override="":
|
||||
# matches the published AntiPaSTO row. alpha=r (no extra scaling).
|
||||
antipasto*) lr=5e-3; r=256; alpha=256 ;;
|
||||
esac
|
||||
# 5e-3 suits the tiny S-space gain, but arrow's large dense block is LoRA-like
|
||||
# and destabilizes at that lr (block=128 got 45.7% vs block=8's 60.5%). Drop to
|
||||
# LoRA's 1e-4 once the block dominates the param count.
|
||||
if [ "{{variant}}" = "antipasto_arrow" ] && [ "{{block}}" -gt 8 ]; then lr=1e-4; fi
|
||||
# r override (e.g. low-rank corda sweep); alpha tracks r for the antipasto family.
|
||||
if [ -n "{{r_override}}" ]; then r="{{r_override}}"; alpha="{{r_override}}"; fi
|
||||
exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \
|
||||
@@ -106,7 +102,7 @@ bench-variant model variant steps="5000" block="8" r_override="":
|
||||
--steps {{steps}} \
|
||||
--lr "$lr" \
|
||||
--target-name "$target" \
|
||||
--antipasto-block {{block}} \
|
||||
--antipasto-lora-rank {{lora_rank}} \
|
||||
--layers all --r "$r" --alpha "$alpha"
|
||||
|
||||
metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto":
|
||||
|
||||
@@ -50,7 +50,7 @@ def main() -> None:
|
||||
ap.add_argument("--model", default="Qwen/Qwen3-0.6B-Base")
|
||||
ap.add_argument("--variants", nargs="+",
|
||||
default=["lora", "antipasto", "antipasto_rot", "antipasto_corda",
|
||||
"antipasto_ablate", "antipasto_arrow"])
|
||||
"antipasto_ablate", "antipasto_dplr"])
|
||||
ap.add_argument("--target-name", nargs="+", default=[r"q_proj$", r"v_proj$"])
|
||||
ap.add_argument("--r", type=int, default=32)
|
||||
ap.add_argument("--layers", default="all",
|
||||
|
||||
@@ -37,7 +37,7 @@ CFG_BY_VARIANT = {
|
||||
"antipasto_rot": ll.AntiPaSTORotConfig,
|
||||
"antipasto_ablate": ll.AntiPaSTOAblateConfig,
|
||||
"antipasto_corda": ll.AntiPaSTOCorDAConfig,
|
||||
"antipasto_arrow": ll.AntiPaSTOArrowConfig,
|
||||
"antipasto_dplr": ll.AntiPaSTODPLRConfig,
|
||||
"road": ll.RoadConfig,
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ class BenchmarkConfig:
|
||||
"""MetaMathQA -> GSM8K benchmark config. Tyro turns this into the CLI."""
|
||||
|
||||
model: str = "Qwen/Qwen3-0.6B-Base"
|
||||
variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_arrow", "road"] = "lora"
|
||||
variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_dplr", "road"] = "lora"
|
||||
mode: Literal["benchmark", "probe"] = "benchmark"
|
||||
device: str = "cuda"
|
||||
torch_dtype: str = "bfloat16"
|
||||
@@ -64,8 +64,8 @@ class BenchmarkConfig:
|
||||
antipasto_cov_orient: bool = False
|
||||
# AntiPaSTO-rot (legacy rotation variant) basis to rotate.
|
||||
antipasto_rotate_basis: Literal["V", "U", "none"] = "V"
|
||||
# AntiPaSTO-arrow: dense interaction block size on the top-b singular directions.
|
||||
antipasto_block: int = 8
|
||||
# AntiPaSTO-dplr: rank of the low-rank mixing core in the frozen subspace.
|
||||
antipasto_lora_rank: int = 8
|
||||
target_name: list[str] = field(default_factory=lambda: list(DEFAULT_TARGETS))
|
||||
layers: str = "all"
|
||||
train_dataset: str = "meta-math/MetaMathQA"
|
||||
@@ -146,9 +146,9 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf
|
||||
if args.variant == "antipasto_ablate":
|
||||
extra = {"coeff": args.antipasto_coeff, "k": args.antipasto_ablate_k,
|
||||
"cov_orient": args.antipasto_cov_orient}
|
||||
if args.variant == "antipasto_arrow":
|
||||
if args.variant == "antipasto_dplr":
|
||||
extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only,
|
||||
"block": args.antipasto_block}
|
||||
"lora_rank": args.antipasto_lora_rank}
|
||||
return CFG_BY_VARIANT[args.variant](
|
||||
r=args.r,
|
||||
alpha=args.r if args.variant == "pissa" else args.alpha,
|
||||
@@ -533,9 +533,9 @@ def run(args: BenchmarkConfig) -> dict[str, Any]:
|
||||
dtype = getattr(torch, args.torch_dtype)
|
||||
run_commit = current_git_commit()
|
||||
run_id = f"{args.model.replace('/', '--')}__{args.variant}__s{args.steps}__seed{args.seed}"
|
||||
# arrow's capacity is set by block, not r, so keep block-sweep runs from colliding.
|
||||
if args.variant == "antipasto_arrow" and args.antipasto_block != 8:
|
||||
run_id += f"__b{args.antipasto_block}"
|
||||
# dplr capacity is set by lora_rank, not r, so keep rank-sweep runs from colliding.
|
||||
if args.variant == "antipasto_dplr" and args.antipasto_lora_rank != 8:
|
||||
run_id += f"__k{args.antipasto_lora_rank}"
|
||||
# antipasto family defaults to r=256; low-rank sweeps get their own dirs.
|
||||
if args.variant.startswith("antipasto") and args.r != 256:
|
||||
run_id += f"__r{args.r}"
|
||||
|
||||
@@ -23,7 +23,7 @@ from .variants.antipasto import AntiPaSTOConfig
|
||||
from .variants.antipasto_rot import AntiPaSTORotConfig
|
||||
from .variants.antipasto_ablate import AntiPaSTOAblateConfig
|
||||
from .variants.antipasto_corda import AntiPaSTOCorDAConfig
|
||||
from .variants.antipasto_arrow import AntiPaSTOArrowConfig
|
||||
from .variants.antipasto_dplr import AntiPaSTODPLRConfig
|
||||
from .variants.road import RoadConfig
|
||||
|
||||
__all__ = [
|
||||
@@ -40,7 +40,7 @@ __all__ = [
|
||||
"AntiPaSTORotConfig",
|
||||
"AntiPaSTOAblateConfig",
|
||||
"AntiPaSTOCorDAConfig",
|
||||
"AntiPaSTOArrowConfig",
|
||||
"AntiPaSTODPLRConfig",
|
||||
"RoadConfig",
|
||||
"attach",
|
||||
"detach",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from . import ( # noqa: F401 side-effect: register
|
||||
lora, pissa, delora, ia3, dora, hra, eva, antipasto, road,
|
||||
antipasto_rot, antipasto_ablate, antipasto_corda, antipasto_arrow,
|
||||
antipasto_rot, antipasto_ablate, antipasto_corda, antipasto_dplr,
|
||||
)
|
||||
|
||||
+61
-53
@@ -1,22 +1,33 @@
|
||||
"""AntiPaSTO-Arrow: cross-direction mixing via a cheap arrowhead core.
|
||||
"""AntiPaSTO-DPLR: diagonal-plus-low-rank core in the frozen SVD basis.
|
||||
|
||||
antipasto's core is diagonal (S_eff = S * gain): it reweights each singular direction
|
||||
independently but cannot let direction i drive direction j. A full dense r x r core
|
||||
restores all mixing but costs r^2 params. The arrowhead is the cheap middle: a dense
|
||||
block on the top-b directions (where the action lives), the diagonal gain on the rest.
|
||||
antipasto's core is diagonal (a per-direction gain); it rescales each singular
|
||||
direction but cannot mix one into another. The arrowhead tried a dense b x b block
|
||||
on the top-b directions, but a dense block is the wrong shape (b^2 params, mixes only
|
||||
the top-b) and -- sitting on the S-scaled coords -- its perturbation is amplified by
|
||||
the largest singular values, so it destabilizes. The fix is LoRA's lesson: a low-rank
|
||||
core. Put a trainable rank-k core inside the frozen U/Vh basis, ADDED to the gain:
|
||||
|
||||
core C (r x r, on the S-scaled coords):
|
||||
[ B (b x b dense) | 0 ] B = I_b + coeff*M (top-b mixing)
|
||||
[ 0 | diag(1 + ELU(coeff*g)) ] tail = antipasto's gain
|
||||
DeltaW = U @ C @ diag(S) @ Vh
|
||||
cost: b^2 + (r-b) params, one b x b matmul per forward.
|
||||
W = U diag(S) Vh + W_res # frozen top-r SVD
|
||||
learn: g (r,) # diagonal gain
|
||||
A (k,r), B (r,k) # low-rank mixing core, B=0 at init
|
||||
S_eff = S * (1 + ELU(coeff * g))
|
||||
y = x @ W_res.T + ( (Vh x) * S_eff + coeff * B (A (Vh x)) ) @ U.T
|
||||
|
||||
Identity at init: M=0 -> B=I, g=0 -> 1+ELU(0)=1, so C=I and DeltaW = U diag(S) Vh.
|
||||
coeff=0 -> C=I too (runtime off). The block is the linear (1+z) regime -- stable but
|
||||
not strictly bounded; for a can't-blow-up guarantee on the top directions use
|
||||
antipasto_ablate.
|
||||
so the trainable core is C = diag(S_eff) + coeff * B A acting in S-space, and
|
||||
DeltaW = U C Vh. The diagonal part scales directions; the low-rank part B A mixes them
|
||||
across the whole top-r subspace for 2*r*k params (k=LoRA's rank), not b^2.
|
||||
|
||||
Refs: antipasto.py (diagonal sibling), antipasto_corda.py (off-axis basis argument).
|
||||
Why the low-rank part is ADDED, not multiplied into diag(S): an additive core
|
||||
U (BA) Vh is independent of S, so a unit step in BA moves W by O(1), not O(S). That is
|
||||
exactly the S-amplification edge that made the dense arrowhead block blow up at the
|
||||
gain's learning rate -- gone by construction.
|
||||
|
||||
Identity at init: B=0 -> BA=0, g=0 -> 1+ELU(0)=1, so C=diag(S) and DeltaW = U diag(S) Vh.
|
||||
coeff=0 -> identity too (runtime off). The basis (U, Vh) stays frozen and interpretable;
|
||||
only the gain and the rank-k core move.
|
||||
|
||||
Refs: antipasto.py (diagonal sibling), lora.py (the low-rank core), antipasto_corda.py
|
||||
(oriented basis -- composes with this core).
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, Literal
|
||||
@@ -36,42 +47,42 @@ CalibrationData = Iterable[CalibrationBatch]
|
||||
|
||||
@register_config
|
||||
@dataclass
|
||||
class AntiPaSTOArrowConfig(AdapterConfig):
|
||||
variant: str = "antipasto_arrow"
|
||||
class AntiPaSTODPLRConfig(AdapterConfig):
|
||||
variant: str = "antipasto_dplr"
|
||||
r: int = 256
|
||||
# Dense interaction block on the top-b singular directions; sets capacity and the
|
||||
# only quadratic cost (b^2 params/module). b=1 degenerates to antipasto; b->r
|
||||
# approaches a full dense r-core (~LoRA params) at the cost arrow exists to avoid.
|
||||
block: int = 8
|
||||
suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected.
|
||||
# Tail guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies.
|
||||
coeff: float = 1.0 # runtime knob: 0=identity, scales both block and tail
|
||||
act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto
|
||||
# Rank of the low-rank mixing core (LoRA's r, but inside the frozen subspace).
|
||||
# Params = r (gain) + 2*r*lora_rank. k=0 degenerates to plain antipasto.
|
||||
lora_rank: int = 8
|
||||
suppress_only: bool = False # clamp the gain g<=0 (attenuate only); core unaffected.
|
||||
coeff: float = 1.0 # runtime knob: 0=identity, scales gain and core.
|
||||
act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto.
|
||||
|
||||
|
||||
@register
|
||||
class AntiPaSTOArrow:
|
||||
name = "antipasto_arrow"
|
||||
class AntiPaSTODPLR:
|
||||
name = "antipasto_dplr"
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
r, b = cfg.r, cfg.block
|
||||
if not 1 <= b < r:
|
||||
raise ValueError(f"antipasto_arrow needs 1 <= block({b}) < r({r}).")
|
||||
r, k = cfg.r, cfg.lora_rank
|
||||
if not 0 < k <= r:
|
||||
raise ValueError(f"antipasto_dplr needs 0 < lora_rank({k}) <= r({r}).")
|
||||
return dict(
|
||||
lora_U=ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
|
||||
lora_S=ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
|
||||
lora_Vh=ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
|
||||
# Dense b x b interaction on the top-b directions. init 0 -> B=I -> identity.
|
||||
lora_M=ParamSpec((b, b), init="zeros"),
|
||||
# Diagonal bounded gain on the remaining r-b directions (== antipasto's g).
|
||||
lora_g=ParamSpec((r - b,), init="zeros"),
|
||||
# Diagonal gain (== antipasto). init 0 -> 1+ELU(0)=1 -> identity.
|
||||
lora_g=ParamSpec((r,), init="zeros"),
|
||||
# Low-rank core B@A in the frozen subspace. A down (r->k), B up (k->r).
|
||||
# B=0 at init -> core=0 -> identity (LoRA convention).
|
||||
lora_A=ParamSpec((k, r), init="kaiming"),
|
||||
lora_B=ParamSpec((r, k), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
if type(layer) is not nn.Linear:
|
||||
raise TypeError("AntiPaSTOArrow mutates layer.weight into W_res; nn.Linear only.")
|
||||
raise TypeError("AntiPaSTODPLR mutates layer.weight into W_res; nn.Linear only.")
|
||||
with torch.no_grad():
|
||||
W = layer.weight.data.float()
|
||||
U, S, Vh = torch.linalg.svd(W, full_matrices=False)
|
||||
@@ -85,9 +96,9 @@ class AntiPaSTOArrow:
|
||||
|
||||
@staticmethod
|
||||
def group_init(model: nn.Module, targets, cfg, calibration_data: CalibrationData | None) -> None:
|
||||
"""Wanda-style data-driven dimension selection, identical to antipasto: re-pick
|
||||
the top-r directions by S[i] * pool|X @ Vh[i]|. Runs before training (g, M at
|
||||
their zero init), so re-selecting the basis is a harmless no-op on the core."""
|
||||
"""Wanda-style re-selection of the top-r directions, identical to antipasto.
|
||||
Runs before training while g and B are still zero, so the core contributes
|
||||
nothing and re-selecting the basis is a no-op on the adapter output."""
|
||||
if calibration_data is None:
|
||||
return
|
||||
|
||||
@@ -122,15 +133,13 @@ class AntiPaSTOArrow:
|
||||
for name, layer in layers.items():
|
||||
X = torch.cat(captured[name], dim=0)
|
||||
if X.shape[0] < r:
|
||||
raise RuntimeError(f"AntiPaSTOArrow at {name}: {X.shape[0]} tokens, need >= r={r}")
|
||||
raise RuntimeError(f"AntiPaSTODPLR at {name}: {X.shape[0]} tokens, need >= r={r}")
|
||||
# Rebuild the FULL W exactly (W_res + stored top-r), then re-select top-r.
|
||||
W_res = layer.weight.data.float()
|
||||
W_orig = W_res + (layer.lora_U.float() * layer.lora_S.float()) @ layer.lora_Vh.float()
|
||||
U_full, S_full, Vh_full = torch.linalg.svd(W_orig, full_matrices=False)
|
||||
proj = X.to(Vh_full) @ Vh_full.T
|
||||
act_mag = proj.pow(2).mean(0).sqrt() if pool == "rms" else proj.abs().mean(0)
|
||||
# Pick top-r by score, then sort by SVD index. svd() returns S descending,
|
||||
# so the block's first-b coords are the b largest-S among the selected r
|
||||
# (= where the action lives), not the b highest-score.
|
||||
idx = (S_full * act_mag).argsort(descending=True)[:r].sort().values
|
||||
Ur, Sr, Vhr = U_full[:, idx], S_full[idx], Vh_full[idx]
|
||||
W_res_new = (W_orig - (Ur * Sr) @ Vhr).to(layer.weight.dtype)
|
||||
@@ -150,19 +159,18 @@ class AntiPaSTOArrow:
|
||||
U = layer.lora_U.to(x.dtype) # (d_out, r)
|
||||
S = layer.lora_S.to(x.dtype) # (r,)
|
||||
Vh = layer.lora_Vh.to(x.dtype) # (r, d_in)
|
||||
M = layer.lora_M.to(x.dtype) # (b, b)
|
||||
g = layer.lora_g.to(x.dtype) # (r-b,)
|
||||
coeff, b = float(cfg.coeff), cfg.block
|
||||
g = layer.lora_g.to(x.dtype) # (r,)
|
||||
A = layer.lora_A.to(x.dtype) # (k, r)
|
||||
B = layer.lora_B.to(x.dtype) # (r, k)
|
||||
coeff = float(cfg.coeff)
|
||||
|
||||
cS = (x @ Vh.T) * S # (..., r) = diag(S) Vh x
|
||||
|
||||
# Top-b: dense block B = I_b + coeff*M couples the top singular directions.
|
||||
eye = torch.eye(b, dtype=x.dtype, device=x.device)
|
||||
top = cS[..., :b] @ (eye + coeff * M).T # (..., b)
|
||||
# Tail: antipasto's bounded diagonal gain (see antipasto.py for the 1+ELU why).
|
||||
if cfg.suppress_only:
|
||||
g = torch.clamp(g, max=0.0)
|
||||
tail = cS[..., b:] * (1.0 + F.elu(coeff * g)) # (..., r-b)
|
||||
|
||||
h = torch.cat([top, tail], dim=-1) # (..., r)
|
||||
p = x @ Vh.T # (..., r) = Vh x (unscaled)
|
||||
S_eff = S * (1.0 + F.elu(coeff * g)) # diagonal gain (see antipasto.py)
|
||||
# Diagonal part scales each direction; low-rank part B@A mixes across the
|
||||
# subspace. Additive (not * diag(S)), so the core is S-independent: a unit
|
||||
# step in B@A moves W by O(1), not O(S) -- no S-amplification edge.
|
||||
h = p * S_eff + coeff * (p @ A.T) @ B.T # (..., r)
|
||||
return y + h @ U.T
|
||||
@@ -32,12 +32,12 @@ SPEC.loader.exec_module(benchmark)
|
||||
|
||||
VARIANTS = ["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva",
|
||||
"antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda",
|
||||
"antipasto_arrow", "road"]
|
||||
"antipasto_dplr", "road"]
|
||||
# Variants that fail loud when attached on a bnb-loaded base (read dense weight in init).
|
||||
# delora/eva also read weight but currently silently dequant -- they produce sane attach,
|
||||
# so we don't expect a raise from them in the attach-only smoke.
|
||||
BNB_RAISERS = {"pissa", "dora", "antipasto", "antipasto_rot", "antipasto_ablate",
|
||||
"antipasto_corda", "antipasto_arrow"}
|
||||
"antipasto_corda", "antipasto_dplr"}
|
||||
TINY_MODEL = "hf-internal-testing/tiny-random-LlamaForCausalLM"
|
||||
|
||||
HAS_CUDA = torch.cuda.is_available()
|
||||
@@ -59,7 +59,7 @@ def quick_cfg(variant: str, tmp_path: Path, quantization: str = "none") -> "benc
|
||||
quantization=quantization,
|
||||
r=4,
|
||||
alpha=8,
|
||||
antipasto_block=2, # antipasto_arrow needs block < r (r=4 here)
|
||||
antipasto_lora_rank=2, # antipasto_dplr needs 0 < lora_rank <= r (r=4 here)
|
||||
target_name=target_name,
|
||||
layers="all",
|
||||
steps=2,
|
||||
|
||||
Reference in New Issue
Block a user