mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:00:33 +08:00
variants: replace arrow's dense block with diagonal-plus-low-rank core
antipasto_arrow -> antipasto_dplr. The arrowhead's dense b x b block is the wrong
shape: b^2 params, mixes only the top-b, and sits on the S-scaled coords so its
perturbation is amplified by the largest singular values (block=128 collapsed to
45.7% at the gain's lr). Replace it with LoRA's lesson -- a low-rank core inside
the frozen basis, ADDED to the gain:
DeltaW = U [diag(S_eff) + coeff * B A] Vh, A:(k,r) B:(r,k), B=0 at init
The low-rank part mixes the whole top-r subspace for 2*r*k params (k=LoRA's rank),
and being additive (not * diag(S)) it is S-independent -- the amplification edge is
gone by construction. Diagonal gain unchanged; identity at init from B=0 and g=0.
Wired through benchmark (antipasto_lora_rank, run_id __k suffix), justfile, cost_report,
smoke (green, dplr attaches/trains/round-trips). Arrow code removed; its run results
stay on disk for comparison.
Co-Authored-By: Claudypoo <noreply@anthropic.com>
This commit is contained in:
@@ -50,7 +50,7 @@ def main() -> None:
|
||||
ap.add_argument("--model", default="Qwen/Qwen3-0.6B-Base")
|
||||
ap.add_argument("--variants", nargs="+",
|
||||
default=["lora", "antipasto", "antipasto_rot", "antipasto_corda",
|
||||
"antipasto_ablate", "antipasto_arrow"])
|
||||
"antipasto_ablate", "antipasto_dplr"])
|
||||
ap.add_argument("--target-name", nargs="+", default=[r"q_proj$", r"v_proj$"])
|
||||
ap.add_argument("--r", type=int, default=32)
|
||||
ap.add_argument("--layers", default="all",
|
||||
|
||||
@@ -37,7 +37,7 @@ CFG_BY_VARIANT = {
|
||||
"antipasto_rot": ll.AntiPaSTORotConfig,
|
||||
"antipasto_ablate": ll.AntiPaSTOAblateConfig,
|
||||
"antipasto_corda": ll.AntiPaSTOCorDAConfig,
|
||||
"antipasto_arrow": ll.AntiPaSTOArrowConfig,
|
||||
"antipasto_dplr": ll.AntiPaSTODPLRConfig,
|
||||
"road": ll.RoadConfig,
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ class BenchmarkConfig:
|
||||
"""MetaMathQA -> GSM8K benchmark config. Tyro turns this into the CLI."""
|
||||
|
||||
model: str = "Qwen/Qwen3-0.6B-Base"
|
||||
variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_arrow", "road"] = "lora"
|
||||
variant: Literal["lora", "pissa", "delora", "ia3", "ia3_ff", "dora", "hra", "eva", "antipasto", "antipasto_rot", "antipasto_ablate", "antipasto_corda", "antipasto_dplr", "road"] = "lora"
|
||||
mode: Literal["benchmark", "probe"] = "benchmark"
|
||||
device: str = "cuda"
|
||||
torch_dtype: str = "bfloat16"
|
||||
@@ -64,8 +64,8 @@ class BenchmarkConfig:
|
||||
antipasto_cov_orient: bool = False
|
||||
# AntiPaSTO-rot (legacy rotation variant) basis to rotate.
|
||||
antipasto_rotate_basis: Literal["V", "U", "none"] = "V"
|
||||
# AntiPaSTO-arrow: dense interaction block size on the top-b singular directions.
|
||||
antipasto_block: int = 8
|
||||
# AntiPaSTO-dplr: rank of the low-rank mixing core in the frozen subspace.
|
||||
antipasto_lora_rank: int = 8
|
||||
target_name: list[str] = field(default_factory=lambda: list(DEFAULT_TARGETS))
|
||||
layers: str = "all"
|
||||
train_dataset: str = "meta-math/MetaMathQA"
|
||||
@@ -146,9 +146,9 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf
|
||||
if args.variant == "antipasto_ablate":
|
||||
extra = {"coeff": args.antipasto_coeff, "k": args.antipasto_ablate_k,
|
||||
"cov_orient": args.antipasto_cov_orient}
|
||||
if args.variant == "antipasto_arrow":
|
||||
if args.variant == "antipasto_dplr":
|
||||
extra = {"coeff": args.antipasto_coeff, "suppress_only": args.antipasto_suppress_only,
|
||||
"block": args.antipasto_block}
|
||||
"lora_rank": args.antipasto_lora_rank}
|
||||
return CFG_BY_VARIANT[args.variant](
|
||||
r=args.r,
|
||||
alpha=args.r if args.variant == "pissa" else args.alpha,
|
||||
@@ -533,9 +533,9 @@ def run(args: BenchmarkConfig) -> dict[str, Any]:
|
||||
dtype = getattr(torch, args.torch_dtype)
|
||||
run_commit = current_git_commit()
|
||||
run_id = f"{args.model.replace('/', '--')}__{args.variant}__s{args.steps}__seed{args.seed}"
|
||||
# arrow's capacity is set by block, not r, so keep block-sweep runs from colliding.
|
||||
if args.variant == "antipasto_arrow" and args.antipasto_block != 8:
|
||||
run_id += f"__b{args.antipasto_block}"
|
||||
# dplr capacity is set by lora_rank, not r, so keep rank-sweep runs from colliding.
|
||||
if args.variant == "antipasto_dplr" and args.antipasto_lora_rank != 8:
|
||||
run_id += f"__k{args.antipasto_lora_rank}"
|
||||
# antipasto family defaults to r=256; low-rank sweeps get their own dirs.
|
||||
if args.variant.startswith("antipasto") and args.r != 256:
|
||||
run_id += f"__r{args.r}"
|
||||
|
||||
Reference in New Issue
Block a user