mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 18:05:16 +08:00
variants: replace arrow's dense block with diagonal-plus-low-rank core
antipasto_arrow -> antipasto_dplr. The arrowhead's dense b x b block is the wrong
shape: b^2 params, mixes only the top-b, and sits on the S-scaled coords so its
perturbation is amplified by the largest singular values (block=128 collapsed to
45.7% at the gain's lr). Replace it with LoRA's lesson -- a low-rank core inside
the frozen basis, ADDED to the gain:
DeltaW = U [diag(S_eff) + coeff * B A] Vh, A:(k,r) B:(r,k), B=0 at init
The low-rank part mixes the whole top-r subspace for 2*r*k params (k=LoRA's rank),
and being additive (not * diag(S)) it is S-independent -- the amplification edge is
gone by construction. Diagonal gain unchanged; identity at init from B=0 and g=0.
Wired through benchmark (antipasto_lora_rank, run_id __k suffix), justfile, cost_report,
smoke (green, dplr attaches/trains/round-trips). Arrow code removed; its run results
stay on disk for comparison.
Co-Authored-By: Claudypoo <noreply@anthropic.com>
This commit is contained in:
@@ -75,7 +75,7 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base":
|
||||
|
||||
# Run a single MetaMathQA->GSM8K benchmark for a given variant.
|
||||
# Per-variant lr / target-name defaults are baked in here.
|
||||
bench-variant model variant steps="5000" block="8" r_override="":
|
||||
bench-variant model variant steps="5000" lora_rank="8" r_override="":
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
lr=1e-4
|
||||
@@ -94,10 +94,6 @@ bench-variant model variant steps="5000" block="8" r_override="":
|
||||
# matches the published AntiPaSTO row. alpha=r (no extra scaling).
|
||||
antipasto*) lr=5e-3; r=256; alpha=256 ;;
|
||||
esac
|
||||
# 5e-3 suits the tiny S-space gain, but arrow's large dense block is LoRA-like
|
||||
# and destabilizes at that lr (block=128 got 45.7% vs block=8's 60.5%). Drop to
|
||||
# LoRA's 1e-4 once the block dominates the param count.
|
||||
if [ "{{variant}}" = "antipasto_arrow" ] && [ "{{block}}" -gt 8 ]; then lr=1e-4; fi
|
||||
# r override (e.g. low-rank corda sweep); alpha tracks r for the antipasto family.
|
||||
if [ -n "{{r_override}}" ]; then r="{{r_override}}"; alpha="{{r_override}}"; fi
|
||||
exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \
|
||||
@@ -106,7 +102,7 @@ bench-variant model variant steps="5000" block="8" r_override="":
|
||||
--steps {{steps}} \
|
||||
--lr "$lr" \
|
||||
--target-name "$target" \
|
||||
--antipasto-block {{block}} \
|
||||
--antipasto-lora-rank {{lora_rank}} \
|
||||
--layers all --r "$r" --alpha "$alpha"
|
||||
|
||||
metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto":
|
||||
|
||||
Reference in New Issue
Block a user