mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 17:16:12 +08:00
9d027752ad
antipasto_arrow -> antipasto_dplr. The arrowhead's dense b x b block is the wrong
shape: b^2 params, mixes only the top-b, and sits on the S-scaled coords so its
perturbation is amplified by the largest singular values (block=128 collapsed to
45.7% at the gain's lr). Replace it with LoRA's lesson -- a low-rank core inside
the frozen basis, ADDED to the gain:
DeltaW = U [diag(S_eff) + coeff * B A] Vh, A:(k,r) B:(r,k), B=0 at init
The low-rank part mixes the whole top-r subspace for 2*r*k params (k=LoRA's rank),
and being additive (not * diag(S)) it is S-independent -- the amplification edge is
gone by construction. Diagonal gain unchanged; identity at init from B=0 and g=0.
Wired through benchmark (antipasto_lora_rank, run_id __k suffix), justfile, cost_report,
smoke (green, dplr attaches/trains/round-trips). Arrow code removed; its run results
stay on disk for comparison.
Co-Authored-By: Claudypoo <noreply@anthropic.com>
144 lines
6.6 KiB
Python
144 lines
6.6 KiB
Python
"""One-row-per-variant cost table: params, MACs/token, fwd/bwd ms, peak GPU, group_init.
|
|
|
|
Answers "which is best -- time / flops / adds / params?": MACs/token is the
|
|
deterministic apples-to-apples compute number; trainable_params is the size headline;
|
|
wall-time is the felt-but-noisy number; group_init is where CorDA's eigh(d_in^3) bites.
|
|
|
|
Usage:
|
|
uv run --extra benchmark python scripts/cost_report.py \
|
|
--model Qwen/Qwen3-0.6B-Base --variants antipasto antipasto_corda antipasto_ablate lora \
|
|
--target-name 'q_proj$' 'v_proj$' --r 32 --out logs/cost_qwen0.6b.log
|
|
|
|
Point --target-name at down_proj to see the CorDA covariance corner (large d_in).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import importlib.util
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
from tabulate import tabulate
|
|
|
|
import lora_lite as ll
|
|
|
|
_HERE = Path(__file__).resolve().parent
|
|
_BENCH = importlib.util.spec_from_file_location("metamath_benchmark", _HERE / "metamath_gsm8k_benchmark.py")
|
|
benchmark = importlib.util.module_from_spec(_BENCH)
|
|
sys.modules[_BENCH.name] = benchmark
|
|
_BENCH.loader.exec_module(benchmark)
|
|
|
|
_COST = importlib.util.spec_from_file_location("_cost", _HERE / "_cost.py")
|
|
cost = importlib.util.module_from_spec(_COST)
|
|
sys.modules[_COST.name] = cost
|
|
_COST.loader.exec_module(cost)
|
|
|
|
|
|
def build_cfg(variant: str, args, dtype) -> ll.AdapterConfig:
|
|
"""Reuse the benchmark's variant->config map; only need r/targets/dtype here."""
|
|
bcfg = benchmark.BenchmarkConfig(
|
|
model=args.model, variant=variant, r=args.r, alpha=float(args.r),
|
|
target_name=list(args.target_name), layers=args.layers, torch_dtype=args.dtype,
|
|
antipasto_cov_orient=args.cov_orient,
|
|
)
|
|
return benchmark.cfg_for_variant(bcfg, dtype)
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--model", default="Qwen/Qwen3-0.6B-Base")
|
|
ap.add_argument("--variants", nargs="+",
|
|
default=["lora", "antipasto", "antipasto_rot", "antipasto_corda",
|
|
"antipasto_ablate", "antipasto_dplr"])
|
|
ap.add_argument("--target-name", nargs="+", default=[r"q_proj$", r"v_proj$"])
|
|
ap.add_argument("--r", type=int, default=32)
|
|
ap.add_argument("--layers", default="all",
|
|
help="'all' or comma list e.g. '0,1' -- limit layers (CorDA down_proj eigh is slow).")
|
|
ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
|
|
ap.add_argument("--dtype", default="bfloat16")
|
|
ap.add_argument("--seq-len", type=int, default=256)
|
|
ap.add_argument("--batch", type=int, default=2)
|
|
ap.add_argument("--calib-batches", type=int, default=4)
|
|
ap.add_argument("--cov-orient", action="store_true",
|
|
help="CorDA-orient antipasto_ablate (measure the eigh corner).")
|
|
ap.add_argument("--out", default="logs/cost.log")
|
|
args = ap.parse_args()
|
|
|
|
dtype = getattr(torch, args.dtype)
|
|
# eager attention: FlopCounterMode's sdpa_flop_count asserts on GQA (Qwen3) SDPA
|
|
# shapes (q heads != kv heads). eager uses explicit matmuls it can count.
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
tok = AutoTokenizer.from_pretrained(args.model)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
args.model, dtype=dtype, attn_implementation="eager"
|
|
).to(args.device)
|
|
model.eval()
|
|
|
|
n_tokens = args.batch * args.seq_len
|
|
ids = torch.randint(0, model.config.vocab_size, (args.batch, args.seq_len), device=args.device)
|
|
calib = [{"input_ids": torch.randint(0, model.config.vocab_size,
|
|
(args.batch, args.seq_len), device=args.device)}
|
|
for _ in range(args.calib_batches)]
|
|
|
|
def fwd():
|
|
model(input_ids=ids)
|
|
|
|
def bwd_step():
|
|
model.zero_grad(set_to_none=True)
|
|
loss = model(input_ids=ids).logits.float().pow(2).mean()
|
|
loss.backward()
|
|
|
|
# base (no-adapter) cost, so each row can report the adapter's ADDED MACs/token.
|
|
base = cost.measure_cost(model, fwd, bwd_step_fn=bwd_step, n_tokens=n_tokens)
|
|
base_macs = base["macs_per_token"]
|
|
print(f"base (no adapter): MACs/tok={int(base_macs) if base_macs else None} "
|
|
f"fwd_ms={round(base['fwd_ms'],2)} bwd_ms={round(base['bwd_ms'],2)}")
|
|
|
|
# base = no adapter; model params left trainable, so this is the full-finetune
|
|
# GPU-mem reference (its backward stores grads for every weight).
|
|
total_params = sum(p.numel() for p in model.parameters())
|
|
rows = [{
|
|
"variant": "base(full-FT)", "train_params": total_params,
|
|
"fwd_ms": round(base["fwd_ms"], 2), "bwd_ms": round(base["bwd_ms"], 2),
|
|
"peak_GPU_MB": round(base["peak_gpu_mb"], 1) if base["peak_gpu_mb"] else None,
|
|
"added_MACs/tok": 0 if base_macs else None,
|
|
"ginit_ms": 0.0, "ginit_CPU_MB": 0.0,
|
|
}]
|
|
for variant in args.variants:
|
|
cfg = build_cfg(variant, args, dtype)
|
|
# group_init / attach cost (CorDA's eigh + C live here).
|
|
with cost.group_init_meter() as gi:
|
|
ll.attach(model, cfg, calibration_data=calib)
|
|
c = cost.measure_cost(model, fwd, bwd_step_fn=bwd_step, n_tokens=n_tokens)
|
|
ll.detach(model)
|
|
rows.append({
|
|
"variant": variant,
|
|
"train_params": c["trainable_params"],
|
|
"fwd_ms": round(c["fwd_ms"], 2),
|
|
"bwd_ms": round(c["bwd_ms"], 2) if c["bwd_ms"] else None,
|
|
"peak_GPU_MB": round(c["peak_gpu_mb"], 1) if c["peak_gpu_mb"] else None,
|
|
# flat across same-r adapters; kept only as a sanity check, not a comparator.
|
|
"added_MACs/tok": int(c["macs_per_token"] - base_macs) if (c["macs_per_token"] and base_macs) else None,
|
|
"ginit_ms": round(gi.ms, 1),
|
|
"ginit_CPU_MB": round(gi.peak_cpu_mb, 1),
|
|
})
|
|
print(f" {variant}: params={rows[-1]['train_params']} "
|
|
f"peak_GPU_MB={rows[-1]['peak_GPU_MB']} bwd_ms={rows[-1]['bwd_ms']} ginit_ms={rows[-1]['ginit_ms']}")
|
|
|
|
table = tabulate(rows, headers="keys", tablefmt="pipe")
|
|
header = (f"# cost report: {args.model} targets={args.target_name} r={args.r} "
|
|
f"seq={args.seq_len} batch={args.batch} dtype={args.dtype}\n"
|
|
f"# COMPARATORS: train_params, peak_GPU_MB (fwd+bwd, process-local max), bwd_ms, ginit_ms.\n"
|
|
f"# added_MACs/tok is flat across same-r adapters (sanity check only).\n"
|
|
f"# ginit_CPU_MB undercounts: tracemalloc misses torch C++ tensor allocs (the CorDA C matrix).\n")
|
|
out_path = Path(args.out)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_text(header + table + "\n")
|
|
print("\n" + header + table)
|
|
print(f"\nsaved -> {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|