Files
lora-lite/scripts/_cost.py
wassname 5f9d90d8b8 benchmark sweep: rot(U/both) ablation, whitening conclusion, cost rows
- antipasto_rot: add rotate_basis="both" (independent V+U Cayley rotations),
  run_id suffix __rotU/__rotboth so ablation arms get their own output dirs
- justfile: thread rotate_basis through bench-variant
- corda/eva: padding-mask fix in calibration capture + bf16-tight residual
- README: fill PiSSA/DoRA/CorDA/ASVD/ablate/dplr/rot rows; record the
  metric-axis ablation (C=I 56.0 > diag-C 55.6 > full-C 54.7) and the
  rotation ablation (V 57.2 > U 56.5 > both 55.6) conclusions
- docs/reviews: external ref-checks + deepseek/gpt reviews of the cores

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-17 06:17:53 +08:00

140 lines
5.3 KiB
Python

"""Measure the cost of an attached adapter: params, FLOPs/MACs, time, GPU mem.
Which metric is "best" for comparing adapters? They answer different questions:
- trainable_params -- deterministic "size" number. The headline.
- macs_per_token -- deterministic, hardware-INDEPENDENT compute. Best for an
apples-to-apples comparison: wall-time is noisy and the old
rotation adapter paid a per-forward Cayley solve the new ones
do not. "adds" (additions) ~= MACs; FLOPs ~= 2 * MACs.
- fwd_ms / bwd_ms -- felt cost, but noisy: warmup + median over `iters`, never one run.
- peak_gpu_mb -- resident + activation peak around fwd(+bwd).
FLOPs come from torch.utils.flop_counter.FlopCounterMode (built in, no new dep). Its
convention is MACs (a (m,k)@(k,n) matmul counts as m*n*k); we expose both `flops`
(as returned) and `macs_per_token = flops / n_tokens` -- calibrate once on a known
matmul if you need to be sure of the factor of 2.
"""
from __future__ import annotations
import statistics
import time
import torch
from torch.utils.flop_counter import FlopCounterMode
def _time_call(fn, warmup: int, iters: int, cuda: bool) -> float:
"""Median wall-time of fn() in milliseconds (warmup excluded)."""
for _ in range(warmup):
fn()
if cuda:
torch.cuda.synchronize()
samples = []
for _ in range(iters):
if cuda:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
fn()
end.record()
torch.cuda.synchronize()
samples.append(start.elapsed_time(end))
else:
t0 = time.perf_counter()
fn()
samples.append((time.perf_counter() - t0) * 1e3)
return statistics.median(samples)
def measure_cost(
model: torch.nn.Module,
fwd_fn,
*,
bwd_step_fn=None,
n_tokens: int | None = None,
adapter_filter: str = "lora_",
warmup: int = 3,
iters: int = 10,
) -> dict:
"""Cost of the currently-attached adapter.
fwd_fn(): run one forward (no grad). Used for FLOPs + fwd timing.
bwd_step_fn(): zero_grad + forward + loss.backward(). Used for bwd timing.
n_tokens: tokens in the fwd_fn batch, for macs_per_token.
adapter_filter: substring marking adapter params/buffers (default 'lora_').
"""
dev = next(model.parameters()).device
cuda = dev.type == "cuda"
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
named = list(model.named_parameters()) + list(model.named_buffers())
adapter_bytes = sum(t.numel() * t.element_size() for n, t in named if adapter_filter in n)
# Adapter ADDED MACs/token, analytic and arch-independent (the FLOP counter below
# asserts on some fused/linear-attention shapes -> None). Each 2D adapter weight of
# shape (a, b) is used once in a per-token matmul, contributing a*b MACs; summing 2D
# adapter-tensor numel is therefore the exact added compute for the U/Vh/P/A/B paths.
# (Slight undercount for cores that reuse a factor twice, e.g. ablate's C C^T.)
added_macs_per_token = sum(t.numel() for n, t in named if adapter_filter in n and t.ndim == 2)
# FLOPs: one forward under the counter (no grad so we count inference cost).
# FlopCounterMode can assert on some fused attention shapes; degrade to None.
try:
fc = FlopCounterMode(display=False)
with torch.no_grad(), fc:
fwd_fn()
flops = fc.get_total_flops()
except Exception as e:
print(f" [warn] FLOP count failed ({type(e).__name__}: {e}); flops=None")
flops = None
if cuda:
torch.cuda.synchronize()
torch.cuda.reset_peak_memory_stats()
fwd_ms = _time_call(lambda: _no_grad(fwd_fn), warmup, iters, cuda)
bwd_ms = _time_call(bwd_step_fn, warmup, iters, cuda) if bwd_step_fn is not None else None
peak_gpu_mb = (torch.cuda.max_memory_allocated() / 1e6) if cuda else None
return dict(
trainable_params=trainable_params,
adapter_resident_mb=adapter_bytes / 1e6,
added_macs_per_token=added_macs_per_token, # adapter-only, always populated
flops=flops, # whole model, best-effort (None on hybrid attn)
macs_per_token=(flops / n_tokens) if (flops and n_tokens) else None,
fwd_ms=fwd_ms,
bwd_ms=bwd_ms,
peak_gpu_mb=peak_gpu_mb,
)
def _no_grad(fn):
with torch.no_grad():
return fn()
class group_init_meter:
"""Context manager: wall-time + peak CPU RAM of a group_init / attach-with-calib.
CorDA accumulates C = E[xx^T] on CPU and runs eigh(d_in^3) -- the expensive corner.
Use around ll.attach(model, cfg, calibration_data=...) to log that asymmetry.
"""
def __init__(self):
self.ms = None
self.peak_cpu_mb = None
def __enter__(self):
import tracemalloc
self._tm = tracemalloc
tracemalloc.start()
self._t0 = time.perf_counter()
return self
def __exit__(self, *exc):
self.ms = (time.perf_counter() - self._t0) * 1e3
_, peak = self._tm.get_traced_memory()
self._tm.stop()
self.peak_cpu_mb = peak / 1e6
return False