mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:15:50 +08:00
5f9d90d8b8
- antipasto_rot: add rotate_basis="both" (independent V+U Cayley rotations), run_id suffix __rotU/__rotboth so ablation arms get their own output dirs - justfile: thread rotate_basis through bench-variant - corda/eva: padding-mask fix in calibration capture + bf16-tight residual - README: fill PiSSA/DoRA/CorDA/ASVD/ablate/dplr/rot rows; record the metric-axis ablation (C=I 56.0 > diag-C 55.6 > full-C 54.7) and the rotation ablation (V 57.2 > U 56.5 > both 55.6) conclusions - docs/reviews: external ref-checks + deepseek/gpt reviews of the cores Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
140 lines
5.3 KiB
Python
140 lines
5.3 KiB
Python
"""Measure the cost of an attached adapter: params, FLOPs/MACs, time, GPU mem.
|
|
|
|
Which metric is "best" for comparing adapters? They answer different questions:
|
|
|
|
- trainable_params -- deterministic "size" number. The headline.
|
|
- macs_per_token -- deterministic, hardware-INDEPENDENT compute. Best for an
|
|
apples-to-apples comparison: wall-time is noisy and the old
|
|
rotation adapter paid a per-forward Cayley solve the new ones
|
|
do not. "adds" (additions) ~= MACs; FLOPs ~= 2 * MACs.
|
|
- fwd_ms / bwd_ms -- felt cost, but noisy: warmup + median over `iters`, never one run.
|
|
- peak_gpu_mb -- resident + activation peak around fwd(+bwd).
|
|
|
|
FLOPs come from torch.utils.flop_counter.FlopCounterMode (built in, no new dep). Its
|
|
convention is MACs (a (m,k)@(k,n) matmul counts as m*n*k); we expose both `flops`
|
|
(as returned) and `macs_per_token = flops / n_tokens` -- calibrate once on a known
|
|
matmul if you need to be sure of the factor of 2.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import statistics
|
|
import time
|
|
|
|
import torch
|
|
from torch.utils.flop_counter import FlopCounterMode
|
|
|
|
|
|
def _time_call(fn, warmup: int, iters: int, cuda: bool) -> float:
|
|
"""Median wall-time of fn() in milliseconds (warmup excluded)."""
|
|
for _ in range(warmup):
|
|
fn()
|
|
if cuda:
|
|
torch.cuda.synchronize()
|
|
samples = []
|
|
for _ in range(iters):
|
|
if cuda:
|
|
start = torch.cuda.Event(enable_timing=True)
|
|
end = torch.cuda.Event(enable_timing=True)
|
|
start.record()
|
|
fn()
|
|
end.record()
|
|
torch.cuda.synchronize()
|
|
samples.append(start.elapsed_time(end))
|
|
else:
|
|
t0 = time.perf_counter()
|
|
fn()
|
|
samples.append((time.perf_counter() - t0) * 1e3)
|
|
return statistics.median(samples)
|
|
|
|
|
|
def measure_cost(
|
|
model: torch.nn.Module,
|
|
fwd_fn,
|
|
*,
|
|
bwd_step_fn=None,
|
|
n_tokens: int | None = None,
|
|
adapter_filter: str = "lora_",
|
|
warmup: int = 3,
|
|
iters: int = 10,
|
|
) -> dict:
|
|
"""Cost of the currently-attached adapter.
|
|
|
|
fwd_fn(): run one forward (no grad). Used for FLOPs + fwd timing.
|
|
bwd_step_fn(): zero_grad + forward + loss.backward(). Used for bwd timing.
|
|
n_tokens: tokens in the fwd_fn batch, for macs_per_token.
|
|
adapter_filter: substring marking adapter params/buffers (default 'lora_').
|
|
"""
|
|
dev = next(model.parameters()).device
|
|
cuda = dev.type == "cuda"
|
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
named = list(model.named_parameters()) + list(model.named_buffers())
|
|
adapter_bytes = sum(t.numel() * t.element_size() for n, t in named if adapter_filter in n)
|
|
|
|
# Adapter ADDED MACs/token, analytic and arch-independent (the FLOP counter below
|
|
# asserts on some fused/linear-attention shapes -> None). Each 2D adapter weight of
|
|
# shape (a, b) is used once in a per-token matmul, contributing a*b MACs; summing 2D
|
|
# adapter-tensor numel is therefore the exact added compute for the U/Vh/P/A/B paths.
|
|
# (Slight undercount for cores that reuse a factor twice, e.g. ablate's C C^T.)
|
|
added_macs_per_token = sum(t.numel() for n, t in named if adapter_filter in n and t.ndim == 2)
|
|
|
|
# FLOPs: one forward under the counter (no grad so we count inference cost).
|
|
# FlopCounterMode can assert on some fused attention shapes; degrade to None.
|
|
try:
|
|
fc = FlopCounterMode(display=False)
|
|
with torch.no_grad(), fc:
|
|
fwd_fn()
|
|
flops = fc.get_total_flops()
|
|
except Exception as e:
|
|
print(f" [warn] FLOP count failed ({type(e).__name__}: {e}); flops=None")
|
|
flops = None
|
|
|
|
if cuda:
|
|
torch.cuda.synchronize()
|
|
torch.cuda.reset_peak_memory_stats()
|
|
fwd_ms = _time_call(lambda: _no_grad(fwd_fn), warmup, iters, cuda)
|
|
bwd_ms = _time_call(bwd_step_fn, warmup, iters, cuda) if bwd_step_fn is not None else None
|
|
peak_gpu_mb = (torch.cuda.max_memory_allocated() / 1e6) if cuda else None
|
|
|
|
return dict(
|
|
trainable_params=trainable_params,
|
|
adapter_resident_mb=adapter_bytes / 1e6,
|
|
added_macs_per_token=added_macs_per_token, # adapter-only, always populated
|
|
flops=flops, # whole model, best-effort (None on hybrid attn)
|
|
macs_per_token=(flops / n_tokens) if (flops and n_tokens) else None,
|
|
fwd_ms=fwd_ms,
|
|
bwd_ms=bwd_ms,
|
|
peak_gpu_mb=peak_gpu_mb,
|
|
)
|
|
|
|
|
|
def _no_grad(fn):
|
|
with torch.no_grad():
|
|
return fn()
|
|
|
|
|
|
class group_init_meter:
|
|
"""Context manager: wall-time + peak CPU RAM of a group_init / attach-with-calib.
|
|
|
|
CorDA accumulates C = E[xx^T] on CPU and runs eigh(d_in^3) -- the expensive corner.
|
|
Use around ll.attach(model, cfg, calibration_data=...) to log that asymmetry.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.ms = None
|
|
self.peak_cpu_mb = None
|
|
|
|
def __enter__(self):
|
|
import tracemalloc
|
|
self._tm = tracemalloc
|
|
tracemalloc.start()
|
|
self._t0 = time.perf_counter()
|
|
return self
|
|
|
|
def __exit__(self, *exc):
|
|
self.ms = (time.perf_counter() - self._t0) * 1e3
|
|
_, peak = self._tm.get_traced_memory()
|
|
self._tm.stop()
|
|
self.peak_cpu_mb = peak / 1e6
|
|
return False
|