mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 19:31:26 +08:00
b80d7778af
Replace antipasto's rotation/Cayley with a bounded 1+ELU gain and split the
S-space idea into four interpretable PiSSA-style cores (frozen U/S/Vh, small
trainable core):
- antipasto: S_eff = S*(1+ELU(coeff*g)). exp-bounded attenuation, linear
amplification (constant gradient, no runaway). g=0 -> exact identity.
- antipasto_rot: keeps the block-Cayley rotation as a separate variant for
cost comparison (its per-forward solve is the 72ms vs 36ms gap).
- antipasto_ablate: contractive (I - a c c^T) diag(S), eigenvalues in [0,1],
cannot blow up. Optional cov_orient (CorDA) basis.
- antipasto_corda: covariance-oriented oblique projector P = Vh C^{-1/2}, the
data-energy basis rather than the weight-gain basis. 1+ELU gain.
Add scripts/_cost.py + scripts/cost_report.py: one-row-per-variant cost table
(trainable params, peak GPU mem, fwd/bwd ms, added MACs/tok, group_init ms).
Wire all four into the benchmark, smoke test, and __init__ exports.
External review (DeepSeek-v4-pro, docs/reviews/) verified the math; acted on
its one real point (corda g now inits to zeros for exact identity).
Co-Authored-By: Claudypoo <noreply@anthropic.com>
132 lines
4.6 KiB
Python
132 lines
4.6 KiB
Python
"""Measure the cost of an attached adapter: params, FLOPs/MACs, time, GPU mem.
|
|
|
|
Which metric is "best" for comparing adapters? They answer different questions:
|
|
|
|
- trainable_params -- deterministic "size" number. The headline.
|
|
- macs_per_token -- deterministic, hardware-INDEPENDENT compute. Best for an
|
|
apples-to-apples comparison: wall-time is noisy and the old
|
|
rotation adapter paid a per-forward Cayley solve the new ones
|
|
do not. "adds" (additions) ~= MACs; FLOPs ~= 2 * MACs.
|
|
- fwd_ms / bwd_ms -- felt cost, but noisy: warmup + median over `iters`, never one run.
|
|
- peak_gpu_mb -- resident + activation peak around fwd(+bwd).
|
|
|
|
FLOPs come from torch.utils.flop_counter.FlopCounterMode (built in, no new dep). Its
|
|
convention is MACs (a (m,k)@(k,n) matmul counts as m*n*k); we expose both `flops`
|
|
(as returned) and `macs_per_token = flops / n_tokens` -- calibrate once on a known
|
|
matmul if you need to be sure of the factor of 2.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import statistics
|
|
import time
|
|
|
|
import torch
|
|
from torch.utils.flop_counter import FlopCounterMode
|
|
|
|
|
|
def _time_call(fn, warmup: int, iters: int, cuda: bool) -> float:
|
|
"""Median wall-time of fn() in milliseconds (warmup excluded)."""
|
|
for _ in range(warmup):
|
|
fn()
|
|
if cuda:
|
|
torch.cuda.synchronize()
|
|
samples = []
|
|
for _ in range(iters):
|
|
if cuda:
|
|
start = torch.cuda.Event(enable_timing=True)
|
|
end = torch.cuda.Event(enable_timing=True)
|
|
start.record()
|
|
fn()
|
|
end.record()
|
|
torch.cuda.synchronize()
|
|
samples.append(start.elapsed_time(end))
|
|
else:
|
|
t0 = time.perf_counter()
|
|
fn()
|
|
samples.append((time.perf_counter() - t0) * 1e3)
|
|
return statistics.median(samples)
|
|
|
|
|
|
def measure_cost(
|
|
model: torch.nn.Module,
|
|
fwd_fn,
|
|
*,
|
|
bwd_step_fn=None,
|
|
n_tokens: int | None = None,
|
|
adapter_filter: str = "lora_",
|
|
warmup: int = 3,
|
|
iters: int = 10,
|
|
) -> dict:
|
|
"""Cost of the currently-attached adapter.
|
|
|
|
fwd_fn(): run one forward (no grad). Used for FLOPs + fwd timing.
|
|
bwd_step_fn(): zero_grad + forward + loss.backward(). Used for bwd timing.
|
|
n_tokens: tokens in the fwd_fn batch, for macs_per_token.
|
|
adapter_filter: substring marking adapter params/buffers (default 'lora_').
|
|
"""
|
|
dev = next(model.parameters()).device
|
|
cuda = dev.type == "cuda"
|
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
named = list(model.named_parameters()) + list(model.named_buffers())
|
|
adapter_bytes = sum(t.numel() * t.element_size() for n, t in named if adapter_filter in n)
|
|
|
|
# FLOPs: one forward under the counter (no grad so we count inference cost).
|
|
# FlopCounterMode can assert on some fused attention shapes; degrade to None.
|
|
try:
|
|
fc = FlopCounterMode(display=False)
|
|
with torch.no_grad(), fc:
|
|
fwd_fn()
|
|
flops = fc.get_total_flops()
|
|
except Exception as e:
|
|
print(f" [warn] FLOP count failed ({type(e).__name__}: {e}); flops=None")
|
|
flops = None
|
|
|
|
if cuda:
|
|
torch.cuda.synchronize()
|
|
torch.cuda.reset_peak_memory_stats()
|
|
fwd_ms = _time_call(lambda: _no_grad(fwd_fn), warmup, iters, cuda)
|
|
bwd_ms = _time_call(bwd_step_fn, warmup, iters, cuda) if bwd_step_fn is not None else None
|
|
peak_gpu_mb = (torch.cuda.max_memory_allocated() / 1e6) if cuda else None
|
|
|
|
return dict(
|
|
trainable_params=trainable_params,
|
|
adapter_resident_mb=adapter_bytes / 1e6,
|
|
flops=flops,
|
|
macs_per_token=(flops / n_tokens) if (flops and n_tokens) else None,
|
|
fwd_ms=fwd_ms,
|
|
bwd_ms=bwd_ms,
|
|
peak_gpu_mb=peak_gpu_mb,
|
|
)
|
|
|
|
|
|
def _no_grad(fn):
|
|
with torch.no_grad():
|
|
return fn()
|
|
|
|
|
|
class group_init_meter:
|
|
"""Context manager: wall-time + peak CPU RAM of a group_init / attach-with-calib.
|
|
|
|
CorDA accumulates C = E[xx^T] on CPU and runs eigh(d_in^3) -- the expensive corner.
|
|
Use around ll.attach(model, cfg, calibration_data=...) to log that asymmetry.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.ms = None
|
|
self.peak_cpu_mb = None
|
|
|
|
def __enter__(self):
|
|
import tracemalloc
|
|
self._tm = tracemalloc
|
|
tracemalloc.start()
|
|
self._t0 = time.perf_counter()
|
|
return self
|
|
|
|
def __exit__(self, *exc):
|
|
self.ms = (time.perf_counter() - self._t0) * 1e3
|
|
_, peak = self._tm.get_traced_memory()
|
|
self._tm.stop()
|
|
self.peak_cpu_mb = peak / 1e6
|
|
return False
|