diff --git a/justfile b/justfile index 5ab18ee..9841e6f 100644 --- a/justfile +++ b/justfile @@ -73,6 +73,26 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base": -w "$PWD" -o 1 -- \ uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py --model {{model}} --variant {{variant}} --steps {{steps}} +# Run a single MetaMathQA->GSM8K benchmark for a given variant. +# Per-variant lr / target-name defaults are baked in here. +bench-variant model variant steps="5000": + #!/usr/bin/env bash + set -euo pipefail + lr=1e-4 + target='(q_proj|v_proj)$' + case "{{variant}}" in + delora) lr=1e-3 ;; + ia3) lr=1e-3; target='(k_proj|v_proj)$' ;; + ia3_ff) lr=1e-3; target='(down_proj)$' ;; + esac + exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \ + --model '{{model}}' \ + --variant '{{variant}}' \ + --steps {{steps}} \ + --lr "$lr" \ + --target-name "$target" \ + --layers all --r 32 --alpha 64 + metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto": #!/usr/bin/env bash set -euo pipefail @@ -80,5 +100,5 @@ metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora piss pueue add \ -l "why: benchmark {{model}} ${variant} on MetaMathQA->GSM8K at {{steps}} steps; resolve: outputs/metamath_gsm8k/results/benchmark_results.tsv gets a row with accuracy commit time method argv and result JSON for ${variant}" \ -w "$PWD" -o 1 -- \ - bash scripts/bench_variant.sh '{{model}}' "$variant" {{steps}} + just bench-variant '{{model}}' "$variant" {{steps}} done \ No newline at end of file diff --git a/scripts/metamath_gsm8k_benchmark.py b/scripts/metamath_gsm8k_benchmark.py index 6d41437..d544fba 100644 --- a/scripts/metamath_gsm8k_benchmark.py +++ b/scripts/metamath_gsm8k_benchmark.py @@ -16,6 +16,7 @@ from typing import Any, Literal import torch from tabulate import tabulate +from tqdm.auto import tqdm import lora_lite as ll @@ -297,7 +298,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]], last_loss = math.nan train_total_tokens = 0 probe_batch = batches[0] - for step, batch in enumerate(batches): + pbar = tqdm(batches, desc="train", mininterval=60.0, dynamic_ncols=True) + for step, batch in enumerate(pbar): opt.zero_grad() loss = model( input_ids=batch["input_ids"], @@ -319,8 +321,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]], scheduler.step() last_loss = loss.item() train_total_tokens += int(batch["label_tokens"]) - if args.log_every and (step + 1) % args.log_every == 0: - print(f"TRAIN step={step + 1} loss={last_loss:.6g} grad={grad_norm:.6g} tokens={train_total_tokens}", flush=True) + pbar.set_postfix(loss=f"{last_loss:.4g}", grad=f"{grad_norm:.3g}", tok=train_total_tokens) + pbar.close() after = adapter_state(model) adapter_delta = sum((after[k] - before[k]).float().norm().item() for k in before) model.eval() @@ -435,8 +437,10 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non # BLUF: status line first so log tails are immediately readable cue = "🟢" if row.get("base_grad_leaks", 0) == 0 and row.get("grad", 0) > 0 else "🔴" n = row.get("samples", "?") + print() print(f"{cue} test_acc={row['test_acc']:.4g} valid_acc={row['valid_acc']:.4g} grad={row['grad']:.3g} dθ={row['dθ']:.3g} base_grad_leaks={row['base_grad_leaks']} N={n}") print("SHOULD: grad>0, dθ>0, base_grad_leaks=0; test/valid_acc meaningful only in benchmark mode. ELSE adapter or eval wiring is dead/wrong.") + print() # ordered: most important / shortest columns first display_keys = ["variant", "test_acc", "valid_acc", "grad", "dθ", "base_grad_leaks", "steps", "samples", "loss0", "lossN", "commit"] if "perturb" in row: @@ -444,6 +448,7 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non display_keys += ["run_id"] display_row = {k: row[k] for k in display_keys if k in row} print(tabulate([display_row], headers="keys", tablefmt="tsv", floatfmt=".4g")) + print() print(f"argv: {' '.join(sys.argv)} N={n} mode={mode}") print(f"out: {result_path}") diff --git a/src/lora_lite/variants/antipasto.py b/src/lora_lite/variants/antipasto.py index 1b4e839..44e6534 100644 --- a/src/lora_lite/variants/antipasto.py +++ b/src/lora_lite/variants/antipasto.py @@ -79,15 +79,15 @@ class AntiPaSTO: raise ValueError(f"AntiPaSTO requires r={r} divisible by block_size={bs}") n_blocks = r // bs n_triu = bs * (bs - 1) // 2 - return { + return dict( # Frozen SVD components captured at init. - "lora_U": ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True), - "lora_S": ParamSpec((r,), init="zeros", trainable=False, as_buffer=True), - "lora_Vh": ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True), + lora_U=ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True), + lora_S=ParamSpec((r,), init="zeros", trainable=False, as_buffer=True), + lora_Vh=ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True), # Trainable: per-singular-value delta + block-diagonal Cayley rotation. - "lora_delta_s": ParamSpec((r,), init="zeros", trainable=True), - "lora_rot_T": ParamSpec((n_blocks, n_triu), init="zeros", trainable=True), - } + lora_delta_s=ParamSpec((r,), init="zeros"), + lora_rot_T=ParamSpec((n_blocks, n_triu), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/delora.py b/src/lora_lite/variants/delora.py index 3bf491a..1cdccb4 100644 --- a/src/lora_lite/variants/delora.py +++ b/src/lora_lite/variants/delora.py @@ -43,15 +43,13 @@ class DeLoRA: @staticmethod def param_specs(d_in, d_out, cfg): lam0 = float(cfg.lambda0) - return { - "lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), - "lora_lambda": ParamSpec( - (), init=lambda t: t.fill_(lam0), trainable=True - ), + return dict( + lora_A=ParamSpec((cfg.r, d_in), init="kaiming"), + lora_B=ParamSpec((d_out, cfg.r), init="zeros"), + lora_lambda=ParamSpec((), init=lambda t: t.fill_(lam0)), # ||W||_2 per input channel; frozen buffer captured at init. - "lora_wnorm": ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True), - } + lora_wnorm=ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/dora.py b/src/lora_lite/variants/dora.py index 31063aa..7ee496e 100644 --- a/src/lora_lite/variants/dora.py +++ b/src/lora_lite/variants/dora.py @@ -30,12 +30,12 @@ class DoRA: @staticmethod def param_specs(d_in, d_out, cfg): - return { - "lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), + return dict( + lora_A=ParamSpec((cfg.r, d_in), init="kaiming"), + lora_B=ParamSpec((d_out, cfg.r), init="zeros"), # m is filled from ||W||_c during init(); shape (d_out,) - "lora_m": ParamSpec((d_out,), init="zeros", trainable=True), - } + lora_m=ParamSpec((d_out,), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/eva.py b/src/lora_lite/variants/eva.py index 8bc7a2b..68b4cf0 100644 --- a/src/lora_lite/variants/eva.py +++ b/src/lora_lite/variants/eva.py @@ -38,11 +38,11 @@ class EVA: @staticmethod def param_specs(d_in, d_out, cfg): - return { + return dict( # A trainable per peft: EVA only changes the init. - "lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), - } + lora_A=ParamSpec((cfg.r, d_in), init="zeros"), + lora_B=ParamSpec((d_out, cfg.r), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/hra.py b/src/lora_lite/variants/hra.py index 11efa98..8329c43 100644 --- a/src/lora_lite/variants/hra.py +++ b/src/lora_lite/variants/hra.py @@ -43,11 +43,11 @@ class HRA: f"HRA symmetric init requires even r; got r={cfg.r}. " "Pick an even rank or use a different variant." ) - return { + return dict( # Householder vectors stacked as rows (one vector per rank slot) # init done in init() to enforce paired rows -> R = I at t=0. - "lora_U": ParamSpec((cfg.r, d_in), init="zeros", trainable=True), - } + lora_U=ParamSpec((cfg.r, d_in), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/ia3.py b/src/lora_lite/variants/ia3.py index 839ef64..8e93ee1 100644 --- a/src/lora_lite/variants/ia3.py +++ b/src/lora_lite/variants/ia3.py @@ -41,7 +41,7 @@ class IA3: @staticmethod def param_specs(d_in, d_out, cfg): - return {"lora_g": ParamSpec((d_out,), init="ones", trainable=True)} + return dict(lora_g=ParamSpec((d_out,), init="ones")) @staticmethod def init(layer: nn.Module, cfg) -> None: @@ -62,7 +62,7 @@ class IA3FF: @staticmethod def param_specs(d_in, d_out, cfg): - return {"lora_g": ParamSpec((d_in,), init="ones", trainable=True)} + return dict(lora_g=ParamSpec((d_in,), init="ones")) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/lora.py b/src/lora_lite/variants/lora.py index 34eaae0..209b48c 100644 --- a/src/lora_lite/variants/lora.py +++ b/src/lora_lite/variants/lora.py @@ -30,10 +30,10 @@ class LoRA: @staticmethod def param_specs(d_in, d_out, cfg): - return { - "lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), - } + return dict( + lora_A=ParamSpec((cfg.r, d_in), init="kaiming"), + lora_B=ParamSpec((d_out, cfg.r), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/pissa.py b/src/lora_lite/variants/pissa.py index 0aed219..7987ff2 100644 --- a/src/lora_lite/variants/pissa.py +++ b/src/lora_lite/variants/pissa.py @@ -37,10 +37,10 @@ class PiSSA: @staticmethod def param_specs(d_in, d_out, cfg): - return { - "lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True), - "lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True), - } + return dict( + lora_A=ParamSpec((cfg.r, d_in), init="zeros"), + lora_B=ParamSpec((d_out, cfg.r), init="zeros"), + ) @staticmethod def init(layer: nn.Module, cfg) -> None: diff --git a/src/lora_lite/variants/road.py b/src/lora_lite/variants/road.py index e87bd35..6042576 100644 --- a/src/lora_lite/variants/road.py +++ b/src/lora_lite/variants/road.py @@ -116,10 +116,10 @@ class ROAD: def param_specs(d_in: int, d_out: int, cfg: RoadConfig) -> dict[str, ParamSpec]: _validate_group_geometry(d_out, cfg.group_size) size = _road_param_size(d_out, cfg.road_variant) - return { - "lora_road_theta": ParamSpec((size,), init="zeros", trainable=True), - "lora_road_alpha": ParamSpec((size,), init="ones", trainable=True), - } + return dict( + lora_road_theta=ParamSpec((size,), init="zeros"), + lora_road_alpha=ParamSpec((size,), init="ones"), + ) @staticmethod def init(layer: nn.Module, cfg: RoadConfig) -> None: