This commit is contained in:
wassname
2026-04-27 11:44:40 +08:00
parent a342801807
commit 0bd091fe5b
11 changed files with 68 additions and 45 deletions
+21 -1
View File
@@ -73,6 +73,26 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base":
-w "$PWD" -o 1 -- \
uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py --model {{model}} --variant {{variant}} --steps {{steps}}
# Run a single MetaMathQA->GSM8K benchmark for a given variant.
# Per-variant lr / target-name defaults are baked in here.
bench-variant model variant steps="5000":
#!/usr/bin/env bash
set -euo pipefail
lr=1e-4
target='(q_proj|v_proj)$'
case "{{variant}}" in
delora) lr=1e-3 ;;
ia3) lr=1e-3; target='(k_proj|v_proj)$' ;;
ia3_ff) lr=1e-3; target='(down_proj)$' ;;
esac
exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \
--model '{{model}}' \
--variant '{{variant}}' \
--steps {{steps}} \
--lr "$lr" \
--target-name "$target" \
--layers all --r 32 --alpha 64
metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto":
#!/usr/bin/env bash
set -euo pipefail
@@ -80,5 +100,5 @@ metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora piss
pueue add \
-l "why: benchmark {{model}} ${variant} on MetaMathQA->GSM8K at {{steps}} steps; resolve: outputs/metamath_gsm8k/results/benchmark_results.tsv gets a row with accuracy commit time method argv and result JSON for ${variant}" \
-w "$PWD" -o 1 -- \
bash scripts/bench_variant.sh '{{model}}' "$variant" {{steps}}
just bench-variant '{{model}}' "$variant" {{steps}}
done
+8 -3
View File
@@ -16,6 +16,7 @@ from typing import Any, Literal
import torch
from tabulate import tabulate
from tqdm.auto import tqdm
import lora_lite as ll
@@ -297,7 +298,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]],
last_loss = math.nan
train_total_tokens = 0
probe_batch = batches[0]
for step, batch in enumerate(batches):
pbar = tqdm(batches, desc="train", mininterval=60.0, dynamic_ncols=True)
for step, batch in enumerate(pbar):
opt.zero_grad()
loss = model(
input_ids=batch["input_ids"],
@@ -319,8 +321,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]],
scheduler.step()
last_loss = loss.item()
train_total_tokens += int(batch["label_tokens"])
if args.log_every and (step + 1) % args.log_every == 0:
print(f"TRAIN step={step + 1} loss={last_loss:.6g} grad={grad_norm:.6g} tokens={train_total_tokens}", flush=True)
pbar.set_postfix(loss=f"{last_loss:.4g}", grad=f"{grad_norm:.3g}", tok=train_total_tokens)
pbar.close()
after = adapter_state(model)
adapter_delta = sum((after[k] - before[k]).float().norm().item() for k in before)
model.eval()
@@ -435,8 +437,10 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non
# BLUF: status line first so log tails are immediately readable
cue = "🟢" if row.get("base_grad_leaks", 0) == 0 and row.get("grad", 0) > 0 else "🔴"
n = row.get("samples", "?")
print()
print(f"{cue} test_acc={row['test_acc']:.4g} valid_acc={row['valid_acc']:.4g} grad={row['grad']:.3g} dθ={row['']:.3g} base_grad_leaks={row['base_grad_leaks']} N={n}")
print("SHOULD: grad>0, dθ>0, base_grad_leaks=0; test/valid_acc meaningful only in benchmark mode. ELSE adapter or eval wiring is dead/wrong.")
print()
# ordered: most important / shortest columns first
display_keys = ["variant", "test_acc", "valid_acc", "grad", "", "base_grad_leaks", "steps", "samples", "loss0", "lossN", "commit"]
if "perturb" in row:
@@ -444,6 +448,7 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non
display_keys += ["run_id"]
display_row = {k: row[k] for k in display_keys if k in row}
print(tabulate([display_row], headers="keys", tablefmt="tsv", floatfmt=".4g"))
print()
print(f"argv: {' '.join(sys.argv)} N={n} mode={mode}")
print(f"out: {result_path}")
+7 -7
View File
@@ -79,15 +79,15 @@ class AntiPaSTO:
raise ValueError(f"AntiPaSTO requires r={r} divisible by block_size={bs}")
n_blocks = r // bs
n_triu = bs * (bs - 1) // 2
return {
return dict(
# Frozen SVD components captured at init.
"lora_U": ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
"lora_S": ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
"lora_Vh": ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
lora_U=ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
lora_S=ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
lora_Vh=ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
# Trainable: per-singular-value delta + block-diagonal Cayley rotation.
"lora_delta_s": ParamSpec((r,), init="zeros", trainable=True),
"lora_rot_T": ParamSpec((n_blocks, n_triu), init="zeros", trainable=True),
}
lora_delta_s=ParamSpec((r,), init="zeros"),
lora_rot_T=ParamSpec((n_blocks, n_triu), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+6 -8
View File
@@ -43,15 +43,13 @@ class DeLoRA:
@staticmethod
def param_specs(d_in, d_out, cfg):
lam0 = float(cfg.lambda0)
return {
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
"lora_lambda": ParamSpec(
(), init=lambda t: t.fill_(lam0), trainable=True
),
return dict(
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
lora_lambda=ParamSpec((), init=lambda t: t.fill_(lam0)),
# ||W||_2 per input channel; frozen buffer captured at init.
"lora_wnorm": ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True),
}
lora_wnorm=ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+5 -5
View File
@@ -30,12 +30,12 @@ class DoRA:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
return dict(
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
# m is filled from ||W||_c during init(); shape (d_out,)
"lora_m": ParamSpec((d_out,), init="zeros", trainable=True),
}
lora_m=ParamSpec((d_out,), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+4 -4
View File
@@ -38,11 +38,11 @@ class EVA:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {
return dict(
# A trainable per peft: EVA only changes the init.
"lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
}
lora_A=ParamSpec((cfg.r, d_in), init="zeros"),
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+3 -3
View File
@@ -43,11 +43,11 @@ class HRA:
f"HRA symmetric init requires even r; got r={cfg.r}. "
"Pick an even rank or use a different variant."
)
return {
return dict(
# Householder vectors stacked as rows (one vector per rank slot)
# init done in init() to enforce paired rows -> R = I at t=0.
"lora_U": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
}
lora_U=ParamSpec((cfg.r, d_in), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+2 -2
View File
@@ -41,7 +41,7 @@ class IA3:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {"lora_g": ParamSpec((d_out,), init="ones", trainable=True)}
return dict(lora_g=ParamSpec((d_out,), init="ones"))
@staticmethod
def init(layer: nn.Module, cfg) -> None:
@@ -62,7 +62,7 @@ class IA3FF:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {"lora_g": ParamSpec((d_in,), init="ones", trainable=True)}
return dict(lora_g=ParamSpec((d_in,), init="ones"))
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+4 -4
View File
@@ -30,10 +30,10 @@ class LoRA:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
}
return dict(
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+4 -4
View File
@@ -37,10 +37,10 @@ class PiSSA:
@staticmethod
def param_specs(d_in, d_out, cfg):
return {
"lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
}
return dict(
lora_A=ParamSpec((cfg.r, d_in), init="zeros"),
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
)
@staticmethod
def init(layer: nn.Module, cfg) -> None:
+4 -4
View File
@@ -116,10 +116,10 @@ class ROAD:
def param_specs(d_in: int, d_out: int, cfg: RoadConfig) -> dict[str, ParamSpec]:
_validate_group_geometry(d_out, cfg.group_size)
size = _road_param_size(d_out, cfg.road_variant)
return {
"lora_road_theta": ParamSpec((size,), init="zeros", trainable=True),
"lora_road_alpha": ParamSpec((size,), init="ones", trainable=True),
}
return dict(
lora_road_theta=ParamSpec((size,), init="zeros"),
lora_road_alpha=ParamSpec((size,), init="ones"),
)
@staticmethod
def init(layer: nn.Module, cfg: RoadConfig) -> None: