mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 15:15:55 +08:00
tidy
This commit is contained in:
@@ -73,6 +73,26 @@ metamath-queue variant="lora" steps="5000" model="Qwen/Qwen3-0.6B-Base":
|
||||
-w "$PWD" -o 1 -- \
|
||||
uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py --model {{model}} --variant {{variant}} --steps {{steps}}
|
||||
|
||||
# Run a single MetaMathQA->GSM8K benchmark for a given variant.
|
||||
# Per-variant lr / target-name defaults are baked in here.
|
||||
bench-variant model variant steps="5000":
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
lr=1e-4
|
||||
target='(q_proj|v_proj)$'
|
||||
case "{{variant}}" in
|
||||
delora) lr=1e-3 ;;
|
||||
ia3) lr=1e-3; target='(k_proj|v_proj)$' ;;
|
||||
ia3_ff) lr=1e-3; target='(down_proj)$' ;;
|
||||
esac
|
||||
exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \
|
||||
--model '{{model}}' \
|
||||
--variant '{{variant}}' \
|
||||
--steps {{steps}} \
|
||||
--lr "$lr" \
|
||||
--target-name "$target" \
|
||||
--layers all --r 32 --alpha 64
|
||||
|
||||
metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto":
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
@@ -80,5 +100,5 @@ metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora piss
|
||||
pueue add \
|
||||
-l "why: benchmark {{model}} ${variant} on MetaMathQA->GSM8K at {{steps}} steps; resolve: outputs/metamath_gsm8k/results/benchmark_results.tsv gets a row with accuracy commit time method argv and result JSON for ${variant}" \
|
||||
-w "$PWD" -o 1 -- \
|
||||
bash scripts/bench_variant.sh '{{model}}' "$variant" {{steps}}
|
||||
just bench-variant '{{model}}' "$variant" {{steps}}
|
||||
done
|
||||
@@ -16,6 +16,7 @@ from typing import Any, Literal
|
||||
|
||||
import torch
|
||||
from tabulate import tabulate
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
import lora_lite as ll
|
||||
|
||||
@@ -297,7 +298,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]],
|
||||
last_loss = math.nan
|
||||
train_total_tokens = 0
|
||||
probe_batch = batches[0]
|
||||
for step, batch in enumerate(batches):
|
||||
pbar = tqdm(batches, desc="train", mininterval=60.0, dynamic_ncols=True)
|
||||
for step, batch in enumerate(pbar):
|
||||
opt.zero_grad()
|
||||
loss = model(
|
||||
input_ids=batch["input_ids"],
|
||||
@@ -319,8 +321,8 @@ def train(model: torch.nn.Module, batches: list[dict[str, torch.Tensor | int]],
|
||||
scheduler.step()
|
||||
last_loss = loss.item()
|
||||
train_total_tokens += int(batch["label_tokens"])
|
||||
if args.log_every and (step + 1) % args.log_every == 0:
|
||||
print(f"TRAIN step={step + 1} loss={last_loss:.6g} grad={grad_norm:.6g} tokens={train_total_tokens}", flush=True)
|
||||
pbar.set_postfix(loss=f"{last_loss:.4g}", grad=f"{grad_norm:.3g}", tok=train_total_tokens)
|
||||
pbar.close()
|
||||
after = adapter_state(model)
|
||||
adapter_delta = sum((after[k] - before[k]).float().norm().item() for k in before)
|
||||
model.eval()
|
||||
@@ -435,8 +437,10 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non
|
||||
# BLUF: status line first so log tails are immediately readable
|
||||
cue = "🟢" if row.get("base_grad_leaks", 0) == 0 and row.get("grad", 0) > 0 else "🔴"
|
||||
n = row.get("samples", "?")
|
||||
print()
|
||||
print(f"{cue} test_acc={row['test_acc']:.4g} valid_acc={row['valid_acc']:.4g} grad={row['grad']:.3g} dθ={row['dθ']:.3g} base_grad_leaks={row['base_grad_leaks']} N={n}")
|
||||
print("SHOULD: grad>0, dθ>0, base_grad_leaks=0; test/valid_acc meaningful only in benchmark mode. ELSE adapter or eval wiring is dead/wrong.")
|
||||
print()
|
||||
# ordered: most important / shortest columns first
|
||||
display_keys = ["variant", "test_acc", "valid_acc", "grad", "dθ", "base_grad_leaks", "steps", "samples", "loss0", "lossN", "commit"]
|
||||
if "perturb" in row:
|
||||
@@ -444,6 +448,7 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non
|
||||
display_keys += ["run_id"]
|
||||
display_row = {k: row[k] for k in display_keys if k in row}
|
||||
print(tabulate([display_row], headers="keys", tablefmt="tsv", floatfmt=".4g"))
|
||||
print()
|
||||
print(f"argv: {' '.join(sys.argv)} N={n} mode={mode}")
|
||||
print(f"out: {result_path}")
|
||||
|
||||
|
||||
@@ -79,15 +79,15 @@ class AntiPaSTO:
|
||||
raise ValueError(f"AntiPaSTO requires r={r} divisible by block_size={bs}")
|
||||
n_blocks = r // bs
|
||||
n_triu = bs * (bs - 1) // 2
|
||||
return {
|
||||
return dict(
|
||||
# Frozen SVD components captured at init.
|
||||
"lora_U": ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
|
||||
"lora_S": ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
|
||||
"lora_Vh": ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
|
||||
lora_U=ParamSpec((d_out, r), init="zeros", trainable=False, as_buffer=True),
|
||||
lora_S=ParamSpec((r,), init="zeros", trainable=False, as_buffer=True),
|
||||
lora_Vh=ParamSpec((r, d_in), init="zeros", trainable=False, as_buffer=True),
|
||||
# Trainable: per-singular-value delta + block-diagonal Cayley rotation.
|
||||
"lora_delta_s": ParamSpec((r,), init="zeros", trainable=True),
|
||||
"lora_rot_T": ParamSpec((n_blocks, n_triu), init="zeros", trainable=True),
|
||||
}
|
||||
lora_delta_s=ParamSpec((r,), init="zeros"),
|
||||
lora_rot_T=ParamSpec((n_blocks, n_triu), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -43,15 +43,13 @@ class DeLoRA:
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
lam0 = float(cfg.lambda0)
|
||||
return {
|
||||
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
|
||||
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
|
||||
"lora_lambda": ParamSpec(
|
||||
(), init=lambda t: t.fill_(lam0), trainable=True
|
||||
),
|
||||
return dict(
|
||||
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
|
||||
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
|
||||
lora_lambda=ParamSpec((), init=lambda t: t.fill_(lam0)),
|
||||
# ||W||_2 per input channel; frozen buffer captured at init.
|
||||
"lora_wnorm": ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True),
|
||||
}
|
||||
lora_wnorm=ParamSpec((d_in,), init="ones", trainable=False, as_buffer=True),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -30,12 +30,12 @@ class DoRA:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {
|
||||
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
|
||||
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
|
||||
return dict(
|
||||
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
|
||||
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
|
||||
# m is filled from ||W||_c during init(); shape (d_out,)
|
||||
"lora_m": ParamSpec((d_out,), init="zeros", trainable=True),
|
||||
}
|
||||
lora_m=ParamSpec((d_out,), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -38,11 +38,11 @@ class EVA:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {
|
||||
return dict(
|
||||
# A trainable per peft: EVA only changes the init.
|
||||
"lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
|
||||
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
|
||||
}
|
||||
lora_A=ParamSpec((cfg.r, d_in), init="zeros"),
|
||||
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -43,11 +43,11 @@ class HRA:
|
||||
f"HRA symmetric init requires even r; got r={cfg.r}. "
|
||||
"Pick an even rank or use a different variant."
|
||||
)
|
||||
return {
|
||||
return dict(
|
||||
# Householder vectors stacked as rows (one vector per rank slot)
|
||||
# init done in init() to enforce paired rows -> R = I at t=0.
|
||||
"lora_U": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
|
||||
}
|
||||
lora_U=ParamSpec((cfg.r, d_in), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -41,7 +41,7 @@ class IA3:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {"lora_g": ParamSpec((d_out,), init="ones", trainable=True)}
|
||||
return dict(lora_g=ParamSpec((d_out,), init="ones"))
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
@@ -62,7 +62,7 @@ class IA3FF:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {"lora_g": ParamSpec((d_in,), init="ones", trainable=True)}
|
||||
return dict(lora_g=ParamSpec((d_in,), init="ones"))
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -30,10 +30,10 @@ class LoRA:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {
|
||||
"lora_A": ParamSpec((cfg.r, d_in), init="kaiming", trainable=True),
|
||||
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
|
||||
}
|
||||
return dict(
|
||||
lora_A=ParamSpec((cfg.r, d_in), init="kaiming"),
|
||||
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -37,10 +37,10 @@ class PiSSA:
|
||||
|
||||
@staticmethod
|
||||
def param_specs(d_in, d_out, cfg):
|
||||
return {
|
||||
"lora_A": ParamSpec((cfg.r, d_in), init="zeros", trainable=True),
|
||||
"lora_B": ParamSpec((d_out, cfg.r), init="zeros", trainable=True),
|
||||
}
|
||||
return dict(
|
||||
lora_A=ParamSpec((cfg.r, d_in), init="zeros"),
|
||||
lora_B=ParamSpec((d_out, cfg.r), init="zeros"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg) -> None:
|
||||
|
||||
@@ -116,10 +116,10 @@ class ROAD:
|
||||
def param_specs(d_in: int, d_out: int, cfg: RoadConfig) -> dict[str, ParamSpec]:
|
||||
_validate_group_geometry(d_out, cfg.group_size)
|
||||
size = _road_param_size(d_out, cfg.road_variant)
|
||||
return {
|
||||
"lora_road_theta": ParamSpec((size,), init="zeros", trainable=True),
|
||||
"lora_road_alpha": ParamSpec((size,), init="ones", trainable=True),
|
||||
}
|
||||
return dict(
|
||||
lora_road_theta=ParamSpec((size,), init="zeros"),
|
||||
lora_road_alpha=ParamSpec((size,), init="ones"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init(layer: nn.Module, cfg: RoadConfig) -> None:
|
||||
|
||||
Reference in New Issue
Block a user