mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 16:15:50 +08:00
feat: near_zero/near_one init for trainable params (breaks bf16 dead-grad symmetry)
Trainable params that were init'd at exact 0 or 1 now use near_zero (N(0,1e-4)) or near_one (1 + N(0,1e-4)) to break bf16 symmetry without meaningfully breaking identity-at-t=0. Exact-zero init is kept where zero IS the identity constraint (DeLoRA lora_B, EVA lora_B -- both scaled by other params so any nonzero B would blow up the output). AntiPaSTO: delta_s and rot_T now near_zero. The old exact-zero could leave rotation learning dead in bf16 where step sizes round back to zero. IA3: lora_g now near_one instead of exact ones. Avoids the bf16 spacing issue around 1.0 where eps_bf16 ~ 7.8e-3 and lr=1e-3 updates were rounding away. PiSSA: lora_A and lora_B now near_zero (both overwritten by SVD in init(), so the init value is moot -- but ParamSpec now documents intent correctly). HRA: lora_U now near_zero (overwritten by symmetric init in init()). ParamSpec: added 'near_zero' and 'near_one' init modes. Default changed from 'zeros' to 'near_zero'. Tests relaxed identity tolerances accordingly.
This commit is contained in:
@@ -442,7 +442,7 @@ def print_final_report(row: dict[str, Any], result_path: Path, mode: str) -> Non
|
||||
print("SHOULD: grad>0, dθ>0, base_grad_leaks=0; test/valid_acc meaningful only in benchmark mode. ELSE adapter or eval wiring is dead/wrong.")
|
||||
print()
|
||||
# ordered: most important / shortest columns first
|
||||
display_keys = ["variant", "test_acc", "valid_acc", "grad", "dθ", "base_grad_leaks", "steps", "samples", "loss0", "lossN", "commit"]
|
||||
display_keys = ["variant", "test_acc", "valid_acc", "params_M", "peak_mem_GB", "grad", "dθ", "base_grad_leaks", "steps", "samples", "loss0", "lossN", "commit"]
|
||||
if "perturb" in row:
|
||||
display_keys += ["perturb", "reload"]
|
||||
display_keys += ["run_id"]
|
||||
@@ -480,6 +480,8 @@ def append_results_row(
|
||||
"method": args.variant,
|
||||
"steps": args.steps,
|
||||
"samples": result["train_samples"],
|
||||
"params_M": round(result["trainable_param_count"] / 1e6, 4),
|
||||
"peak_mem_GB": round(result.get("peak_cuda_mem_gb", 0.0), 3),
|
||||
"model": args.model,
|
||||
"commit": run_commit[:12],
|
||||
"wall_time_s": round(result["wall_time_s"]),
|
||||
@@ -530,10 +532,13 @@ def run(args: BenchmarkConfig) -> dict[str, Any]:
|
||||
probe_metrics = probe_before_train(model, batches[0], attached["targets"])
|
||||
model.train()
|
||||
|
||||
if args.device == "cuda":
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
started = time.time()
|
||||
train_metrics = train(model, batches, args)
|
||||
valid_metrics = evaluate(model, tokenizer, datasets["valid"], args, "valid")
|
||||
test_metrics = evaluate(model, tokenizer, datasets["test"], args, "test")
|
||||
peak_mem_gb = (torch.cuda.max_memory_allocated() / 1024**3) if args.device == "cuda" else 0.0
|
||||
|
||||
adapter_path = out_dir / "adapter.safetensors"
|
||||
ll.save(model, str(adapter_path))
|
||||
@@ -581,6 +586,7 @@ def run(args: BenchmarkConfig) -> dict[str, Any]:
|
||||
"probe": probe_metrics,
|
||||
"adapter_path": str(adapter_path),
|
||||
"wall_time_s": time.time() - started,
|
||||
"peak_cuda_mem_gb": peak_mem_gb,
|
||||
}
|
||||
result_path = out_dir / "result.json"
|
||||
result_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
|
||||
@@ -604,6 +610,8 @@ def run(args: BenchmarkConfig) -> dict[str, Any]:
|
||||
"base_grad_leaks": train_metrics["base_grad_leaks"],
|
||||
"valid_acc": valid_metrics["accuracy"],
|
||||
"test_acc": test_metrics["accuracy"],
|
||||
"params_M": round(result["trainable_param_count"] / 1e6, 4),
|
||||
"peak_mem_GB": round(peak_mem_gb, 3),
|
||||
"commit": run_commit[:12],
|
||||
"result": str(result_path),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user