diff --git a/justfile b/justfile index 1e45692..d516e0a 100644 --- a/justfile +++ b/justfile @@ -88,6 +88,7 @@ bench-variant model variant steps="5000": delora) lr=1e-3 ;; ia3) lr=5e-3; target='(k_proj|v_proj)$' ;; ia3_ff) lr=5e-3; target='(down_proj)$' ;; + antipasto) lr=5e-3 ;; # small params need higher lr esac exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \ --model '{{model}}' \ @@ -95,7 +96,7 @@ bench-variant model variant steps="5000": --steps {{steps}} \ --lr "$lr" \ --target-name "$target" \ - --layers all --r 32 --alpha 64 + --layers all --r 32 --alpha 64 "$@" metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto": #!/usr/bin/env bash diff --git a/scripts/metamath_gsm8k_benchmark.py b/scripts/metamath_gsm8k_benchmark.py index ba08197..2bf0f08 100644 --- a/scripts/metamath_gsm8k_benchmark.py +++ b/scripts/metamath_gsm8k_benchmark.py @@ -52,6 +52,7 @@ class BenchmarkConfig: alpha: float = 64.0 delora_lambda0: float = 0.1 road_group_size: int = 64 + antipasto_rotate_basis: Literal["V", "U", "none"] = "V" target_name: list[str] = field(default_factory=lambda: list(DEFAULT_TARGETS)) layers: str = "all" train_dataset: str = "meta-math/MetaMathQA" @@ -123,6 +124,8 @@ def cfg_for_variant(args: BenchmarkConfig, dtype: torch.dtype) -> ll.AdapterConf extra = {"lambda0": args.delora_lambda0} if args.variant == "delora" else {} if args.variant == "road": extra = {"group_size": args.road_group_size} + if args.variant == "antipasto": + extra = {"rotate_basis": args.antipasto_rotate_basis} return CFG_BY_VARIANT[args.variant]( r=args.r, alpha=args.r if args.variant == "pissa" else args.alpha,