From 32b1fd885af3221c1deaff8606a7d3fd02144cd1 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Mon, 15 Jun 2026 05:58:34 +0800 Subject: [PATCH] justfile: route antipasto bench through r=256/alpha=256 in bench-variant The README GSM8K sweep was queued as raw expanded commands with an unquoted --target-name '(q_proj|v_proj)$'; pueue runs via sh -c, so the parens errored instantly before training. Routing through bench-variant (bash shebang quotes the target) fixes it. Also bake the antipasto family's r=256/alpha=256 into the case block so it matches the published AntiPaSTO row, replacing the dead trailing "$@" (shebang recipes get no extra args). Co-Authored-By: Claudypoo --- justfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/justfile b/justfile index b337ac1..4d6f30a 100644 --- a/justfile +++ b/justfile @@ -80,6 +80,7 @@ bench-variant model variant steps="5000": set -euo pipefail lr=1e-4 target='(q_proj|v_proj)$' + r=32; alpha=64 # IA3 lr: paper uses 3e-3 to 1e-2 (Liu et al. 2022 ยง3.3). Also a hard # bf16 floor: lora_g inits to 1.0 where bf16 spacing is ~7.8e-3, so # AdamW updates with lr<<3.9e-3 round back to 1.0 and the param freezes. @@ -88,7 +89,10 @@ bench-variant model variant steps="5000": delora) lr=1e-3 ;; ia3) lr=5e-3; target='(k_proj|v_proj)$' ;; ia3_ff) lr=5e-3; target='(down_proj)$' ;; - antipasto*) lr=5e-3 ;; # small params (gain/block) need higher lr; covers all antipasto_* cores + # antipasto cores tune only S-space gain/block (tiny params), so a small + # r leaves almost nothing trainable; r=256 is the variant default and + # matches the published AntiPaSTO row. alpha=r (no extra scaling). + antipasto*) lr=5e-3; r=256; alpha=256 ;; esac exec uv run --extra benchmark python scripts/metamath_gsm8k_benchmark.py \ --model '{{model}}' \ @@ -96,7 +100,7 @@ bench-variant model variant steps="5000": --steps {{steps}} \ --lr "$lr" \ --target-name "$target" \ - --layers all --r 32 --alpha 64 "$@" + --layers all --r "$r" --alpha "$alpha" metamath-queue-all model="Qwen/Qwen3-0.6B-Base" steps="5000" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto": #!/usr/bin/env bash