justfile: bench-variant takes a target_override arg, routed to its own out dir

LoRA-XS's paper recipe spreads across q/k/v/o + all 3 FFN projections, not
down_proj only. run_id ignores target, so overridden runs go to
outputs/metamath_gsm8k_alllinear to avoid clobbering the canonical down_proj
results the README table is built from.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-18 21:53:30 +08:00
parent a75bed492b
commit 5d910996b3
+7 -1
View File
@@ -83,7 +83,7 @@ metamath-queue variant="lora" steps="5000" model=model:
# Run a single MetaMathQA->GSM8K benchmark for a given variant. # Run a single MetaMathQA->GSM8K benchmark for a given variant.
# Per-variant lr / target-name defaults are baked in here. # Per-variant lr / target-name defaults are baked in here.
bench-variant model variant steps="5000" r_override="" lr_override="" rotate_basis="V" seed="0": bench-variant model variant steps="5000" r_override="" lr_override="" rotate_basis="V" seed="0" target_override="":
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -euo pipefail
lr=1e-4 lr=1e-4
@@ -91,6 +91,7 @@ bench-variant model variant steps="5000" r_override="" lr_override="" rotate_bas
# only 6 full-attention layers) and CorDA/ASVD's canonical highest-d_in target. # only 6 full-attention layers) and CorDA/ASVD's canonical highest-d_in target.
target='(down_proj)$' target='(down_proj)$'
r=32; alpha=64 r=32; alpha=64
out_dir=outputs/metamath_gsm8k
# IA3 lr: paper uses 3e-3 to 1e-2 (Liu et al. 2022 §3.3). Also a hard # IA3 lr: paper uses 3e-3 to 1e-2 (Liu et al. 2022 §3.3). Also a hard
# bf16 floor: lora_g inits to 1.0 where bf16 spacing is ~7.8e-3, so # bf16 floor: lora_g inits to 1.0 where bf16 spacing is ~7.8e-3, so
# AdamW updates with lr<<3.9e-3 round back to 1.0 and the param freezes. # AdamW updates with lr<<3.9e-3 round back to 1.0 and the param freezes.
@@ -112,6 +113,10 @@ bench-variant model variant steps="5000" r_override="" lr_override="" rotate_bas
if [ -n "{{r_override}}" ]; then r="{{r_override}}"; alpha="{{r_override}}"; fi if [ -n "{{r_override}}" ]; then r="{{r_override}}"; alpha="{{r_override}}"; fi
# lr override (e.g. a tamer lr than antipasto's 5e-3 default). # lr override (e.g. a tamer lr than antipasto's 5e-3 default).
if [ -n "{{lr_override}}" ]; then lr="{{lr_override}}"; fi if [ -n "{{lr_override}}" ]; then lr="{{lr_override}}"; fi
# target override (e.g. LoRA-XS paper spreads across all q/k/v/o + FFN linears, not
# just down_proj). run_id ignores target, so route overrides to their own dir to
# avoid clobbering the canonical down_proj results the README table is built from.
if [ -n "{{target_override}}" ]; then target="{{target_override}}"; out_dir=outputs/metamath_gsm8k_alllinear; fi
# 0.8B + large vocab: HF ForCausalLMLoss upcasts logits to fp32 (bs*seq*vocab*4), # 0.8B + large vocab: HF ForCausalLMLoss upcasts logits to fp32 (bs*seq*vocab*4),
# which OOMs the 24GB card at the old bs=4/seq=768. micro-batch 2 fits at ~10GB; # which OOMs the 24GB card at the old bs=4/seq=768. micro-batch 2 fits at ~10GB;
# grad-accum 4 -> effective batch 8 (optimization quality without the memory). # grad-accum 4 -> effective batch 8 (optimization quality without the memory).
@@ -126,6 +131,7 @@ bench-variant model variant steps="5000" r_override="" lr_override="" rotate_bas
--batch-size 2 --grad-accum 4 --max-seq-length 512 --batch-size-eval 16 \ --batch-size 2 --grad-accum 4 --max-seq-length 512 --batch-size-eval 16 \
--layers all --r "$r" --alpha "$alpha" \ --layers all --r "$r" --alpha "$alpha" \
--antipasto-rotate-basis '{{rotate_basis}}' \ --antipasto-rotate-basis '{{rotate_basis}}' \
--output-dir "$out_dir" \
--seed {{seed}} --seed {{seed}}
metamath-queue-all model=model steps="2500" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto": metamath-queue-all model=model steps="2500" variants="lora pissa delora dora hra ia3 ia3_ff eva antipasto":