diff --git a/.comprehension_catalog.json b/.comprehension_catalog.json new file mode 100644 index 0000000..fcb5941 --- /dev/null +++ b/.comprehension_catalog.json @@ -0,0 +1,1004 @@ +{ + "mode": "editor", + "full_text_verified": true, + "claims_checked": [ + { + "claim": "We want to understand how transformers work. There are many approaches -- probing, ablation, SAEs --", + "number": "2022", + "status": "number_found_in_paper" + }, + { + "claim": "GDM's interpretability team recently pivoted toward \"pragmatic interpretability\" -- directly solving", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "GDM's interpretability team recently pivoted toward \"pragmatic interpretability\" -- directly solving", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "If the adapter generalizes out-of-distribution, that tells us the geometric structure it exploits is", + "number": "2601.07473", + "status": "number_found_in_paper" + }, + { + "claim": "| PE | 1 | Parameter-efficient: competitive with full FT at <1% params |", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "| PE | 1 | Parameter-efficient: competitive with full FT at <1% params |", + "number": "1%", + "status": "number_found_in_paper" + }, + { + "claim": "| BL | 1 | Beats LoRA on raw performance at comparable budget |", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "| DE | 1.5 | Data-efficient: faster convergence or works with less data |", + "number": "1.5", + "status": "number_found_in_paper" + }, + { + "claim": "| WA | 1 | Widely adopted: used as baseline by other papers |", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "Total = sum of applicable dimensions (max 8). Higher = stronger evidence that the method's structura", + "number": "8", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Hu et al. 2021](https://arxiv.org/abs/2106.09685) (ICLR 2022)", + "number": "2021", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Parameter-efficient (matches full FT with 0.01% params on GPT-3). Universal baseline a", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+WA=2 (parameter-efficient, universal baseline, but ceiling on hard tasks)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Qiu et al. 2023](https://arxiv.org/abs/2306.07280)", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "**See also:** BOFT ([Liu et al. 2023](https://arxiv.org/abs/2311.06243)), OFTv2 ([2025](https://arxi", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "R = cayley(Q_skew) # R = (I + Q_skew)(I - Q_skew)^{-1} \u2208 O(k)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors demonstrate OFT preserves \"hyperspherical energy\" (pairwise neuron angles) dur", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors demonstrate OFT preserves \"hyperspherical energy\" (pairwise neuron angles) dur", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE=2.5 (parameter-efficient, data-efficient: converges well with only 5% of training d", + "number": "2.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE=2.5 (parameter-efficient, data-efficient: converges well with only 5% of training d", + "number": "5%", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Kopiczko et al. 2023](https://arxiv.org/abs/2310.11454) (ICLR 2024)", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** 10x fewer trainable parameters than LoRA while maintaining competitive performance acr", + "number": "10", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (extreme parameter efficiency, competitive with LoRA, random-projection ceiling on c", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Liu et al. 2024](https://arxiv.org/abs/2402.09353) (ICML 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "V\u0302_norm = norm(V, dim=1).detach() # column norms, detached", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF+WA=4.5 (beats LoRA across multiple domains, QDoRA slightly outperforms full FT o", + "number": "4.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF+WA=4.5 (beats LoRA across multiple domains, QDoRA slightly outperforms full FT o", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Bini, Girrbach, Akata 2025](https://arxiv.org/abs/2503.18225) (ICLR 2025)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** The *direction* of a weight update (which features to mix) and its *strength* (how f", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "\u00c2_norm = clamp(norm(A, dim=1), min=1e-4) # \u2208 \u211d^r", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "\u00c2_norm = clamp(norm(A, dim=1), min=1e-4) # \u2208 \u211d^r", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "B\u0302_norm = clamp(norm(B, dim=0), min=1e-4) # \u2208 \u211d^r", + "number": "0", + "status": "number_found_in_paper" + }, + { + "claim": "B\u0302_norm = clamp(norm(B, dim=0), min=1e-4) # \u2208 \u211d^r", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "scaling = (\u03bb / r) / (\u00c2_norm * B\u0302_norm) # normalize each rank-1 component", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "The key insight: $\\Delta W = B \\cdot \\text{diag}\\left(\\frac{\\lambda}{r \\cdot \\|a_i\\| \\cdot \\|b^j\\|}\\", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+DE=3.5 (beats LoRA on robustness, faster convergence via bounded deviation preventi", + "number": "3.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+DE=3.5 (beats LoRA on robustness, faster convergence via bounded deviation preventi", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Meng, Wang, Zhang 2024](https://arxiv.org/abs/2404.02948) (NeurIPS 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** PiSSA consistently outperforms LoRA across 11 models (184M--70B) on 5 NLG and 8 NLU ta", + "number": "11", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF+DE=5 (beats LoRA, approaches/beats full FT, faster convergence, NeurIPS 2024)", + "number": "5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF+DE=5 (beats LoRA, approaches/beats full FT, faster convergence, NeurIPS 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Lingam et al. 2024](https://arxiv.org/abs/2405.19597)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [github.com/VijayLingam95/SVFT](https://github.com/VijayLingam95/SVFT/)", + "number": "95", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [github.com/VijayLingam95/SVFT](https://github.com/VijayLingam95/SVFT/)", + "number": "95", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** SVFT recovers up to 96% of full fine-tuning performance with only 0.006--0.25% of para", + "number": "96%", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL=2 (beats LoRA/DoRA on performance/parameter tradeoff, weight-aware structure)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Wang, Watanabe, Van hamme 2025](https://arxiv.org/abs/2509.02830)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "G_k = cayley(K) # G_k = (I-K)(I+K)^{-1} \u2208 O(k)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "Only $k(k-1)/2 + k$ parameters (skew-symmetric entries + singular value shifts). Uses Cayley-Neumann", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "Only $k(k-1)/2 + k$ parameters (skew-symmetric entries + singular value shifts). Uses Cayley-Neumann", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** SSVD achieves comparable performance to LoRA, DoRA, PiSSA, VeRA, and SVFT on domain-sh", + "number": "0.1", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** SSVD achieves comparable performance to LoRA, DoRA, PiSSA, VeRA, and SVFT on domain-sh", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+DE=3.5 (matches/beats LoRA with fewer params on domain-shifted ASR, faster converge", + "number": "3.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Liu et al. 2022](https://arxiv.org/abs/2205.05638)", + "number": "2022", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [peft/tuners/ia3/layer.py](https://github.com/huggingface/peft/blob/main/src/peft/tuners/i", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [peft/tuners/ia3/layer.py](https://github.com/huggingface/peft/blob/main/src/peft/tuners/i", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/ia3_few_shot_peft.md](docs/ia3_few_shot_peft.md)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/ia3_few_shot_peft.md](docs/ia3_few_shot_peft.md)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim (IA)3 with T0-3B outperforms ICL with GPT-3 175B on Super-NaturalInstruc", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim (IA)3 with T0-3B outperforms ICL with GPT-3 175B on Super-NaturalInstruc", + "number": "0", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (parameter-efficient, strong on T5-family, expressivity-limited compared to LoRA/DoR", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (parameter-efficient, strong on T5-family, expressivity-limited compared to LoRA/DoR", + "number": "5", + "status": "number_found_in_paper" + }, + { + "claim": "*Implications:* IA3's success tells us that a surprisingly large fraction of \"task adaptation\" is ju", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "*Implications:* IA3's success tells us that a surprisingly large fraction of \"task adaptation\" is ju", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Petrushkov 2024](https://arxiv.org/abs/2409.00119)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Adaptation is a *rotation* of activation pairs, with independently controllable *ang", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "x1, x2 = split_groups(result, group_size) # split into paired halves", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "x1, x2 = split_groups(result, group_size) # split into paired halves", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "y1 = \u03b1 * cos(\u03b8) * x1 - \u03b1 * sin(\u03b8) * x2 # 2D rotation + scale", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "y1 = \u03b1 * cos(\u03b8) * x1 - \u03b1 * sin(\u03b8) * x2 # 2D rotation + scale", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "y2 = \u03b1 * sin(\u03b8) * x1 + \u03b1 * cos(\u03b8) * x2 # per pair", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "y2 = \u03b1 * sin(\u03b8) * x1 + \u03b1 * cos(\u03b8) * x2 # per pair", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (parameter-efficient, clean decoupling, competitive with LoRA, limited published ben", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "*Implications:* ROAD's decoupling of angle from magnitude is the cleanest formulation of the \"direct", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "*Implications:* ROAD's decoupling of angle from magnitude is the cleanest formulation of the \"direct", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Clark 2025](https://arxiv.org/abs/2601.07473)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Clark 2025](https://arxiv.org/abs/2601.07473)", + "number": "2601.07473", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** The model's own SVD basis defines the natural intervention manifold. Steering is bes", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** The model's own SVD basis defines the natural intervention manifold. Steering is bes", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "X = \u03b1 * A_v / 2 # scale skew-symmetric params", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "R_v = solve(I - X, I + X) # Cayley: (I - \u03b1A/2)^{-1}(I + \u03b1A/2) \u2208 O(r)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "R_v = solve(I - X, I + X) # Cayley: (I - \u03b1A/2)^{-1}(I + \u03b1A/2) \u2208 O(r)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "where $R_v(\\alpha) = (I - \\alpha A/2)^{-1}(I + \\alpha A/2)$ is the Cayley transform of skew-symmetri", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "where $R_v(\\alpha) = (I - \\alpha A/2)^{-1}(I + \\alpha A/2)$ is the Cayley transform of skew-symmetri", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** AntiPaSTO beats prompting baselines by 6.9x on DailyDilemmas honesty evaluation using ", + "number": "6.9", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** AntiPaSTO beats prompting baselines by 6.9x on DailyDilemmas honesty evaluation using ", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE+OOD=4.5 (OOD transfer from templates to real dilemmas, trains on 800 pairs, bidirec", + "number": "4.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE+OOD=4.5 (OOD transfer from templates to real dilemmas, trains on 800 pairs, bidirec", + "number": "800", + "status": "number_found_in_paper" + }, + { + "claim": "*Caveat:* Primary evidence is on models up to 4B parameters. The paper notes larger models \"need fur", + "number": "4", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Zhang et al. 2023](https://arxiv.org/abs/2303.10512) (ICLR 2023)", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim AdaLoRA achieves comparable or better performance than LoRA with 30-50% ", + "number": "30", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim AdaLoRA achieves comparable or better performance than LoRA with 30-50% ", + "number": "50%", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (parameter-efficient, smarter budget allocation, added complexity for modest gains)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Liu et al. 2023](https://arxiv.org/abs/2311.06243) (ICLR 2024)", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Orthogonal transformations (OFT's key insight) are right, but the full block-diagona", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "Each butterfly layer has $d/2$ independent $2\\times2$ rotation blocks arranged in a permuted pattern", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "Each butterfly layer has $d/2$ independent $2\\times2$ rotation blocks arranged in a permuted pattern", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** BOFT matches or exceeds OFT performance on DreamBooth and ControlNet with 2-4x fewer p", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** BOFT matches or exceeds OFT performance on DreamBooth and ControlNet with 2-4x fewer p", + "number": "4", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF+DE=4 (outperforms full FT baseline on almost all vision tasks; faster and more stab", + "number": "4", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF+DE=4 (outperforms full FT baseline on almost all vision tasks; faster and more stab", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Ma et al. 2024](https://arxiv.org/abs/2404.04316) (ICML 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Any orthogonal transformation in $SO(d)$ can be decomposed into $O(d)$ Givens rotati", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Any orthogonal transformation in $SO(d)$ can be decomposed into $O(d)$ Givens rotati", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "G = givens_rotation(d, i, j, \u03b8) # identity except 2x2 block at (i,j)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "G = givens_rotation(d, i, j, \u03b8) # identity except 2x2 block at (i,j)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim GOFT outperforms OFT and BOFT on LLaMA-2-7B SFT (MT-Bench, AlpacaEval), ", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim GOFT outperforms OFT and BOFT on LLaMA-2-7B SFT (MT-Bench, AlpacaEval), ", + "number": "7", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (most parameter-efficient orthogonal method, strong results, ICML 2024)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (most parameter-efficient orthogonal method, strong results, ICML 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Yuan et al. 2024](https://arxiv.org/abs/2405.17484)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Orthogonal adaptations are equivalent to specific low-rank adaptations when paramete", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Orthogonal adaptations are equivalent to specific low-rank adaptations when paramete", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "H_i = eye(d) - 2 * outer(v, v) / dot(v, v) # Householder reflector", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (bridges orthogonal and low-rank paradigms, competitive performance)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Albert et al. 2025](https://arxiv.org/abs/2502.00987) (ICLR 2025)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** RandLoRA outperforms LoRA as parameter budget expands, while remaining parameter-effic", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** RandLoRA outperforms LoRA as parameter budget expands, while remaining parameter-effic", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF=2.5 (full-rank update bridges gap with full FT on CLIP; loss landscape closer to fu", + "number": "2.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF=2.5 (full-rank update bridges gap with full FT on CLIP; loss landscape closer to fu", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Gao et al. 2024](https://arxiv.org/abs/2405.03003) (ICML 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "\u0394W = real(ifft2(spectrum)) # inverse 2D DFT", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "\u0394W = real(ifft2(spectrum)) # inverse 2D DFT", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim FourierFT achieves higher compression than LoRA by exploiting frequency-", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF=2.5 (outperforms all baselines including full FT on RoBERTa-Base CoLA and RoBERTa-L", + "number": "2.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BF=2.5 (outperforms all baselines including full FT on RoBERTa-Base CoLA and RoBERTa-L", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Phoveran et al. 2024](https://arxiv.org/abs/2407.19342) (ACL 2025)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [peft/tuners/c3a](https://github.com/huggingface/peft/blob/main/src/peft/tuners/c3a/)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Code:** [peft/tuners/c3a](https://github.com/huggingface/peft/blob/main/src/peft/tuners/c3a/)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/c3a_circular_convolution.md](docs/c3a_circular_convolution.md)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/c3a_circular_convolution.md](docs/c3a_circular_convolution.md)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim C3A achieves higher effective rank than LoRA with similar parameter coun", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim C3A achieves higher effective rank than LoRA with similar parameter coun", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (full-rank with fewer params, ACL 2025, circulant assumption is strong)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (full-rank with fewer params, ACL 2025, circulant assumption is strong)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Hyeon-Woo et al. 2021](https://arxiv.org/abs/2108.06098) (FedPara; adapted in [LyCORIS](", + "number": "2021", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Weight updates have *multiplicative* structure that a single low-rank factorization ", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** Weight updates have *multiplicative* structure that a single low-rank factorization ", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "\u0394W = (A1 @ B1) * (A2 @ B2) # Hadamard product, potentially full-rank", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "\u0394W = (A1 @ B1) * (A2 @ B2) # Hadamard product, potentially full-rank", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (richer than LoRA for vision, part of LyCORIS ecosystem)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Yeh et al. 2023](https://arxiv.org/abs/2309.14859) (LyCORIS)", + "number": "2023", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (efficient for conv layers, niche use case for transformers)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [JL-er 2024](https://arxiv.org/abs/2409.15371)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE=2.5 (memory-efficient, faster early convergence via larger initial gradient norms; ", + "number": "2.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Li et al. 2024](https://arxiv.org/abs/2405.15179) (NeurIPS 2024)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim VBLoRA uses 0.4% of LoRA's parameters while maintaining comparable perfo", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (extreme compression, NeurIPS 2024, intriguing theoretical implications)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (extreme compression, NeurIPS 2024, intriguing theoretical implications)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [KKB et al. 2024](https://arxiv.org/abs/2406.13175) (NeurIPS 2024 Workshop)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** The right parameterization isn't low-rank *or* full-rank, but *sparse high-rank*. Di", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** The right parameterization isn't low-rank *or* full-rank, but *sparse high-rank*. Di", + "number": "2%", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim SHiRA outperforms LoRA especially on concept loss when using multiple ad", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (sparse high-rank, good multi-adapter properties, workshop paper)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (extremely few params, competitive on some tasks, limited expressiveness)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Papers:** Prompt Tuning ([Lester et al. 2021](https://arxiv.org/abs/2104.08691)), Prefix Tuning ([", + "number": "2021", + "status": "number_found_in_paper" + }, + { + "claim": "Variants: Prefix Tuning adds prompts to key/value projections at every layer. P-Tuning v2 applies de", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Prompt Tuning scales with model size: at T5-XXL (11B), it matches full fine-tuning wit", + "number": "5", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Prompt Tuning scales with model size: at T5-XXL (11B), it matches full fine-tuning wit", + "number": "11", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (scales with model size, conceptually different from weight methods)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Papers:** Polytropon ([Ponti et al. 2022](https://arxiv.org/abs/2202.13914)), X-LoRA ([Buehler 202", + "number": "2022", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (compositional multi-task, routing overhead)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Bini, Girrbach, Akata 2024](https://arxiv.org/abs/2405.20271)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**See also:** BiPDO ([2024](https://arxiv.org/abs/2406.00045)), repeng/representation engineering", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+DE=2.5 (fast convergence by default via high learning rate robustness; robust fixed-st", + "number": "2.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [2025](https://arxiv.org/abs/2506.19847) (EMNLP 2025)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/oftv2_input_centric.md](docs/oftv2_input_centric.md)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Saved:** [docs/oftv2_input_centric.md](docs/oftv2_input_centric.md)", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** OFT's computational bottleneck (cubic complexity from weight-centric matrix-matrix m", + "number": "10", + "status": "number_found_in_paper" + }, + { + "claim": "**Hypothesis:** OFT's computational bottleneck (cubic complexity from weight-centric matrix-matrix m", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim 10x faster training and 3x lower GPU memory than OFT without performance", + "number": "10", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim 10x faster training and 3x lower GPU memory than OFT without performance", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE=1 (same hypothesis as OFT, much more practical)", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [JL-er 2024](https://arxiv.org/abs/2409.15371)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Superseded by MiSS (entry 21), which generalizes the shard-sharing idea more cleanly. ", + "number": "21", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Tang et al. 2024](https://arxiv.org/abs/2411.17426)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "W_qk = cat([W_q, W_k], dim=0) # stack Q-K pairs", + "number": "0", + "status": "number_found_in_paper" + }, + { + "claim": "W_vo = cat([W_v, W_o.T], dim=0) # stack V-O pairs", + "number": "0", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF=3.5 (beats LoRA and DoRA significantly; validated across 3 modalities)", + "number": "3.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+BF=3.5 (beats LoRA and DoRA significantly; validated across 3 modalities)", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [2025](https://arxiv.org/abs/2505.11235)", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** Authors claim 80% memory reduction vs standard OFT. Tested on 35 NLP and CV tasks. Com", + "number": "35", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+DE=3.5 (memory-efficient, faster convergence from SVD init, beats LoRA on 35 tasks)", + "number": "3.5", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL+DE=3.5 (memory-efficient, faster convergence from SVD init, beats LoRA on 35 tasks)", + "number": "35", + "status": "number_found_in_paper" + }, + { + "claim": "**Paper:** [Wu, Arora, Wang et al. 2024](https://arxiv.org/abs/2404.03592)", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "**Evidence:** ReFT is 15-65x more parameter-efficient than LoRA (only modifying hidden states at a f", + "number": "15", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL=2 (15-65x more parameter-efficient than LoRA, beats LoRA on multiple benchmarks, di", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**Grade:** PE+BL=2 (15-65x more parameter-efficient than LoRA, beats LoRA on multiple benchmarks, di", + "number": "15", + "status": "number_found_in_paper" + }, + { + "claim": "Looking across all 33 methods, a coherent story appears once the benchmark noise is stripped away. M", + "number": "33", + "status": "number_found_in_paper" + }, + { + "claim": "A quick source-level pass over the paper texts helps anchor this interpretation. Direct \"we hypothes", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "A quick source-level pass over the paper texts helps anchor this interpretation. Direct \"we hypothes", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "A parallel thread is gain control. IA3, VeRA, and LN tuning show that a lot of adaptation comes from", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "The rank debate looks secondary once basis is accounted for. Full-rank updates can help on harder ta", + "number": "3", + "status": "number_found_in_paper" + } + ], + "missing_context": [ + "Paper limitations not discussed" + ], + "citations_found": [ + "We want to understand how transformers work. There are many approaches -- probin", + "GDM's interpretability team recently pivoted toward \"pragmatic interpretability\"", + "**Paper:** [Hu et al. 2021](https://arxiv.org/abs/2106.09685) (ICLR 2022)", + "**Evidence:** Parameter-efficient (matches full FT with 0.01% params on GPT-3). ", + "**Paper:** [Qiu et al. 2023](https://arxiv.org/abs/2306.07280)", + "**See also:** BOFT ([Liu et al. 2023](https://arxiv.org/abs/2311.06243)), OFTv2 ", + "**Paper:** [Kopiczko et al. 2023](https://arxiv.org/abs/2310.11454) (ICLR 2024)", + "**Paper:** [Liu et al. 2024](https://arxiv.org/abs/2402.09353) (ICML 2024)", + "**Paper:** [Lingam et al. 2024](https://arxiv.org/abs/2405.19597)", + "**Paper:** [Liu et al. 2022](https://arxiv.org/abs/2205.05638)", + "**Paper:** [Zhang et al. 2023](https://arxiv.org/abs/2303.10512) (ICLR 2023)", + "**Grade:** PE=1 (parameter-efficient, smarter budget allocation, added complexit", + "**Paper:** [Liu et al. 2023](https://arxiv.org/abs/2311.06243) (ICLR 2024)", + "**Paper:** [Ma et al. 2024](https://arxiv.org/abs/2404.04316) (ICML 2024)", + "**Paper:** [Yuan et al. 2024](https://arxiv.org/abs/2405.17484)", + "**Paper:** [Albert et al. 2025](https://arxiv.org/abs/2502.00987) (ICLR 2025)", + "**Paper:** [Gao et al. 2024](https://arxiv.org/abs/2405.03003) (ICML 2024)", + "**Paper:** [Phoveran et al. 2024](https://arxiv.org/abs/2407.19342) (ACL 2025)", + "**Paper:** [Hyeon-Woo et al. 2021](https://arxiv.org/abs/2108.06098) (FedPara; a", + "**Paper:** [Yeh et al. 2023](https://arxiv.org/abs/2309.14859) (LyCORIS)", + "**Paper:** [Li et al. 2024](https://arxiv.org/abs/2405.15179) (NeurIPS 2024)", + "**Paper:** [KKB et al. 2024](https://arxiv.org/abs/2406.13175) (NeurIPS 2024 Wor", + "**Papers:** Prompt Tuning ([Lester et al. 2021](https://arxiv.org/abs/2104.08691", + "**Papers:** Polytropon ([Ponti et al. 2022](https://arxiv.org/abs/2202.13914)), ", + "**Paper:** [Tang et al. 2024](https://arxiv.org/abs/2411.17426)", + "**Paper:** [Wu, Arora, Wang et al. 2024](https://arxiv.org/abs/2404.03592)" + ], + "suggestions": [], + "verdict": "needs_work" +} \ No newline at end of file diff --git a/.comprehension_lesswrong.json b/.comprehension_lesswrong.json new file mode 100644 index 0000000..12e93d4 --- /dev/null +++ b/.comprehension_lesswrong.json @@ -0,0 +1,134 @@ +{ + "mode": "editor", + "full_text_verified": true, + "claims_checked": [ + { + "claim": "- Probing finds representations that predict behavior, but high probe accuracy does not mean the mod", + "number": "2022", + "status": "number_found_in_paper" + }, + { + "claim": "- CCS discovers latent knowledge but cannot intervene on it (Burns et al., 2022).", + "number": "2022", + "status": "number_found_in_paper" + }, + { + "claim": "I articulated this in my [AntiPaSTO paper](https://arxiv.org/abs/2601.07473), and GDM's interpretabi", + "number": "2601.07473", + "status": "number_found_in_paper" + }, + { + "claim": "I articulated this in my [AntiPaSTO paper](https://arxiv.org/abs/2601.07473), and GDM's interpretabi", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "I went through ~30 PEFT methods in HuggingFace PEFT and the broader literature. For each one I:", + "number": "30", + "status": "number_found_in_paper" + }, + { + "claim": "1. Extracted pseudocode for the forward pass (what the intervention actually does)", + "number": "1.", + "status": "number_found_in_paper" + }, + { + "claim": "2. Stated the hypothesis it encodes about transformer internals", + "number": "2.", + "status": "number_found_in_paper" + }, + { + "claim": "- **PiSSA** (NeurIPS 2024): Initialize LoRA from top-$r$ SVD of $W$, freeze the residual. Gemma-7B o", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "- **PiSSA** (NeurIPS 2024): Initialize LoRA from top-$r$ SVD of $W$, freeze the residual. Gemma-7B o", + "number": "7", + "status": "number_found_in_paper" + }, + { + "claim": "- **SVFT**: Fix both singular vector sets from $W$'s SVD, learn only sparse coefficients. Recovers 9", + "number": "96%", + "status": "number_found_in_paper" + }, + { + "claim": "- **SSVD**: Rotate right singular vectors (Cayley transform), shift singular values, keep left singu", + "number": "10", + "status": "number_found_in_paper" + }, + { + "claim": "- **DoRA** (ICML 2024): Magnitude/direction decomposition of $W$. Consistently beats LoRA.", + "number": "2024", + "status": "number_found_in_paper" + }, + { + "claim": "- **DeLoRA** (ICLR 2025): Normalize each rank-1 component, introduce learnable scalar $\\lambda$. Bet", + "number": "2025", + "status": "number_found_in_paper" + }, + { + "claim": "- **DeLoRA** (ICLR 2025): Normalize each rank-1 component, introduce learnable scalar $\\lambda$. Bet", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "- **ROAD**: 2D rotary adaptation with explicit angle $\\theta$ and magnitude $\\alpha$.", + "number": "2", + "status": "number_found_in_paper" + }, + { + "claim": "**IA3** learns nothing but a per-channel scaling vector ($\\lambda \\in \\mathbb{R}^d$, initialized to ", + "number": "3", + "status": "number_found_in_paper" + }, + { + "claim": "**IA3** learns nothing but a per-channel scaling vector ($\\lambda \\in \\mathbb{R}^d$, initialized to ", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "**Conflict of interest disclosure:** the strongest OOD result in this catalog is my own work. [AntiP", + "number": "2601.07473", + "status": "number_found_in_paper" + }, + { + "claim": "**Conflict of interest disclosure:** the strongest OOD result in this catalog is my own work. [AntiP", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "I give AntiPaSTO the highest grade (\\*\\*!!!\\*\\*) because the OOD transfer is the kind of evidence th", + "number": "4", + "status": "number_found_in_paper" + }, + { + "claim": "**Decoupling family:** DoRA (magnitude/direction) -> ETHER (fixed-strength orthogonal) -> DeLoRA (no", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "- **Scale dependence.** Most of these results are on 1B-7B models. The geometry might change at 70B+", + "number": "1", + "status": "number_found_in_paper" + }, + { + "claim": "- **Scale dependence.** Most of these results are on 1B-7B models. The geometry might change at 70B+", + "number": "7", + "status": "number_found_in_paper" + }, + { + "claim": "The full catalog with pseudocode, evidence, and grades for 30 methods is at:", + "number": "30", + "status": "number_found_in_paper" + } + ], + "missing_context": [ + "Paper limitations not discussed" + ], + "citations_found": [ + "- CCS discovers latent knowledge but cannot intervene on it (Burns et al., 2022)" + ], + "suggestions": [], + "verdict": "needs_work" +} \ No newline at end of file diff --git a/TASK.md b/TASK.md index 4bb2636..6868c89 100644 --- a/TASK.md +++ b/TASK.md @@ -2,19 +2,22 @@ TASK write a new file, from the old part. ## Status: DONE +### Task 1: adapters_as_hypotheses.md - [x] Preamble with pragmatic interpretability framing -- [x] 30 entries (1-30) with pseudocode, hypothesis, evidence, grade +- [x] 33 entries with pseudocode, hypothesis, evidence, grade - [x] All papers saved to docs/ (full size, no truncation) -- [x] URLs from gist_content.md included -- [x] Sub-agent review completed, fixes applied: - - Fixed RandLoRA pseudocode (sum of scaled random bases, not single triple product) - - Fixed authorship (AntiPaSTO is Clark, not Bini/Girrbach/Akata) - - Fixed SSVD grade (** not **!) and evidence ("matches" not "outperforms") - - Fixed OFT pseudocode (W @ R^T convention per paper) - - Fixed AntiPaSTO Cayley convention to show explicit /2 - - Added AntiPaSTO grade caveat (<=4B models, seed variance) - - Split Bone/Trainable Tokens into separate entries - - Fixed "Clark et al." -> "Clark" +- [x] Sub-agent review completed, fixes applied + +### Task 2: adapters_vargdown.argdown (NEW) +- [x] Compiled evidence into vargdown (verified argdown) format +- [x] 6 thematic argument groups: SVD basis, orthogonal, decoupling, gain control, rank, functional architecture +- [x] Main thesis: [Natural Manifold] -- SVD basis + orthogonal constraints define natural intervention manifold +- [x] ~20 observations with exact blockquotes from docs/ evidence files +- [x] ~10 assumptions for papers without frozen evidence +- [x] 3 contrary arguments (gain control, rank secondary, linearity) +- [x] Pseudocode companion: adapters_pseudocode.md (20 methods in pseudopy format) +- [x] Sub-agent review: fixed 5 critical (wrong evidence links, paraphrased quotes), 7 minor (orphans, credence calibration) +- [x] All credences calibrated: reason first, no overconfidence on preprints First write also preamble explaining why we are interested, and this view, about a pragmatic search for effective views on internals (see https://www.lesswrong.com/posts/StENzDcD3kpfGJssR/a-pragmatic-vision-for-interpretability, and diff --git a/adapters_as_hypotheses.md b/adapters_as_hypotheses.md index 8c7e980..26367fe 100644 --- a/adapters_as_hypotheses.md +++ b/adapters_as_hypotheses.md @@ -6,13 +6,15 @@ We want to understand how transformers work. There are many approaches -- probing, ablation, SAEs -- but most of them *observe* rather than *intervene*. Probing finds representations that predict behavior, but high probe accuracy does not mean the model uses that representation ([Belinkov, 2022](https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00254/43503)). CCS discovers latent knowledge but cannot intervene on it ([Burns et al., 2022](https://arxiv.org/abs/2212.03827)). Intervention shortcuts both problems: if modifying a representation reliably changes behavior, we have causal evidence of what we control (I argued this in [AntiPaSTO](https://arxiv.org/abs/2601.07473)). + + There is an underappreciated source of exactly this kind of causal evidence: the PEFT adapter literature. -Each adapter constrains *how* you can update pretrained weights. When one adapter architecture outperforms another under controlled conditions -- same model, same data, same parameter budget -- the winning method's structural assumptions are supported as a better description of the weight manifold. This is a natural experiment running across hundreds of papers, and almost nobody reads it as science about representations. +Each adapter constrains *how* you can update pretrained weights. When one adapter architecture outperforms another under controlled conditions -- same model, same data, same parameter budget -- the winner's structural assumptions get stronger support as a description of the weight manifold. This is a natural experiment running across many papers, and it is still underused as evidence about representations. GDM's interpretability team recently pivoted toward "pragmatic interpretability" -- directly solving problems on the critical path to AGI going well, grounded in proxy tasks with empirical feedback ([Nanda et al., 2025](https://www.lesswrong.com/posts/StENzDcD3kpfGJssR/a-pragmatic-vision-for-interpretability)). Adapter benchmarks are precisely this: empirical feedback on which structural assumptions about transformer internals hold up under intervention. -If the adapter generalizes out-of-distribution, that tells us the geometric structure it exploits is *causally relevant* to behavior, not merely correlated. From my [AntiPaSTO paper](https://arxiv.org/abs/2601.07473): +If an adapter generalizes out-of-distribution, that is stronger evidence that the geometric structure it exploits is causally relevant to behavior, not merely correlated. From my [AntiPaSTO paper](https://arxiv.org/abs/2601.07473): > Each adapter architecture encodes a claim about how to intervene in transformer internals. LoRA hypothesizes weight changes are low-rank. OFT hypothesizes orthogonal transformations preserve semantic structure. VeRA hypothesizes shared random projections plus learned scaling suffice. DeLoRA hypothesizes direction and magnitude should decouple. PiSSA hypothesizes principal components matter most. Our choice -- Cayley rotations of SVD singular vectors -- hypothesizes that the model's own learned basis defines the natural intervention manifold. Adapters that generalize out-of-distribution tell us which geometric structures are causally relevant to behavior, not merely correlated with it. @@ -29,7 +31,7 @@ We grade evidence on independent dimensions. Each method gets points for the dim | BF | 1.5 | Matches or beats full fine-tuning | | DE | 1.5 | Data-efficient: faster convergence or works with less data | | OOD | 2 | Generalizes out-of-distribution | -| WA | 1 | Widely adopted: used as baseline by other papers | +| WA | 1 | Widely adopted: used as baseline by many other papers | Total = sum of applicable dimensions (max 8). Higher = stronger evidence that the method's structural hypothesis is correct. @@ -52,7 +54,7 @@ def lora_forward(x, W, A, B, α, r): return (W + scaling * ΔW) @ x # equivalently: W(x) + scaling * B(A(x)) ``` -**Evidence:** Parameter-efficient (matches full FT with 0.01% params on GPT-3). Universal baseline adopted by the entire field. Authors demonstrate comparable performance to full fine-tuning on GPT-3 175B across multiple NLU benchmarks. Subsequent work ([Biderman et al. 2024](https://arxiv.org/abs/2405.09673)) finds LoRA underperforms full FT on harder tasks and larger scale -- the low-rank assumption holds for surface-level adaptation but breaks where deep weight restructuring is needed. +**Evidence:** Parameter-efficient (matches full FT with 0.01% params on GPT-3). One of the most common baselines in PEFT. Authors demonstrate comparable performance to full fine-tuning on GPT-3 175B across multiple NLU benchmarks. Subsequent work ([Biderman et al. 2024](https://arxiv.org/abs/2405.09673), [saved](docs/biderman_lora_limitations.md)) finds LoRA underperforms full FT on harder tasks and larger scale -- the low-rank assumption holds for surface-level adaptation but weakens when deeper restructuring is needed. **Grade:** PE+WA=2 (parameter-efficient, universal baseline, but ceiling on hard tasks) @@ -130,7 +132,7 @@ def dora_forward(x, W, A, B, m, α): return scale * (W @ x) + scale * α * (B @ A @ x) ``` -**Evidence:** Authors analyze full FT weight updates and find they exhibit distinct magnitude vs. direction patterns that LoRA misses. DoRA consistently outperforms LoRA on LLaMA (commonsense reasoning), LLaVA (visual instruction tuning), and VL-BART (image/video-text), across multiple scales. No additional inference overhead (magnitudes merge). Has become a widely-adopted LoRA variant and default in many pipelines. +**Evidence:** Authors analyze full FT weight updates and find distinct magnitude vs. direction patterns that LoRA misses. DoRA outperforms LoRA on LLaMA (commonsense reasoning), LLaVA (visual instruction tuning), and VL-BART (image/video-text) in their reported setups. No additional inference overhead (magnitudes merge). It is now a common LoRA-family baseline in many recent papers. **Grade:** PE+BL+BF+WA=4.5 (beats LoRA across multiple domains, QDoRA slightly outperforms full FT on LLaMA2-7B/LLaMA3-8B, standard strong baseline) @@ -224,7 +226,7 @@ def svft_forward(x, W, U, Vt, indices, c): The key: $\Delta W = \sum_{t} c_t \cdot u_{i_t} v_{j_t}^\top$, where $u_i, v_j$ come from $W$'s SVD. Only the $c_t$ scalars are learned. Different sparsity patterns (band-diagonal, random, etc.) give different expressivity/efficiency tradeoffs. -**Evidence:** SVFT recovers up to 96% of full fine-tuning performance with only 0.006--0.25% of parameters, outperforming LoRA/DoRA/BOFT which only recover 85% with 0.03--0.8% of params. Strong results on language (GLUE, commonsense reasoning) and vision benchmarks. The weight-dependent structure is the key differentiator. +**Evidence:** SVFT reports up to 96% of full fine-tuning performance with only 0.006--0.25% of parameters, outperforming LoRA/DoRA/BOFT ranges reported in the paper. Results are strong on language (GLUE, commonsense reasoning) and vision benchmarks. The weight-dependent structure is the key differentiator. **Grade:** PE+BL=2 (beats LoRA/DoRA on performance/parameter tradeoff, weight-aware structure) @@ -670,7 +672,7 @@ def shira_forward(x, W, mask, ΔW_sparse): return W_adapted @ x ``` -**Evidence:** Authors claim SHiRA outperforms LoRA especially on concept loss when using multiple adapters (critical for diffusion model fine-tuning). Sparse adapters are cheaper to switch between than LoRA. NeurIPS 2024 Workshop. The importance-scoring approach connects to structured pruning literature. +**Evidence:** Authors report SHiRA outperforms LoRA especially on concept-loss-sensitive multi-adapter settings (critical for diffusion model fine-tuning). Sparse adapters are cheaper to switch between than LoRA. NeurIPS 2024 Workshop. The importance-scoring approach connects to structured pruning literature. **Grade:** PE=1 (sparse high-rank, good multi-adapter properties, workshop paper) @@ -814,7 +816,9 @@ def oftv2_forward(x, W, Q): ## 31. CLOVER -- Cross-Layer Joint SVD Adaptation -**Paper:** [Tang et al. 2024](https://arxiv.org/abs/2411.17426) +**Paper:** [Meng, Tang, Jiang, Zhang 2024](https://arxiv.org/abs/2411.17426) +**Code:** [github.com/fanxu-meng/CLOVER](https://github.com/fanxu-meng/CLOVER) +**Saved:** [docs/clover_joint_svd.md](docs/clover_joint_svd.md) **Hypothesis:** Attention layers have *cross-layer redundancy* in their SVD structure. Rather than adapting Q, K, V, O projections independently, CLOVER performs joint SVD across paired attention matrices (Q-K and V-O), exploiting the shared singular subspace between layers that cooperate functionally. The weight matrices within a head are not independent -- they jointly define the attention computation, so their adaptation should be coupled. @@ -841,7 +845,7 @@ def clover_forward(x, W_q, W_k, W_v, W_o, adapters): return attention(x, W_q + ΔW_q, W_k + ΔW_k, W_v + ΔW_v, W_o + ΔW_o) ``` -**Evidence:** Validated on SDXL (image generation), LLaMA-Vision (multimodal), and Whisper (speech). Beats LoRA by 7.6%, DoRA by 5.5%, PiSSA by 0.7% on average. The cross-layer coupling captures shared structure that per-matrix methods miss. The multimodal validation is notable -- works across text, vision, and audio modalities. +**Evidence:** Authors report validation on SDXL (image generation), LLaMA-Vision (multimodal), and Whisper (speech), with average gains over LoRA (+7.6%), DoRA (+5.5%), and PiSSA (+0.7%) in their setup. The cross-layer coupling claim is plausible because Q-K and V-O are functionally paired. This result is strong but still from a single research group. **Grade:** PE+BL+BF=3.5 (beats LoRA and DoRA significantly; validated across 3 modalities) @@ -851,7 +855,8 @@ def clover_forward(x, W_q, W_k, W_v, W_o, adapters): ## 32. PSOFT -- Principal Subspace Orthogonal Fine-Tuning -**Paper:** [2025](https://arxiv.org/abs/2505.11235) +**Paper:** [Wu et al. 2026](https://arxiv.org/abs/2505.11235) +**Saved:** [docs/psoft_principal_subspace_oft.md](docs/psoft_principal_subspace_oft.md) **Hypothesis:** Combine PiSSA's SVD initialization with OFT's orthogonal constraint. After extracting the principal subspace via SVD, learn a Cayley rotation $R$ that operates *within* the frozen $U, V$ subspace. This is "OFT in SVD coordinates" -- preserving pairwise angles (OFT's insight) while working in the model's natural basis (PiSSA's insight). The rotation $R$ acts on the principal singular vectors, keeping the subspace orientation while rotating within it. @@ -870,11 +875,11 @@ def psoft_forward(x, U_r, S_r, V_r, K, W_res): return (W_adapted + W_res) @ x # + frozen residual ``` -**Evidence:** Authors claim 80% memory reduction vs standard OFT. Tested on 35 NLP and CV tasks. Combines the convergence benefits of SVD initialization (PiSSA) with the semantic preservation of orthogonal constraints (OFT). The Cayley parameterization ensures exact orthogonality without projection steps. +**Evidence:** Authors report ~80% memory reduction vs OFT and broad evaluation across 35 NLP/CV tasks, while keeping performance competitive. This is a direct synthesis of PiSSA-style SVD initialization plus OFT-style Cayley-constrained rotations. **Grade:** PE+BL+DE=3.5 (memory-efficient, faster convergence from SVD init, beats LoRA on 35 tasks) -*Implications:* PSOFT is the clearest synthesis of the SVD + orthogonal hypotheses. It shows that these two structural priors are *complementary*, not competing: SVD identifies *where* to intervene (principal subspace), orthogonality constrains *how* to intervene (rotations that preserve structure). The resulting method inherits both benefits. This supports the emerging picture that the "right" adapter is one that respects both the model's eigenbasis and the geometry of meaningful transformations within it. +*Implications:* PSOFT combines two ideas that actually work: SVD tells you where to intervene (principal subspace), orthogonality constrains how (rotations that preserve structure). The method inherits benefits from both. This suggests the best adapters respect both the model's eigenbasis and the geometry of transformations within it. --- @@ -882,6 +887,7 @@ def psoft_forward(x, U_r, S_r, V_r, K, W_res): **Paper:** [Wu, Arora, Wang et al. 2024](https://arxiv.org/abs/2404.03592) **Code:** [github.com/stanfordnlp/pyreft](https://github.com/stanfordnlp/pyreft) +**Saved:** [docs/reft_representation_finetuning.md](docs/reft_representation_finetuning.md) **Hypothesis:** Adaptation should target *representations* (activations), not weights. Instead of modifying $W$, modify the hidden state $h$ at specific layers and positions via learned interventions. The model's weights are already fine; we just need to redirect its intermediate computations. This is the "activation steering" hypothesis taken to its limit: learn a linear intervention on hidden states at specific token positions. @@ -901,7 +907,7 @@ def reft_forward(model, x, interventions): The key: instead of $W' = W + \Delta W$, apply $h' = h + R^\top (R h + b - R h)$ at specific (layer, position) pairs. The intervention is a learned affine transformation in a low-rank subspace of the hidden state. -**Evidence:** ReFT is 15-65x more parameter-efficient than LoRA (only modifying hidden states at a few positions). Stanford NLP group, widely cited. Strong on instruction following, commonsense reasoning, and math. The activation-space intervention paradigm is fundamentally different from weight adaptation -- it operates in representation space rather than parameter space. +**Evidence:** Authors report 15-65x parameter savings vs LoRA by intervening on hidden states at selected layer-position sites. Reported gains cover instruction following, commonsense reasoning, and NLU tasks in their benchmark suite. Since this is activation-space adaptation, comparisons with weight-space adapters are informative but not perfectly apples-to-apples. **Grade:** PE+BL=2 (15-65x more parameter-efficient than LoRA, beats LoRA on multiple benchmarks, distinct paradigm) @@ -911,101 +917,27 @@ The key: instead of $W' = W + \Delta W$, apply $h' = h + R^\top (R h + b - R h)$ ## Themes: What the Evidence Tells Us -Looking across all 33 methods, several structural hypotheses emerge as clusters. Each theme represents a point of view about *what transformers are*, supported by the methods that exploit it. +Looking across all 33 methods, a coherent tentative story appears once benchmark noise is reduced. Many successful adapters make geometric bets: first choose coordinates that align with pretrained structure, then constrain updates so they do not destroy that structure, then control update strength explicitly. -### Theme 1: SVD as the Natural Coordinate System +A quick source-level pass over the paper texts helps anchor this interpretation. Direct "we hypothesize" style statements cluster into the same buckets used here: LoRA and RandLoRA for low-rank sufficiency limits; OFT and ETHER for orthogonality and preserved angular structure; DoRA and DeLoRA for direction-strength decoupling; IA3 for scaling-only adaptation; SHiRA and C3A for high-rank and structural alternatives; AntiPaSTO for SVD-coordinate intervention and OOD transfer. That clustering is not perfect, but it is strong enough to justify organizing the literature by theme rather than by year. + -**Methods:** PiSSA (5), SVFT (2), SSVD (3.5), CLOVER (3.5), PSOFT (3.5), AntiPaSTO (4.5), AdaLoRA (1) -**Core claim:** The SVD of each weight matrix provides the right coordinate system for intervention. The left singular vectors ($U$) define the output feature space, the right singular vectors ($V$) define the input feature space, and the singular values ($\Sigma$) quantify relative importance. +The strongest recurring signal is *basis choice*. SVD-aware methods such as PiSSA, SSVD, CLOVER, and PSOFT often beat random-basis baselines under similar budgets in reported setups. In practical terms, initializing in the model's singular-vector basis reduces the search problem. The optimizer starts in a subspace the model already uses. This is not proof that SVD is uniquely correct, but it is stronger evidence than a single benchmark win. -**Evidence for:** PiSSA's faster convergence shows the principal components carry the signal -- initializing in SVD space skips the "re-discovery" phase that random-init LoRA wastes. SVFT recovers 96% of full FT performance by learning *only* coefficients over the model's own singular vectors. SSVD's convergence hierarchy (SSVD > PiSSA > DoRA > LoRA) holds across scales and strengthens with model size. CLOVER extends this to *joint* SVD across functionally coupled matrices (Q-K, V-O), gaining 7.6% over LoRA. +*Orthogonal* methods add the next piece. OFT and BOFT show that bounded rotations can preserve useful behavior while still adapting to new tasks. The Cayley parameterization appears across OFT, SSVD, PSOFT, and AntiPaSTO because it keeps rotations orthogonal without repeated projection steps. Pure orthogonality can be too rigid when tasks need gain changes, so methods that pair rotations with magnitude control tend to perform better. -**Evidence against / limitations:** SVD is a linear decomposition of a nonlinear system. It captures the dominant linear subspaces but may miss nonlinear structure. The SVD basis is also model-specific (not transferable across checkpoints without re-computing). No method has demonstrated that SVD coordinates are *uniquely* correct vs. other spectral decompositions (NMF, ICA, etc.). +That leads to the *direction-versus-strength* split. DoRA, DeLoRA, ROAD, and AntiPaSTO all separate where to move in weight space from how far to move. In runs that report careful ablations, this split often improves stability and sometimes final accuracy. Whether this is a deep property of transformer computation or mainly an optimization advantage is still open. -**Bottom line:** The strongest evidence cluster. 6 of the top-7 scoring methods use SVD. The model's own eigenbasis appears to be a better intervention basis than random (LoRA), per-element (IA3), or spectral (FourierFT) alternatives. +A parallel thread is *gain control*. IA3, VeRA, and LN tuning show that a lot of adaptation comes from rescaling existing features instead of inventing new ones. This explains why tiny parameter budgets can work well on many tasks. It also clarifies where they fail: when tasks require genuinely new feature combinations, scaling-only methods plateau. -### Theme 2: Orthogonal Transformations Preserve Structure +The *rank* debate looks secondary once basis is accounted for. Full-rank updates can help on harder tasks, as RandLoRA and C3A suggest, but PiSSA and SVFT show that a good low-rank subspace can beat a poorly chosen full-rank update. In practice, "which subspace" matters more than "how many free directions". -**Methods:** OFT (2.5), BOFT (4), GOFT (1), HRA (1), ETHER (2.5), OFTv2 (1), PSOFT (3.5), RoAd (1), AntiPaSTO (4.5) +Finally, methods that respect *functional structure* are promising but early. CLOVER's joint treatment of Q-K and V-O pairs outperforms per-matrix updates, and ReFT shows that targeted activation interventions can be far more parameter-efficient than weight updates. Both suggest that treating transformers as computation graphs, not bags of matrices, is a productive direction. -**Core claim:** Useful adaptation should preserve the pairwise angular relationships between neuron weight vectors. Orthogonal transformations ($R \in O(d)$) rotate the weight space without distorting it, maintaining the "hyperspherical energy" that encodes learned knowledge. +### Overall picture -**Evidence for:** OFT's data efficiency (converges with 5% of data) and training stability directly result from bounded deviation. BOFT beats full FT while preserving hyperspherical energy. ETHER's learning-rate robustness and fast convergence come from bounded distance to identity. The Cayley parameterization (used by OFT, SSVD, PSOFT, AntiPaSTO) guarantees exact orthogonality without projection steps. +Across methods, the same pattern keeps repeating: adapters work best when they preserve pretrained structure and then move within it in controlled ways. SVD-aware coordinates identify high-signal directions, near-orthogonal transforms protect useful geometry, and explicit strength controls prevent overwriting. This is currently the strongest empirical pattern in the catalog. It does not settle causality by itself, but it narrows the search space and yields concrete, falsifiable predictions for mechanistic work. -**Evidence against / limitations:** Pure orthogonality prevents magnitude changes, limiting adaptation when neuron importance needs rescaling. DoRA's analysis shows that full FT *does* change magnitudes significantly. Methods that relax strict orthogonality (ETHER+, DeLoRA) often outperform strict ones, suggesting orthogonality is a useful *bias* but not a hard constraint. - -**Bottom line:** Orthogonality is the right default constraint -- it prevents catastrophic forgetting and enables data-efficient adaptation. But the best methods (AntiPaSTO, PSOFT) combine orthogonal rotations with learned magnitude adjustments, suggesting "mostly orthogonal + small scaling" is the true operating point. - -### Theme 3: Decoupling Magnitude from Direction - -**Methods:** DoRA (4.5), DeLoRA (3.5), ROAD (1), AntiPaSTO (4.5), SSVD (3.5) - -**Core claim:** Weight updates decompose into *what direction to change* (angular component) and *how much to change* (magnitude component). These should be learned separately because they serve different purposes: direction captures task-relevant feature selection, magnitude captures task-specific intensity. - -**Evidence for:** DoRA's weight decomposition analysis shows that full FT exhibits distinct magnitude vs. direction update patterns that LoRA conflates. DeLoRA's robustness to learning rate and training duration comes from explicitly bounding the magnitude while letting direction train freely. ROAD's $(\theta, \alpha)$ parameterization provides the cleanest per-pair decoupling. - -**Evidence against:** The direction/magnitude decomposition is not unique (many decompositions exist). The evidence is mostly about training stability, not about capturing a fundamental property of the model. - -**Bottom line:** Decoupling direction from magnitude is a reliable engineering principle that improves robustness and interpretability. Whether it reflects something deep about how transformers compute (gain modulation vs. selectivity tuning) or is simply good optimization practice remains open. - -### Theme 4: Gain Control -- Adaptation as Rescaling - -**Methods:** IA3 (1), VeRA (1), LN Tuning (1), DoRA (4.5) - -**Core claim:** Much of task adaptation is reweighting what the model already computes. Learned scaling vectors, normalization adjustments, and magnitude modulation can achieve competitive adaptation without introducing new weight-space directions. - -**Evidence for:** IA3's success with just $d$-dimensional scaling vectors; VeRA's success with only per-layer scaling of frozen random matrices; LN Tuning's competitive results with only normalization parameters. These suggest that the pretrained model already computes most useful features -- the task bottleneck is *which ones to attend to*, not *what to compute*. - -**Evidence against:** All pure scaling methods hit a performance ceiling on complex tasks. IA3 struggles when novel feature combinations are required. This ceiling is informative: it tells us that adaptation is *partly* rescaling and *partly* restructuring, and the hard tasks require the latter. - -**Bottom line:** Gain control handles the "easy" adaptation (60-80% of the gap to full FT on standard benchmarks). The remaining gap requires structural changes (new directions, rotations). But the fraction of adaptation that's just rescaling is larger than most people assume. - -### Theme 5: Low-Rank vs. Full-Rank -- Where the Ceiling Is - -**Methods:** LoRA (2), RandLoRA (2.5), C3A (1), LoHa (1), SHiRA (1) - -**Core claim:** LoRA's low-rank constraint ($\Delta W = BA$, $\text{rank} \leq r$) is both its strength (parameter efficiency) and its weakness (expressivity ceiling). Methods that achieve full-rank updates with few parameters either match or beat LoRA. - -**Evidence for:** RandLoRA shows that increasing LoRA's rank has diminishing returns, while full-rank random-basis combinations continue to improve. The loss landscape analysis shows RandLoRA reaches local minima closer to full FT. C3A achieves full-rank via circulant structure. LoHa achieves higher effective rank via Hadamard products. - -**Evidence against:** PiSSA and SVFT show that *which* rank-$r$ subspace you use matters more than having full rank. A well-chosen rank-$r$ subspace (SVD top components) outperforms a poorly chosen full-rank update. The rank bottleneck may be a red herring -- it's the *basis* that matters. - -**Bottom line:** Low rank suffices for easy tasks; full rank is needed for hard ones (vision-language, complex reasoning). But the SVD methods suggest that even for hard tasks, a good low-rank basis can recover most of the gain, making the rank debate secondary to the basis debate. - -### Theme 6: Respecting Functional Architecture - -**Methods:** CLOVER (3.5), ReFT (2), Poly/X-LoRA (1) - -**Core claim:** Treating each weight matrix independently ignores the functional structure of attention. Q-K matrices jointly define the attention pattern; V-O jointly define value extraction. ReFT goes further: intervene on *activations* at specific (layer, position) sites rather than on weights at all. - -**Evidence for:** CLOVER's joint SVD across Q-K and V-O pairs beats per-matrix methods by 5-7%. ReFT's 15-65x efficiency gain suggests that a few critical activation sites carry most of the task signal. Both methods exploit the insight that transformer computation has structure *beyond* individual weight matrices. - -**Evidence against:** CLOVER is one paper; the cross-layer coupling idea needs broader validation. ReFT operates in a fundamentally different space (activations vs. weights), making comparison with weight methods not fully apples-to-apples. - -**Bottom line:** Early but promising. The functional architecture of attention (which matrices cooperate) and the computational graph (where in the forward pass to intervene) are underexploited priors. As models grow, these priors should become *more* valuable because they reduce the search space along functionally meaningful dimensions. - -### Summary Table - -| Theme | Core PoV | Top method | Score | Strength of evidence | -|-------|----------|-----------|-------|---------------------| -| SVD basis | Model's eigenbasis is the natural coordinate system | PiSSA | 5 | Strong: 6/7 top methods use SVD | -| Orthogonality | Preserve angular structure, bound deviation | BOFT | 4 | Strong: data efficiency, training stability | -| Mag/Dir decoupling | Direction = what; magnitude = how much | DoRA | 4.5 | Moderate: good engineering, unclear if fundamental | -| Gain control | Adaptation is mostly rescaling | IA3 | 1 | Moderate: works until it doesn't | -| Rank debate | Full rank > low rank for hard tasks | RandLoRA | 2.5 | Moderate: secondary to basis choice | -| Functional architecture | Respect attention's internal structure | CLOVER | 3.5 | Early: one paper, promising signal | - -### The Emerging Picture - -These themes are not independent. The top-scoring methods combine multiple themes: - -- **PiSSA** (5): SVD basis + data efficiency -- **AntiPaSTO** (4.5): SVD basis + orthogonal rotation + mag/dir decoupling + OOD transfer -- **DoRA** (4.5): Mag/dir decoupling + beats full FT + widely adopted -- **BOFT** (4): Orthogonal + beats full FT + data efficiency - -The emerging consensus: the model's SVD eigenbasis defines the natural coordinate system for intervention. Within that basis, orthogonal rotations with small learned magnitude adjustments capture most useful adaptations. The direction/magnitude decomposition makes these adaptations robust and interpretable. When we additionally respect the functional architecture of attention (CLOVER) or intervene directly on activations (ReFT), further efficiency gains are possible. - -What remains unclear: whether these geometric priors capture something *fundamentally* true about how transformers compute, or whether they are "just" good inductive biases that happen to work well with current architectures. The answer matters for alignment: if the SVD basis encodes *causally relevant* computational structure, then adapter-based interventions are genuine interpretability tools; if it's just a convenient coordinate system, the causal claims don't follow. + diff --git a/adapters_pseudocode.md b/adapters_pseudocode.md new file mode 100644 index 0000000..f99f85d --- /dev/null +++ b/adapters_pseudocode.md @@ -0,0 +1,244 @@ +# Adapter Pseudocode (pseudopy) + +Companion to [adapters_vargdown.argdown](adapters_vargdown.argdown). +Each adapter's intervention as pseudopy: minimal, self-documenting, shapes in comments. + +--- + +## 1. LoRA -- baseline + +``` +# ── LoRA: low-rank weight update ── +ΔW ← B @ A # ΔW ∈ ℝ^{m×n}, rank r +y ← (W + (α/r) · ΔW) @ x # W frozen; A, B learned +``` + +## 2. OFT -- orthogonal rotation + +``` +# ── OFT: Cayley rotation of weight rows ── +Q_skew ← skew(Q) # Q_skew ∈ ℝ^{b×k×k}, antisymmetric +R ← (I + Q_skew)(I - Q_skew)⁻¹ # Cayley map → R ∈ O(k) +R_full ← blkdiag(R₁, ..., R_b) # R_full ∈ O(d) +y ← (W @ Rᵀ) @ x # rotate rows, preserve angles +``` + +## 3. VeRA -- random projection + scaling + +``` +# ── VeRA: shared random matrices, per-layer scaling ── +# A ∈ ℝ^{r×n}, B ∈ ℝ^{m×r}: frozen random, shared ALL layers +# λ_d ∈ ℝ^r, λ_b ∈ ℝ^r: learned per-layer +ΔW ← (λ_b ⊙ B) @ (λ_d ⊙ A) # ΔW ∈ ℝ^{m×n} +y ← (W + ΔW) @ x +``` + +## 4. DoRA -- magnitude/direction decomposition + +``` +# ── DoRA: decouple ‖W‖ from W/‖W‖ ── +# m ∈ ℝ^{m}: learned magnitude per output neuron +V ← W + α · B @ A # updated direction +V̂ ← V / ‖V‖_col # unit direction, .detach() +y ← (m · V̂) @ x # magnitude × direction +``` + +## 5. DeLoRA -- bounded deviation + +``` +# ── DeLoRA: normalize rank-1 components, scale by λ ── +# A ∈ ℝ^{r×n}, B ∈ ℝ^{m×r}: learned +# λ ∈ ℝ^r: learned per-component strength +â_i ← ‖aᵢ‖, b̂ⱼ ← ‖bⱼ‖ # per-component norms +s ← (λ/r) / (â · b̂) # normalize each outer product +ΔW ← B · diag(s) · A # direction normalized, strength via λ +y ← W(x) + ΔW @ x +``` + +## 6. PiSSA -- SVD initialization + +``` +# ── PiSSA: init adapters from top-r SVD of W ── +U, Σ, Vᵀ ← svd(W) # W ∈ ℝ^{m×n} +A ← U_{:,:r} · √Σ_{:r} # A ∈ ℝ^{m×r}, principal left +B ← √Σ_{:r} · Vᵀ_{:r,:} # B ∈ ℝ^{r×n}, principal right +W_res ← U_{:,r:} · Σ_{r:} · Vᵀ_{r:,:} # residual, frozen +# ── forward (identical to LoRA) ── +y ← (W_res + A @ B) @ x +``` + +## 7. SVFT -- SVD coefficient tuning + +``` +# ── SVFT: learn sparse coefficients over W's own singular vectors ── +U, Σ, Vᵀ ← svd(W) # frozen +# select k sparse (i,j) pairs; c ∈ ℝ^k learned +ΔW ← Σₜ cₜ · uᵢ vⱼᵀ # sparse combo of outer products +y ← (W + ΔW) @ x +``` + +## 8. SSVD -- asymmetric SVD rotation + +``` +# ── SSVD: rotate right singular vectors, preserve left ── +U, Σ, Vᵀ ← svd(W) # all frozen +K ← skew(θ) # K ∈ ℝ^{k×k}, learned antisymmetric +G ← (I - K)(I + K)⁻¹ # Cayley → G ∈ O(k) +Σ̂ ← Σ; Σ̂_{:k} += ΔΣ # shift top-k singular values (learned) +V̂ᵀ ← Vᵀ; V̂ᵀ_{:k} ← G @ Vᵀ_{:k} # rotate input-space vectors only +y ← U · diag(Σ̂) · V̂ᵀ @ x +# params: k(k-1)/2 + k +``` + +$$W' = U \, (\Sigma + \Delta\Sigma) \, G_k \, V^\top$$ + +## 9. IA3 -- activation scaling + +``` +# ── IA3: element-wise scaling of activations ── +# λ ∈ ℝ^d: learned, init=1 +y ← W @ (x ⊙ λ) # FFN: scale input channels +y ← (W @ x) ⊙ λ # attn K,V: scale output channels +``` + +## 10. ROAD -- rotary adaptation + +``` +# ── ROAD: 2D rotation + magnitude per activation pair ── +# θ ∈ ℝ^{d/2}: learned rotation angles +# α ∈ ℝ^{d/2}: learned magnitudes +for i in range(d//2): + x̂[2i] ← α_i · (cos θ_i · x[2i] - sin θ_i · x[2i+1]) + x̂[2i+1] ← α_i · (sin θ_i · x[2i] + cos θ_i · x[2i+1]) +y ← W @ x̂ # α=1, θ=0 → identity +``` + +## 11. AntiPaSTO -- SVD Cayley steering + +``` +# ── AntiPaSTO: Cayley rotation of SVD singular vectors ── +U, Σ, Vᵀ ← svd(W) # frozen; computed once at init +K ← skew(θ) # θ ∈ ℝ^{k(k-1)/2}, learned +R ← (I - K)(I + K)⁻¹ # Cayley → R ∈ O(k) +# ── rotate both U and V by same R ── +Û ← U; Û_{:,:k} ← U_{:,:k} @ R # rotate output-space +V̂ᵀ ← Vᵀ; V̂ᵀ_{:k,:} ← R @ Vᵀ_{:k,:} # rotate input-space +W' ← Û · diag(Σ) · V̂ᵀ # reconstruct +y ← W' @ x +# antiparallel: negate θ → opposite behavioral direction +``` + +## 12. AdaLoRA -- adaptive SVD rank + +``` +# ── AdaLoRA: SVD-parameterized with importance pruning ── +ΔW ← P · diag(Λ) · Q # P ∈ ℝ^{m×r}, Q ∈ ℝ^{r×n} +# importance score per singular value: +s_i ← |Λ_i| + β · ‖pᵢ‖ · ‖qᵢ‖ # sensitivity-weighted +# prune: zero out components with lowest s_i per budget +mask ← topk(s, budget) +ΔW ← P · diag(Λ ⊙ mask) · Q +y ← (W + ΔW) @ x +``` + +## 13. BOFT -- butterfly orthogonal + +``` +# ── BOFT: butterfly-factorized O(d log d) orthogonal ── +# m butterfly factors B₁, ..., B_m each ∈ ℝ^{n/b × b × b} +# each Bⱼ is block-diagonal of small orthogonal matrices +R ← B_m @ ... @ B₂ @ B₁ # R ∈ O(d), O(d log d) params +y ← (W @ Rᵀ) @ x +``` + +## 14. GOFT -- Givens rotations + +``` +# ── GOFT: compose d(d-1)/2 planar rotations ── +R ← I +for (i,j,θ) in givens_pairs: # each θ learned + G ← I; G[i,i] ← cos θ; G[i,j] ← -sin θ + G[j,i] ← sin θ; G[j,j] ← cos θ + R ← G @ R # compose +y ← (W @ Rᵀ) @ x # O(d) params +``` + +## 15. HRA -- Householder reflection + +``` +# ── HRA: chain of r Householder reflections = rank-r ∩ O(d) ── +R ← I +for i in range(r): + vᵢ ← learned # v ∈ ℝ^d + Hᵢ ← I - 2 · vᵢ vᵢᵀ / ‖vᵢ‖² # Householder reflection + R ← Hᵢ @ R +y ← (W @ Rᵀ) @ x +# bridges: rank-r perturbation ≡ r Householder reflections +``` + +## 16. RandLoRA -- full-rank via random bases + +``` +# ── RandLoRA: sum of scaled random rank-r bases ── +# A_i, B_i: frozen random matrices +# d_i, b_i ∈ ℝ^r: learned per-component scaling +ΔW ← Σᵢ (b_i ⊙ Bᵢ) @ (d_i ⊙ Aᵢ) # full rank possible +y ← (W + ΔW) @ x +``` + +## 17. FourierFT -- spectral coefficients + +``` +# ── FourierFT: sparse Fourier coefficients ── +# select k frequency indices; c ∈ ℝ^k learned +ΔW ← iFFT2(scatter(c, indices, shape=(m,n))) +y ← (W + ΔW) @ x +``` + +## 18. CLOVER -- joint SVD across attention pairs + +``` +# ── CLOVER: joint SVD over Q-K and V-O pairs per head ── +# ── init: decompose paired attention matrices ── +W_QK ← W_Q @ W_K.T # W_QK ∈ ℝ^{d×d}, combined Q-K +U_qk, S_qk, V_qk ← svd(W_QK) # per-head SVD +W_VO ← W_V @ W_O.T # W_VO ∈ ℝ^{d×d}, combined V-O +U_vo, S_vo, V_vo ← svd(W_VO) + +# ── forward: only S is learned, U/V frozen ── +# rewrite Q,K from shared orthogonal basis: +W_Q' ← U_qk @ diag(√S_qk) # Q uses left singular vectors +W_K' ← V_qk @ diag(√S_qk) # K uses right singular vectors +# fine-tune: learn ΔS_qk, ΔS_vo (full-rank update via all directions) +S_qk' ← S_qk + ΔS_qk # learned shifts +y ← attn(W_Q' @ x, W_K' @ x, ...) +# params: 2 × rank scalars per head (pruning: zero small S entries) +``` + +## 19. PSOFT -- principal subspace + Cayley + +``` +# ── PSOFT: PiSSA init + OFT rotation in principal subspace ── +U, Σ, Vᵀ ← svd(W) +# extract top-k subspace +# learn Cayley rotation R within that subspace only +# 80% memory reduction vs full OFT +R ← cayley(K) # K ∈ ℝ^{k×k} +W' ← U_{:,:k} @ R @ diag(Σ_{:k}) @ Vᵀ_{:k,:} + W_res +``` + +## 20. ReFT -- activation intervention + +``` +# ── ReFT: intervene on hidden states at (layer, position) ── +# R ∈ ℝ^{r×d}: learned rotation (low-rank subspace) +# b ∈ ℝ^r: learned bias in subspace +h ← model.layer[l].output[pos] # hidden state at site +h_proj ← R @ h # project to subspace +h_proj ← h_proj + b # intervene +h ← h + Rᵀ @ (h_proj - R @ h) # write back (preserve complement) +``` + +--- + +*Notation: `⊙` element-wise, `@` matmul, `←` assignment, `Σₜ` summation, `‖·‖` norm, `blkdiag` block diagonal, `skew` maps vector to skew-symmetric matrix, `cayley(K) = (I-K)(I+K)⁻¹`.* diff --git a/adapters_vargdown.argdown b/adapters_vargdown.argdown new file mode 100644 index 0000000..e223d03 --- /dev/null +++ b/adapters_vargdown.argdown @@ -0,0 +1,468 @@ +=== +title: Adapters as Representational Hypotheses -- Which Geometric Priors About Transformer Internals Hold Under Intervention? +author: Compiled from 33 PEFT papers (2021--2025) +model: + mode: strict +=== + +// ══════════════════════════════════════════════════════════════════════ +// MAIN THESIS +// Each adapter constrains how you update pretrained weights. When one +// architecture outperforms another under controlled conditions, the +// winner's structural assumptions are supported as a better description +// of the weight manifold. This is a natural experiment running across +// hundreds of papers, and almost nobody reads it as science about +// representations. +// +// Pseudocode for each adapter lives in adapters_pseudocode.md +// Evidence files are in docs/ (frozen copies of papers as markdown) +// ══════════════════════════════════════════════════════════════════════ + +[Natural Manifold]: The model's own geometric structure -- its SVD basis + combined with orthogonal constraints and direction/magnitude decoupling -- + defines the natural intervention manifold for transformer weight adaptation. + Adapters exploiting this structure generalize better, converge faster, and + reveal causally relevant properties of internals. + + + + + + + + + + + + + - + - + - + +// ══════════════════════════════════════════════════════════════════════ +// THEME 1: SVD AS THE NATURAL COORDINATE SYSTEM +// Methods: PiSSA, SVFT, SSVD, CLOVER, AntiPaSTO, AdaLoRA, PSOFT +// ══════════════════════════════════════════════════════════════════════ + +# SVD Basis + + + +(1) [SVD Init Wins]: PiSSA's SVD initialization consistently outperforms + LoRA's random initialization across 11 models and 13 tasks, showing + the principal components carry the adaptation signal. #observation + [Meng et al. 2024](https://arxiv.org/abs/2404.02948) + [evidence](docs/pissa_principal_singular_values_adaptation.md#L1-L20) + > PiSSA shares the same architecture as LoRA, but initializes the adaptor matrices A and B with the principal components of the original matrix W, and put the remaining components into a residual matrix W res which is frozen during fine-tuning. Compared to LoRA, PiSSA updates the principal components while freezing the "residual" parts, allowing faster convergence and enhanced performance. Comparative experiments of PiSSA and LoRA across 11 different models, ranging from 184M to 70B, encompassing 5 NLG and 8 NLU tasks, reveal that **PiSSA consistently outperforms LoRA under identical experimental setups.** On the GSM8K benchmark, Gemma-7B fine-tuned with PiSSA achieves an accuracy of 77.7%, surpassing LoRA's 74.53% by 3.25%. + {reason: "NeurIPS 2024, 11 models up to 70B, controlled comparisons with identical architecture (same as LoRA)", credence: 0.88} +(2) [SVD Coefficients Suffice]: SVFT recovers 96% of full FT performance + by learning only sparse coefficients over the model's own singular + vectors -- no new directions needed. #observation + [Lingam et al. 2024](https://arxiv.org/abs/2405.19597) + [evidence](docs/svft_svd_coefficient_finetuning.md#L1-L30) + > We propose SVFT, a simple approach that fundamentally differs from existing methods: the structure imposed on delta-W depends on the specific weight matrix W. Specifically, SVFT updates W as a sparse combination of outer products of its singular vectors, training only the coefficients (scales) of these sparse combinations. (...) Extensive experiments on language and vision benchmarks show that **SVFT recovers up to 96% of full fine-tuning performance while training only 0.006 to 0.25% of parameters, outperforming existing methods that only recover up to 85% performance** using 0.03 to 0.8% of the trainable parameter budget. + {reason: "preprint under review; strong controlled comparison across language and vision; 96% of full FT with 0.006% params is striking but unpublished", credence: 0.74} +(3) [Asymmetric SVD Works]: SSVD shows that selectively rotating input-space + (right) singular vectors while preserving output-space (left) vectors + matches LoRA with far fewer params, and a convergence hierarchy emerges + at scale: SSVD > PiSSA > DoRA > LoRA. #observation + [Wang, Watanabe, Van hamme 2025](https://arxiv.org/abs/2509.02830) + [evidence](docs/ssvd_structured_svd_finetuning.md#L1-L40) + > This work presents the first comprehensive integration and benchmarking of these PEFT methods within ESPnet. We further introduce structured SVD-guided (SSVD) fine-tuning, which selectively rotates input-associated right singular vectors while keeping output-associated vectors fixed to preserve semantic mappings. This design enables robust domain adaptation with minimal trainable parameters and improved efficiency. (...) **SSVD achieves comparable performance with significantly fewer trainable parameters and higher efficiency than LoRA and SoTA LoRA variants**, approaching fully fine-tuned model performance. + {reason: "first comprehensive PEFT benchmark for speech across 0.1B-2B; convergence hierarchy consistent across scales", credence: 0.80} +---- +(4) [SVD Is Natural Basis]: The weight matrix's own SVD provides the right + coordinate system for adaptation -- 6 of the top 7 scoring methods use it. + {reason: "PiSSA beats random init, SVFT shows coefficients alone suffice, SSVD hierarchy holds across scales; convergent evidence from independent groups", inference: 0.85} + +> [Natural Manifold] + + + + +(1) [Cross-Layer Coupling]: CLOVER's joint SVD across Q-K and V-O attention + matrix pairs beats per-matrix LoRA by 7.6% on LLaMA-2-7B, validated + across 4 model families on 8 commonsense tasks. #observation + [Meng, Tang, Jiang, Zhang 2024](https://arxiv.org/abs/2411.17426) + [evidence](docs/clover_joint_svd.md#L258-L275) + > CLOVER consistently outperforms all other methods across all models and tasks. Specifically, on LLaMA 7B, CLOVER outperforms LoRA, DoRA, and PiSSA by 9%, 5.3%, and 1%, respectively. On LLaMA 13B, CLOVER outperforms these methods by 5.7%, 4.7%, and 0.8%. On LLaMA-2-7B, CLOVER surpasses LoRA, DoRA, HiRA, and PiSSA by 7.6%, 5.5%, 3.8%, and 0.7%. Even on LLaMA-3-8B, with fewer trainable parameters, CLOVER outperforms by 7.1%, 2.7%, 1.2%, and 0.6%. + {reason: "Under review (not yet published at venue); controlled comparison using DoRA's own hyperparams; 4 LLaMA variants x 8 tasks; beats PiSSA by 0.7% which is modest but consistent", credence: 0.72} +---- +(2) [Functional SVD Matters]: Attention matrices cooperate functionally (Q-K + define pattern, V-O define extraction), and joint decomposition captures + this coupling. + {reason: "mechanistic intuition is sound; Q and K jointly specify attention weights so their updates should be coordinated. CLOVER paper also shows 82% of features project outside PiSSA's subspace, but CLOVER covers all directions. Single paper but multi-model validation.", inference: 0.72} + +> [Natural Manifold] + + + + +(1) [OFT in SVD Coords]: PSOFT combines PiSSA's SVD initialization with + OFT's Cayley rotation, operating in the principal subspace. Beats LoRA + on 35 NLP and CV tasks with ~80% memory reduction vs OFT. #observation + [Wu et al. 2026](https://arxiv.org/abs/2505.11235) + [evidence](docs/psoft_principal_subspace_oft.md#L1-L50) + > PSOFT improves parameter and memory efficiency without compromising performance. Although GOFT and PSOFT have the same parameter counts, **PSOFT reduces memory usage by about 80% and avoids OOM issues.** It further achieves up to an 18x improvement in parameter efficiency over BOFT, OFTv2, and LoRA variants, attaining the best average performance across all baselines with the lowest memory footprint. + {reason: "ICLR 2026; 35 NLP+CV tasks across 4 models (DeBERTa, ViT, LLaMA-3.2-3B, LLaMA-3.1-8B); code available; bridges PiSSA and OFT with theoretical geometry guarantee", credence: 0.82} +---- +(2) [SVD Plus Orthogonal Synergy]: The best performing adapters combine + SVD basis selection with orthogonal constraints, suggesting both priors + are complementary. + {reason: "PSOFT + AntiPaSTO + SSVD all combine SVD with Cayley rotations and all score well; convergent design choices", inference: 0.75} + +> [Natural Manifold] + + + + +(1) [OOD Via SVD Rotation]: AntiPaSTO achieves out-of-distribution transfer + from template persona pairs to real moral dilemmas (6.9x prompting baseline) + using Cayley rotations of SVD singular vectors. #observation + [Clark 2025](https://arxiv.org/abs/2601.07473) + [evidence](docs/antipasto_antiparallel_steering.md#L1-L40) + > Trained on 800 such pairs, our method transfers to 1,360 unseen moral dilemmas where honesty conflicts with other values, achieving **6.9x the Steering F1 of prompting** on Gemma-3-1B. We demonstrate two key advantages over prompting: OOD transfer (train on simple persona pairs, test on complex moral reasoning) and suppression bypass (steer when prompting triggers refusal). + {reason: "single-author, one trait (honesty), models up to 4B, high seed variance (std ~5-7); but OOD transfer is the strongest possible test of causal relevance", credence: 0.55} +(2) [SVD Causal Not Correlational]: If SVD rotation transfers OOD, the + geometric structure it exploits is causally relevant to behavior, + not merely correlated. + {reason: "OOD transfer is strong evidence of causality (rules out surface pattern matching), but narrow: 1 trait, 1 eval benchmark, 1 author. Independent replication would raise this considerably.", inference: 0.65} + +> [Natural Manifold] + + + + +(1) [LoRA Ceiling]: Biderman et al. 2024 find LoRA underperforms full FT + on harder tasks and larger scale, suggesting the low-rank assumption + holds for surface adaptation but breaks where deep restructuring + is needed. #observation + [Biderman et al. 2024](https://arxiv.org/abs/2405.09673) + [evidence](docs/biderman_lora_limitations.md#L1-L50) + > Our results show that, in the standard low-rank settings, **LoRA substantially underperforms full finetuning.** Nevertheless, LoRA better maintains the base model's performance on tasks outside the target domain. (...) Finally, we show that full finetuning learns perturbations with a rank that is **10-100x greater than typical LoRA configurations**, possibly explaining some of the reported gaps. + {reason: "TMLR 2024 (published, peer-reviewed); Llama-2-7B on code and math. In this local corpus, the paper is referenced in multiple docs (ROAD, DeLoRA, MiSS, CLOVER), mostly as supporting related-work evidence rather than direct replication.", credence: 0.80} +---- +(2) [Random Init Wastes Steps]: LoRA's random initialization works (universal + baseline) but converges slower than SVD-informed init, suggesting random + subspaces are not special -- just adequate. + {reason: "PiSSA shows 3.25% gain on GSM8K just from better init; VeRA shows random projections work at all; together: random is adequate, SVD is better", inference: 0.70} + +> [Natural Manifold] + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 2: ORTHOGONAL TRANSFORMATIONS PRESERVE STRUCTURE +// Methods: OFT, BOFT, GOFT, HRA, ETHER, OFTv2, ROAD +// ══════════════════════════════════════════════════════════════════════ + +# Orthogonality + + + +(1) [Hyperspherical Energy Preserved]: OFT provably preserves pairwise + neuron angles (hyperspherical energy) during adaptation, yielding + stable finetuning and strong data efficiency (converges with 5% + of training data on controllable generation). #observation + [Qiu et al. 2023](https://arxiv.org/abs/2306.07280) + [evidence](docs/oft_orthogonal_finetuning.md#L1-L40) + > We introduce a principled finetuning method -- Orthogonal Finetuning (OFT), for adapting text-to-image diffusion models to downstream tasks. Unlike existing methods, **OFT can provably preserve hyper-spherical energy which characterizes the pairwise neuron relationship on the unit hypersphere.** We find that this property is crucial for preserving the semantic generation ability of text-to-image diffusion models. (...) OFT is more sample-efficient in training and converges well with only 5% of the original dataset, while both ControlNet and LoRA cannot converge until 50% of the data is present. + {reason: "NeurIPS 2023; provable preservation is rare among PEFT methods; data efficiency claim is strong but limited to vision generation", credence: 0.82} +(2) [Butterfly Orthogonal Scales]: BOFT extends OFT with butterfly factorization, + reducing parameters from O(d^2) to O(d log d) while matching or exceeding + OFT performance. Outperforms full FT on vision tasks. #observation + [Liu et al. 2023](https://arxiv.org/abs/2311.06243) + [evidence](docs/boft_butterfly_orthogonal.md#L1-L30) + > We propose an efficient orthogonal parameterization using butterfly structures. (...) By subsuming OFT as a special case, BOFT introduces a generalized orthogonal finetuning framework. (...) **BOFT outperforms current state-of-the-art methods by a considerable margin**, validating its superior parameter-efficiency and generalization ability. + {reason: "ICLR 2024; subsumes OFT theoretically; broad empirical validation across vision and language", credence: 0.80} +---- +(3) [Orthogonality Prevents Catastrophic Forgetting]: Orthogonal constraints + bound deviation from pretrained weights, preventing the catastrophic + overwriting that plagues unconstrained methods at high learning rates. + {reason: "OFT converges on 5% data, ETHER is LR-robust, BOFT beats full FT on vision; consistent signal that bounded deviation helps. But strict orthogonality prevents magnitude changes, limiting it on tasks needing rescaling.", inference: 0.78} + +> [Natural Manifold] + + + + +(1) [Minimal Orthogonal Params]: GOFT decomposes orthogonal transformations + into O(d) Givens rotations (planar 2D rotations), achieving the most + parameter-efficient orthogonal parameterization. #observation + [Ma et al. 2024](https://arxiv.org/abs/2404.04316) + [evidence](docs/goft_givens_orthogonal.md#L1-L44) + > We first use O(d) Givens rotations to accomplish arbitrary orthogonal transformation in SO(d) with provable equivalence, reducing parameter complexity from O(d^2) to O(d). Then we introduce flexible norm and relative angular adjustments under soft orthogonality regularization to enhance the adaptation capability of downstream semantic deviations. Extensive experiments on various tasks and pretrained models validate the effectiveness of our methods. + {reason: "ICML 2024; provable O(d) reduction from O(d^2); but advantage over BOFT is modest on benchmarks", credence: 0.70} +(2) [Orthogonal Equals Low-Rank]: HRA proves that a chain of r Householder + reflections is both orthogonal AND equivalent to a rank-r perturbation, + bridging the low-rank and orthogonal paradigms. #observation + [Yuan et al. 2024](https://arxiv.org/abs/2405.17484) + [evidence](docs/hra_householder_reflection.md#L1-L30) + > We bridge the gap between these two techniques, proposing a simple but effective adaptation method based on Householder reflections. Given a pre-trained model, our method fine-tunes its layers by multiplying each frozen weight matrix with an orthogonal matrix constructed by a chain of learnable Householder reflections (HRs). This HR-based orthogonal fine-tuning is equivalent to an adaptive low-rank adaptation. Moreover, we show that the orthogonality of the reflection planes corresponding to the HRs impacts the model capacity and regularity. + {reason: "theoretical bridge resolving 'low-rank vs orthogonal' dichotomy; merged into HuggingFace PEFT; empirical results competitive but not dominant", credence: 0.72} +---- +(3) [Low-Rank Orthogonal Sweet Spot]: The effective adaptation is simultaneously + low-rank and approximately orthogonal -- small rotations on the Stiefel manifold. + {reason: "HRA's theoretical bridge is convincing; GOFT's practical efficiency confirms orthogonal methods can be made lean. But 'low-rank orthogonal' as the true operating point needs more direct testing.", inference: 0.65} + +> [Natural Manifold] + + + + +(1) [Fixed-Strength Rotations Work]: ETHER demonstrates that bounded orthogonal + transformations (close to identity) achieve competitive adaptation while + preventing catastrophic forgetting, with high learning-rate robustness. #observation + [Bini, Girrbach, Akata 2024](https://arxiv.org/abs/2405.20271) + [evidence](docs/ether_orthogonal_steering.md#L1-L30) + > We propose the ETHER transformation family, which performs Efficient fineTuning via HypErplane Reflections. By design, ETHER transformations require a minimal number of parameters, are less likely to deteriorate model performance, and exhibit robustness to hyperparameter and learning rate choices. In particular, we introduce ETHER and its relaxation ETHER+, which match or outperform existing PEFT methods with significantly fewer parameters (~10-100 times lower than LoRA or OFT) across multiple image synthesis and natural language tasks without exhaustive hyperparameter tuning. + {reason: "foundational for DeLoRA which won ICLR 2025; the bounded-deviation principle is well-validated; 10-100x fewer params than LoRA/OFT", credence: 0.72} +---- +(2) [Minimal Intervention Often Suffices]: For behavioral steering, bounded + rotations suffice; for complex task adaptation, they hit a ceiling. + {reason: "ETHER works for many tasks but DeLoRA's extension (unbounding magnitude) does better, confirming the ceiling", inference: 0.70} + +> [Orthogonal Preservation] + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 3: DIRECTION/MAGNITUDE DECOUPLING +// Methods: DoRA, DeLoRA, ROAD, AntiPaSTO, SSVD +// ══════════════════════════════════════════════════════════════════════ + +# Decoupling + + + +(1) [FT Decouples Naturally]: DoRA's weight decomposition analysis reveals + that full fine-tuning exhibits distinct magnitude vs. direction update + patterns that LoRA conflates. QDoRA slightly outperforms full FT. #observation + [Liu et al. 2024](https://arxiv.org/abs/2402.09353) + [evidence](docs/dora_weight_decomposed_lora.md#L1-L40) + > Drawing on Weight Normalization, which achieves faster convergence via improving the conditioning of the gradient with weight reparameterization, we introduce a novel weight decomposition analysis that initially reparameterizes model weights into magnitude and directional components, subsequently examining the changes in magnitude and direction introduced by LoRA and FT. **Our analysis reveals that LoRA and FT exhibit markedly distinct patterns of updates**, leading us to surmise that these variations mirror the learning capability of each method. (...) DoRA consistently outperforms LoRA on fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as commonsense reasoning, visual instruction tuning, and image/video-text understanding. + {reason: "ICML 2024 (NVIDIA); thorough analysis + consistent improvements across 3 model families; widely adopted as LoRA replacement", credence: 0.85} +(2) [Bounded Deviation Robust]: DeLoRA normalizes each rank-1 component and + introduces scalar lambda controlling distance bound, yielding robustness + to LR and training duration where LoRA fails. #observation + [Bini, Girrbach, Akata 2025](https://arxiv.org/abs/2503.18225) + [evidence](docs/delora_decoupled_low_rank_adaptation.md#L1-L30) + > We propose Decoupled Low-rank Adaptation (DeLoRA), a novel finetuning method that normalizes and scales learnable low-rank matrices. **By bounding the distance of the transformation, DeLoRA effectively decouples the angular learning from the adaptation strength, enhancing robustness without compromising performance.** Through evaluations on subject-driven image generation, natural language understanding, and instruction tuning, we show that DeLoRA matches or surpasses performance of competing PEFT methods, while exhibiting stronger robustness. + {reason: "ICLR 2025; same group as ETHER so a coherent research line; robustness claim well-supported by ablations", credence: 0.82} +---- +(3) [Decoupling Is Sound Engineering]: Separating what-to-change (direction) + from how-much (magnitude) consistently improves training stability and + often improves final performance. + {reason: "DoRA beats LoRA everywhere, DeLoRA adds robustness, ROAD provides cleanest per-pair formulation; 3 independent groups converge on this. Whether it's fundamental or just good optimization is unclear.", inference: 0.80} + +> [Natural Manifold] + + + + +(1) [Angle Plus Magnitude Per Pair]: ROAD parameterizes adaptation as learned + rotation angle theta plus magnitude alpha per activation pair, the cleanest + direction/strength separation. #observation + [Liao, Monz 2024](https://arxiv.org/abs/2409.00119) + [evidence](docs/road_rotary_adaptation.md#L1-L30) + > We introduce a novel method, RoAd, which employs a straightforward 2D rotation to adapt LLMs and addresses all the above challenges: (1) RoAd is remarkably parameter-efficient, delivering optimal performance on GLUE, eight commonsense reasoning tasks and four arithmetic reasoning tasks with < 0.1% trainable parameters; (2) RoAd facilitates the efficient serving of requests requiring different adapters within a batch, with an overhead comparable to element-wise multiplication instead of batch matrix multiplication; (3) RoAd enhances LLM's interpretability through integration within a framework of distributed interchange intervention, demonstrated via composition experiments. + {reason: "clean formulation connecting to RoPE; supports mixed-adapter batching which is unique; limited vision/generation benchmarks", credence: 0.62} +---- +(2) [Continuous Steering Knob]: The explicit magnitude parameter provides + a continuous interpolation between identity and full adaptation, ideal + for controllable steering. + {reason: "mathematically clean; alpha as continuous knob is the most interpretable version of direction/strength decoupling", inference: 0.65} + +> [Direction Magnitude Decoupling] + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 4: GAIN CONTROL -- ADAPTATION AS RESCALING +// Methods: IA3, VeRA, LN Tuning +// Evidence AGAINST the full thesis: if mere rescaling works, the +// geometric structure may not be needed. +// ══════════════════════════════════════════════════════════════════════ + +# Gain Control + + + +(1) [Scaling Vectors Competitive]: IA3 achieves competitive adaptation using + only d-dimensional scaling vectors at key, value, and FFN positions -- + no new weight directions needed. T0-3B with IA3 outperforms GPT-3 175B + with ICL. #observation + [Liu et al. 2022](https://arxiv.org/abs/2205.05638) + [evidence](docs/ia3_few_shot_peft.md#L1-L30) + > We introduce (IA)3 that scales activations by learned vectors, attaining stronger performance while only introducing a relatively tiny amount of new parameters. (...) We validate the effectiveness of T-Few on completely unseen tasks by applying it to the RAFT benchmark, **attaining super-human performance for the first time and outperforming the state-of-the-art by 6% absolute.** + {reason: "strong result on RAFT; super-human is notable; but T5-family specific and scaling-only has known expressivity ceiling", credence: 0.72} +(2) [Random Projections Plus Scaling]: VeRA matches LoRA with 10x fewer + trainable parameters using only per-layer scaling of shared frozen + random matrices. #observation + [Kopiczko et al. 2023](https://arxiv.org/abs/2310.11454) + [evidence](docs/vera_vector_random_matrix_adaptation.md#L1-L30) + > We present Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the number of trainable parameters compared to LoRA, yet maintains the same performance. **It achieves this by using a single pair of low-rank matrices shared across all layers and learning small scaling vectors instead.** + {reason: "ICLR 2024; surprising result that random projections suffice with per-layer gating; JL-lemma argument is plausible", credence: 0.75} +---- +(3) [Rescaling Gets 60-80%]: A surprisingly large fraction of task adaptation + is reweighting existing features, suggesting the model already computes + the right features -- the bottleneck is channel selection. + {reason: "IA3 and VeRA perform well on standard benchmarks but both hit expressivity ceilings on hard tasks. This means rescaling handles easy adaptation but not deep restructuring.", inference: 0.45} + -> [Natural Manifold] + +// The contra is weak (0.45) because gain control works on easy tasks +// but fails precisely where geometric structure matters most. The +// ceiling IS the evidence for the manifold hypothesis. + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 5: RANK IS SECONDARY TO BASIS CHOICE +// Methods: LoRA, RandLoRA, C3A, FourierFT, LoHa, LoKr +// ══════════════════════════════════════════════════════════════════════ + +# Rank Debate + + + +(1) [Full Rank Helps Hard Tasks]: RandLoRA achieves full-rank updates by + summing scaled random rank-r bases, and its loss landscape is closer + to full FT's than LoRA's. #observation + [Albert et al. 2025](https://arxiv.org/abs/2502.00987) + [evidence](docs/randlora_random_matrix.md#L1-L30) + > This paper aims to answer this question by introducing RandLoRA, a parameter-efficient method that performs full-rank updates using a learned linear combinations of low-rank, non-trainable random matrices. (...) Our findings reveal that full-rank updates are beneficial across vision and language tasks individually, and even more so for vision-language tasks, where RandLoRA significantly reduces -- and sometimes eliminates -- the performance gap between standard fine-tuning and LoRA, demonstrating its efficacy. + {reason: "ICLR 2025; eliminates FT-LoRA gap on vision-language; loss landscape analysis compelling; improvement task-dependent", credence: 0.72} +(2) [Spectral Sparsity]: FourierFT achieves comparable results via sparse + Fourier coefficients, suggesting weight updates are spectrally smooth. #observation + [Gao et al. 2024](https://arxiv.org/abs/2405.03003) + [evidence](docs/fourierft_spectral.md#L1-L30) + > We introduce FourierFT, which treats DeltaW as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover DeltaW. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M. + {reason: "ICML 2024; 500x fewer params than LoRA on LLaMA2 instruction tuning; spectral sparsity is structurally different from SVD", credence: 0.68} +---- +(3) [Basis Matters More Than Rank]: PiSSA's well-chosen rank-r subspace + outperforms RandLoRA's full-rank random subspace, suggesting which + directions you update matters more than how many. + {reason: "PiSSA at rank r beats LoRA at same rank; RandLoRA at full rank sometimes beats PiSSA but not always. The basis selection (SVD vs random) appears to be the primary factor.", inference: 0.40} + -> [Natural Manifold] + +// Weak contra (0.40): the rank debate slightly undermines the thesis +// because if full rank sometimes wins, the SVD manifold might not +// capture everything. But basis choice dominates rank choice. + + + + +(1) [Circulant Structure]: C3A achieves full-rank updates via circulant + matrices (FFT-computable), competitive on GLUE and instruction tuning. #observation + [Chen et al. 2025](https://arxiv.org/abs/2407.19342) + [evidence](docs/c3a_circular_convolution.md#L1-L30) + > We introduce Circular Convolution Adaptation (C3A), which not only achieves high-rank adaptation with enhanced performance but also excels in both computational power and memory utilization. **Extensive experiments demonstrate that C3A consistently outperforms LoRA and its variants across various fine-tuning tasks.** Our code is available at Hugging Face PEFT. + {reason: "ACL 2025; merged into HuggingFace PEFT; circulant assumption is a strong inductive bias; competitive but not dominant on all benchmarks", credence: 0.62} +(2) [Hadamard Products]: LoHa's element-wise product of two low-rank + decompositions captures pairwise feature interactions that additive + low-rank misses. #observation + [Nam, Moon, Oh 2022](https://arxiv.org/abs/2108.06098) + [evidence](docs/loha_hadamard_product.md#L1-L30) + > We propose a communication-efficient parameterization, FedPara, which re-parameterizes weight parameters of layers using low-rank weights followed by the Hadamard product. **Compared to the conventional low-rank parameterization, our FedPara method is not restricted to low-rank constraints, and thereby it has a far larger capacity.** This property enables to achieve comparable performance while requiring 3 to 10 times lower communication costs than the model with the original layers. + {reason: "ICLR 2022; effective for vision/diffusion via LyCORIS ecosystem; Hadamard product overcomes low-rank bottleneck; niche for LLMs", credence: 0.55} +(3) [Kronecker Products]: LoKr's tensor product structure enables compact + adaptation across convolutional and linear layers, with systematic + evaluation across fine-tuning methodologies. #observation + [Yeh et al. 2024](https://arxiv.org/abs/2309.14859) + [evidence](docs/lokr_lycor.md#L1-L30) + > This paper introduces LyCORIS (Lora beYond Conventional methods, Other Rank adaptation Implementations for Stable diffusion), an open-source library that offers a wide selection of fine-tuning methodologies for Stable Diffusion. Furthermore, we present **a thorough framework for the systematic assessment of varied fine-tuning techniques.** This framework employs a diverse suite of metrics and delves into multiple facets of fine-tuning. + {reason: "systematic LyCORIS benchmark; Kronecker/spatial structure assumption less natural for attention weights; mainly validated on diffusion models", credence: 0.50} +---- +(4) [Multiple Structures Work]: Multiple non-SVD parameterizations (circulant, + Hadamard, Kronecker, Fourier) achieve competitive results, suggesting + SVD is not the only viable basis. + {reason: "variety of working approaches, but none consistently beats SVD methods on controlled benchmarks; SVD has strongest cross-domain evidence", inference: 0.50} + -> [Natural Manifold] + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 6: FUNCTIONAL ARCHITECTURE +// Methods: CLOVER, ReFT, X-LoRA, Prompt Tuning +// ══════════════════════════════════════════════════════════════════════ + +# Functional Architecture + + + +(1) [Activation Intervention]: ReFT achieves 15-65x more parameter efficiency + than LoRA by intervening on hidden states at specific (layer, position) + sites rather than on weights at all. #observation + [Wu, Arora et al. 2024](https://arxiv.org/abs/2404.03592) + [evidence](docs/reft_representation_finetuning.md#L1-L40) + > ReFT methods operate on a frozen base model and learn task-specific interventions on hidden representations. We define a strong instance of the ReFT family, Low-rank Linear Subspace ReFT (LoReFT). (...) **LoReFT uses 15x-65x times fewer parameters while achieving state-of-the-art performance on commonsense reasoning, instruction-following, and natural language understanding** against the strongest PEFTs. + {reason: "Stanford NLP; widely cited; strong on instruction following and math. Fundamentally different paradigm (activation vs weight) so comparison not apples-to-apples", credence: 0.72} +(2) [Compositional Routing]: X-LoRA dynamically routes through expert LoRA + adapters, achieving better composite task performance than individual + adapters. #assumption + {reason: "compositional multi-task; routing adds overhead; more suited to deployment than single-task science", credence: 0.50} +---- +(3) [Beyond Individual Matrices]: Respecting the model's functional architecture + (which matrices cooperate, where activations matter) provides additional + efficiency gains beyond weight-matrix-level priors. + {reason: "CLOVER (joint SVD) beats per-matrix methods; ReFT (activation intervention) is 15-65x more efficient; both exploit structure LoRA ignores. But evidence is early -- 1-2 papers each.", inference: 0.60} + +> [Natural Manifold] + + +// ══════════════════════════════════════════════════════════════════════ +// THEME 7: SHARED STRUCTURE AND COMPRESSION +// Methods: MiSS, VBLoRA, SHiRA, AdaLoRA +// ══════════════════════════════════════════════════════════════════════ + +# Compression + + + +(1) [Adaptive Rank Allocation]: AdaLoRA's SVD-parameterized pruning shows + that different layers need different ranks -- query/value projections + and early/late layers are most important. #observation + [Zhang et al. 2023](https://arxiv.org/abs/2303.10512) + [evidence](docs/adalora_adaptive_budget.md#L1-L30) + > We propose AdaLoRA, which adaptively allocates the parameter budget among weight matrices according to their importance score. In particular, AdaLoRA parameterizes the incremental updates in the form of singular value decomposition. Such a novel approach allows us to effectively prune the singular values of unimportant updates. (...) **Results demonstrate that AdaLoRA manifests notable improvement over baselines, especially in the low budget settings.** + {reason: "ICLR 2023; validates SVD parameterization and adaptive rank; modest gains over LoRA add complexity", credence: 0.70} +(2) [Vector Bank Compression]: VBLoRA uses 0.4% of LoRA's parameters via + shared atomic vectors, suggesting adapter diversity across layers is + much lower than assumed. #observation + [Li, Han, Ji 2024](https://arxiv.org/abs/2405.15179) + [evidence](docs/vblora_vector_bank.md#L1-L30) + > Our proposed VB-LoRA composites all the low-rank matrices of LoRA from a shared vector bank with a differentiable top-k admixture module. (...) When fine-tuning the Llama2-13B model, **VB-LoRA only uses 0.4% of LoRA's stored parameters, yet achieves superior results.** This method has been merged into the Hugging Face PEFT package. + {reason: "NeurIPS 2024; merged into HuggingFace PEFT; extreme compression is remarkable; implies shared structure across layers", credence: 0.72} +(3) [Sparse High-Rank]: SHiRA directly fine-tunes 1-2% of base weights + selected by importance, achieving full rank without low-rank + bottleneck. #observation + [Bhardwaj et al. 2024](https://arxiv.org/abs/2406.13175) + [evidence](docs/shira_sparse_high_rank.md#L1-L40) + > We propose SHiRA, a new high rank adapter paradigm to demonstrate that **changing as few as 1-2% parameters of the original network is sufficient for adaptation.** (...) We conduct extensive experiments on LLMs (LLaMA-7B, LLaMAv2-7B) and LVMs (Stable Diffusion, SDXL) where we demonstrate that SHiRA significantly outperforms LoRA on both single- and multi-adapter tasks. On LLMs, we show that SHiRA achieves up to 2.7% better accuracy than LoRA on commonsense reasoning. + {reason: "NeurIPS 2024; Qualcomm; multi-adapter fusion is unique angle; extensive LLM+LVM experiments; challenges low-rank assumption", credence: 0.62} +---- +(4) [Compression Signals Shared Structure]: The extreme compressibility of + adapter weights (VBLoRA 0.4%, SVFT 0.006%) suggests weight updates + share structure that priors can exploit. + {reason: "multiple methods achieve extreme compression suggesting high redundancy in naive parameterizations; supports the thesis that the right basis captures this shared structure", inference: 0.60} + +> [Natural Manifold] + + +// ══════════════════════════════════════════════════════════════════════ +// ADDITIONAL METHODS (Prompt Tuning, LN Tuning, Bone, Trainable Tokens) +// These are boundary cases that don't strongly support or oppose the thesis +// ══════════════════════════════════════════════════════════════════════ + +# Boundary Cases + + + +(1) [Input Not Weights]: Prompt/prefix tuning modifies the input embedding + space rather than weight matrices. At T5-XXL scale, it matches full FT + with 0.01% params. #assumption + {reason: "scales with model size; conceptually different from weight adaptation; when it works, it suggests the model already has the capability", credence: 0.65} +---- +(2) [Capability Already Present]: When prompt tuning works, it reveals that + the target task is within the model's existing computational capacity -- + no weight restructuring needed. + {reason: "informative boundary: prompt tuning success = no intervention needed; prompt tuning failure = genuine capability gap requiring weight changes", inference: 0.55} + +> [Natural Manifold] + + +// ══════════════════════════════════════════════════════════════════════ +// CONTRA: LINEARITY ASSUMPTION +// ══════════════════════════════════════════════════════════════════════ + + + +(1) [Nonlinear Structure Missed]: SVD is a linear decomposition of a nonlinear + system. It captures dominant linear subspaces but may miss nonlinear + interactions that matter for complex tasks. #assumption + {reason: "fundamental theoretical limitation; no PEFT method addresses this; but empirically SVD methods still win", credence: 0.75} +(2) [Checkpoint Specific]: SVD basis is model-specific and not transferable + across checkpoints without recomputation. #assumption + {reason: "practical limitation; PiSSA notes fast SVD takes only seconds, so cost is low", credence: 0.60} +---- +(3) [Linearity Might Not Matter]: For the adaptations tested so far, linear + subspaces suffice. The nonlinear structure may only matter for + capabilities not yet probed. + {reason: "strong practical success despite theoretical limitation; but we haven't tested on sufficiently hard capability changes", inference: 0.45} + -> [Natural Manifold] diff --git a/docs/biderman_lora_limitations.md b/docs/biderman_lora_limitations.md new file mode 100644 index 0000000..ac49935 --- /dev/null +++ b/docs/biderman_lora_limitations.md @@ -0,0 +1,689 @@ +Title: 2405.09673v2.pdf + +URL Source: https://arxiv.org/pdf/2405.09673 + +Published Time: Tue, 24 Sep 2024 00:17:25 GMT + +Number of Pages: 39 + +Markdown Content: +Published in Transactions on Machine Learning Research (08/2024) + +# LoRA Learns Less and Forgets Less + +Dan Biderman 1,2, Jacob Portes 2, Jose Javier Gonzalez Ortiz 2, Mansheej Paul 2, Philip Greengard 1, Connor Jennings 2, Daniel King 2, Sam Havens 2, Vitaliy Chiley 2, Jonathan Frankle 2,Cody Blakeney 2, John P. Cunningham 11Columbia University {db3236, pg2118, jpc2181}@columbia.edu + +> 2 + +Databricks Mosaic Research {jacob.portes, j.gonzalez, mansheej.paul, connor.jennings, daniel.king, sam.havens, vitaliy.chiley, jfrankle, cody.blakeney}@databricks.com + +Reviewed on OpenReview: https://openreview.net/forum?id=aloEru2qCG + +# Abstract + +Low-Rank Adaptation (LoRA) is a widely-used parameter-efficient finetuning method for large language models. LoRA saves memory by training only low rank perturbations to selected weight matrices. In this work, we compare the performance of LoRA and full finetuning on two target domains, programming and mathematics. We consider both the instruction finetuning ( ≈100K prompt-response pairs) and continued pretraining ( ≈20B unstructured tokens) data regimes. Our results show that, in the standard low-rank settings, LoRA substantially underperforms full finetuning. Nevertheless, LoRA better maintains the base model’s performance on tasks outside the target domain. We show that LoRA mitigates forgetting more than common regularization techniques such as weight decay and dropout; it also helps maintain more diverse generations. Finally, we show that full finetuning learns perturbations with a rank that is 10-100 × greater than typical LoRA configurations, possibly explaining some of the reported gaps. We conclude by proposing best practices for finetuning with LoRA. + +# 1 Introduction + +Finetuning large language models (LLMs) with billions of weights requires a non-trivial amount of GPU memory. Parameter-efficient finetuning methods reduce the memory footprint during training by freezing a pretrained LLM and only training a small number of additional parameters, often called adapters. Low-Rank Adaptation (LoRA; Hu et al. (2021)) trains adapters that are low-rank perturbations to selected weight matrices. LoRA is widely adopted for finetuning LLMs under hardware constraints, but the jury is still out on whether it compromises performance compared to full finetuning. The two seminal methods papers on the topic, which introduce LoRA (Hu et al., 2021) and its more recent combination with model quantization (QLoRA; Dettmers et al. (2024)), reported that LoRA performs better or equivalent to full finetuning. More empirical work (Ghosh et al., 2024; Zhao et al., 2024b) reaches a similar conclusion; this sentiment is echoed in an array of industry blog posts as well (e.g., Raschka (2023); Niederfahrenhorst et al. (2023)). At the same time, there is evidence that LoRA underperforms full finetuning (Ivison et al., 2023; Zhuo et al., 2024), and the need to improve upon LoRA has led to the development of enhanced LoRA variants (Hayou et al., 2024; Meng et al., 2024; Li et al., 2023b; Shi et al., 2024) or alternative low-rank approximation methods (e.g Liu et al. (2024); Zhao et al. (2024a)). To shed light on this ongoing debate, we ask: under which conditions does LoRA approximate full finetuning accuracy on challenging target domains, such as code and math? + +By training fewer parameters, LoRA is hypothesized to constrain the finetuned model from diverging significantly from the base model (Sun et al., 2023; Du et al., 2024). This potential characteristic is particularly helpful for LLM finetuning, a form of continual learning where specializing in new domains can come at the expense of base model capabilities (Wang et al., 2024) (a phenomenon known its extreme form 1 + +> arXiv:2405.09673v2 [cs.LG] 20 Sep 2024 + +Published in Transactions on Machine Learning Research (08/2024) as “catastrophic forgetting” McCloskey & Cohen (1989); French (1999)). To date, only a few studies have examined forgetting in modern LLMs (Vu et al., 2022; Kleiman et al., 2023; Kalajdzievski, 2024). To address this gap, we also ask: when performing continual learning on a new domain, to what extent does LoRA mitigate forgetting of base model capabilities? + +In this study, we compare LoRA and full finetuning for Llama-2-7B models across two challenging target domains, code and mathematics. Within each domain, we explore two training regimes. The first regime is + +continued pretraining , which involves training on billions of unlabeled domain-specific tokens, most commonly via full finetuning; here we use the StarCoder-Python (Li et al., 2023a) and OpenWebMath (Paster et al., 2023) datasets (Table 1). The second is instruction finetuning , the common scenario for LoRA involving question-answer datasets with tens to hundreds of millions of tokens. Here, we use Magicoder-Evol-Instruct-110K (Wei et al., 2023) and MetaMathQA (Yu et al., 2023). We evaluate target-domain performance (henceforth, learning ) via challenging coding and math benchmarks (HumanEval; Chen et al. (2021), and GSM8K; Cobbe et al. (2021)). We evaluate source-domain forgetting + +performance on language understanding, world knowledge, and common-sense reasoning tasks (Zellers et al., 2019; Sakaguchi et al., 2019; Clark et al., 2018). We find that with commonly used low-rank settings, LoRA substantially underperforms full finetuning, while typically requiring longer training (Sec. 4.1). In continued pretraining, the performance gap between full finetuning and LoRA is not closed even with high ranks. In instruction finetuning, on the other hand, high ranks can match full finetuning performance. Despite LoRA’s limitations, we show that it consistently maintains better source-domain performance compared to full finetuning (Sec. 4.2). Furthermore, we characterize the tradeoff between learning and forgetting (Sec. 4.3). We then show that LoRA – even with higher rank – mitigates forgetting more aggressively than classic regularization techniques that aim to prevent overfitting, such as dropout (Srivastava et al., 2014; Goodfellow et al., 2013), and weight decay (Goodfellow et al., 2016). Moreover, by analyzing the generated solutions to HumanEval problems, we demonstrate that while full finetuning tends to produce a limited set of solutions, LoRA produces a wider range of solutions more akin to those of the base model (Sun et al., 2023; Du et al., 2024) Why does LoRA underperform full finetuning? LoRA was originally motivated in part by the hypothesis that finetuning results in low-rank perturbations to the base model’s weight matrix (Li et al., 2018; Aghajanyan et al., 2020; Hu et al., 2021). However, the tasks explored by these prior works are relatively easy for modern LLMs, and certainly easier than the coding and math domains studied here. Thus, we perform a singular value decomposition to show that full finetuning barely changes the spectrum of the base model’s weight matrices, and yet the difference between the two (i.e. the perturbation) is high rank. The rank of the perturbation grows as training progresses, with ranks 10-100 × higher than typical LoRA configurations (Figure 6). We conclude by proposing best practices for training models with LoRA. We find that LoRA is very sensitive to hyperparameters, including learning rates, choice of target modules, ranks, and scaling factors; setting these properly is a prerequisite to approach full finetuning performance. To summarize, we contribute the following results: + +• Full finetuning is more accurate and sample-efficient than LoRA in continued pretraining (CPT) for code and math; in instruction finetuning (IFT), higher ranks can close most of the gaps (Sec.4.1). + +• LoRA forgets less of the source domain (Sec. 4.2 and 4.3). + +• LoRA forgets less than common regularization techniques; it also helps maintaining the diversity of generations (Sec. 4.5). + +• Full finetuning finds high rank weight perturbations (Sec. 4.6). + +• A hyperparameter sensitivity analysis for LoRA, as well as practical recommendations (Sec. 4.7). Model checkpoints and LoRA adapters can be accessed at https://github.com/danbider/lora-tradeoffs .2Published in Transactions on Machine Learning Research (08/2024) Code Math CPT StarCoder-Python (up to 20B tokens) OpenWebMath (14.7B tokens) IFT Magicoder-Evol-Instruct-110K (72.97M tokens) MetaMathQA (103M tokens) Table 1: Datasets and token counts for math and code experiments + +# 2 Background + +LoRA involves freezing a pretrained weight matrix Wpretrained ∈ Rd×k, and learning only a low-rank pertur-bation to it, denoted here as ∆, as follows: + +Wfinetuned = Wpretrained + ∆ ∆ = γr AB, A ∈ Rd×r , B ∈ Rr×k. + +Most common implementations initialize A0 ∼ N (0 , 1) , B 0 = 0 and set the scalar γr = α/r with a controllable hyperparameter α. The user chooses which Wpretrained to adapt (“target modules”), the rank r << d, k , and the hyperparameter α. By doing so, only d × r + r × k parameters are trained per module instead of d × k,which reduces the memory and FLOPS required for computing the gradient. As an example, applying a + +r = 16 LoRA adapter to a 7B weight matrix with d = k = 4096 trains < 1% of the original parameter count. Appendix Sec. H lays out the approximate memory savings by LoRA during training. LoRA’s introduction and first applications targeted only the Wq and Wv matrices in the self-attention module (Hu et al., 2021). Since then, it has become best practice to target all transformer modules (Raschka, 2023; Dettmers et al., 2024), i.e., {W (l) + +> q + +, W (l) + +> k + +, W (l) + +> v + +, W (l) + +> o + +}Ll=1 in the self-attention modules, and + +{W (l) + +> gate + +, W (l) + +> up + +, W (l) + +> down + +}Ll=1 in the feedforward modules for L layers in, say, a Llama architecture (Hu et al., 2021; Touvron et al., 2023). + +# 3 Experimental Setup + +We train on code and math datasets that have been shown to increase downstream performance. We motivate the training datasets and evaluation benchmarks below. All training was done using the Databricks MosaicML + +composer 1, streaming 2, and llm-foundry 3 repositories, as well as the HuggingFace peft library. + +3.1 Datasets for Continued Pretraining (CPT) and Instruction Finetuning (IFT) + +Coding CPT - Starcoder-Python (Li et al., 2023a) This dataset consists of permissively licensed repositories from GitHub, including Git commits, in 80+ programming languages. We chose the Python subset and sub-sampled it to 20B tokens. + +Math CPT - OpenWebMath (Paster et al., 2023) This dataset contains 14.7B tokens derived from mathematical web pages from Common Crawl, correctly formatted to preserve mathematical content such as LaTeX equations. 4 To match with the StarCoder-Python dataset, we trained on up to 20B tokens, repeating tokens beyond the first 14.7B. An analysis of this dataset shows that it contains a considerable amount of full English sentences. 5 + +Coding IFT - Magicoder-Evol-Instruct-110k (Wei et al., 2023) This dataset contains 72.97M tokens of programming questions and answers. It reproduces the “Evol-Instruct” dataset of WizardCoder (Luo et al., + +> 1https://github.com/mosaicml/composer +> 2https://github.com/mosaicml/streaming +> 3https://github.com/mosaicml/llm-foundry +> 4https://huggingface.co/datasets/open-web-math/open-web-math +> 5Out of a random selection of 100K examples, a regex search shows that 75% of the examples contain LaTex. The data is classified as 99.7% English and “overwhelmingly English” by the langdetect and fasttext tools. + +3Published in Transactions on Machine Learning Research (08/2024) 2023b) by iteratively prompting an LLM (GPT-4) to increase the difficulty of a set of question-answer pairs from Code Alpaca (Chaudhary, 2023). 6 + +Math IFT - MetaMathQA (Yu et al., 2023) This dataset was built by bootstrapping mathematical word problems from the training sets of GSM8K (Cobbe et al., 2021) and MATH (Hendrycks et al., 2021) by rewriting the questions with variations using GPT-3.5. This dataset contains 395K question-answer pairs and roughly 103M tokens. 7 + +We quantify learning and forgetting via benchmarks reported on the Open LLM Leaderboard 8 for state of the art open-source LLMs such as Llama (Touvron et al., 2023). + +3.2 Measuring Learning with Coding and Math Benchmarks ( target domain evaluation) + +Coding - HumanEval (Chen et al., 2021) This benchmark contains 164 problems that involve generating a Python program given a docstring and a function signature. A generation is considered correct if it passes all supplied unit tests. We use the Code Generation LM Evaluation Harness (Ben Allal et al., 2022) configured to output 50 generations per problem, and calculate “pass@1” with softmax temperature=0.2 and top_p=0.95 for 0-shot HumanEval. + +Math - GSM8K (Cobbe et al., 2021) This benchmark includes a collection of 8.5K grade-school math word problems. We evaluate on the test split of GSM8K (1,319 samples) as implemented in the LM Evaluation Harness (Gao et al., 2023), with default generation parameters (temperature=0, 5 few-shot, pass@1). + +3.3 Forgetting Metrics ( source domain evaluation) + +We use the following benchmarks to asses degradation of base model capabilities. HellaSwag (Zellers et al., 2019) includes 70K problems that describe an event with multiple possible continuations. The task is to pick the most plausible continuation, which requires making inferences about nuanced everyday situations. + +WinoGrande (Sakaguchi et al., 2019) also assesses commonsense reasoning. It includes 44K problems with sentences that require ambiguous pronoun resolution. ARC-Challenge (Clark et al., 2018) consists of 7,787 grade-school level, multiple-choice science questions, and tests complex reasoning and understanding of scientific concepts. These benchmarks involve multiple-choice questions that use the predicted logits for calculating accuracy, and do not require specifying further generation hyperparameters. All forgetting metrics were computed using the MosaicML Gauntlet evaluation harness (Dohmann, 2023). 9 + +# 4 Results + +4.1 Target-domain performance: LoRA at low ranks underperforms full finetuning + +We compare LoRA and full finetuning after performing an exhaustive learning rate sweep for each method, which we found to be crucial (Dettmers et al., 2024). We include learning rate sweep results in Figure S1. We perform a sample-efficiency analysis – i.e., compute the learning metrics as a function of training samples seen – for both LoRA and full finetuning. For IFT, we train separate models for 1, 2, 4, 8, and 16 epochs. For CPT, we vary the number of training tokens ( 0.25 , 0.5, 1, 2, 4, 8, 16 , 20 billion), using individual learning rate cooldown schedules. For each condition, we train one full finetuning model and three LoRA models with ranks r = 16 , 64 , 256 noting that most LoRA papers use a “low” rank of 8-64, (e.g., Dettmers et al. (2024); Zhuo et al. (2024)). The LoRA models target all transformer modules and use α = 2 r, as known to be best practice (Raschka, 2023). For further details on experimental setup and hyperparameters, see Appendix Sec. A. + +> 6https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K +> 7https://huggingface.co/datasets/meta-math/MetaMathQA +> 8https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard +> 9https://github.com/mosaicml/llm-foundry/tree/main/scripts/eval + +4Published in Transactions on Machine Learning Research (08/2024) + +Figure 1: LoRA performance scales by rank and underperforms full finetuning in code and math. + +(A) Starcoder-Python, ( B) Magicoder-Evol-Instruct-110K, ( C ) OpenWebMath, ( D) MetaMathQA. In ( A)and ( B) y-axis: HumanEval pass@1. In ( C ) and ( D) y-axis: GSM8K strict match. In all panels, “base model” indicates Llama-2-7B without instruction finetuning. Note that 16 epochs are ≈1.16B and ≈1.6B tokens, for Magicoder-Evol-Instruct-110K and MetaMathQA, respectively. The results appear in Fig. 1. We first note that for both programming and math, IFT improves evaluation scores much more than CPT, which is expected because the samples in each IFT dataset are more similar to the evaluation problems (e.g., for code, IFT achieves maximum HumanEval of 0.497 vs. 0.263 for CPT). For Code CPT (Fig. 1A and Table S1), we identify a substantial gap between full finetuning and LoRA that grows with more data. The best LoRA model, with rank r = 256 , peaks at 20B tokens with HumanEval=0.224, roughly matching full finetuning with 4B tokens (HumanEval=0.218). Full finetuning reaches its peak HumanEval of 0.263 at 20B tokens. A clear ordering by rank emerges after the initial 1B CPT tokens. 5Published in Transactions on Machine Learning Research (08/2024) For Code IFT (Fig. 1B and Table S5), HumanEval accuracy is clearly ordered by rank from the very first epoch. The more common r = 16 and r = 64 LoRA configurations have lower accuracy than full finetuning, with HumanEval scores of 0.358 and 0.417 at epoch 4, respectively). With a high LoRA rank ( r = 256 ), full finetuning performance can be matched (LoRA=0.498 in epoch 4, full finetuning=0.497 in epoch 8). In Appendix Sec. F we perform a more sensitive HumanEval analysis, calculating pass@ k as a function of + +k = 1 , . . . , 256 with a higher temperature of 0.8 for full finetuning and the LoRA models (at epoch 4). This analysis shows that full finetuning is superior to r = 256 for k < 64 , after which the two are equal. + +Math CPT (Fig. 1C and S3) results closely echo those of code CPT. Consistent patterns in GSM8K emerge at 4B tokens. Full finetuning opens a gap in GSM8K which widens with more data. Similarly, LoRA performance is ordered by rank. The best LoRA ( r = 256 ) peaks at 16B tokens (GSM8K=0.203), underperforming full finetuning at 4B tokens (GSM8K=0.224) and at its peak at 20B tokens (GSM8K=0.293). LoRA closes much of the gap with full finetuning in the Math IFT (Fig. 1D and Table S7) dataset, while remaining less sample efficient. Both methods substantially improve upon the base model; LoRA ( r = 256 )peaks at 8 epochs (GSM8K=0.634) while full finetuning achieves GSM8K=0.641 at 2 epochs and peaks at 4 epochs, with GSM8K=0.642. 10 Unlike the code IFT dataset, r = 64 suffices to approach full finetuning and achieve GSM8K=0.624 at epoch 4. We suggest that lower ranks are effective here because English mathematics problems involve a smaller domain shift from the pretraining data as compared to coding ones. In summary, in CPT, LoRA underperforms full finetuning across all configurations. In IFT, and especially in code, high LoRA ranks are required to close the gap with full finetuning. + +4.2 LoRA forgets less than full finetuning + +Here, we investigate the extent of forgetting (defined in Sec. 3.2) as a function of training data in Fig. 2. Overall, we observe that (1) IFT induces more forgetting than than CPT, (2) programming induces more forgetting than math, and (3) forgetting tends to worsen with training duration. Most importantly, LoRA forgets less than full finetuning, and the extent of forgetting is controlled by rank. In code – for both CPT and IFT – full finetuning forgets substantially more than any LoRA configuration. In code CPT (Table S2), at 20B tokens, full finetuning scores 0.545 versus 0.617 by LoRA r = 256 . In code IFT (Table S6), full finetuning scores 0.414 versus 0.509 by LoRA r = 64 . In math – for both CPT and IFT – LoRA with r = 256 + +forgets nearly as much as full finetuning. In CPT (Table S4), LoRA scores 0.616 (20B tokens) versus 0.613 of full finetuning (16B tokens). In IFT (Table S8), LoRA and full finetuing respectively degrade to 0.567 and 0.559 at epoch 16. We note that the least forgetting occurs for the OpenWebMath dataset, which is dominated by English sentences (see 3.1 for details). + +4.3 The Learning-Forgetting Tradeoff + +It is trivial that models that change less when finetuned to a new target domain will forget less of the source domain. The nontrivial question is: do LoRA and full finetuning differ in how they trade off learning and forgetting? Can LoRA achieve similar target domain performance but with diminished forgetting? We form learning-forgetting Pareto curves by plotting the forgetting metric versus the learning metric for each training duration (Fig. 3). As models train on more data, they learn more and forget more, traveling up and left in this space. As we increase LoRA ranks, we find that the curves shift up and left as well, again, learning more and forgetting more, doing so more consistently in IFT than CPT. Each dataset presents a unique tradeoff pattern which makes it difficult to conclude whether LoRA and full finetuning offer fundamentally different learning-forgetting tradeoffs. We will review each dataset next. + +> 10 We note that the original MetaMath paper reports a maximum accuracy of 0.665 when (fully) finetuning Llama-2-7B on the MetaMathQA dataset. We attribute this to small differences in hyperparameters; they trained on 3 epochs with a batch size of 128 using the AdamW optimizer, a learning rate of 2e-5, a learning rate warmup of 3%. + +6Published in Transactions on Machine Learning Research (08/2024) + +Figure 2: LoRA forgets less than full finetuning. In all panels, the y-axis shows the average of HellaSwag, ARC-Challenge and Winogrande for Llama-2-7B trained trained on: (A) StarCoder-Python (B) Magicoder-Evol-Instruct-110k (C) OpenWebMath (D) MetaMathQA. For Code CPT, though the full finetuning curve reaches much higher values of HumanEval, it appears to forget more for any given HumanEval value, which LoRA can reach if trained on more tokens. This pattern does not hold for math CPT, where LoRA and full finetuning curves are roughly overlapping until full finetuning shoots up (in 4B tokens) to achieve much higher GSM8K scores without increased forgetting. In code IFT, LoRA r = 256 offers comparable HumanEval accuracy while strictly forgetting less. Lower ranks do not reach high values on HumanEval to compare to full finetuning. In math IFT, LoRA and full finetuning seem to lie on adjacent learning-forgetting tradeoff curves, with full finetuning offering preferable tradeoffs. With the caveats mentioned above, it seems that LoRA can offer preferable learning-forgetting tradeoffs for code, while full finetuning can offer preferable tradeoffs for math. Moreover the choice of LoRA rank can serve as a knob to navigate the learning-forgetting tradeoffs. 7Published in Transactions on Machine Learning Research (08/2024) + +Figure 3: LoRA vs. full finetuning tradeoff for Llama-2-7B. Relative to full finetuning, LoRA learns less (lower values on the y-axis) and forgets less (higher values on the x-axis). Each dot is a separate model, with marker size corresponding to training duration (from 0.25-20 billion tokens for CPT, and 1-16 epochs for IFT). Same data as Figures 1, 2. + +4.4 For the Tülu-v2-mix dataset, LoRA is on par with full finetuning + +So far, we analyzed how LoRA and full finetuning specialize in very specific domains. Often, code or math problems appear as part of larger IFT data mixtures that include multi-turn conversations and a variety of other NLP tasks, such as summarization, etc. (e.g. Wei et al. (2021)). We therefore finetuned LoRA and full finetuning models on one such popular dataset, the Tülu-v2-mix (Ivison et al., 2023). The results are presented in the Appendix (Sec. C and Table S9). In summary, we find that both LoRA and full finetuning meaningfully improve upon the base model, and that LoRA, even with lower ranks, can match full finetuning in chat quality as measured by Multi-Turn Benchmark (MT-bench (Zheng et al., 2024)), GSM8K (Cobbe et al., 2021), and Massive Multitask Language Understanding (MMLU; Hendrycks et al. (2020)). At longer training durations (6 epochs), LoRA also forgets less. 8Published in Transactions on Machine Learning Research (08/2024) + +Figure 4: LoRA forgets less than attention dropout and weight decay. Results from Llama-2-7B finetuned on Magicoder-Evol-Instruct-110K. Left panel: learning as measured by accuracy on HumanEval. Right panel: forgetting as measured by the average of HellaSwag, ARC-Challenge and WinoGrande scores. The solid slateblue line shows that LoRA (r=256) learns as much as full finetuning, weight decay, and attention dropout, while forgetting much less. + +4.5 How strongly does LoRA constrain the finetuning process? + +In this section, we analyze Llama-2-7B models trained on the Magicoder-Evol-Instruct-110K dataset. We first compare the learning-forgetting tradeoffs between LoRA and classic regularization techniques, and then analyze the diversity of the generated text. + +LoRA forgets less than attention dropout and weight decay We compare LoRA ( r = 16 , 256 ,training all modules) to weight decay (Goodfellow et al., 2016) with values 5e−5, 1e−4 and attention dropout (Srivastava et al., 2014) with values 0.05 , 0.1. Both regularization techniques appear to learn and forget as much as full finetuning, except that weight decay starts to generally deteriorate at longer training durations (epochs 8 and 16). LoRA, with the common r = 16 , learns less and forgets less than all other models. LoRA + +r = 256 , on the other hand, learns as much as the other methods while forgetting less. + +LoRA helps maintain diversity of token generations. We scrutinize the generated solution strings for HumanEval problems. We calculate the unique number of output strings out of 50 generations (for base model, full finetuning, and LoRA) serving as a coarse proxy for predictive diversity. In Figure 5 we separately show the results for correct and incorrect answers. As in the reinforcement learning from human feedback literature (Du et al., 2024; Sun et al., 2023), we find that full finetuning results in fewer unique generations (“distribution collapse”) compared to the base model, for both pass and fail generations, with LoRA in between the two. The above works also suggest that LoRA could even substitute a common Kullback-Leibler divergence term that keeps the probabilities of the generated text similar between the finetuned and base model. We reiterate that exact string matching between generations is not a sensitive metric of predictive diversity, as generations can slightly vary in format and remain functionally identical. + +4.6 Full finetuning on code and math does not learn low-rank perturbations + +In this section, we seek to study whether we should expect low-rank training to be a good approximation to full finetuning, and if so, what is the necessary rank. Recall that full finetuning can be written as + +Wfinetuned = Wpretrained + ∆ ; here we compute the Singular Value Decomposition of all three terms in the equation. We focus on continued pretraining for code, where there are drastic differences between LoRA and full finetuning. We analyze checkpoints obtained at 0.25, 0.5, 1, 2, 4, 8, 16, and 20 billion training tokens. 9Published in Transactions on Machine Learning Research (08/2024) + +Figure 5: LoRA maintains output token diversity relative to full finetuning. + +First, in Figure S7 we present results for the Wq projection at layer 26 of Llama-2-7B (with dimensions + +d × d, d = 4096 ). We show that the spectrum of the finetuned weight matrix is very similar to that of the base weight matrix, both decaying slowly and requiring keeping ≈ 50% of singular vectors ( ≈ 2000 /4096 ) to explain 90% of the variance in the weight matrix. Critically, the difference ∆ also has a similar spectrum to the finetuned and base weight matrices (up to a multiplicative scaling). These results are in line with the analysis in Zeng & Lee (2024) showing that any transformer model can be well approximated with r = d/ 2.Additionally, we suggest that there is nothing extraordinary about the full finetuning spectra; similar spectra can be achieved by adding low-magnitude Gaussian i.i.d noise to a weight matrix (Fig. S8). Next, we ask when during training does the perturbation become high rank, and whether it meaningfully varies between module types and layers. We estimate the rank needed to explain 90% of the variance in the matrix. The results appear in Figure 6. We find that: (1) The earliest checkpoint at 0.25B CPT tokens exhibits ∆ matrices with a rank that is 10 − 100 × larger than typical LoRA ranks; (2) the rank of ∆ increases when trained on more data; (3) MLP modules have higher ranks compared to attention modules; (4) first and last layers seem to be lower rank compared to middle layers. + +4.7 Hyperparameter sensitivity analyses for LoRA + +Our goal in this work was to optimally configure LoRA so that it has the best chances of matching full finetuning. This is nontrivial, as LoRA has a large number of hyperparameters to choose from: target modules, rank, scaling factors, and learning rates. We turn to analyze the importance of each, and provide some practical recommendations. First, we found that the choice α = 2 r is crucial for high ranks. Most common packages, e.g. HuggingFace’s + +peft ,11 scale the LoRA matrices by α/r , effectively scaling down higher ranks (see also Kalajdzievski (2023)). One might think that high learning rate values may compensate for fixed low α’s, but doing so creates instabilities and often leads to inferior performance. To show this, we performed a joint hyperparameter + +> 11 https://huggingface.co/docs/peft/en/index + +10 Published in Transactions on Machine Learning Research (08/2024) + +Figure 6: Dynamics of rank for Llama-2-7B trained on the Starcoder (CPT) data. In each panel, the x-axis denotes layer number and the y-axis denotes rank needed to explain at least 90% of the variance (maximal dimensionality is 4096). Colors denote CPT tokens, with lighter colors trained for longer. + +Figure 7: Targeting MLP or All modules is superior to training Attention modules alone . All Llama-2-7B checkpoints were trained on Magicoder for 1, 2 and 4 epochs with rank 16 (left), 64 (center) and 256 (right). sweep over α and learning rate for the Magicoder dataset training a r = 256 LoRA for 4 epochs (Fig. S3). We find that α = 512 does much better than 256 or 32 across all learning rates. Next, to assess the relative contribution of target modules and rank, we trained Llama-2-7B models on 4 epochs of the Magicoder dataset, sweeping over target modules (“Attention”, “MLP”, and “All”, their union), ranks ( r = 16 , 64 , 256 ), setting α = 2 r. Fig. 7 shows that HumanEval performance increases with rank, and 11 Published in Transactions on Machine Learning Research (08/2024) that targeting just “Attention” underperforms both “MLP” and “All”, where in the latter, most gains are interestingly driven by the “MLP” modules. This is potential evidence that the MLP blocks are the primary loci for continual learning in LoRA, at least in our datasets. For IFT, we find that LoRA is more sensitive to learning rates compared to full finetuning, and benefits from the highest learning rate that enables stable training for the chosen training duration (see Appendix Sec. B and Fig. S1). LoRA’s best learning rates should be set one order of magnitude higher than that of full finetuning, often ranging between 5e−5 and 5e−4 for these combinations of model architecture and dataset. In Appendix Sec. I, we benchmark throughput and peak GPU memory of different LoRA configurations, showing that for standard implementations and a fixed batch size, LoRA tends to train slower than full finetuning. To conclude, based on our main results and hyperparameter sweeps, we recommend: (a) using LoRA for instruction finetuning and not continued pretraining; (b) if GPU memory allows, targeting “All” transformer modules with a rank of 256 , since ranks 16 − 64 tend not to suffice for code tasks; (c) using α = 2 r, and (d) sweeping over learning rates between [1 e − 5, 5e − 4] , picking the highest value that enables stable training. + +# 5 Related Work + +Extensions to LoRA LoRA has inspired many variants and extensions. One group of methods improves training with LoRA by focusing on initialization or scaling (Meng et al., 2024; Hayou et al., 2024; Li et al., 2023b; Kalajdzievski, 2023; Nikdan et al., 2024), sequential training procedures (Xia et al., 2024), or architectural modifications (Shi et al., 2024). Other works propose alternative low-rank approximations altogether (Liu et al., 2024; Zhao et al., 2024a; Jiang et al., 2024a; Kopiczko et al., 2023). In this study we chose to analyze the classic LoRA setup; while many of these proposed variations of LoRA seem promising, we leave a rigorous comparison of these techniques to future work. + +Benchmarking LoRA vs. Full Finetuning The original LoRA paper Hu et al. (2021) reported that LoRA matched full finetuning performance for RoBERTa (Liu et al., 2019) on GLUE (Wang et al., 2018), GPT-2 on E2E NLG Challenge (Novikova et al., 2017), and GPT-3 on WikiSQL (Zhong et al., 2017), MNLI (Williams et al., 2017), and SAMSum (Gliwa et al., 2019). Many subsequent studies follow this template and report encoder model performance on tasks in GLUE such as SST-2 (Socher et al., 2013) and MNLI (Williams et al., 2017). Models such as RoBERTa are less than 340M parameters, however, and classification tasks such as MNLI are quite trivial for modern billion-parameter LLMs such as Llama-2-7B. Despite LoRA’s popularity, only a few studies have rigorously compared LoRA to full finetuning in this setting and with challenging domains such as code and math. Dettmers et al. (2024) for example found that QLoRA matched full finetuning MMLU (Hendrycks et al., 2020) performance when finetuning Llama-1 -7B, 13B, 33B and 65B on the Alpaca (Taori et al., 2023) and FLAN (Chung et al., 2024) datasets. Ivison et al. (2023) on the other hand found that QLoRA did not perform as well as full finetuning for Llama-2-7B, 13B and 70B models trained on the Tülü-v2-mix dataset when evaluated across MMLU, GSM8K, AlpacaEval (which uses LLM-as-a-judge; (Dubois et al., 2024)) and HumanEval. One recent notable study is Astraios, which found that LoRA at rank r = 8 performed worse than full finetuning on 8 datasets and across 4 model sizes (up to 16 billion parameters), on 5 representative code tasks (Zhuo et al., 2024). Our study corroborates these results and shows that with higher ranks and proper hyperparameter choices, LoRA can perform much better. The conclusions have also been mixed with regards to the practical details surrounding LoRA target modules and rank: Raschka (2023) and Dettmers et al. (2024) show that optimized LoRA configurations perform as well as full finetuning, and that performance is governed by choice of target modules but not rank. 12 However, in that work, the scalar α was not modified with rank, and we found that increasing it to 2r was necessary to unlock improvements by rank. In contrast, Liu et al. (2024) shows that LoRA is sensitive to ranks. It is likely that some of these discrepancies are due to differences in finetuning datasets and evaluations. + +> 12 see also Zhang et al. (2024), who report some cases where performance does improve with rank. + +12 Published in Transactions on Machine Learning Research (08/2024) + +Continual learning on code and math. A growing body of work investigates ways of specializing LLMs for code and math. In code, models such as StarCoder (Li et al., 2023a; Lozhkov et al., 2024), DeepSeek Coder (Guo et al., 2024), and SantaCoder (Allal et al., 2023) were pretrained from scratch on large-scale code datasets. Alternatively, some works start with a generic pretrained base model, and combine continued pretraining on large code datasets followed by IFT on code problems (usually with full finetuning), e.g., Codex (Chen et al., 2021), Code-Qwen (Bai et al., 2023), CodeLlama (Roziere et al., 2023). Some perform only IFT on top of a base model, like MagiCoder (Wei et al., 2023), or WizardCoder (Luo et al., 2023b). Other models such as OctoCoder (Muennighoff et al., 2023) perform IFT with LoRA. Similarly, much recent work aims to improve mathematical capabilities. Models like DeepSeek Math (Shao et al., 2024) perform continued pretraining on top of a base model, while other methods focus on finetuning by generating high-quality synthetic math problems, scaling to millions of examples. Luo et al. (2023a) takes the Evol-Instruct approach to data generation (akin to the Magicoder dataset; Sec. 3.1) which it then uses to train reward models for instruction quality and solution correctness, which are in turn used for LLM finetuning. Other work develops Monte Carlo Tree Search methods to automatically supervise the intermediate reasoning steps while solving math problems (Luo et al., 2024), and Yue et al. (2024) generates questions and answers from the pretraining web corpus. Toshniwal et al. (2024) uses an LLM to synthesize Code-Interpreter-style solutions to the GSM8K and MATH benchmarks; the proposed solutions can be verified against the official solutions. Singh et al. (2023) iterate over this procedure multiple times (“Self-training”) using an expectation-maximization approach. All reviewed methods meaningfully improve math capabilities. + +Learning-Forgetting tradeoffs Vu et al. (2022) shows that prompt tuning (Lester et al., 2021), another parameter-efficient finetuning method, can aid in mitigating forgetting for cross-lingual summarization tasks (using multilingual variants of the T5 model). With large Llama-style LLMs, it has been reported that code-finetuned LLMs lose some of their capabilities in language understanding and commonsense reasoning (Li et al., 2023a; Roziere et al., 2023; Wei et al., 2023). A common approach to mitigate forgetting involves “replaying” source-domain data during continual learning, which can be done by storing the data in a memory buffer, or generating it on the fly (Lesort et al., 2022; Scialom et al., 2022; Sun et al., 2019). + +# 6 Discussion + +Does the difference between LoRA and full finetuning change with model size? Studies in the past have hinted at a relationship between the effectiveness of finetuning and model size (Aghajanyan et al., 2020; Hu et al., 2021; Zhuo et al., 2024). While recent studies have successfully applied LoRA to 70B parameter models (Ivison et al., 2023; Yu et al., 2023; Niederfahrenhorst et al., 2023; Turgutlu, 2024), and previous work shows that techniques like prompt tuning become more effective for larger models (Vu et al., 2022), we leave a rigorous study of these intriguing scaling properties to future work. + +Limitations of the spectral analysis. The observation that full finetuning tends to find high rank solutions does not rule out the possibility of low-rank solutions; rather, it shows that they are not typically found. An alternative interpretation is that the rank needed to reconstruct the weight matrix is higher than the rank needed for a downstream task. We also only presented SVD analysis for the continued pretraining setting. It is possible that a similar analysis for the instruction finetuning setting would reveal that the full finetuning does not tend to be as high rank. + +# 7 Conclusion + +This work sheds light on the downstream performance of 7 billion parameter LLMs trained with LoRA and full finetuning. Unlike most prior work, we use domain-specific datasets in code and math, associated with sensitive evaluation metrics. We show that LoRA, with commonly used low-rank settings, underperforms full finetuning across domains. We also show that LoRA keeps the finetuned model’s behavior close to that of the base model, with diminished source-domain forgetting and more diverse generations at inference time. We show that LoRA mitigates forgetting more than classical regularization techniques, and also show that full 13 Published in Transactions on Machine Learning Research (08/2024) finetuning finds weight perturbations that are far from being low-rank. We conclude by analyzing LoRA’s increased sensitivity to hyperparameters and highlighting best practices. + +Acknowledgements + +We would like to thank the editor and the three anonymous reviewers who provided high-quality feedback on this work. We are also grateful to Daniel Han and Damjan Kalajdzievski for carefully reading our work and pointing out the importance of setting α = 2 r for training with high ranks. + +Author Contributions + +D.B. led this project by developing code, running experiments, analyzing results, and writing the manuscript. J.P. ran experiments and assisted in the writing of the manuscript. J.G.O. wrote code and ran experiments. P.G. advised the SVD analysis, C.J. ran experiments, and D.K. wrote code. M.P., S.H., V.C., J.F., C.B., and J.P.C. advised this work. 14 Published in Transactions on Machine Learning Research (08/2024) + +# References + +Armen Aghajanyan, Luke Zettlemoyer, and Sonal Gupta. Intrinsic dimensionality explains the effectiveness of language model fine-tuning. arXiv preprint arXiv:2012.13255 , 2020. Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, et al. Santacoder: don’t reach for the stars! + +arXiv preprint arXiv:2301.03988 , 2023. Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, et al. Qwen technical report. arXiv preprint arXiv:2309.16609 , 2023. Loubna Ben Allal, Niklas Muennighoff, Logesh Kumar Umapathi, Ben Lipkin, and Leandro von Werra. A framework for the evaluation of code generation models. https://github.com/bigcode-project/ bigcode-evaluation-harness , 2022. Sahil Chaudhary. Code alpaca: An instruction-following llama model for code generation. https://github. com/sahil280114/codealpaca , 2023. Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba. Evaluating large language models trained on code, 2021. Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al. Scaling instruction-finetuned language models. Journal of Machine Learning Research , 25(70):1–53, 2024. Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. Think you have solved question answering? try arc, the ai2 reasoning challenge. ArXiv ,abs/1803.05457, 2018. URL https://api.semanticscholar.org/CorpusID:3922816 .Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 , 2021. Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: Efficient finetuning of quantized llms. Advances in Neural Information Processing Systems , 36, 2024. Jeremy Dohmann. Blazingly fast llm evaluation for in-context learning, February 2023. URL https: //www.databricks.com/blog/llm-evaluation-for-icl . Blog post, Mosaic AI Research. Yuqing Du, Alexander Havrilla, Sainbayar Sukhbaatar, Pieter Abbeel, and Roberta Raileanu. A study on improving reasoning in language models. In I Can’t Believe It’s Not Better Workshop: Failure Modes in the Age of Foundation Models , 2024. URL https://openreview.net/forum?id=tCZFmDyPFm .Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 , 2024. Yann Dubois, Chen Xuechen Li, Rohan Taori, Tianyi Zhang, Ishaan Gulrajani, Jimmy Ba, Carlos Guestrin, Percy S Liang, and Tatsunori B Hashimoto. Alpacafarm: A simulation framework for methods that learn from human feedback. Advances in Neural Information Processing Systems , 36, 2024. 15 Published in Transactions on Machine Learning Research (08/2024) Robert M French. Catastrophic forgetting in connectionist networks. Trends in cognitive sciences , 3(4): 128–135, 1999. Leo Gao, Jonathan Tow, Baber Abbasi, Stella Biderman, Sid Black, Anthony DiPofi, Charles Foster, Laurence Golding, Jeffrey Hsu, Alain Le Noac’h, Haonan Li, Kyle McDonell, Niklas Muennighoff, Chris Ociepa, Jason Phang, Laria Reynolds, Hailey Schoelkopf, Aviya Skowron, Lintang Sutawika, Eric Tang, Anish Thite, Ben Wang, Kevin Wang, and Andy Zou. A framework for few-shot language model evaluation, 12 2023. URL https://zenodo.org/records/10256836 .Sreyan Ghosh, Chandra Kiran Reddy Evuru, Sonal Kumar, Deepali Aneja, Zeyu Jin, Ramani Duraiswami, Di-nesh Manocha, et al. A closer look at the limitations of instruction tuning. arXiv preprint arXiv:2402.05119 ,2024. Bogdan Gliwa, Iwona Mochol, Maciej Biesek, and Aleksander Wawer. Samsum corpus: A human-annotated dialogue dataset for abstractive summarization. arXiv preprint arXiv:1911.12237 , 2019. Ian Goodfellow, Yoshua Bengio, and Aaron Courville. Deep learning . MIT press, 2016. Ian J Goodfellow, Mehdi Mirza, Da Xiao, Aaron Courville, and Yoshua Bengio. An empirical investigation of catastrophic forgetting in gradient-based neural networks. arXiv preprint arXiv:1312.6211 , 2013. Daya Guo, Qihao Zhu, Dejian Yang, Zhenda Xie, Kai Dong, Wentao Zhang, Guanting Chen, Xiao Bi, Y Wu, YK Li, et al. Deepseek-coder: When the large language model meets programming–the rise of code intelligence. arXiv preprint arXiv:2401.14196 , 2024. Soufiane Hayou, Nikhil Ghosh, and Bin Yu. Lora+: Efficient low rank adaptation of large models. arXiv preprint arXiv:2402.12354 , 2024. Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 , 2020. Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874 , 2021. Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 ,2021. Hamish Ivison, Yizhong Wang, Valentina Pyatkin, Nathan Lambert, Matthew Peters, Pradeep Dasigi, Joel Jang, David Wadden, Noah A Smith, Iz Beltagy, et al. Camels in a changing climate: Enhancing lm adaptation with tulu 2. arXiv preprint arXiv:2311.10702 , 2023. Ting Jiang, Shaohan Huang, Shengyue Luo, Zihan Zhang, Haizhen Huang, Furu Wei, Weiwei Deng, Feng Sun, Qi Zhang, Deqing Wang, et al. Mora: High-rank updating for parameter-efficient fine-tuning. arXiv preprint arXiv:2405.12130 , 2024a. Weisen Jiang, Han Shi, Longhui Yu, Zhengying Liu, Yu Zhang, Zhenguo Li, and James T. Kwok. Forward-backward reasoning in large language models for mathematical verification, 2024b. Damjan Kalajdzievski. A rank stabilization scaling factor for fine-tuning with lora. arXiv preprint arXiv:2312.03732 , 2023. Damjan Kalajdzievski. Scaling laws for forgetting when fine-tuning large language models. arXiv preprint arXiv:2401.05605 , 2024. Anat Kleiman, Jonathan Frankle, Sham M Kakade, and Mansheej Paul. Predicting task forgetting in large language models, 2023. URL https://openreview.net/pdf?id=0BMg0OgNTP .16 Published in Transactions on Machine Learning Research (08/2024) Dawid Jan Kopiczko, Tijmen Blankevoort, and Yuki Markus Asano. Vera: Vector-based random matrix adaptation. arXiv preprint arXiv:2310.11454 , 2023. Timothée Lesort, Oleksiy Ostapenko, Diganta Misra, Md Rifat Arefin, Pau Rodríguez, Laurent Char-lin, and Irina Rish. Challenging common assumptions about catastrophic forgetting. arXiv preprint arXiv:2207.04543 , 2022. Brian Lester, Rami Al-Rfou, and Noah Constant. The power of scale for parameter-efficient prompt tuning. + +arXiv preprint arXiv:2104.08691 , 2021. Chunyuan Li, Heerad Farkhoor, Rosanne Liu, and Jason Yosinski. Measuring the intrinsic dimension of objective landscapes. arXiv preprint arXiv:1804.08838 , 2018. Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al. Starcoder: may the source be with you! arXiv preprint arXiv:2305.06161 , 2023a. Yixiao Li, Yifan Yu, Chen Liang, Pengcheng He, Nikos Karampatziakis, Weizhu Chen, and Tuo Zhao. Loftq: Lora-fine-tuning-aware quantization for large language models. arXiv preprint arXiv:2310.08659 , 2023b. Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng, and Min-Hung Chen. Dora: Weight-decomposed low-rank adaptation. arXiv preprint arXiv:2402.09353 ,2024. Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 , 2019. Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, et al. Starcoder 2 and the stack v2: The next generation. arXiv preprint arXiv:2402.19173 , 2024. Haipeng Luo, Qingfeng Sun, Can Xu, Pu Zhao, Jianguang Lou, Chongyang Tao, Xiubo Geng, Qingwei Lin, Shifeng Chen, and Dongmei Zhang. Wizardmath: Empowering mathematical reasoning for large language models via reinforced evol-instruct, 2023a. URL https://arxiv.org/abs/2308.09583 .Liangchen Luo, Yinxiao Liu, Rosanne Liu, Samrat Phatale, Harsh Lara, Yunxuan Li, Lei Shu, Yun Zhu, Lei Meng, Jiao Sun, and Abhinav Rastogi. Improve mathematical reasoning in language models by automated process supervision, 2024. URL https://arxiv.org/abs/2406.06592 .Ziyang Luo, Can Xu, Pu Zhao, Qingfeng Sun, Xiubo Geng, Wenxiang Hu, Chongyang Tao, Jing Ma, Qingwei Lin, and Daxin Jiang. Wizardcoder: Empowering code large language models with evol-instruct. arXiv preprint arXiv:2306.08568 , 2023b. Michael McCloskey and Neal J Cohen. Catastrophic interference in connectionist networks: The sequential learning problem. In Psychology of learning and motivation , volume 24, pp. 109–165. Elsevier, 1989. Fanxu Meng, Zhaohui Wang, and Muhan Zhang. Pissa: Principal singular values and singular vectors adaptation of large language models. arXiv preprint arXiv:2404.02948 , 2024. Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro Von Werra, and Shayne Longpre. Octopack: Instruction tuning code large language models. arXiv preprint arXiv:2308.07124 , 2023. Artur Niederfahrenhorst, Kourosh Hakhamaneshi, and Rehaan Ahmad. Fine-tuning llms: Lora or full-parameter? an in-depth analysis with llama 2, September 2023. URL https://www.anyscale.com/blog/ fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 . Blog post. Mahdi Nikdan, Soroush Tabesh, and Dan Alistarh. Rosa: Accurate parameter-efficient fine-tuning via robust adaptation. arXiv preprint arXiv:2401.04679 , 2024. 17 Published in Transactions on Machine Learning Research (08/2024) Jekaterina Novikova, Ondřej Dušek, and Verena Rieser. The e2e dataset: New challenges for end-to-end generation. arXiv preprint arXiv:1706.09254 , 2017. Keiran Paster, Marco Dos Santos, Zhangir Azerbayev, and Jimmy Ba. Openwebmath: An open dataset of high-quality mathematical web text. arXiv preprint arXiv:2310.06786 , 2023. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis , pp. 1–16. IEEE, 2020. Sebastian Raschka. Practical tips for finetuning llms using lora (low-rank adaptation), 2023. URL https: //magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms .Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, et al. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 , 2023. Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. Winogrande: An adversarial winograd schema challenge at scale, 2019. Thomas Scialom, Tuhin Chakrabarty, and Smaranda Muresan. Fine-tuned language models are continual learners. arXiv preprint arXiv:2205.12393 , 2022. Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo. Deepseekmath: Pushing the limits of mathematical reasoning in open language models, 2024. URL https://arxiv.org/abs/2402.03300 .Shuhua Shi, Shaohan Huang, Minghui Song, Zhoujun Li, Zihan Zhang, Haizhen Huang, Furu Wei, Weiwei Deng, Feng Sun, and Qi Zhang. Reslora: Identity residual mapping in low-rank adaption. arXiv preprint arXiv:2402.18039 , 2024. Avi Singh, John D Co-Reyes, Rishabh Agarwal, Ankesh Anand, Piyush Patil, Peter J Liu, James Harrison, Jaehoon Lee, Kelvin Xu, Aaron Parisi, et al. Beyond human data: Scaling self-training for problem-solving with language models. arXiv preprint arXiv:2312.06585 , 2023. Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher D Manning, Andrew Y Ng, and Christopher Potts. Recursive deep models for semantic compositionality over a sentiment treebank. In + +Proceedings of the 2013 conference on empirical methods in natural language processing , pp. 1631–1642, 2013. Nitish Srivastava, Geoffrey Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research , 15(1): 1929–1958, 2014. Fan-Keng Sun, Cheng-Hao Ho, and Hung-Yi Lee. Lamol: Language modeling for lifelong language learning. + +arXiv preprint arXiv:1909.03329 , 2019. Simeng Sun, Dhawal Gupta, and Mohit Iyyer. Exploring the impact of low-rank adaptation on the performance, efficiency, and regularization of rlhf, 2023. Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B Hashimoto. Stanford alpaca: An instruction-following llama model, 2023. Shubham Toshniwal, Ivan Moshkov, Sean Narenthiran, Daria Gitman, Fei Jia, and Igor Gitman. Openmathinstruct-1: A 1.8 million math instruction tuning dataset, 2024. URL https://arxiv.org/abs/ 2402.10176 .Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 , 2023. 18 Published in Transactions on Machine Learning Research (08/2024) Kerem Turgutlu. Efficient finetuning of llama 3 with fsdp qdora, April 2024. URL https://www.answer.ai/ posts/2024-04-26-fsdp-qdora-llama3.html . Blog post. Tu Vu, Aditya Barua, Brian Lester, Daniel Cer, Mohit Iyyer, and Noah Constant. Overcoming catastrophic forgetting in zero-shot cross-lingual generation. arXiv preprint arXiv:2205.12647 , 2022. Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. Glue: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 ,2018. Liyuan Wang, Xingxing Zhang, Hang Su, and Jun Zhu. A comprehensive survey of continual learning: Theory, method and application, 2024. Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 ,2021. Yuxiang Wei, Zhe Wang, Jiawei Liu, Yifeng Ding, and Lingming Zhang. Magicoder: Source code is all you need. arXiv preprint arXiv:2312.02120 , 2023. Yixuan Weng, Minjun Zhu, Fei Xia, Bin Li, Shizhu He, Shengping Liu, Bin Sun, Kang Liu, and Jun Zhao. Large language models are better reasoners with self-verification. arXiv preprint arXiv:2212.09561 , 2022. Adina Williams, Nikita Nangia, and Samuel R Bowman. A broad-coverage challenge corpus for sentence understanding through inference. arXiv preprint arXiv:1704.05426 , 2017. Wenhan Xia, Chengwei Qin, and Elad Hazan. Chain of lora: Efficient fine-tuning of language models via residual learning. arXiv preprint arXiv:2401.04151 , 2024. Longhui Yu, Weisen Jiang, Han Shi, Jincheng Yu, Zhengying Liu, Yu Zhang, James T Kwok, Zhenguo Li, Adrian Weller, and Weiyang Liu. Metamath: Bootstrap your own mathematical questions for large language models. arXiv preprint arXiv:2309.12284 , 2023. Xiang Yue, Tuney Zheng, Ge Zhang, and Wenhu Chen. Mammoth2: Scaling instructions from the web, 2024. URL https://arxiv.org/abs/2405.03548 .Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. Hellaswag: Can a machine really finish your sentence?, 2019. Yuchen Zeng and Kangwook Lee. The expressive power of low-rank adaptation, 2024. URL https: //arxiv.org/abs/2310.17513 .Biao Zhang, Zhongtao Liu, Colin Cherry, and Orhan Firat. When scaling meets llm finetuning: The effect of data, model and finetuning method. arXiv preprint arXiv:2402.17193 , 2024. Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang Wang, Anima Anandkumar, and Yuandong Tian. Galore: Memory-efficient llm training by gradient low-rank projection. arXiv preprint arXiv:2403.03507 , 2024a. Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. arXiv preprint arXiv:2405.00732 , 2024b. Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. Judging llm-as-a-judge with mt-bench and chatbot arena. + +Advances in Neural Information Processing Systems , 36, 2024. Victor Zhong, Caiming Xiong, and Richard Socher. Seq2sql: Generating structured queries from natural language using reinforcement learning. arXiv preprint arXiv:1709.00103 , 2017. Terry Yue Zhuo, Armel Zebaze, Nitchakarn Suppattarachai, Leandro von Werra, Harm de Vries, Qian Liu, and Niklas Muennighoff. Astraios: Parameter-efficient instruction tuning code large language models. + +arXiv preprint arXiv:2401.00788 , 2024. 19 Published in Transactions on Machine Learning Research (08/2024) + +# Appendix A Experimental Setup + +LoRA configuration for all experiments .All experiments were done with the Databricks MosaicML composer , streaming and llm-foundry libraries in conjunction with the HuggingFace peft library on 32 ×H100-80GB GPUs. All experiments in the main text used the LionW optimizer (Chen et al., 2021) instead of the AdamW optimizer. We targeted all trainable modules inside each of the L Llama transformer blocks: + +{W (l) + +> q + +, W (l) + +> k + +, W (l) + +> v + +, W (l) + +> o + +, W (l) + +> gate + +, W (l) + +> up + +, W (l) + +> down + +}Ll=1 . We used ranks of r = 16 , 64 , 256 and set α = 2 r, to achieve a constant scaling factor γr = 2 across ranks. We use lora_dropout=0.05. For both the Code CPT and Math CPT settings, we train the model once for 20B tokens. We then perform individual cooldowns using intermediate checkpoints as follows: We set a target max training duration (e.g. 8 billion tokens), and define the last 20% of max training duration as the cooldown period. We then retrain from the latest available checkpoint prior to the cooldown period. In all four scenarios below, we use the Llama-2-7B base model meta-llama/Llama-2-7b-hf 13 . For the CPT runs, we use the meta-llama/Llama-2-7b-hf tokenizer, while for the IFT runs we use the + +meta-llama/Llama-2-7b-chat-hf tokenizer. 14 + +Code CPT Llama-2-7B trained on the StarCoder-Python dataset. + +• seq_len : 4096 + +• optimizer : decoupled_lionw (betas=[0.9, 0.95]) + +• learning_rate : 1.0e-05 for LoRA and Full Finetuning + +• scheduler : inv_sqrt_with_warmup (t_scale=1000ba, t_warmup=1000ba, t_cooldown=5086ba, alpha_f_decay=1, alpha_f_cooldown=0). We note that this ends up looking very much like a trapezoidal schedule. + +• weight_decay : 1.0e-06 + +• precision : amp_bf16 + +• global_train_batch_size : 192 + +• device_train_microbatch_size : 6 + +• gradient_clipping : norm (threshold=1) + +• num_gpus : 32 + +Math CPT . Llama-2-7B trained on the OpenWebMath dataset. + +• max_seq_len : 4096 + +• optimizer : decoupled_lionw (betas=[0.9, 0.95]) + +• learning_rate : 1.0e-05 for full finetuning, 4.0e-05 for LoRA + +• scheduler : inv_sqrt_with_warmup (t_scale=1000ba, t_warmup=1000ba, t_cooldown=5086ba, alpha_f_decay=1, alpha_f_cooldown=0). We note that this ends up looking very much like a trapezoidal schedule. + +• weight_decay : 0 + +• precision : amp_bf16 + +• global_train_batch_size : 192 + +• device_train_microbatch_size : 6 + +• gradient_clipping : norm (threshold=1) + +• num_gpus : 32 + +Code IFT: Finetuning Llama-2-7B on the Magicoder-Evol-Instruct-110K dataset + +> 13 https://huggingface.co/meta-llama/Llama-2-7b-hf +> 14 https://huggingface.co/meta-llama/Llama-2-7b-chat-hf + +20 Published in Transactions on Machine Learning Research (08/2024) + +• max_seq_len : 4096 + +• optimizer : decoupled_lionw (betas=[0.9, 0.95]) + +• learning_rate : 2e-4 for rank r = 16 , 64 and 1e-4 for r = 256 α = 2 r = 512 (due to instabilities/loss spikes at 2e-4) + +• scheduler : cosine_with_warmup (alpha_f=0.01, t_warmup=0.1dur) + +• weight_decay : 0 + +• precision : amp_bf16 + +• global_train_batch_size : 192 + +• device_train_microbatch_size : 6 + +• gradient_clipping : norm (threshold=1) + +• num_gpus : 32 + +Math IFT: Finetuning Llama-2-7B on the MetaMathQA dataset + +• seq_len : 1024 + +• optimizer : decoupled_lionw (betas=[0.9, 0.95]) + +• learning_rate : Full finetuning: 1e-5, LoRA: 1e-4 for r = 16 , 64 , 5e-5 for r = 256 due to instabilities. + +• scheduler : cosine_with_warmup (alpha_f=0.01, t_warmup=0.1dur) + +• weight_decay : 0 + +• precision : amp_bf16 + +• global_train_batch_size : 768 + +• device_train_microbatch_size : 24 + +• gradient_clipping : norm (threshold=1) + +• num_gpus : 32 + +A.1 Training the input and output embedding layers. + +Vanilla LoRA and other popular methods such as QLoRA (Dettmers et al., 2024) often do not train the input and output embedding layers. Recent open-source work, 15 on the other hand, shows that it might be beneficial to supplement LoRA with full finetuning of these two modules (additional ≈ 200 M parameters for a 7B model). We view this approach as a hybrid of LoRA and full finetuning, and therefore leave its empirical investigation for future work. Moreover, this hybrid approach involves further hyperparameter optimization: the input and output layers require tuning their own separate learning rates, which should typically be 2-10 × + +smaller than the LoRA learning rates (training with a single learning rate results in instabilities). + +# B Learning rate searches + +We perform a learning rate sensitivity analysis for Llama-2-7B, trained for two epochs on the code and math IFT datasets, and followed by HumanEval and GSM8K evaluation, respectively. Fig. S1 shows that LoRA improves monotonically with learning rate up to a value at which training diverges, with best learning rates of 5e−4 for code and 2e−4 for math. On both datasets, these best LoRA learning rates are underperformed by four alternative full finetuning learning rates. The best full finetuning learning rates are 5e−5 and 1e−5, respectively, an order of magnitude smaller than LoRA. For LoRA, we cannot find alternative learning rates that achieve at least 90% of the best learning rate’s performance. For full finetuning, there are two viable alternative learning rates for code and three for math. Note that in these experiments, the LoRA models target all modules but the Wgate , with α = 32 which should preferably be higher for r = 64 . This explains the slight differences between Figures S1 and S3. + +> 15 https://unsloth.ai/blog/contpretraining see also the following blogpost https://www.anyscale.com/blog/ fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 (Niederfahrenhorst et al., 2023) + +21 Published in Transactions on Machine Learning Research (08/2024) + +Figure S1: LoRA is more sensitive to learning rates compared to full finetuning . Llama-2-7B models (A) trained on Magicoder-Evol-Instruct-110k (Wei et al., 2023) and evaluated on HumanEval, (B) trained on MetaMathQA (Yu et al., 2023) and evaluated on GSM8K. Experiments here are performed with LionW; see Fig. S2 for a comparion to AdamW. + +B.1 Learning rate sensitivity analysis across optimizers + +We compared the AdamW and Decoupled LionW optimizers by training for two epochs of Magicoder-Evol-Instruct-110K using different learning rates. We found that Decoupled LionW performed better on HumanEval for both LoRA and full finetuning, and across learning rates, as seen in Fig. S2. + +Figure S2: Comparing LionW to AdamW across learning rates for two epochs of the Magicoder-Evol-Instruct-110K dataset. Left: HumanEval; Right: Average of “Language Understanding” benchmarks in the MosaicML evaluation gauntlet. Both methods peak at the learning rate used in the original paper (Wei et al., 2023). 22 Published in Transactions on Machine Learning Research (08/2024) + +B.2 The importance of the alpha scaling parameter for LoRA + +We found that the performance of all models was particularly sensitive to the LoRA α hyperparameter. Fig. S3 shows two experiments on two separate datasets (Magicoder-Evol-Instruct-110K and OpenWebMath) for LoRA with rank r = 256 . In both cases the best accuracy is achieved when α = 2 r. + +> (a) Jointly sweeping over LoRA αand learning rate. The optimal choice is α= 2 r(blue). +> (b) Continued pretraining with two different choices of +> α, where α= 2 ris best (blue). + +Figure S3: LoRA performance is sensitive to the α hyperparameter . We show that for Code IFT (a) and math CPT (b) an α that is scaled with rank such that α = 2 r leads to the highest accuracy. 23 Published in Transactions on Machine Learning Research (08/2024) + +# C Finetuning on the Tülu-v2-mix dataset + +We finetuned Llama-2-7B models on the Tülu-v2-mix (Ivison et al., 2023), which is a finetuning dataset containing chain of thought reasoning, multi-turn assistant conversations, math and science problems, code, and more. 16 There are roughly 326k samples in this dataset. As in all main experiments, we compared full finetuning and LoRA r = 16 , 64 , 256 , targeting all transformer modules. For each of the four experimental conditions, we trained a model for up to 6 epochs and evaluated it after 2, 4, and 6 epochs. Different from the main IFT experiments, the checkpoints evaluated are “hot” and are not cooled down for each training duration. As in the original paper (Ivison et al., 2023), we assess math capabilities with GSM8K Cobbe et al. (2021), STEM, humanities, and social science capabilities as the average of 57 subjects of the Massive Multitask Language Understanding (MMLU; Hendrycks et al. (2020)), and conversational capabilities with Multi-Turn Benchmark (MT-bench (Zheng et al., 2024)) which includes 80 multi-turn conversations where the model responses are evaluated automatically by GPT-4. We also compute the same average forgetting score as in all other datasets in this paper. Since datasets like Tülu-v2-mix are where LoRA is mostly used, we ask: can LoRA, even with a low rank, achieve full finetuning accuracy both in specific domains and in general conversational capabilities? + +C.1 Experimental setup + +After an initial learning rate sweep, we chose the following hyperparameters: + +• max_seq_len : 4096 + +• optimizer : decoupled_lionw (betas=[0.9, 0.95]) + +• learning_rate : Full finetuning: 5e-6; LoRA 1e-4 + +• scheduler : cosine_with_warmup (alpha_f=0.01, t_warmup=0.1dur) + +• weight_decay : 0 + +• precision : amp_bf16 + +• global_train_batch_size : 192 + +• device_train_microbatch_size : 6 + +• gradient_clipping : norm (threshold=1) + +• num_gpus : 32 + +C.2 Results + +First, we find that on MT-bench (Fig. S4), both LoRA and full finetuning meaningfully improve upon the base model (2.74), starting from the second epoch and improving only slightly when trained for longer. Crucially, all LoRA models are within one standard error of the mean of the full finetuning model (computed with 160 datapoints = 80 questions × 2 turns). That is, one can achieve full finetuning conversational capabilities with r = 16 . The caveat is that only 80 questions appear in this benchmark and that the variance, within model, is high. In GSM8K (Fig. S5a), again, all models significantly improve upon the base model (0.145). Remarkably, even in this specific domain, LoRA and full finetuning are overlapping, with the best model being LoRA + +r = 256 at epoch 4, which is followed by full finetuning at epoch 2. Here too, as in the other math datasets in the paper, there is an ordering by LoRA rank. In MMLU (Fig. S5b), full finetuning and LoRA are overlapping with LoRA r = 64 as the best model (epoch 4), followed by full finetuning at epoch 2. Here there is no ordering by rank. As for forgetting (Fig. S6), we find an overall mild forgetting compared to the rest of the datasets in the paper. At two epochs, full finetuning does better than LoRA. The former starts to degrade at epoch 4. At + +> 16 https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture + +24 Published in Transactions on Machine Learning Research (08/2024) epoch 6, the findings of the main paper are replicated: full finetuning forgets the most and we find a clear ordering of forgetting by rank. Across all evaluations – learning and forgetting – full finetuning is the best model at epoch 2, and only degrades afterwards. LoRA, on the other hand, needs 4 epochs to train, mirroring the findings in the main part of the paper. LoRA r = 16 seems to offer competitive conversational capabilities, and minimal forgetting, but it underperforms in domain-specific knowledge like math. Future work should seek to understand why this is the case. + +Figure S4: Average MT-bench score with GPT-4 as a judge, calculated over 80 questions with two turns each . Base model value as reported in the MT-bench paper. We note that the Tülu paper reports a 6.3 MT-bench value from full finetuning of Llama-2-7B base model, which is only slightly exceeding the standard error from our average score. + +> (a) Accuracy in GSM8K. (b) Average of MMLU benchmarks. + +Figure S5: On Tülu-v2-mix, LoRA and full finetuning both improve upon the base model and perform comparably. + +25 Published in Transactions on Machine Learning Research (08/2024) + +Figure S6: LoRA forgets less even on a more diverse IFT dataset like Tülu-v2-mix. We plot the average forgetting score, same as in all other datasets, as a function of training duration. 26 Published in Transactions on Machine Learning Research (08/2024) + +# D Supplementary tables + +Table S1: Starcoder-Python Results (HumanEval pass@1, temperature 0.2) Num. tokens (billions) 0.25 0.50 1 2 4 8 16 20 Condition LoRA (r=16) 0.143 0.144 0.141 0.141 0.154 0.159 0.162 0.162 LoRA (r=64) 0.142 0.146 0.141 0.153 0.157 0.176 0.194 0.196 LoRA (r=256) 0.144 0.142 0.143 0.159 0.159 0.208 0.211 0.224 Full Finetuning 0.152 0.153 0.172 0.181 0.218 0.258 0.255 0.263 Table S2: Starcoder-Python Results (Forgetting Average) Num. tokens (billions) 0.25 0.50 1 2 4 8 16 20 Condition LoRA (r=16) 0.645 0.642 0.645 0.642 0.644 0.640 0.638 0.635 LoRA (r=64) 0.646 0.644 0.646 0.646 0.639 0.634 0.626 0.626 LoRA (r=256) 0.644 0.645 0.643 0.639 0.636 0.630 0.618 0.617 Full Finetuning 0.625 0.624 0.625 0.616 0.599 0.583 0.551 0.545 Table S3: OpenWebMath Results (GSM8K) Num. tokens (billions) 0.25 0.50 1 2 4 8 16 20 Condition LoRA (r=16) 0.162 0.157 0.161 0.155 0.165 0.156 0.152 0.158 LoRA (r=64) 0.163 0.167 0.150 0.166 0.164 0.168 0.179 0.163 LoRA (r=256) 0.162 0.161 0.140 0.170 0.193 0.196 0.203 0.202 Full Finetuning 0.155 0.152 0.165 0.158 0.224 0.238 0.283 0.293 27 Published in Transactions on Machine Learning Research (08/2024) Table S4: OpenWebMath Results (Forgetting Average) Num. tokens (billions) 0.25 0.50 1 2 4 8 16 20 Condition LoRA (r=16) 0.640 0.641 0.646 0.641 0.643 0.641 0.636 0.637 LoRA (r=64) 0.640 0.640 0.638 0.637 0.643 0.634 0.634 0.627 LoRA (r=256) 0.638 0.638 0.637 0.634 0.633 0.620 0.620 0.616 Full Finetuning 0.634 0.634 0.640 0.630 0.629 0.619 0.613 0.618 Table S5: Magicoder-Evol-Instruct-110K Results (HumanEval pass@1) Epoch 1 2 4 8 16 Condition LoRA (r=16) 0.197 0.275 0.358 0.338 0.324 LoRA (r=64) 0.249 0.339 0.417 0.392 0.405 LoRA (r=256) 0.299 0.385 0.498 0.437 0.466 Full Finetuning 0.302 0.464 0.470 0.497 0.416 Table S6: Magicoder-Evol-Instruct-110K Results (Forgetting Average) Epoch 1 2 4 8 16 Condition LoRA (r=16) 0.653 0.648 0.652 0.646 0.609 LoRA (r=64) 0.652 0.651 0.632 0.580 0.510 LoRA (r=256) 0.655 0.659 0.631 0.552 0.517 Full Finetuning 0.595 0.579 0.512 0.446 0.414 Table S7: MetaMathQA Results (GSM8K) Epoch 1 2 4 8 16 Condition LoRA (r=16) 0.447 0.528 0.580 0.578 0.569 LoRA (r=64) 0.527 0.588 0.624 0.624 0.595 LoRA (r=256) 0.557 0.607 0.625 0.634 0.594 Full Finetuning 0.604 0.641 0.642 0.619 0.599 Table S8: MetaMathQA Results (Forgetting Average) Epoch 1 2 4 8 16 Condition LoRA (r=16) 0.628 0.617 0.616 0.616 0.596 LoRA (r=64) 0.617 0.609 0.608 0.586 0.568 LoRA (r=256) 0.613 0.607 0.599 0.584 0.567 Full Finetuning 0.598 0.599 0.590 0.572 0.559 28 Published in Transactions on Machine Learning Research (08/2024) Table S9: Tülu-v2-mix Results Table S10: Tülu-v2-mix MT-Bench Epoch 2 4 6Condition LoRA (r=16) 5.681 5.997 5.712 LoRA (r=64) 5.597 5.725 5.944 LoRA (r=256) 5.788 5.834 5.894 Full Finetuning 5.825 5.838 5.862 Table S11: Tülu-v2-mix MMLU Epoch 2 4 6Condition LoRA (r=16) 0.491 0.502 0.504 LoRA (r=64) 0.503 0.509 0.504 LoRA (r=256) 0.502 0.496 0.492 Full Finetuning 0.507 0.504 0.502 Table S12: Tülu-v2-mix GSM8K Epoch 2 4 6Condition LoRA (r=16) 0.251 0.275 0.280 LoRA (r=64) 0.285 0.270 0.295 LoRA (r=256) 0.296 0.335 0.301 Full Finetuning 0.324 0.291 0.303 Table S13: Tülu-v2-mix Forgetting Average epoch 2 4 6condition LoRA (r=16) 0.650 0.657 0.657 LoRA (r=64) 0.649 0.655 0.647 LoRA (r=256) 0.653 0.649 0.629 Full Finetuning 0.660 0.652 0.621 29 Published in Transactions on Machine Learning Research (08/2024) + +# E Supplementary Figures for SVD Analysis + +Figure S7: SVD analysis for 4096 × 4096 matrix Wq at layer 26 . Left: singular values for base weights, finetuned weights, and their difference. Right: cumulative explained variance. Notice that for all three matrices, a rank > 1500 is needed to explain 90% of the variance. 0 200 400 600 800 1000 + +i + +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> 0.8 +> 1.0 +> i +> Spectra of random matrices +> A +> A + 0.1B +> 0.1B + +(a) Spectrum of A and A + cB as well as cB for c = 0 .1.Notably, A, cB, A + cB are all high rank. 6 5 4 3 2 1 0 + +log 10 (c) + +> 10 +> 8 +> 6 +> 4 +> 2 +> 0 +> log 10 1 +> n | A A + cB | +> MAD between spectra of random matrices +> mean absolute difference +> c2 +> c + +(b) Mean absolute difference between spectra of A and + +A + cB for various c. + +Figure S8: Analyzing the spectra of the sum of two 1000 × 1000 Gaussian i.i.d matrices . A and B + +are 1000 × 1000 random matrices with i.i.d. standard normal Gaussian entries. 30 Published in Transactions on Machine Learning Research (08/2024) + +# F Solution Generation Diversity on HumanEval + +For the best set of Llama-2-7B models trained on Magicoder for 4 epochs, we evaluate how the pass@ k + +metric in the HumanEval benchmark increases as we increase the parameter k which controls the acceptance criterion. The pass@ k metric (Chen et al., 2021) is defined as pass@ k := E + +[ + +1 − + +(n−ck + +)(nk + +)] + +, (1) where n is the number of generations, c the number of correct generations and k determines the size of the sample set of generations considered for acceptance. Assuming we sample from the model outputs, i.e. sampling temperature T > 0, then increasing k will increase the diversity of generations, and increase the likelihood of a passing generation being present in a random subset of size k.Figure S9 reports pass@ k for the LoRA models trained on the Magicoder dataset as well as the base Llama-2-7B model. For all models, as we increase k, the pass@ k consistently and monotonically improves. Finetuned models scores are substantially higher than the base model. At k = 1 , full finetuning outperforms the LoRA model whose scores are ordered from largest to smallest rank, as expected. As k increases we observe that all models improve their pass@ k scores, and that the gap between them reduces when k > 16 . We note that full finetuning is superior across all values of k with temperature 0.8. This complements the results in Fig. 1 which used a temperature of 0.2 and pass@ 1, where the improvements upon r = 256 at epoch 4 are less clear. 1 2 4 8 16 32 64 128 256 + +> k +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> 0.8 +> pass@k - HumanEval +> Llama-2-7B Epoch=4 +> 256 Generations with Temperature = 0.8 +> Model +> Base +> Full finetuning +> LoRA (r=16, All) +> LoRA (r=64, All) +> LoRA (r=256, All) + +Figure S9: HumanEval pass@ k for models trained on the Magicoder dataset. For every model, we sample 256 independent generations with temperature 0.8. 31 Published in Transactions on Machine Learning Research (08/2024) + +# G Training Datasets + +G.1 MetaMathQA (Math IFT) + +The MetaMathQA dataset (Yu et al. (2023), https://huggingface.co/datasets/meta-math/MetaMathQA )contains 395,000 samples that are bootsrapped from the GSM (Cobbe et al., 2021) and Math (Hendrycks et al., 2021) training sets. These samples are augmented by GPT-3.5 using the following methods: + +• Answer Augmentation (155k samples, Yu et al. (2023)): this method proposed by the MetaMathQA authors generates multiple reasoning paths for a given mathetical question and filters for generated reasoning paths that contain the correct final answer. + +• Rephrasing (130k samples, (Yu et al., 2023)): this method proposed by the MetaMathQA authors uses GPT-3.5 to rephrase questions. They check for the correctness of rephrased questions by using few-shot Chain of Thought prompting to compare reasoning chains and proposed answers with ground truth answers. Both Self-Verification (Weng et al., 2022) and FOBAR (Jiang et al., 2024b) fall under the category of “backward reasoning,” where the question starts with a given condition and requires reasoning backwards to solve for an unknown variable. In order to generate new mathematical questions, a numerical value in the original question is masked as a variable X, and the question is rephrased accordingly. + +• Self-Verification (55k samples, Weng et al. (2022)): the question is rephrased into a declarative statement followed by a question about the masked variable X. + +• FOBAR (55k samples, Jiang et al. (2024b)): this approach is similar to Self-Verification but directly appends the answer to the question using the template “If we know the answer to the above question is ˆAc, what is the value of unknown variable x?” MetaMathQA samples are organized by 4 columns: type , original_question , query and response .We include two full examples below: + +Example G.1: MetaMathQA + +> type :GSM_Rephrased query : Scott wants to prepare and freeze a large portion of ratatouille. He purchases 5 pounds of eggplants and 4 pounds of zucchini at a rate of $2.00 per pound from the farmers’ market. Additionally, he needs 4 pounds of tomatoes that cost $3.50 per pound, 3 pounds of onions priced at $1.00 per pound, and a pound of basil sold for $2.50 per half pound. If this quantity yields 4 quarts, what is the cost per quart? +> original_question : Scott wants to make and freeze a large batch of ratatouille. At the farmers’ market he buys 5 pounds of eggplants and 4 pounds of zucchini at $2.00 a pound. He needs 4 pounds of tomatoes that are $3.50 a pound. The onions are $1.00 a pound and he needs 3 pounds. Then he needs a pound of basil which is sold for $2.50 per half pound. If this yields 4 quarts, how much does each quart cost? +> response : The cost of the eggplants is 5 pounds x $2.00 per pound = $10.00. The cost of the zucchini is 4 pounds x $2.00 per pound = $8.00. The cost of the tomatoes is 4 pounds x $3.50 per pound = $14.00. The cost of the onions is 3 pounds x $1.00 per pound = $3.00. The cost of the basil is 1 pound x ($2.50 per half pound x 2) = $5.00. The total cost for all the ingredients is $10.00 + $8.00 + $14.00 + $3.00 + $5.00 = $40.00. Since this quantity yields 4 quarts, the cost per quart is $40.00 / 4 quarts = $10.00 per quart. #### 10 The answer is: 10 + +32 Published in Transactions on Machine Learning Research (08/2024) + +Example G.2: MetaMathQA + +type : MATH_SV query : If a school bus leaves school with X students on board, and one-half of the students get off the bus at each of the first three stops, The number of students remaining on the bus after the third stop is 6. What is the value of unknown variable X? + +original_question : If a school bus leaves school with 48 students on board, and one-half of the students get off the bus at each of the first three stops, how many students remain on the bus after the third stop? + +response : To solve this problem, we need to determine the value of x, which represents the initial number of students on the bus. Let’s break down the information given: Number of students remaining on the bus after the third stop: 6 Number of students getting off the bus at each stop: 1/2 Number of stops: 3 We can set up the equation as follows: Number of students on the bus * (1/2)ˆ 3 = Number of students remaining on the bus after the third stop x * (1/2)ˆ 3 = 6 Let’s simplify and solve for x: x * (1/8) = 6 To isolate x, we multiply both sides of the equation by 8: x * (1/8) * 8 = 6 * 8 x = 48 The value of x is 48. The answer is: 48 + +G.2 Magicoder-Evol-Instruct-110k (Code IFT) + +As stated in the main text, this dataset contains 72.97M tokens of programming questions and answers (Wei et al., 2023). It reproduces the “Evol-Instruct” dataset of WizardCoder (Luo et al., 2023b) by iteratively prompting an LLM (GPT-4) to increase the difficulty of a set of question-answer pairs from Code Alpaca (Chaudhary, 2023). The dataset can be found here: https://huggingface.co/datasets/ ise-uiuc/Magicoder-Evol-Instruct-110K + +Example G.3: Magicoder-Evol-Instruct-110k + +instruction : Please amend the subsequent Python script so that it includes a ’while’ loop rather than the existing ’for’ loop, which iterates through the items of an integer list. The script currently has a bug where it attempts to print an object that is outside the bounds of the list. Fix this error and modify the script to use ’while’ instead of ’for’ loop. Ensure your script correctly handles empty lists. + +‘‘‘python # Establish an integer list arr = [1, 2, 3, 4] # Determine the length of the list n = len(arr) # Traverse the list and output each individual element for i in range(n+1): print(arr[i]) ‘‘‘ response : + +‘‘‘python # Establish an integer list arr = [1, 2, 3, 4] # Determine the length of the list n = len(arr) # Initialize index at 0 i = 0 # Traverse the list and output each individual element while i < n: print(arr[i]) i += 1 ‘‘‘ + +In the given code, it tries to access ‘arr[n]‘ which is out of bounds as python list indexing starts from 0 and ends at n-1 for a list of length n. Also, I used ‘while‘ loop as per your request. By initializing ‘i=0‘ before the loop and ‘i += 1‘ within the loop, we can iterate over the list. This iteration continues until ‘i‘ is less than ‘n‘, the length of the list. The ‘while‘ loop will not proceed if the list is empty so it correctly handles empty lists. + +G.3 Starcoder Python (Code CPT) + +As stated in the main text, this dataset consists of permissively licensed repositories from GitHub, including Git commits, in 80+ programming languages (Li et al., 2023a) . We chose the Python subset and sub-sampled it to 20B tokens. The full dataset can be found here: https://huggingface.co/datasets/bigcode/ starcoderdata + +33 Published in Transactions on Machine Learning Research (08/2024) + +Example G.4: Starcoder-Python + +‘‘‘python """ function of invoking Gitee API """ import base64 import logging import requests from flask import current_app from requests import exceptions logger = logging.getLogger(__name__) ORG_URL = "\href{https://gitee.com/api/v5/orgs}{https://gitee.com/api/v5/orgs}" REPO_URL = "\href{https://gitee.com/api/v5/repos}{https://gitee.com/api/v5/repos}" def get_request(url, params): """ get request """ logger.debug("Get request, connect url: %s", url) try: response = requests.get(url,params=params) return True, response except exceptions.ConnectionError as err: logger.error(err) return False, ’connection error’ except IOError as err: logger.error(err) return False, ’IO error’ ‘‘‘ more functions truncated... + +G.4 OpenWebMath (Math CPT) + +As stated in the main text, this dataset contains 14.7B tokens derived from mathematical web pages from Common Crawl, correctly formatted to preserve mathematical content such as LaTeX equations (Paster et al., 2023) . The dataset can be found here: https://huggingface.co/datasets/open-web-math/ open-web-math . As can be seen from the example below, this dataset contains a large amount of English. 34 Published in Transactions on Machine Learning Research (08/2024) + +Example G.5: OpenWebMath + +url : http://math.stackexchange.com/questions/222974/probability-of-getting -2-aces-2-kings-and-1-queen-in-a-five-card-poker-hand-pa text : # Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand (Part II) So I reworked my formula in method 1 after getting help with my original question - Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand. But I am still getting results that differ...although they are much much closer than before, but I must still be making a mistake somewhere in method 1. Anyone know what it is? Method 1 $P(2A \cap 2K \cap 1Q) = P(Q|2A \cap 2K)P(2A|2K)P(2K)$ $$= \frac{1}{12}\frac{{4 \choose 2}{46 \choose 1}}{50 \choose 3}\frac{{4 \choose 2}{48 \choose 3}}{52 \choose 5}$$ $$= \frac{(6)(17296)(6)(46)}{(2598960)(19600)(12)}$$ $$= 4.685642 * 10 ˆ{-5}$$ Method 2 $$\frac{{4 \choose 2} {4 \choose 2}{4 \choose 1}}{52 \choose 5} = \frac{3}{54145}$$ $$5.540678 * 10 ˆ{-5}$$ - Please make an effort to make the question self-contained and provide a link to your earlier question. – Sasha Oct 28 ’12 at 19:56 I think we would rather ahve you edit your initial question by adding your new progress. This avoids having loss of answer and keeps track of progress – Jean-Sébastien Oct 28 ’12 at 19:56 But there already answers to my original question so those answers would not make sense now that I am using a new formula for method 1. – sonicboom Oct 28 ’12 at 20:03 Conditional probability arguments can be delicate. Given that there are exactly two Kings, what’s the $46$ doing? That allows the possibility of more Kings. – André Nicolas Oct 28 ’12 at 20:26 The $46$ is because have already taken two kings from the pack leaving us with 50. And now we have chosen 2 aces and we have to pick the other 1 card from the 50 remaining cards less the 4 aces? – sonicboom Oct 28 ’12 at 20:42 show 1 more comment $$\frac{1}{11}\frac{{4 \choose 2}{44 \choose 1}}{48 \choose 3}\frac{{4 \choose 2}{48 \choose 3}}{52 \choose 5}$$ If you wrote this as $$\frac{{4 \choose 2}{48 \choose 3}}{52 \choose 5}\frac{{4 \choose 2}{44 \choose 1}}{48 \choose 3}\frac{{4 \choose 1}{40 \choose 0}}{44 \choose 1}$$ it might be more obvious why they are the same. + +date : 2014-03-07 11:01:44 + +35 Published in Transactions on Machine Learning Research (08/2024) There is often some confusion about the memory gains that vanilla LoRA offers both in theory and in practice. In Appendix H we discuss some of the theoretical benefits of LoRA, and show how it can enable training both on GPUs with less memory and on fewer total GPUs (in the multi-GPU setting). In Appendix I we show how LoRA in practice leads to memory savings relative to full finetuning, but can in fact lead to slower throughput for particular hardware and software settings. + +# H Theoretical Memory Efficiency Gains with LoRA for Single and Multi-GPU Settings + +Modern systems for training neural networks store and operate on the following objects (following the conventions in Rajbhandari et al. (2020)). Most memory requirements relate to model states , which include: + +• parameter weights + +• gradients + +• higher order optimization quantities such as optimizer momentum and variance in the Adam optimizer, and the momentum in the Lion optimizer The remaining memory requirements come from the residual states : + +• activations (which depend on batch size and maximum sample sequence length) + +• temporary buffers for intermediate quantities in the forward and backward pass. which will require more memory when increasing the batch size and maximum sequence lengths. LoRA offers memory savings with respect to the model states . The next two sections describe these memory savings in the single GPU and multi-GPU setting with examples loosely inspired by Rajbhandari et al. (2020). The data stored at single precision includes: + +• a “master copy” of the tuned parameter weights + +• the gradient + +• all optimizer states (both momentum and variance for Adam, and just momentum for Lion) For simplicity, we do not consider mixed-precision training, which involves storing critical data at single precision (fp32; 4 bytes per number) while performing some computations at half precision (fp16 or bfloat16; 2 bytes per number). + +H.1 Training on a Single GPU + +In the single GPU setup, the difference in memory requirements between LoRA and full finetuning is particularly drastic when using the Adam optimizer (Hu et al., 2021; Rajbhandari et al., 2020). Storing the master weights in fp32 requires 4 bytes per parameter, while storing the gradient in fp32 requires 4 bytes per tuned parameter . In order to maintain the optimizer state in fp32 for Adam, 8 bytes per tuned parameter are required; 4 bytes for the momentum term, and 4 bytes for the variance term. Let Ψ be the number of model parameters. Therefore, in the Adam full finetuning setting of a Ψ = 7 B parameter model, the total memory requirements are at least roughly 4 × Ψ + 4 × Ψ + 8 × Ψ = 112 GB. The Lion optimizer only uses a momentum term in the gradient calculation, and the variance term in Adam therefore disappears. In the Lion full finetuning setting of a Ψ = 7 B parameter model, the total memory requirements are therefore roughly 4 × Ψ + 4 × Ψ + 4 × Ψ = 84 GB. LoRA, on the other hand, does not calculate the gradients or maintain optimizer states (momentum and variance terms) for most of the parameters . Therefore the amount of memory used for these terms is drastically reduced. 36 Published in Transactions on Machine Learning Research (08/2024) 7B Training 1 GPU 8 GPUs 16 GPUs 32 GPUs 64 GPUs Adam 112 GB 14 GB 7 GB 3.5 GB 1.75 GB Adam + LoRA 15.12 GB 1.89 GB 0.945 GB 0.4725 GB 0.236 GB Lion 84 GB 10.5 GB 5.25 GB 2.625 GB 1.3125 GB Lion + LoRA 14.84 GB 1.855 GB 0.9275 GB 0.464 GB 0.232 GB Table S14: Theoretical memory required to store the model and optimizer state during training for a 7B parameter model. Note that the numbers exclude memory needed to store activations. FSDP sharding the parameter and optimizer states across N devices results in less memory usage relative to LoRA. LoRA on the other hand enables training on GPUs with far less memory and also enables training without needing as many GPUs to shard across. A LoRA setting with Adam that only tunes matrices that are 1% of the total parameter count (e.g. Ψ = 7 B + +base model with 70M additional parameters used by LoRA) requires roughly 4 × Ψ(1 + 0 .01) + 4 × Ψ × 0.01 + 8 × Ψ × 0.01 = 29.12 GB of memory. Theoretically this can be reduced further to 2 × Ψ + 16 × Ψ × 0.01 = + +15.12 GB if the non-tuned parameter weights are stored in bfloat16 . We use this assumption for the subsequent examples. Note again that these numbers do not take into consideration sample batch size or sequence length, which affect the memory requirements of the activations. + +H.2 Training on Multiple GPUs with Fully Sharded Data Parallelism + +Past approaches for training LLMs across multiple GPUs include model parallelism, where different layers of the LLM are stored on different GPUs. However this requires high communication overhead and has very poor throughput (Rajbhandari et al., 2020). Fully Sharded Data Parallelism (FSDP) shards the parameters, the gradient, and the optimizer states across GPUs. This is incredibly efficient and is actually competitive with the memory savings offered by LoRA in certain settings. FSDP sharding of the parameter and optimizer states across N devices results in less memory usage relative to LoRA. LoRA on the other hand enables training on GPUs with far less memory and also enables training on fewer GPUs. For example, in the Adam full finetuning setting of a Ψ = 7 B parameter model on 8 GPUs with FSDP, the total memory requirement for each GPU is roughly (4 × Ψ + 4 × Ψ + 8 × Ψ) /8 = 14 GB. This reduces further to 3.5 GB for FSDP with 32 GPUs (see Table S14). The LoRA with Adam setup on 8 GPUs (where Ψ = 7 B base model and there are 70M additional LoRA parameters) requires roughly (2 × Ψ + 16 × Ψ × 0.01) /8 = 1.89 GB of memory per GPU. With 32 GPUs this decreases further to 0.4725 GB. Standard industry level GPUs have on-device memory between 16 GB (e.g. V100s) and 80 GB (e.g. A100s and H100s). As Table S14 demonstrates, the per-GPU memory requirements for training a 7B parameter model decrease drastically as the number of GPUs increases. The memory requirements for training a 7B model with Adam + LoRA on a single GPU are 15.12 GB, but the same per-GPU memory requirement for training a 7B model with Adam but without LoRA on 8 GPUs is 14 GB. In this 8 GPU scenario, the efficiency gains from LoRA disappear. Table S15 applies similar calculations to a 70B parameter model. Finetuning such a large model on 8 GPUs is only possible using a technique like LoRA; where Adam requires 140 GB per GPU, Adam+LoRA requires 18.9 GB per GPU. The efficiency gains of LoRA relative to FSDP therefore depend on the model size and GPU availability/cost considerations. We do the same analysis for a 405B parameter model to highlight how LoRA is beneficial as model size scales (Table S16). This is particularly relevant now that Llama-3-405B has been released by Meta (Dubey et al., 2024). 37 Published in Transactions on Machine Learning Research (08/2024) 70B Training 1 GPU 8 GPUs 16 GPUs 32 GPUs 64 GPUs Adam 1.12 TB 140 GB 70 GB 35 GB 17.5 GB Adam + LoRA 151.2 GB 18.9 GB 9.45 GB 4.725 GB 2.36 GB Lion 840 GB 105 GB 52.5 GB 26.25 GB 13.125 GB Lion + LoRA 148.4 GB 18.55 GB 9.275 GB 4.64 GB 2.32 GB Table S15: Theoretical memory required to store the model and optimizer state during training for a 70B parameter model. + +405B Training 1 8 16 32 64 128 256 Adam 6480 810 405 202.5 101.25 50.625 25.3 Adam + LoRA 874.8 109.35 54.65 27.34 13.67 6.83 3.42 Lion 4860 607.5 303.75 151.875 75.94 37.97 18.98 Lion + LoRA 858.6 107.325 53.66 26.83 13.42 6.71 3.35 Table S16: Theoretical memory required to store the model and optimizer state during training for a 405B parameter model. Units are in gigabytes (GB) + +# I LoRA Throughput and Memory Measurements + +We report training efficiency comparisons between full finetuning and models trained with LoRA for various choices of rank. We measured both the throughput (in tokens per second) and peak active memory (in GB) for training runs representative of the experiments reported in the paper. We performed the runs using a single node of 8×H100-80GB GPUs. We used a per-GPU micro batch size of 1 and targeted all linear layer weights with LoRA (i.e. both Attention and MLP). In Figure S10 we observe that there is a significant gap between full finetuning and LoRA runs, related to the additional overheads of the LoRA computations. In general, LoRA leads to an approximately 15% + +reduction in throughput for a given batch size. LoRA with higher ranks is slower than lower ranks across all batch sizes; this is particularly noticeable for rank r = 512 . Similarly, LoRA settings with higher batch sizes have slightly higher throughput relative to lower batch sizes. Some of the slowdown is intrinsically related to the overheads of performing LoRA, since in practice it involves more computations of intermediate activations. However, we note that we did not optimize the LoRA implementation and used the publicly available HuggingFace peft library, which might be amenable to further optimizations that could reduce the gap in throughput. For peak memory, we notice that for small batch sizes, LoRA provides a substantial reduction in peak memory ( ∼ 40% ). This is expected since the optimizer state is significantly smaller when using parameter efficient methods. However, as batch size increases, the size of intermediate activations increases proportionally, dominating the required memory. We limit the per GPU micro batch size to 8 to prevent out of memory errors, so for batch sizes 64 and above, we perform gradient accumulation. This leads to the throughput and memory stabilizing for batch size 64 and above, with just around ( ∼ 15% memory savings) for larger batch sizes. 38 Published in Transactions on Machine Learning Research (08/2024) 8 16 32 64 128 256 + +Batch Size + +> 25000 +> 27500 +> 30000 +> 32500 +> 35000 +> 37500 +> 40000 +> 42500 +> Throughput [tok/s] +> 816 32 64 128 256 + +Batch Size + +> 10 +> 20 +> 30 +> 40 +> 50 +> 60 +> 70 +> Peak Active Memory [GB] + +Finetune + +> Full +> LoRA (R=8) +> LoRA (R=32) +> LoRA (R=128) +> LoRA (R=512) + +Figure S10: Throughput and Memory Measurements for LoRA vs. full finetuning . (left) Training throughput measured in tokens per second across all 8 GPUs. (right) Peak active memory used by the training process in a single GPU (max GPU memory is 80GB). 39 diff --git a/docs/clover_joint_svd.md b/docs/clover_joint_svd.md new file mode 100644 index 0000000..0ae2f88 --- /dev/null +++ b/docs/clover_joint_svd.md @@ -0,0 +1,657 @@ +Title: CLOVER: Cross-Layer Orthogonal Vectors Pruning and Fine-Tuning + +URL Source: https://arxiv.org/pdf/2411.17426 + +Published Time: Mon, 03 Feb 2025 01:40:12 GMT + +Number of Pages: 16 + +Markdown Content: +# CLOVER: Cross-Layer Orthogonal Vectors Pruning and Fine-Tuning + +Fanxu Meng 1 2 Pingzhi Tang 1 Fan Jiang 1 Muhan Zhang 1 2 + +## Abstract + +Decoder-only models generate tokens autoregres-sively by caching key/value vectors, but as the cache grows, inference becomes memory-bound. To address this issue, we introduce CLOVER (Cross-Layer Orthogonal Vectors), a novel ap-proach that treats pairs of attention layers as a set of low-rank decompositions. CLOVER applies Singular Value Decomposition (SVD) to the Q-K + +and V -O pairs within each attention head. The resulting singular values can either guide pruning or serve as trainable parameters for efficient fine-tuning of all orthogonal vectors. After pruning or fine-tuning, these values are reintegrated into the model without increasing its parameter count. We apply CLOVER to various models, including GPT-2 XL, DeepSeek-V2-Lite, Whisper-Large-v3, Stable Diffusion XL, and LLaMA-3.2-11B-Vision. Our results demonstrate that CLOVER significantly improves pruning efficiency. For in-stance, the perplexity of pruning 70% of the Q-K + +pairs in GPT-2 XL is similar to that of pruning just 8% with vanilla methods. Fine-tuning the sin-gular values further results in a full-rank update, outperforming state-of-the-art methods (LoRA, DoRA, HiRA, and PiSSA) by 7.6%, 5.5%, 3.8%, and 0.7%, respectively, on eight commonsense tasks for LLaMA-2 7B. + +## 1. Introduction + +In recent years, Large Language Models (LLMs) have rapidly evolved into essential tools for productivity (OpenAI, 2024; Anthropic, 2024; Team et al., 2024a). Open-source models (AI@Meta, 2024; Mistral, 2024; Qwen, 2024; Liu et al., 2024b; Team et al., 2024b; Abdin et al., 2024) have also narrowed the performance gap with closed-source mod-els. The success of LLMs is largely attributed to Next Token Prediction (Radford, 2018; Brown et al., 2020), where to-kens are predicted sequentially, with attention computed between each token and all preceding ones. To avoid redun-dant computations, key-value features are cached. However, as model size grows, the overhead of caching becomes sub-stantial, leading to memory and communication bottlenecks. 𝑊 ! + +## 𝑊 " 𝑊 # + +## 𝑊 $ + +# 𝑥 + +# 𝑦 + +(a) Multi-Head Attention 𝑈 !" + +## 𝑉 !" 𝑈 #$ + +## 𝑉 #$ + +# 𝑥 + +# 𝑦 + +> 𝑆 !" +> 𝑆 #$ + +(b) CLOVER 0.08 0.7 1 + +> Pruning Ratio +> 3 +> 4 +> 5 +> 6 +> 7 +> Log of Perplexity +> Vanilla +> CLOVER + +(c) Pruning without Training 0.125 0.25 0.375 0.5 0.625 + +> 10 +> 12 +> 14 +> 16 +> 18 +> 20 +> 22 +> 24 +> Perplexity +> Vanilla +> CLOVer +> CLOVer + +(d) Fine-Tuning Pruned Model + +Figure 1. (a) We treat the Query-Key and Value-Output layers within a single attention head as a unified structure. (b) Apply SVD to obtain two sets of singular vectors for initializing the Q-K and V-O layers, along with singular values that guide pruning or enable efficient full-rank fine-tuning. (c) This cross-layer orthogo-nalization strategy allows for higher pruning rates. (d) The pruned model maintains strong performance after fine-tuning. + +For instance, a 65B parameter model (Touvron et al., 2023) with 8-bit key-value quantization requires over 86GB of GPU memory to store 512K tokens, exceeding the capacity of a single H100-80GB GPU (Sun et al., 2024). To enable efficient training and inference, we introduce CLOVER (Cross-Layer Orthogonal Vectors), a novel method that orthogonalizes the Query, Key, Value, and Out-put vectors without generating additional transformation matrices. As shown in Figure 1a, we treat the Q-K and V - + +O pairs in each attention head as a low-rank decomposition of WQK and WV O . By crossing these layers and perform-1 + +> arXiv:2411.17426v3 [cs.LG] 31 Jan 2025 CLOVER: Cross-Layer Orthogonal Vectors + +ing SVD on WQK and WV O , the Query, Key, Value, and Output vectors become orthogonal within each attention head. Figure 1b illustrates how the resulting singular val-ues can guide pruning or serve as trainable parameters for efficient fine-tuning. After pruning or fine-tuning, these values can be reintegrated into the model without increasing its parameter count. Notably, previous methods, such as SVFT (Lingam et al., 2024), obtain orthogonal vectors by directly performing orthogonal decomposition on the matrix at each layer, which results in an accompanying transfor-mation matrix, doubling the parameter count. In contrast, CLOVER treats the Q-K pairs as transformation matrices for each other, and similarly for the V -O pairs. CLOVER only generates a small set of singular values to guide prun-ing and fine-tuning, which can be merged back into the model without increasing inference costs. + +By orthogonalizing the vectors, we eliminate linear re-dundancy. Attention heads contain numerous non-zero norm vectors. Directly pruning these vectors would degrade performance, but orthogonalizing them allows us to repre-sent the entire attention head’s space using a small set of orthogonal bases. The remaining vectors are nearly zero, making them safe to prune. As shown in Figure 1c, prun-ing an average of 45 vectors in the query-key pair using CLOVER results in a perplexity similar to that of vanilla pruning, which prunes only 5 vectors. Moreover, CLOVER generates a singular value matrix between the Q-K and + +V -O pairs. By updating this matrix during fine-tuning, + +CLOVER learns linear combinations of all orthogonal bases within each attention head. In contrast, PiSSA can only learn from a subset of orthogonal vectors, potentially causing some data projections to approach zero in those di-rections, leading to non-functional adapters during training. As shown in Figure 1d, fine-tuning a very small number of singular values can achieve performance close to that of fine-tuning all attention heads. We summarize the contribution of our paper as follows: • We treat the Q-K and V-O pairs in each attention head as low-rank approximations of WQK and WV O . By performing SVD, we orthogonalize the attention head without adding extra transformation matrices. • This orthogonalization reduces linear redundancy, is compatible with any pruning method, and allows for higher pruning ratios. Pruning 46.42% of the vectors in Whisper’s attention head preserves performance with-out requiring additional training. • CLOVER enables efficient full-rank updates, surpass-ing SOTA methods such as LoRA, DoRA, HiRA, and PiSSA on eight commonsense reasoning tasks across LLaMA 7B/13B, LLaMA-2-7B, and LLaMA-3-8B, with additional analyses highlighting its advantages. + +## 2. Related Work + +LLM Compression To mitigate the high memory de-mands of KV Caches in long-context models, several tech-niques have been proposed. These include reducing se-quence length with linear attention (Katharopoulos et al., 2020; Wang et al., 2020; Peng et al., 2023; Gu & Dao, 2023; De et al., 2024), dynamic token pruning (Fu et al., 2024; Jo & Shin, 2024; Li et al., 2024b), compressing the key-value rank (Shazeer, 2019; Ainslie et al., 2023; Liu et al., 2024a; Yu et al., 2024), and pruning head dimensions (Ashkboos et al., 2024; Xia et al., 2023; Sun et al., 2023). Additional approaches include sharing key-value representations across layers (Sun et al., 2024; Brandon et al., 2024; Liu et al., 2024c; Zuhri et al., 2024) and quantizing KV cache weights and activations (Frantar et al., 2022; Dettmers et al., 2022; Xiao et al., 2023; Liu et al., 2024e; Hooper et al., 2024). Among them, structure pruning is hardware-friendly but can reduce performance when non-zero dimensions are removed (Ma et al., 2023). Fine-tuning can recover some of the lost performance, but it’s computationally expensive. To address this, Parameter Efficient Fine-Tuning (PEFT) methods are used (Guo et al., 2023). + +Parameter Efficient Fine-Tuning. Several strategies have been introduced to minimize fine-tuning parameters while maintaining performance. These include low-rank adaptation (Hu et al., 2021), partial-parameter fine-tuning (Zaken et al., 2021; Lawton et al., 2023; Zhao et al., 2020; Sung et al., 2021; Ansell et al., 2021; Xu et al., 2021; Guo et al., 2020; Fu et al., 2023), soft prompt fine-tuning (Ham-bardzumyan et al., 2021; Lester et al., 2021; Li & Liang, 2021; Liu et al., 2023b; Vu et al., 2021; Asai et al., 2022; Wang et al., 2023), and sparse matrix fine-tuning (Qiu et al., 2023; Liu et al., 2023a; Yuan et al., 2024). Among these, LoRA is widely used due to its simplicity and effectiveness, with recent works enhancing it further (Zhang et al., 2023; Zi et al., 2023; Liu et al., 2024d; Zhao et al., 2024; Jiang et al., 2024). PiSSA (Meng et al., 2024) improves conver-gence speed by initializing adapters with principal singular values and vectors, also reducing quantization error (Wang et al., 2024a;b; Li et al., 2024a). However, PiSSA is limited by its use of a fixed set of orthogonal bases. SVFT (Lingam et al., 2024) directly applies Singular Value Decomposition (SVD) to the original matrix, but this increases the number of parameters, raising computational overhead and reducing efficiency. The CLOVER method addresses these issues by treating the Query-Key pairs in each attention head as low-rank matrices. Using orthogonal decomposition, CLOVER eliminates the need for additional transformation matrices. Instead, it leverages a small set of singular values to linearly combine orthogonal vectors, making the approach more parameter-efficient. After fine-tuning, the adapter can be smoothly reintegrated into the original matrix structure. 2CLOVER: Cross-Layer Orthogonal Vectors + +## 3. CLOVER: Cross-Layer Orthogonal Vectors + +Below is a step-by-step explanation of CLOVER method and explain why it can update orthogonal decompose the Query, Key, Value, Output layers in Multi-Head Attention without need introduce any transfer matrix. We mainly use the computation of the Q-K pair in as an example. Then extended to the V -O pair. + +Multi-Head Self-Attention Setup. In a multi-head self-attention mechanism with H heads, each head h ∈{1, . . . , H } computes an attention score as: attn (Qh, K h) = softmax + + QhK⊤ + +> h√d + + + +, + +where H is the number of attention heads, d is the dimen-sionality of each head, X ∈ Rn×D is the input matrix ( n + +is the sequence length, D is the total hidden dimension), + +Qh, K h ∈ Rn×d are the query and key representations for head h, WQ, W K ∈ RD×H×d are weights for projecting the input X into queries and keys. Specifically, the queries and keys for head h are obtained by multiplying X with the corresponding “slice” of WQ and + +WK , respectively: + +Qh = X W [: ,h, :] + +> Q + +, Kh = X W [: ,h, :] + +> K + +. + +Cross Layers Merging. Substituting Qh and Kh into + +QhK⊤ + +> h + +, we have: + +QhK⊤ + +> h + += X W [: ,h, :] + +> Q + +W [: ,h, :] + +> K + +⊤X⊤. + +Notice that the original weights W [: ,h, :] + +> Q + +and W [: ,h, :] + +> K + +are each in RD×d, once multiplied together, the resulting ma-trix W hQK = W [: ,h, :] + +> Q + +W [: ,h, :] + +> K + +⊤ has dimension D × D.Since d ≪ D, using W hQK directly in computations—or storing it as trainable parameters—would be highly ineffi-cient, limiting the use cases of such parameter merging. + +Cross Layers Orthogonal Decomposition To address the large size of W hQK , we factorize W hQK via SVD: + +W hQK = U hQK ShQK V hQK , + +where U hQK is a D × D orthogonal matrix, ShQK is a D × D + +diagonal matrix of singular values, V hQK is another D × D + +orthogonal matrix. Since W [: ,h, :] + +> Q + +and W [: ,h, :] + +> K + +each have shape RD×d, the rank of W hQK is at most d. Thus the actual non-zero singular values in ShQK are at most d. We can truncate the SVD to keep only the top-r singular values without loss: + +W hQK = U hQK [: , : r] ShQK [: r, : r] V hQK [: , : r]⊤, + +where r ≤ d.The process can be easily applied to WV and WO , as intro-duced in Appendix A.1. + +CLOVER for Pruning After performing SVD, we can rewrite the weight matrix W hQK as follows: + +W hQK = U hQK [: , : r] ShQK [: r, : r] + +| {z } + +> ˜Uh∈RD×r + +V hQK [: , : r]⊤ + +| {z } + +> ˜Vh∈Rr×D + +. + +Instead of storing the full matrices W hQ and W hK ∈ RD×d,we store the smaller factors ˜U h ∈ RD×r and ˜V h ∈ Rr×D ,which can be significantly smaller than the original matrix since r ≤ d ≪ D. This leads to a reduction in memory usage and computational cost. Additionally, we can prune + +singular values (and their corresponding singular vectors) below a chosen threshold. This further reduces the parame-ter count and computational overhead. + +CLOVER for Fine-Tuning CLOVER can be used not only for pruning, but also for parameter-efficient fine-tuning. We freeze the matrices U hQK [: , : r] and V hQK [: , : r], and only fine-tune the singular values ShQK [: r, : r].In contrast to SVFT, which factorizes the entire weight ma-trices WQ, W K , W V , W O ∈ RD×D individually, CLOVER factorizes the merged weights W hQK and W hOV within each attention head, significantly reducing the parameters. By applying SVD factorization within each attention head, CLOVER constrains the effective rank of the cross-layer matrix to d. As a result, the tunable matrix SQK has a size bounded by RH×d×d (considering all heads). In compari-son, SVFT requires factorizing large matrices each into three components ( U, S, V ∈ RD×D ), leading to a significant in-crease in parameter count and computational overhead, even with sparse updates for the singular values S.For example, consider the LLaMA 2-7B model with H =32 attention heads and a head dimension of d = 128 . By factorizing each head separately, the largest size for SQK + +is O(32 × 128 × 128) , which is significantly smaller than factorizing a R4096 ×4096 matrix. This makes CLOVER’s parameter efficiency comparable to that of a LoRA config-uration with rank 32, as shown in Appendix A.2, but with additional potential for pruning. 3CLOVER: Cross-Layer Orthogonal Vectors + +> Table 1. Pruning GPT-2-XL’s attention layers with CLOVER and vanilla pruning at various ratios, evaluating perplexity on Wikitext2 (lower is better), and fine-tuning on OpenWebText with different token budgets. The base model’s perplexity is 14.78. + +Pruning Ratio w/o Training Perplexity( ↓) 66M Tokens Perplexity ( ↓) 131M Tokens Perplexity ( ↓) + +Vanilla CLOVER Vanilla CLOVER CLOVER † Vanilla CLOVER CLOVER † + +12.5% 33.76 15.89 16.04 15.45 15.67 16.38 15.77 15.42 + +25.0% 78.36 17.45 16.93 15.70 15.89 17.07 16.05 15.75 + +37.5% 159.4 20.95 18.17 16.17 16.60 18.14 16.48 16.41 + +50.0% 338.9 35.12 20.45 17.22 17.63 19.02 17.13 17.71 62.5% 538.5 85.25 24.65 19.32 20.64 21.44 18.40 20.39 75.0% 708.8 187.4 36.04 24.65 29.28 27.22 20.99 28.44 + +## 4. Experiments + +As detailed in Section 3, CLOVER is highly effective for both pruning and fine-tuning. We presents a series of experi-ments to validate these capabilities. In Section 4.1, we com-pare CLOVER with Vanilla pruning on a GPT-2-XL model (Radford et al., 2019). CLOVER results in less performance degradation, while Vanilla pruning significantly harms the model’s performance, making recovery difficult even with fine-tuning. In Section 4.2, we conduct fine-tuning experi-ments on eight commonsense tasks, comparing CLOVER with state-of-the-art methods. The results show the effec-tiveness of CLOVER’s linear combinations of all orthogonal vectors. In Section 4.3, CLOVER is applied to various mod-els. We visualize how it removes linear redundancy between vectors, enabling more efficient pruning. In Section 4.4, we demonstrate CLOVER’s ability to perform significant prun-ing on the Whisper model, which exhibits substantial linear redundancy, without requiring fine-tuning. In Section 4.5, we explain the importance of learning from all the orthogo-nal vectors by analyzing the projection of data features onto different directions in the model. In Section 4.6, we confirm CLOVER’s full-rank update capability by visualizing the singular value distribution of ∆W from various methods. Finally, in Section 4.7, we show how CLOVER fine-tunes the model using its inherent properties, without introducing “intrusive dimension” like LoRA, which may risk model degradation (Shuttleworth et al., 2024). + +4.1. CLOVER for Large Ratio Pruning + +Due to the need to compute attention between each token and all preceding tokens, compressing atten-tion—particularly the key-value layers—is crucial, despite the larger number of parameters in the MLP. CLOVER rep-resents each attention head with a small number of vectors. Since it only modifies the initialization, it can be combined with any other pruning technique. This paper validates the proposed method using basic structured pruning on GPT-2-XL, rather than targeting state-of-the-art performance. We initialize GPT-2-XL with CLOVER, then prune small singu-lar values based on their magnitude. To maintain inference efficiency, we apply the same pruning rate across all layers, removing a fixed percentage of the smallest singular vectors. The singular values, S, are then merged into the U and V + +matrices. For comparison, we also prune without CLOVER orthogonalization, using L2-norms for pruning. After prun-ing, we evaluate perplexity on the WikiText-2 (Merity et al., 2016) dataset. We then fine-tune the pruned models on the OpenWebText (Gokaslan & Cohen, 2019) dataset following nanoGPT 1. To minimize disruption to the original model, we fine-tune only the pruned attention layers, leaving the MLP, embedding layers, and LM head unchanged. In the CLOVER † case, after pruning, S is not immediately merged into the U and V matrices but is used for parameter-efficient fine-tuning, with the merging occurring afterward. We ad-just the learning rate from 6e-4 to 6e-3 and remove weight decay, while keeping other hyperparameters consistent with the other two methods. Based on Table 1, CLOVER causes less damage to the model than Vanilla pruning, as it transfers functionality into fewer orthogonal bases. For example, pruning 50% of the parameters without further fine-tuning, CLOVER’s perplex-ity only increases by 1.38 ×, while Vanilla pruning increases by 21.9 ×. After fine-tuning, CLOVER’s performance far exceeds that of Vanilla pruning. Due to its lower model disruption, CLOVER requires fewer tokens for fine-tuning to restore performance (e.g., perplexity with 66M tokens is close to that with 131M tokens), whereas Vanilla pruning needs more tokens, resulting in higher costs and potential degradation in out-of-domain tasks. Furthermore, by fine-tuning only the singular values from the SVD decomposition and the attention layer biases, CLOVER achieves recovery with fewer training resources and parameter changes. At lower pruning rates, CLOVER even outperforms full atten-tion layer training. However, when pruning rates are too high, accuracy loss becomes significant, and the available parameters for fine-tuning become insufficient (e.g., at 75% pruning, only 0.15% of the original attention layer parame-ters are updated). + +> 1https://github.com/karpathy/nanoGPT + +4CLOVER: Cross-Layer Orthogonal Vectors + +> Table 2. Accuracy comparison of LLaMA 7B/13B, LLaMA2 7B, and LLaMA3 8B with various PEFT methods on eight commonsense reasoning datasets. Results of LoRA and DoRA are taken from (Liu et al., 2024d). Results of HiRA are taken from (Anonymous, 2025). + +Model Method Params BoolQ PIQA SIQA Hella Swag Wino Grande ARC-e ARC-c OBQA Avg. + +ChatGPT - - 73.1 85.4 68.5 78.5 66.1 89.8 79.9 74.8 77.0 LLaMA-7B Series 0.99% 63.0 79.2 76.3 67.9 75.7 74.5 57.1 72.4 70.8 Parallel 3.54% 67.9 76.4 78.8 69.8 78.9 73.7 57.3 75.2 72.2 LoRA 0.83% 68.9 80.7 77.4 78.1 78.8 77.8 61.3 74.8 74.7 DoRA 0.84% 69.7 83.4 78.6 87.2 81.0 81.9 66.2 79.2 78.4 PiSSA 0.83% 74.1 85.4 81.5 94.0 85.0 85.6 72.1 84.2 82.7 CLOVER 0.83% 72.9 86.34 82.1 94.9 85.4 87.5 74.4 86.4 83.7 + +LLaMA-13B Series 0.80% 71.8 83 79.2 88.1 82.4 82.5 67.3 81.8 79.5 Parallel 2.89% 72.5 84.9 79.8 92.1 84.7 84.2 71.2 82.4 81.4 LoRA 0.67% 72.1 83.5 80.5 90.5 83.7 82.8 68.3 82.4 80.5 DoRA 0.68% 72.4 84.9 81.5 92.4 84.2 84.2 69.6 82.8 81.5 PiSSA 0.67% 74.6 88.0 82.9 95.5 87.0 90.3 77.2 88.2 85.4 CLOVER 0.67% 75.2 88.4 83.1 96.0 87.8 89.7 79.3 89.8 86.2 + +LLaMA2-7B LoRA 0.83% 69.8 79.9 79.5 83.6 82.6 79.8 64.7 81.0 77.6 DoRA 0.84% 71.8 83.7 76.0 89.1 82.6 83.7 68.2 82.4 79.7 HiRA 0.83% 71.2 83.4 79.5 88.1 84.0 86.7 73.8 84.6 81.4 PiSSA 0.83% 75.0 87.0 81.6 95.0 86.5 88.5 75.9 86.4 84.5 CLOVER 0.83% 75.0 86.4 82.0 95.1 87.5 89.6 76.6 89.4 85.2 + +LLaMA3-8B LoRA 0.70% 70.8 85.2 79.9 91.7 84.3 84.2 71.2 79.0 80.8 DoRA 0.71% 74.6 89.3 79.9 95.5 85.6 90.5 80.4 85.8 85.2 HiRA 0.70% 75.4 89.7 81.2 95.4 87.7 93.3 82.9 88.3 86.7 PiSSA 0.70% 77.2 90.0 82.9 96.6 88.4 93.6 82.4 87.4 87.3 CLOVER 0.47% 76.4 89.3 82.1 96.9 89.9 93.6 84.5 90.6 87.9 + +4.2. CLOVER for Full-Rank Fine-Tuning + +In this section, we evaluate CLOVER against LoRA (Hu et al., 2021), DoRA (Liu et al., 2024d), HiRA (Anonymous, 2025), and PiSSA (Meng et al., 2024) on commonsense reasoning tasks, excluding SVFT (Lingam et al., 2024) due to its significant overhead. The tasks are divided into eight sub-tasks, as outlined in Table 4. Following the DoRA setup, we fine-tune the Commonsense-170k dataset and evaluate each sub-task’s test set. We apply orthogonal de-composition to the Value-Output and fine-tune the resulting singular value matrix. Due to the non-linear RoPE(Su et al., 2024) operation between the query and key, we perform orthogonal decomposition in the Key layer and fine-tune the transition matrix. Similarly, we treat the 64 consecu-tive dimensions in the MLP.Up layer as a head, applying orthogonal decomposition and updating the transition ma-trix. The learnable parameters of LLaMA 7B/13B (Touvron et al., 2023) and LLaMA-2-7B (AI@Meta, 2023) match LoRA/DoRA/HiRA/PiSSA with rank 32 updates. LLaMA-3-8B (AI@Meta, 2024) has 2/3 of the trainable parameters compared to the other models. For a fair comparison, we use the hyperparameters from DoRA (3 epochs, batch size 16, linear scheduler learning rate). We adjusted the learning rate based on DoRA’s approach and found that CLOVER per-forms best with lr=1e-4, which we applied across all models. PiSSA was trained using the same hyperparameters, but with a learning rate of 2e-5, as specified in its original pa-per. Due to the stable performance of PiSSA and CLOVER during training, we did not perform validation every 80 it-erations, as done in DoRA, to select the best-performing model on the validation set for testing. Instead, we trained for the full 3 epochs and used the final model for testing. HiRA’s results are taken directly from its original paper, while the other results are sourced from DoRA’s paper. Ta-ble 2 demonstrates that CLOVER consistently outperforms all other methods across all models and tasks. Specifically, on LLaMA 7B, CLOVER outperforms LoRA, DoRA, and PiSSA by 9%, 5.3%, and 1%, respectively. On LLaMA 13B, CLOVER outperforms these methods by 5.7%, 4.7%, and 0.8%. On LLaMA-2-7B, CLOVER surpasses LoRA, DoRA, HiRA, and PiSSA by 7.6%, 5.5%, 3.8%, and 0.7%. Even on LLaMA-3-8B, with fewer trainable parameters, CLOVER outperforms by 7.1%, 2.7%, 1.2%, and 0.6%. CLOVER leads in most sub-tasks and ranks second in a few. 5CLOVER: Cross-Layer Orthogonal Vectors 0 50 100 + +> 0.0 +> 0.5 +> 1.0 +> 1.5 +> 2.0 +> WQ WK +> (10, 0.93) +> Vanilla +> CLOVER +> 050 100 +> Sorted Dimensions +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> WV WTO +> (72, 0.13) +> Vanilla +> CLOVER + +(a) DeepSeek-V2-Lite 0 25 50 75 + +> 0 +> 1 +> 2 +> 3 +> 4 +> 5 +> WQ WK +> (1, 2.66) +> Vanilla +> CLOVER +> 025 50 75 +> Sorted Dimensions +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> WV WTO +> (27, 0.29) +> Vanilla +> CLOVER + +(b) Llama-3.2-Vision 0 20 40 60 + +> 0.00 +> 0.25 +> 0.50 +> 0.75 +> 1.00 +> 1.25 +> 1.50 +> WQ WK +> (7, 0.63) +> Vanilla +> CLOVER +> 020 40 60 +> Sorted Dimensions +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> 0.8 +> WV WTO +> (35, 0.16) +> Vanilla +> CLOVER + +(c) Whisper-Large-v3 0 20 40 60 + +> 1 +> 2 +> 3 +> WQ WK +> (29, 1.25) +> Vanilla +> CLOVER +> 020 40 60 +> Sorted Dimensions +> 0.2 +> 0.4 +> 0.6 +> 0.8 +> WV WTO +> (26, 0.46) +> Vanilla +> CLOVER + +(d) SDXL 0 50 100 + +> 0.00 +> 0.02 +> 0.04 +> 0.06 +> 0.08 +> WQ WK +> (9, 0.04) +> Vanilla +> CLOVER +> 050 100 +> Sorted Dimensions +> 0.00 +> 0.02 +> 0.04 +> 0.06 +> 0.08 +> WV WTO +> (44, 0.02) +> Vanilla +> CLOVER + +(e) CLIP-ViT-BigG + +Figure 2. CLOVER (orange) uses fewer orthogonal basis vectors than Vanilla Pruning (blue) to span the attention head space. The first row shows the importance of Q-K dimensions, and the second row shows V-O dimensions. After the red dot, CLOVER’s importance is lower, and pruning these vectors results in less performance loss. + +4.3. CLOVER Removal Redundant Vectors + +CLOVER achieves a higher pruning ratio due to the sig-nificant linear redundancy present in the model. By repre-senting the entire attention head with only a small number of orthogonal vectors, CLOVER effectively removes this redundancy. To illustrate the advantages of CLOVER in eliminating linear redundancy, we apply it to a variety range of models, including the large language model DeepSeek-V2-Lite (DeepSeek-AI, 2024), the multimodal automatic speech recognition and speech translation model Whisper-Large-v3 (Radford et al., 2023), the multimodal instruction-tuned image reasoning generative models LLaMA-3.2-11B-Vision (AI@Meta, 2024), the image encoder CLIP-ViT-bigG (Cherti et al., 2022), and the image generation model Stable Diffusion XL (Podell et al., 2023). We compute the + +L2 norm for each dimension (equal to singular values) in both the Q-K pair and the V-O pair, sorting the values in descending order within each attention head for better visu-alization. For comparison, we also perform Vanilla Pruning, which does not utilize CLOVER initialization but instead sorts directly based on the L2 norm. Figure 2 showcases the first attention head from the first layer of each model. In the first column of the figure, depict-ing the Q-K norm, we observe that in the original model, the importance of each dimension is relatively balanced (e.g. Figure 2c). This balanced distribution is a result of the linear redundancy, where different directions are inter-twined, making it challenging to prune individual directions without negatively affecting the model’s performance. How-ever, after applying CLOVER’s orthogonal decomposition, only a small number of orthogonal bases on the left side exhibit significantly large norms. These vectors span al-most the entire attention head’s space, and the remaining vectors have norms that approach zero, indicating that they are already represented by the dominant singular vectors and can be pruned without loss of performance. Beyond the red intersection point, CLOVER’s remaining vectors exhibit consistently lower importance than those in Vanilla Pruning, meaning pruning these vectors results in less per-formance degradation. This demonstrates why CLOVER enables a higher pruning ratio. A similar trend is observed for the V-O pair, although the model’s inherent sparsity is less pronounced than in the Q-K pair, making the effect less noticeable. Still, in most models, pruning half of the vectors has a smaller impact on performance compared to Vanilla Pruning. Notably, in CLIP-ViT-bigG (Figure 2e), a proportion of the vectors already have a norm of zero, allowing for safe pruning. + +4.4. CLOVER for Training-Free Pruning + +As demonstrated by the prominent low-rank properties in Figure 2c, we applied pruning to the Whisper-large-v3 model (Radford et al., 2023). To intuitively highlight the effectiveness of CLOVER pruning, we present an example using an audio input from the LibriSpeech Long dataset (Gandhi et al., 2023). For reference, the waveform of this input is shown in Figure 3, and the corresponding target translation script is provided in Appendix A.4. After applying CLOVER to orthogonalize the vec-tors, we pruned vectors with magnitudes close to zero (∥WQ∥∥ WK ∥ ≤ 5 × 10 −3 and ∥WV ∥∥ W ⊤ + +> O + +∥ ≤ 6 × 10 −3). This pruning achieved ratios of 56.01% and 36.82% for the parameters in Q-K Pair and V -O Pair, respectively. Re-markably, the model’s output remains nearly unchanged, with only one error, which has been highlighted in the text using strikethrough and red for clarity: 6CLOVER: Cross-Layer Orthogonal Vectors 0 2e5 4e5 6e5 8e5 10e5 + +> Samples +> 0.5 +> 0.0 +> 0.5 +> Amplitude + +Figure 3. An audio waveform from the librispeech dataset. + +Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter’s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton’s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell’s pictures are a sort of Up Guards and Adam paintings, and Mason’s exquisite idles are as national as a jingo poem. Mr. Birkett Foster’s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And , and Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, next man. + +In contrast, using a vanilla pruning method with the same pruning ratio, the model completely fails to produce valid outputs: + +... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... + +This example validates our earlier claim that straightfor-ward pruning of non-zero dimensions can lead to accumu-lated loss. In contrast, CLOVER effectively eliminates lin-ear redundancy, enabling a significantly higher pruning ra-tio. When the linear redundancy is sufficiently pronounced, CLOVER can even achieve a high pruning ratio without the need for fine-tuning to recover performance. + +4.5. Necessity of Full-Direction Fine-Tuning + +Besides pruning with a large ratio, CLOVER is capable of learning linear combinations of all orthogonal vectors within each attention head. This capability allows CLOVER to resemble full-parameter fine-tuning more closely. To highlight the advantages of updating all orthogonal bases, we randomly sampled 16 instances from the Commonsense dataset, fed them into the model, and performed SVD to the model. We then recorded the projection magnitudes of input features along all orthogonal directions. Figure 4 visualizes the results for the middle layer, revealing the following insights: 1) Without accounting for the scaling effect of singular val-ues, the projection magnitude along the principal singular vector consistently exceeds that in other directions. This ob-servation supports PiSSA’s approach, which updates based Top 256 + +> 10% +> Next 256 +> 7% +> Remaining +> 76% +> Bottom 256 +> 7% + +(a) PiSSA Top 256 + +> 6% +> Next 256 +> 7% +> Remaining +> 81% +> Bottom 256 +> 6% + +(b) LoRA Top 256 + +> 18% +> Next 256 10% +> Remaining +> 65% +> Bottom 256 +> 7% + +(c) PiSSA with Singular Value 100% (d) CLOVER + +Figure 4. Proportion of data projections across different compo-nents in random directions (LoRA) versus orthogonal directions (PiSSA), as well as all orthogonal directions (CLOVER). + +on the principal singular values and vectors, leading to im-proved training performance. In contrast, LoRA projects in random directions, resulting in uniform projection magni-tudes across all directions. 2) The singular values in the original model reflect the im-portance of each direction in the pretraining task. The model amplifies the components along directions with larger sin-gular values and suppresses those along smaller singular values. Therefore, it is crucial to consider the scaling effect of singular values. As shown in Figure 4c, the projection magnitude along the principal singular vector direction in-creases to 18%. 3) While more data projections align with the principal singular vector at higher ranks, 82% of the feature compo-nents are still projected onto other directions. In extreme cases, if a task is entirely orthogonal to the vectors used by PiSSA, training on such a task may result in zero gradients, thereby limiting its learning capacity. Under the same rank constraint, 94% of the feature components in LoRA are pro-jected outside the LoRA adapter, making it more susceptible to the zero-gradient problem. Since CLOVER updates across all orthogonal directions, as shown in Figure 4d it effectively mitigates this issue. Con-sequently, CLOVER outperforms both LoRA and PiSSA in multi-task learning, even when using the same or fewer learnable parameters (Section 4.2). 7CLOVER: Cross-Layer Orthogonal Vectors + +4.6. Visualizing Rank Updates + +To demonstrate CLOVER achieves full-rank updates, we multiply the updated singular values with their correspond-ing singular vectors and perform SVD on the base model (SQK applied to the Key layer, SV O to the Value layer, and + +SU D to the Up layer). We take LoRA, and Full Fine-tuning for comparing. Figure 5 shows the singular value of the middle layer in LLaMA-2-7B, revealing that CLOVER and Full Fine-tuning achieve full-rank updates, while LoRA is constrained by its low-rank design. 0 1000 2000 3000 4000 + +> 0.0 +> 0.5 +> 1.0 +> 1.5 +> 2.0 + +(a) Full Fine-Tuning 0 1000 2000 3000 4000 + +> 0 +> 2 +> 4 +> 6 + +(b) LoRA 0 1000 2000 3000 4000 + +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> 0.8 + +(c) CLOVER + +Figure 5. ∆W is low rank in LoRA, while full rank for Full-Fine-Tuning and CLOVER. + +4.7. CLOVER Avoids Intrusive Dimensions + +Recent research (Shuttleworth et al., 2024) has highlighted an issue with LoRA, referred to as the “intrusive dimensions” phenomenon. As illustrated in Figure 6b, LoRA introduces new random directions into the model, which possess large magnitudes and thus precede all the original singular vectors. The study suggests that these “intrusive dimensions” can degrade the model’s performance, exacerbating catastrophic forgetting during continual learning with LoRA. In contrast, CLOVER addresses this issue by fixing all orthogonal bases and updating only the vector combinations. As a result, the changes introduced by CLOVER fine-tuning closely resemble those generated by full parameter fine-tuning, as shown in Figure 6a and Figure 6c. 0 100 200 300 400 + +> 0 +> 100 +> 200 +> 300 +> 400 + +(a) Full Fine-Tuning 0 100 200 300 400 + +> 0 +> 100 +> 200 +> 300 +> 400 + +(b) LoRA 0 100 200 300 400 + +> 0 +> 100 +> 200 +> 300 +> 400 + +(c) CLOVER + +Figure 6. Intruder dimensions phenomenal in LoRA, which does not exist in Full Fine-Tuning and CLOVER. + +## 5. Conclusion and Limitations + +In this paper, we introduce Cross-Layer Orthogonal Vectors (CLVOER), a method that orthogonalizes vectors within attention heads without requiring additional transformation matrices. This orthogonalization process condenses effec-tive parameters into fewer vectors, improving the pruning ratio. By fine-tuning the singular values obtained through orthogonalization, CLVOER learns linear combinations of orthogonal bases, enabling full-rank updates. When applied to prune 50% of the attention head parameters in GPT-2XL, CLVOER results in a perplexity that is just one-tenth of that achieved by standard pruning methods. For Whisper-Large-v3, CLVOER removes 46.42% of the parameters without fine-tuning, while preserving model performance. Furthermore, when used for fine-tuning, CLVOER outper-forms state-of-the-art methods such as LoRA, DoRA, HiRA, and PiSSA, achieving superior results with equal or fewer trainable parameters. We also demonstrate how CLVOER removes linear redundancy to facilitate pruning and discuss the necessity of fine-tuning across all orthogonal bases. Vi-sual comparisons of models fine-tuned with different meth-ods further illustrate its effectiveness. Despite its advantages, CLVOER has some limitations. When nonlinear operations are present between Q-K or V-O pairs (such as with the widely-used RoPE (Su et al., 2024)), cross-layer orthogonalization is not feasible. In these cases, we instead perform head-wise orthogonalization within the Key layer during fine-tuning. Fortunately, CLVOER Fine-Tuning can apply intra-layer attention head orthogo-nalization, while CLOVER Pruning remains applicable to many popular models, including DeepSeek (DeepSeek-AI, 2024; Liu et al., 2024b)(which uses Decoupled RoPE), ViT and SDXL (which use absolute positional encoding), and BLOOM (Le Scao et al., 2023) (which employs Alibi rela-tive positional encoding (Press et al., 2021)). Additionally, as a newly proposed method, our current evaluation fo-cuses primarily on basic pruning tasks and does not include comparisons with other state-of-the-art pruning techniques. However, because CLVOER does not alter the model struc-ture and only updates the initialization method, it can be combined with existing pruning methods to further enhance their effectiveness. As a novel technique, CLVOER holds considerable promise for future applications. For instance, it could be combined with quantization methods to eliminate outliers, guide prun-ing and fine-tuning based on data feature directions, or even inspire new model architectures. 8CLOVER: Cross-Layer Orthogonal Vectors + +## Impact Statement + +This paper proposes a cross-layer orthogonal initialization method to guide model pruning and efficient fine-tuning, of-fering valuable insights for the application and development of large models. Both application directions aim to reduce training and inference costs, lower computational overhead, decrease power consumption, and minimize carbon emis-sions. + +## References + +Abdin, M., Aneja, J., Behl, H., Bubeck, S., Eldan, R., Gunasekar, S., Harrison, M., Hewett, R. J., Javaheripi, M., Kauffmann, P., et al. Phi-4 technical report. arXiv preprint arXiv:2412.08905, 2024. AI@Meta. Llama 2: Open foundation and fine-tuned chat models. CoRR , abs/2307.09288, 2023. doi: 10. 48550/arXiv.2307.09288. URL https://doi.org/ 10.48550/arXiv.2307.09288 .AI@Meta. Llama 3 model card, 2024. URL + +https://github.com/meta-llama/llama3/ blob/main/MODEL_CARD.md .Ainslie, J., Lee-Thorp, J., de Jong, M., Zemlyanskiy, Y., Lebr ´on, F., and Sanghai, S. Gqa: Training generalized multi-query transformer models from multi-head check-points. arXiv preprint arXiv:2305.13245, 2023. Anonymous. HiRA: Parameter-efficient hadamard high-rank adaptation for large language models. In The Thirteenth International Conference on Learning Representations , 2025. URL https://openreview. net/forum?id=TwJrTz9cRS .Ansell, A., Ponti, E. M., Korhonen, A., and Vuli ´c, I. Composable sparse fine-tuning for cross-lingual trans-fer. arXiv preprint arXiv:2110.07560, 2021. Anthropic. Claude 3.5 sonnet, 2024. URL https://www. anthropic.com/news/claude-3-5-sonnet .Asai, A., Salehi, M., Peters, M. E., and Hajishirzi, H. Attempt: Parameter-efficient multi-task tuning via at-tentional mixtures of soft prompts. arXiv preprint arXiv:2205.11961, 2022. Ashkboos, S., Croci, M. L., Nascimento, M. G. d., Hoefler, T., and Hensman, J. Slicegpt: Compress large language models by deleting rows and columns. arXiv preprint arXiv:2401.15024, 2024. Bisk, Y., Zellers, R., Gao, J., Choi, Y., et al. Piqa: Reason-ing about physical commonsense in natural language. In Proceedings of the AAAI conference on artificial intelligence, volume 34, pp. 7432–7439, 2020. Brandon, W., Mishra, M., Nrusimha, A., Panda, R., and Kelly, J. R. Reducing transformer key-value cache size with cross-layer attention. arXiv preprint arXiv:2405.12981, 2024. Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language models are few-shot learners. Advances in neural information processing systems , 33: 1877–1901, 2020. Cherti, M., Beaumont, R., Wightman, R., Wortsman, M., Ilharco, G., Gordon, C., Schuhmann, C., Schmidt, L., and Jitsev, J. Reproducible scaling laws for contrastive language-image learning. arXiv preprint arXiv:2212.07143, 2022. Clark, C., Lee, K., Chang, M.-W., Kwiatkowski, T., Collins, M., and Toutanova, K. Boolq: Exploring the surpris-ing difficulty of natural yes/no questions. arXiv preprint arXiv:1905.10044, 2019. Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., and Tafjord, O. Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457, 2018. De, S., Smith, S. L., Fernando, A., Botev, A., Cristian-Muraru, G., Gu, A., Haroun, R., Berrada, L., Chen, Y., Srinivasan, S., et al. Griffin: Mixing gated linear recur-rences with local attention for efficient language models. arXiv preprint arXiv:2402.19427, 2024. DeepSeek-AI. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. CoRR ,abs/2405.04434, 2024. URL https://doi.org/10. 48550/arXiv.2405.04434 .Dettmers, T., Lewis, M., Belkada, Y., and Zettlemoyer, L. Gpt3. int8 (): 8-bit matrix multiplication for transform-ers at scale. Advances in Neural Information Processing Systems, 35:30318–30332, 2022. Frantar, E., Ashkboos, S., Hoefler, T., and Alistarh, D. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022. Fu, Q., Cho, M., Merth, T., Mehta, S., Rastegari, M., and Najibi, M. Lazyllm: Dynamic token pruning for efficient long context llm inference. arXiv preprint arXiv:2407.14057, 2024. Fu, Z., Yang, H., So, A. M.-C., Lam, W., Bing, L., and Collier, N. On the effectiveness of parameter-efficient fine-tuning. In Proceedings of the AAAI Conference on Artificial Intelligence , volume 37, pp. 12799–12807, 2023. 9CLOVER: Cross-Layer Orthogonal Vectors + +Gandhi, S., von Platen, P., and Rush, A. M. Distil-whisper: Robust knowledge distillation via large-scale pseudo la-belling. arXiv preprint arXiv:2311.00430, 2023. Gokaslan, A. and Cohen, V. Openwebtext cor-pus. http://Skylion007.github.io/ OpenWebTextCorpus , 2019. Gu, A. and Dao, T. Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752, 2023. Guo, D., Rush, A. M., and Kim, Y. Parameter-efficient transfer learning with diff pruning. arXiv preprint arXiv:2012.07463, 2020. Guo, S., Xu, J., Zhang, L. L., and Yang, M. Com-presso: Structured pruning with collaborative prompting learns compact large language models. arXiv preprint arXiv:2310.05015, 2023. Hambardzumyan, K., Khachatrian, H., and May, J. Warp: Word-level adversarial reprogramming. arXiv preprint arXiv:2101.00121, 2021. Hooper, C., Kim, S., Mohammadzadeh, H., Mahoney, M. W., Shao, Y. S., Keutzer, K., and Gholami, A. Kvquant: Towards 10 million context length llm in-ference with kv cache quantization. arXiv preprint arXiv:2401.18079, 2024. Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., and Chen, W. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685, 2021. Jiang, T., Huang, S., Luo, S., Zhang, Z., Huang, H., Wei, F., Deng, W., Sun, F., Zhang, Q., Wang, D., et al. Mora: High-rank updating for parameter-efficient fine-tuning. arXiv preprint arXiv:2405.12130, 2024. Jo, H.-r. and Shin, D. A2sf: Accumulative attention scoring with forgetting factor for token pruning in transformer decoder. arXiv preprint arXiv:2407.20485, 2024. Katharopoulos, A., Vyas, A., Pappas, N., and Fleuret, F. Transformers are rnns: Fast autoregressive transform-ers with linear attention. In International conference on machine learning, pp. 5156–5165. PMLR, 2020. Lawton, N., Kumar, A., Thattai, G., Galstyan, A., and Steeg, G. V. Neural architecture search for parameter-efficient fine-tuning of large pre-trained language models. arXiv preprint arXiv:2305.16597, 2023. Le Scao, T., Fan, A., Akiki, C., Pavlick, E., Ili ´c, S., Hesslow, D., Castagn ´e, R., Luccioni, A. S., Yvon, F., Gall ´e, M., et al. Bloom: A 176b-parameter open-access multilingual language model. 2023. Lester, B., Al-Rfou, R., and Constant, N. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691, 2021. Li, M., Lin, Y., Zhang, Z., Cai, T., Li, X., Guo, J., Xie, E., Meng, C., Zhu, J.-Y., and Han, S. Svdqunat: Absorb-ing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007, 2024a. Li, X. L. and Liang, P. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190, 2021. Li, Y., Huang, Y., Yang, B., Venkitesh, B., Locatelli, A., Ye, H., Cai, T., Lewis, P., and Chen, D. Snapkv: Llm knows what you are looking for before generation. arXiv preprint arXiv:2404.14469, 2024b. Lingam, V., Tejaswi, A., Vavre, A., Shetty, A., Gudur, G. K., Ghosh, J., Dimakis, A., Choi, E., Bojchevski, A., and Sanghavi, S. Svft: Parameter-efficient fine-tuning with singular vectors. arXiv preprint arXiv:2405.19597, 2024. Liu, A., Feng, B., Wang, B., Wang, B., Liu, B., Zhao, C., Dengr, C., Ruan, C., Dai, D., Guo, D., et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434 ,2024a. Liu, A., Feng, B., Xue, B., Wang, B., Wu, B., Lu, C., Zhao, C., Deng, C., Zhang, C., Ruan, C., et al. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 ,2024b. Liu, A., Liu, J., Pan, Z., He, Y., Haffari, G., and Zhuang, B. Minicache: Kv cache compression in depth dimension for large language models. arXiv preprint arXiv:2405.14366, 2024c. Liu, S.-Y., Wang, C.-Y., Yin, H., Molchanov, P., Wang, Y.-C. F., Cheng, K.-T., and Chen, M.-H. Dora: Weight-decomposed low-rank adaptation. arXiv preprint arXiv:2402.09353, 2024d. Liu, W., Qiu, Z., Feng, Y., Xiu, Y., Xue, Y., Yu, L., Feng, H., Liu, Z., Heo, J., Peng, S., et al. Parameter-efficient orthogonal finetuning via butterfly factorization. arXiv preprint arXiv:2311.06243, 2023a. Liu, X., Zheng, Y., Du, Z., Ding, M., Qian, Y., Yang, Z., and Tang, J. Gpt understands, too. AI Open, 2023b. Liu, Z., Yuan, J., Jin, H., Zhong, S., Xu, Z., Braverman, V., Chen, B., and Hu, X. Kivi: A tuning-free asym-metric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750, 2024e. 10 CLOVER: Cross-Layer Orthogonal Vectors + +Ma, X., Fang, G., and Wang, X. Llm-pruner: On the struc-tural pruning of large language models. Advances in neural information processing systems , 36:21702–21720, 2023. Meng, F., Wang, Z., and Zhang, M. Pissa: Principal singular values and singular vectors adaptation of large language models. arXiv preprint arXiv:2404.02948, 2024. Merity, S., Xiong, C., Bradbury, J., and Socher, R. Pointer sentinel mixture models, 2016. Mihaylov, T., Clark, P., Khot, T., and Sabharwal, A. Can a suit of armor conduct electricity? a new dataset for open book question answering. arXiv preprint arXiv:1809.02789, 2018. Mistral. Cheaper, better, faster, stronger: Continuing to push the frontier of ai and making it accessible to all, 2024. URL https://mistral.ai/news/ mixtral-8x22b .OpenAI. Hello GPT-4o, 2024. URL https://openai. com/index/hello-gpt-4o/ .Peng, B., Alcaide, E., Anthony, Q., Albalak, A., Arcadinho, S., Biderman, S., Cao, H., Cheng, X., Chung, M., Grella, M., et al. Rwkv: Reinventing rnns for the transformer era. arXiv preprint arXiv:2305.13048, 2023. Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M ¨uller, J., Penna, J., and Rombach, R. Sdxl: Im-proving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952, 2023. Press, O., Smith, N. A., and Lewis, M. Train short, test long: Attention with linear biases enables input length extrapolation. arXiv preprint arXiv:2108.12409, 2021. Qiu, Z., Liu, W., Feng, H., Xue, Y., Feng, Y., Liu, Z., Zhang, D., Weller, A., and Sch ¨olkopf, B. Controlling text-to-image diffusion by orthogonal finetuning. Advances in Neural Information Processing Systems , 36:79320– 79362, 2023. Qwen. Qwen2.5: A party of foundation models, 2024. URL + +https://qwenlm.github.io/blog/qwen2.5 .Radford, A. Improving language understanding by genera-tive pre-training. 2018. Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019. Radford, A., Kim, J. W., Xu, T., Brockman, G., McLeavey, C., and Sutskever, I. Robust speech recognition via large-scale weak supervision. In International conference on machine learning, pp. 28492–28518. PMLR, 2023. Sakaguchi, K., Bras, R. L., Bhagavatula, C., and Choi, Y. Winogrande: An adversarial winograd schema challenge at scale. Communications of the ACM , 64(9):99–106, 2021. Sap, M., Rashkin, H., Chen, D., LeBras, R., and Choi, Y. Socialiqa: Commonsense reasoning about social interac-tions. arXiv preprint arXiv:1904.09728, 2019. Shazeer, N. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150, 2019. Shuttleworth, R., Andreas, J., Torralba, A., and Sharma, P. Lora vs full fine-tuning: An illusion of equivalence. arXiv preprint arXiv:2410.21228, 2024. Su, J., Ahmed, M., Lu, Y., Pan, S., Bo, W., and Liu, Y. Roformer: Enhanced transformer with rotary position embedding. Neurocomputing, 568:127063, 2024. Sun, M., Liu, Z., Bair, A., and Kolter, J. Z. A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695, 2023. Sun, Y., Dong, L., Zhu, Y., Huang, S., Wang, W., Ma, S., Zhang, Q., Wang, J., and Wei, F. You only cache once: Decoder-decoder architectures for language models. arXiv preprint arXiv:2405.05254, 2024. Sung, Y.-L., Nair, V., and Raffel, C. A. Training neural networks with fixed sparse masks. Advances in Neural Information Processing Systems , 34:24193– 24205, 2021. Team, G., Georgiev, P., Lei, V. I., Burnell, R., Bai, L., Gulati, A., Tanzer, G., Vincent, D., Pan, Z., Wang, S., et al. Gemini 1.5: Unlocking multimodal understand-ing across millions of tokens of context. arXiv preprint arXiv:2403.05530, 2024a. Team, G., Riviere, M., Pathak, S., Sessa, P. G., Hardin, C., Bhupatiraju, S., Hussenot, L., Mesnard, T., Shahri-ari, B., Ram ´e, A., et al. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118, 2024b. Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi `ere, B., Goyal, N., Hambro, E., Azhar, F., et al. LLaMA: Open and efficient founda-tion language models. arXiv preprint arXiv:2302.13971, 2023. Vu, T., Lester, B., Constant, N., Al-Rfou, R., and Cer, D. Spot: Better frozen model adaptation through soft prompt transfer. arXiv preprint arXiv:2110.07904, 2021. Wang, S., Li, B. Z., Khabsa, M., Fang, H., and Ma, H. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768, 2020. 11 CLOVER: Cross-Layer Orthogonal Vectors + +Wang, S., Yu, L., and Li, J. Lora-ga: Low-rank adap-tation with gradient approximation. arXiv preprint arXiv:2407.05000, 2024a. Wang, Z., Panda, R., Karlinsky, L., Feris, R., Sun, H., and Kim, Y. Multitask prompt tuning enables parameter-efficient transfer learning. arXiv preprint arXiv:2303.02861, 2023. Wang, Z., Liang, J., He, R., Wang, Z., and Tan, T. Lora-pro: Are low-rank adapters properly optimized? arXiv preprint arXiv:2407.18242, 2024b. Xia, M., Gao, T., Zeng, Z., and Chen, D. Sheared llama: Accelerating language model pre-training via structured pruning. arXiv preprint arXiv:2310.06694, 2023. Xiao, G., Lin, J., Seznec, M., Wu, H., Demouth, J., and Han, S. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning , pp. 38087–38099. PMLR, 2023. Xu, R., Luo, F., Zhang, Z., Tan, C., Chang, B., Huang, S., and Huang, F. Raise a child in large language model: Towards effective and generalizable fine-tuning. arXiv preprint arXiv:2109.05687, 2021. Yu, H., Yang, Z., Li, S., Li, Y., and Wu, J. Effectively com-press kv heads for llm. arXiv preprint arXiv:2406.07056, 2024. Yuan, S., Liu, H., and Xu, H. Bridging the gap between low-rank and orthogonal adaptation via householder reflection adaptation. arXiv preprint arXiv:2405.17484, 2024. Zaken, E. B., Ravfogel, S., and Goldberg, Y. Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199, 2021. Zellers, R., Holtzman, A., Bisk, Y., Farhadi, A., and Choi, Y. Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830, 2019. Zhang, Q., Chen, M., Bukharin, A., Karampatziakis, N., He, P., Cheng, Y., Chen, W., and Zhao, T. Adalora: Adaptive budget allocation for parameter-efficient fine-tuning. arXiv preprint arXiv:2303.10512, 2023. Zhao, J., Zhang, Z., Chen, B., Wang, Z., Anandkumar, A., and Tian, Y. Galore: Memory-efficient llm train-ing by gradient low-rank projection. arXiv preprint arXiv:2403.03507, 2024. Zhao, M., Lin, T., Mi, F., Jaggi, M., and Sch ¨utze, H. Mask-ing as an efficient alternative to finetuning for pretrained language models. arXiv preprint arXiv:2004.12406 ,2020. Zi, B., Qi, X., Wang, L., Wang, J., Wong, K.-F., and Zhang, L. Delta-lora: Fine-tuning high-rank parame-ters with the delta of low-rank matrices. arXiv preprint arXiv:2309.02411, 2023. Zuhri, Z. M. K., Adilazuarda, M. F., Purwarianti, A., and Aji, A. F. Mlkv: Multi-layer key-value heads for memory efficient transformer decoding. arXiv preprint arXiv:2406.09297, 2024. 12 CLOVER: Cross-Layer Orthogonal Vectors + +## A. Appendix + +A.1. Cross Layer Orthogonal Vectors in Value and Output layers + +In the main text, we only presented the orthogonalization process for the Q-K pair. Here, we provide the method for orthogonalizing the V-O pair. Additionally, for up-down layers, the output dimension of the Up layer can be reshaped into block number × block size, followed by performing orthogonal decomposition within each block. + +Y = attn(Q, K) V W O , V = XW V ∈ Rb×h×n×d (1) + += attn(Q, K) XW V WO , WV WO = WV O = U SV ∈ Rh×D×D (2) + += attn(Q, K) XU SV, S[: ,r vo :,r vo :] = SV O ∈ Rh×rvo ×rvo = 0 , r vo ≤ d. (3) + += attn(Q, K) XU V O SV O VV O , UV O ∈ RD×h×rvo , VV O ∈ Rh×rvo ×D . (4) Through this series of transformations, WV and WO can be equivalently replaced by orthogonal vectors UV O and VV O ,along with the diagonal matrix SV O . Since rvo ≤ d, the singular zero values and their corresponding singular vectors can be safely pruned. After guided pruning, SV O can be merged into UV O and VV O , resulting in no additional computational overhead. + +A.2. Hyperparameters + +Table 3 presents a comparison of hyperparameters for different fine-tuning methods on commonsense tasks. The target model remains the same for LoRA, DoRA, HiRA, and PiSSA. However, DoRA introduces an additional magnitude module, leading to a slightly higher parameter count. In a single layer of LoRA, the trainable parameters are as follows: In LoRA, the trainable parameters are: + +Q = 4096 × 32 + 4096 × 32 + +K = 4096 × 32 + 4096 × 32 + +V = 4096 × 32 + 4096 × 32 + +Up = 4096 × 32 + 11008 × 32 + +Down = 4096 × 32 + 11008 × 32 + +The total sum is 1,753,088. In CLOVER, the trainable parameters are: + +QK = 32 × 128 × 128 + +V O = 32 × 128 × 128 + +U D = 172 × 64 × 64 + +The total sum is also 1,753,088. Since CLOVER inserts trainable parameters across layers, we use the Q-K pair notation to represent its target model. When CLVOER updates parameters within an attention head, the number of trainable parameters matches exactly that of LoRA at rank 32. To adjust the number of learnable parameters, CLOVER can either span multiple heads or split a single head into multiple blocks. Both PiSSA and CLOVER exhibit stable training performance. Therefore, instead of validating every 80 steps, we omit frequent validation, improving training efficiency. + +A.3. Detail Information of Dataset + +The commonsense reasoning tasks consist of 8 subtasks, each with predefined training and testing sets, as described by LLM-Adapters (Hu et al., 2023). The following table lists the details of each sub-dataset. 13 CLOVER: Cross-Layer Orthogonal Vectors + +> Table 3. Detailed Training Hyperparameters. Q-K,V-O, U-D means CLVOER update pair of orthogonal vectors. + +Method Target Evaluation steps LR Scheduler Batch size Warmup Steps Epochs + +LoRA Q,K,V,U,D 80 3e-4 Linear 16 100 3DoRA Q,K,V,U,D 80 2e-4 Linear 16 100 3HiRA Q,K,V,U,D 80 2e-4/2e-4 Linear 32 100 3PiSSA Q,K,V,U,D – 2e-5 Linear 16 100 3CLOVER Q-K,V-O, U-D – 1e-4 Linear 16 100 3 + +> Table 4. Details of datasets for commonsense reasoning tasks. +> Dataset Train Test About BoolQ (Clark et al., 2019) 9,427 3,270 Naturally occurring yes/no questions from unconstrained settings. PIQA (Bisk et al., 2020) 16,113 1,838 Questions with two solutions requiring physical commonsense. SIQA (Sap et al., 2019) 33,410 1,954 Reasoning about actions and social implications. HellaSwag (Zellers et al., 2019) 39,905 10,042 Commonsense NLI questions with context and endings. WinoGrande (Sakaguchi et al., 2021) 40,398 1,267 Fill-in-the-blank task with binary options. ARC-e (Clark et al., 2018) 2,251 2,376 Grade-school multiple-choice science questions in Easy sets. ARC-c (Clark et al., 2018) 1,119 1,172 Grade-school multiple-choice science questions in Challenge sets. OBQA (Mihaylov et al., 2018) 4,957 500 Questions requiring multi-step reasoning and commonsense knowledge. + +For WinoGrande, the original dataset includes multiple partitions: [xs, s, m, l, xl, debiased]. While LLM-Adapters simply concatenated all these partitions, note that the “xl” partition actually includes all others, leading to extensive data duplication. After removing duplicates, the training data is reduced from 63.2K to 40.4K instances. Additionally, in the LLM-Adapters paper, the training set sizes of ARC Challenge and ARC Easy were reversed by mistake; here, we correct that error. + +A.4. LibriSpeech Long dataset target transcript + +Below is the reference text of the LibriSpeech Long dataset for comparison. + +Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter’s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton’s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell’s pictures are a sort of Up Guards and Adam paintings, and Mason’s exquisite idles are as national as a jingo poem. Mr. Birkett Foster’s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth, and Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, next man. + +In fact, with Vanilla Pruning ratios of just 22.31% and 6.69% for WQ-WK and WV -WO , respectively, the model’s output is already significantly degraded. + +Mr. Colter is the personal of the classes, and we are glad to welcome his gospel. Nor is Mr. Colter’s manner less interesting than his manner. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly he is drawn from eating and its results occur most readily to the mind. He is very dull, so very frequently, and is very Greek after all, and can discover in it but little of Rocky Ithaca. The Nell’s pictures are sort of up-guard to Adam’s paintings, and Mason’s exquisite idylls are as national as a jingle poem. Mr. Burke and Foster’s landscapes smile at one much in the same way as Mr. Parker, Mr. Flash is tits. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo and a Turkish bath, Next man. + +A.5. Visualizing more attention heads + +In Section 4.3, we only presented the first attention head in the first layer. Here, we provide a broader view by showcasing more attention heads. Figure 7 illustrates the L2 norm of all Q-K heads in the first, middle, and last layers of Whisper-Large-14 CLOVER: Cross-Layer Orthogonal Vectors + +v3. Figure 8 shows the L2 norm of all Q-K heads in the first, middle, and last layers of ViT-bigG. From these figures, we can observe that CLOVER consistently represents the entire attention head with fewer orthogonal bases across all layers and all attention heads. This property forms the foundation of CLVOER’s effectiveness in enhancing pruning. 0 200 400 600 800 1000 1200 + +> 0 +> 2 +> 4 +> 6 +> 8 +> 10 +> Layer.0.qk +> Absorb and Decompose +> Vanilla +> 0200 400 600 800 1000 1200 +> 0 +> 2 +> 4 +> 6 +> 8 +> Layer.15.qk +> 0200 400 600 800 1000 1200 +> 0.0 +> 2.5 +> 5.0 +> 7.5 +> 10.0 +> 12.5 +> 15.0 +> Layer.31.qk + +Figure 7. The L2-norm for the 0-th, 15-th, and 31-st attention layers in the Whisper-large-v3 encoder. The blue line represents the results after redundancy removal using the CLOVER method, while the orange line depicts the L2-norm directly computed for each dimension. + +15 CLOVER: Cross-Layer Orthogonal Vectors 0 250 500 750 1000 1250 1500 + +> 0.0 +> 0.1 +> 0.2 +> 0.3 +> 0.4 +> 0.5 + +Layer.0.qk + +> Absorb and Decompose +> Vanilla +> 0250 500 750 1000 1250 1500 +> 0.0 +> 0.5 +> 1.0 +> 1.5 +> 2.0 +> 2.5 + +Layer.23.qk + +> 0250 500 750 1000 1250 1500 +> 0.0 +> 0.2 +> 0.4 +> 0.6 +> 0.8 +> 1.0 + +Layer.47.qk + +Figure 8. The L2-norm for the 0-th, 15-th, and 31-st attention layers in the ViT-bigG. The blue line represents the results after redundancy removal using the CLOVER method, while the orange line depicts the L2-norm directly computed for each dimension. + +16 diff --git a/docs/psoft_principal_subspace_oft.md b/docs/psoft_principal_subspace_oft.md new file mode 100644 index 0000000..c0716a8 --- /dev/null +++ b/docs/psoft_principal_subspace_oft.md @@ -0,0 +1,1345 @@ +Title: Efficient Orthogonal Fine-Tuning with Principal Subspace Adaptation + +URL Source: https://arxiv.org/pdf/2505.11235 + +Published Time: Fri, 20 Feb 2026 01:42:14 GMT + +Number of Pages: 29 + +Markdown Content: +Published as a conference paper at ICLR 2026 + +# EFFICIENT ORTHOGONAL FINE -T UNING WITH + +# PRINCIPAL SUBSPACE ADAPTATION + +Fei Wu, Jia Hu ∗ + +, Geyong Min ∗ + +, Shiqiang Wang + +Department of Computer Science, University of Exeter, UK + +{fw407,j.hu,g.min,s.wang9 }@exeter.ac.uk + +## ABSTRACT + +Driven by the rapid growth of model parameters, parameter-efficient fine-tuning (PEFT) has become essential for adapting large models to diverse downstream tasks under constrained computational resources. Within this paradigm, orthogonal fine-tuning and its variants preserve semantic representations of pre-trained models, but struggle to achieve both expressiveness and efficiency in terms of parame-ter counts, memory, and computation. To overcome this limitation, we propose efficient Orthogonal Fine-Tuning with Principal Subspace adaptation (PSOFT), which confines orthogonal transformations to the principal subspace of pre-trained weights. Specifically, PSOFT constructs this subspace via matrix decomposition to enable compatible transformations, establishes a theoretical condition that strictly maintains the geometry of this subspace for essential semantic preservation, and introduces efficient tunable vectors that gradually relax orthogonality during train-ing to enhance adaptability. Extensive experiments on 35 NLP and CV tasks across four representative models demonstrate that PSOFT offers a practical and scalable solution to simultaneously achieve semantic preservation, expressive-ness, and multi-dimensional efficiency in PEFT. The code is publicly available at + +https://github.com/fei407/PSOFT . + +## 1 INTRODUCTION + +Pre-trained foundation models including large language models (LLMs) (Grattafiori et al., 2024) and vision transformers (ViT) (Dosovitskiy et al., 2021) have transformed natural language pro-cessing (NLP) (Qin et al., 2023) and computer vision (CV) (Liu et al., 2023). This success is attributed to emergent abilities (Wei et al., 2022) that arise as these models are scaled up. However, their ever-growing scale poses a practical barrier to efficiently tailoring ( i.e., fine-tuning) these sophisticated foundation models to specific downstream tasks. To address this challenge, parameter-efficient fine-tuning (PEFT) has emerged as a promising paradigm that adapts models by updating only a minimal subset of parameters (Houlsby et al., 2019; Lester et al., 2021; Li & Liang, 2021; Hu et al., 2022; Meng et al., 2024; yang Liu et al., 2024). Pretrained Weights + +> Wpre ∈ℝ d×n A∈ℝ d×r B∈ℝ r×n +> xhLoRA xhPretrained Weights +> Wpre ∈ℝ d×n +> Orthogonal Weights +> R⊥∈ℝ d×d +> OFT Residual Weights +> Wres ∈ℝ d×n A⊥∈ℝ d×r +> Principal Weights +> Principal Weights +> B∈ℝ r×n +> xhPSOFT +> R⊥∈ℝ r×r +> +×+× +> alpha ∈ℝ rbeta ∈ℝ r + +Figure 1: Overview of the architectures of LoRA, OFT, and the proposed PSOFT. Among PEFT studies, reparameterization-based methods (Hu et al., 2022; Qiu et al., 2023) are widely adopted because they seamlessly inte-grate with pre-trained weights without adding inference latency. As illustrated in the left panel of Figure 1, reparameterization-based methods include Low-Rank Adaptation (LoRA) (Hu et al., 2022) and Orthogonal Fine-Tuning (OFT) (Liu et al., 2021; Qiu et al., 2023). LoRA has been widely adopted for its efficient low-rank structure, but it may distort semantic representations embedded in the pre-trained weights. These semantic rep-resentations can be understood as the geometric + +> ∗ + +Corresponding author. + +1 + +> arXiv:2505.11235v3 [cs.LG] 19 Feb 2026 + +Published as a conference paper at ICLR 2026 Table 1: Comparison of LoRA, OFT variants, and the proposed PSOFT. The table summarizes the trade-off among semantic preservation, multi-dimensional efficiency, and expressiveness (as reflected in performance) across PEFT methods. + +> Methods Semantic Representations (explicitly preserved) Parameter-efficiency Mechanism Memory Usage Computational Overhead Performance +> LoRA (Hu et al., 2022) ✗Low-rank Low Low Medium Full OFT (Liu et al., 2021) Full space ✗Very High Very High High Block-diagonal OFT (Qiu et al., 2023) Full space Block-diagonal Medium Medium Medium-High BOFT (Liu et al., 2024) & qGOFT (Ma et al., 2024) Full space Butterfly factorization / Givens rotation High High High PSOFT (Ours) Principal subspace Low-rank Low ↓Low ↓High ↑ + +structure of weight vectors, specifically the pairwise angles and norms among columns, which en-code relational information learned during pre-training. Distorting this structure may weaken the model’s ability to transfer knowledge to downstream tasks (Wang et al., 2023). In contrast, OFT applies isometric orthogonal transformations, which strictly maintain this geometric structure and thereby preserve semantic representations. However, full-dimensional orthogonal transformations are inefficient in terms of parameter counts, memory, and computation, rendering them impractical for large-scale applications. This contrast leaves a gap in PEFT between the efficiency of LoRA and the semantic preservation of OFT. Building on OFT’s advantages, several studies have explored ways to improve its efficiency while retaining its core strength. Early attempts such as block-diagonal OFT (Qiu et al., 2023) reduced parameter counts and partially alleviated computational and memory overhead through block-diagonal sparsity. However, the rigid block structure restricts the model’s expressiveness (its ability to capture diverse transformations) and consequently limits the performance that can be empirically attained. To address this limitation, later variants such as BOFT (Liu et al., 2024) and qGOFT (Ma et al., 2024) have sought to restore expressiveness while maintaining parameter efficiency by composing multiple sparse orthogonal matrices in sequence. Yet this design incurs a new drawback: chaining multiple sparse matrices introduces substantial intermediate states that dominate runtime and memory consumption. Empirically, qGOFT has been reported to run nearly 6× slower than LoRA during training (Ma et al., 2024), while BOFT and qGOFT frequently consume more than 80 GB of memory in large-scale model settings. Such overhead inflates training costs and undermines their practicality. Thus, sparsity-driven OFT variants struggle to achieve both expressiveness and efficiency across multiple dimensions. This tension underlies the central challenge of our work: + +How to design a PEFT method that simultaneously achieves semantic preservation, expressiveness, and multi-dimensional efficiency (parameter counts, memory, and computation)? + +To address this challenge, motivated by evidence that both pre-trained models and their task-specific adaptations reside in a low intrinsic rank (Li et al., 2018; Aghajanyan et al., 2021; Hu et al., 2022), we propose efficient Orthogonal Fine-Tuning with Principal Subspace adaptation ( PSOFT ), as illustrated in the right panel of Figure 1. The key idea is to confine orthogonal transformations to the low-rank principal subspace of pre-trained weights, thereby overcoming the limitations of conventional OFT operating in the full parameter space and simultaneously achieving semantic preservation, expressiveness, and multi-dimensional efficiency. However, realizing this idea is non-trivial, as it entails overcoming several technical difficulties: 1) Compatibility. A low-dimensional orthogonal transformation cannot be directly applied to the high-dimensional weight matrix, leading to dimensional incompatibility with the pre-trained model. 2) Geometry preservation. Naively applying low-rank orthogonal transformations may distort the geometry of the subspace, thereby undermining the strict preservation of essential semantic representations. 3) Adaptability. Strict orthogonality constraints may hinder adaptation to slight task-specific drifts, resulting in suboptimal performance on downstream tasks. PSOFT resolves these difficulties through principled designs. First, it constructs a principal subspace of pre-trained weights through matrix decomposition, enabling compatible orthogonal transformations and yielding a higher rank that enhances expressiveness. Next, it establishes a theoretical condition to strictly maintain the geometry of the subspace, thereby ensuring essential semantic preservation. Finally, it introduces efficient tunable vectors to gradually relax orthogonality during training at negligible cost, improving adaptability across diverse downstream tasks. We evaluate PSOFT through extensive experiments on 35 NLP and CV tasks with four representative pre-trained models. Compared with OFT variants, PSOFT consistently avoids out-of-memory (OOM) 2Published as a conference paper at ICLR 2026 failures and accelerates training. On small-scale models, it achieves up to 18 × higher parameter efficiency with the lowest memory footprint among baselines, without compromising average per-formance. On larger models, PSOFT lowers the memory footprint of OFT to a level comparable with LoRA-like methods while outperforming LoRA on GSM-8K (+2.3%) and Commonsense Rea-soning (+1.4%) with comparable parameter counts. As summarized in Table 1, PSOFT preserves semantic representation in the principal subspace while minimizing parameter counts, memory, and computation overhead, and simultaneously maintains expressiveness as reflected in high performance. The main contributions of this work are summarized as follows: • We introduce a new low-rank perspective that unifies efficiency and expressiveness in OFT, bridging the gap between low-rank adaptation and orthogonal fine-tuning. • We establish a theoretical condition under which low-dimensional orthogonal fine-tuning strictly preserves the geometric structure of the subspace. • We propose PSOFT, a framework that confines OFT to the principal subspace with theoretical guarantees and practical adaptability. • We validate PSOFT through extensive experiments, establishing a practical and scalable solution to simultaneously achieve semantic preservation, expressiveness, and multi-dimensional efficiency. + +## 2 RELATED WORK + +Parameter-Efficient Fine-Tuning (PEFT). PEFT adapts pre-trained models to diverse down-stream tasks by fine-tuning only a small subset of parameters. Specifically, existing PEFT meth-ods fall into three categories: 1) Selection-based methods select specific components of the pre-trained model without altering its architecture (Zaken et al., 2022; Song et al., 2024; Xu & Zhang, 2024). 2) Addition-based methods insert prompts or adapters at the input or within Transformer blocks (Houlsby et al., 2019; Pfeiffer et al., 2021; Lester et al., 2021; Li & Liang, 2021; Liu et al., 2022). 3) Reparameterization-based methods reparameterize weights in parallel with minimal parameters (Hu et al., 2022; Azizi et al., 2024; Bałazy et al., 2024; Gao et al., 2024; Kopiczko et al., 2024; Lingam et al., 2024; yang Liu et al., 2024; Meng et al., 2024). Reparameterization-based methods are particularly appealing since they incur no additional inference latency, with represen-tative examples including LoRA (Hu et al., 2022) and OFT (Qiu et al., 2023). LoRA’s variants, such as PiSSA (Meng et al., 2024) and DoRA (yang Liu et al., 2024), improve convergence through re-initialization and enhance performance via weight decomposition, respectively. DoRA decomposes the low-rank update into direction and magnitude components, but it may introduce additional mem-ory and computational overhead for computing these components. In addition, LaMDA (Azizi et al., 2024) and LoRA-XS (Bałazy et al., 2024) reduce the parameter count and resource usage of LoRA by employing more compact matrices. In LoRA-XS, the learnable square matrix is constrained by the fixed LoRA matrices, which may limit its expressiveness. However, these LoRA-based methods may induce semantic drift from the pre-trained representations (Wang et al., 2023), which may degrade output quality in generative tasks. + +Orthogonal Fine-Tuning (OFT). Unlike additive methods such as LoRA, multiplicative OFT preserves semantic representations of pre-trained models through orthogonal transformations, which maintains the hyperspherical energy among neurons (Liu et al., 2021; Qiu et al., 2023). To mitigate the prohibitive cost of applying orthogonal transformations over the full parameter space, prior studies typically introduce sparsity constraints. For instance, block-diagonal OFT (Qiu et al., 2023) adopts a block-diagonal sparse structure to reduce parameter counts, though at the risk of undesired inductive biases (Liu et al., 2024). BOFT (Liu et al., 2024) and qGOFT (Ma et al., 2024) address this issue by replacing dense matrices with sequences of sparse multiplications, thereby improving parameter efficiency while restoring expressiveness. Nevertheless, these variants remain less efficient in memory and computation than LoRA and its variants. In parallel, Adapter R (Zhang & Pilanci, 2024) rotates the top spectral space using orthogonal transformations to preserve spectral characteristics of pretrained weights, in contrast to the geometric structure emphasized in OFT. Overall, existing OFT variants struggle to achieve both expressiveness and efficiency across multiple dimensions. These limitations motivate our PSOFT algorithm, which confines orthogonal transformations to the principal subspace with a theoretical guarantee of preserving essential semantic representations, followed by a relaxation of strict orthogonality at negligible cost to enhance adaptability. 3Published as a conference paper at ICLR 2026 + +## 3 PRELIMINARIES + +In this section, we formalize LoRA and OFT variants in mathematical notation, providing a unified view of their parameterization strategies. Conventional full fine-tuning (FFT) updates the entire pre-trained weight matrix Wpre ∈ Rd×n to obtain W , whereas PEFT methods freeze Wpre and introduce only a small set of trainable parameters. For LoRA (Hu et al., 2022), the update is parameterized by a low-rank decomposition: + +h = W ⊤x = ( Wpre + AB )⊤x, s.t. rank( AB ) = r, (1) where A ∈ Rd×r and B ∈ Rr×n are trainable matrices. Following standard practice, A is initialized with Kaiming initialization (He et al., 2015) and B with zeros, so training begins from Wpre .For OFT (Liu et al., 2021; Qiu et al., 2023), the update is parameterized by an orthogonal matrix R,which fine-tunes Wpre in the full parameter space, i.e., Wfs-tuned = RW pre . The forward pass is given by: + +h = W ⊤ + +> fs-tuned + +x = ( RW pre )⊤x, s.t. R⊤R = RR ⊤ = Id, (2) where R ∈ Rd×d is initialized as the identity matrix so that training begins from Wpre . By construc-tion, orthogonal transformations in the full parameter space preserve both angles and norms, thereby maintaining the geometric structure of Wpre .To reduce parameter overhead, block-diagonal OFT (Qiu et al., 2023) constrains R to a block-diagonal form R = diag (R1, · · · , Ri, · · · , Rd/r ), where each Ri ∈ O( d/r ). Although efficient, this structure may introduce undesirable inductive bias. BOFT (Liu et al., 2024) and qGOFT (Ma et al., 2024) mitigate this by factorizing R into sparse matrices, R = Qlog dm=1 ˜Rm, with each + +˜Rm ∈ Rd×d sparse. Assuming d is a power of two, log d is integral, ensuring a valid factorization. This construction restores the expressiveness of dense rotations with reduced parameters. + +## 4 METHODOLOGY + +As discussed in Section 1, existing OFT variants such as BOFT and qGOFT still incur substantial computational and memory overhead. Prior studies (Li et al., 2018; Aghajanyan et al., 2021; Hu et al., 2022) further suggest that both pre-trained models and their task-specific adaptations lie in a low-rank intrinsic subspace. Motivated by this insight, we propose Orthogonal Fine-Tuning with + +Principal Subspace adaptation ( PSOFT ), which confines orthogonal transformations to the low-rank principal subspace of Wpre . The complete algorithm is given in Appendix A, and the remainder of this section details its design. 4.1 DIMENSION -C OMPATIBLE ORTHOGONAL TRANSFORMS + +Realizing orthogonal fine-tuning in the subspace requires a projection of high-dimensional weights onto a low-dimensional subspace, since directly applying the orthogonal matrix R ∈ Rr×r to + +Wpre ∈ Rd×n is infeasible due to dimensional incompatibility. To construct this projection, we perform Singular Value Decomposition (SVD), Wpre = U ΣV ⊤, and decompose it into Wpri and + +Wres , such that Wpre = Wpri + Wres . Here, the subscript “pri” denotes the principal component reconstructed from the top-r singular values and vectors, while “res” denotes the residual component. The principal component Wpri is then used to derive symmetric low-rank matrices A and B as: + +Wpri = U[: ,:r] + +q + +Σ[: r, :r] + +| {z } + +> A∈Rd×r + +q + +Σ[: r, :r]V ⊤ + +> [: ,:r] + +| {z } + +> B∈Rr×n + +∈ Rd×n (Symmetric) , (3) where A projects weights into the r-dimensional principal subspace, while B reconstructs them back. The residual component Wres is then obtained from the remaining singular values and vectors: + +Wres = Wpre − Wpri = U[: ,r :] Σ[r:,r :] V ⊤ + +> [: ,r :] + +∈ Rd×n. (4) Building on this, we regard Wpri = AB as representing the initial principal subspace of Wpre . This subspace enables dimension-compatible orthogonal transformations, yielding Wps-tuned = ARB ,where the subscript “ps-tuned” denotes the fine-tuned weights in the principal subspace for PSOFT. 4Published as a conference paper at ICLR 2026 Sparse Orthogonal Fine-Tuning (OFT variants) + +··· ··· ··· + +dd + +> ··· Orthogonal Weights R + +dn + +> Preserve angles and norms in the full space Wfs-tuned = R · W pre BOFT, GOFT ... +> Col Vectors in W pre Col Vectors in W fs-tuned (OFT variants) +> Residual Weights W res Pre-trained Weights W pre +> Decomposi �on + +Orthogonal Fine-Tuning in the Principal Subspace (PSOFT) + +> Pre-trained Weights W pre AR +> r +> BPrincipal Weights W pri Wpri = A·B = A'·B' Preserve angles and norms in the principal subspace Wps-tuned = A'·diag( α)· R ·diag( β)· B' + +dnd dn n + +# ×× ×C1 C 1C 2C 3C2C 3C 1'C2'C3 ' + +> θ2 +> θ2 +> θ2 +> θ1 +> θ1C1C2C2C3C3C1 +> θ1CiCiCol Vectors in W pri +> Ci +> Col Vectors in W ps-tuned (ini �al of PSOFT) +> Ci +> Col Vectors in W ps-tuned (PSOFT) +> Ci'Higher computa �onal cost Larger GPU memory footprint Smaller GPU memory footprint Lower computa �onal cost + +Figure 2: Our proposed method: PSOFT. The left panel illustrates the principles of OFT variants. On the right, PSOFT preserves the angles and norms of Wpri (blue) in the fine-tuned Wps-tuned (orange), while allowing adjustable angles and scalable norms in the sector. Unlike LoRA (Hu et al., 2022) and PiSSA (Meng et al., 2024), which train both A and B, PSOFT freezes them and fine-tunes only the orthogonal matrix R. LoRA produces updates ∆W = AB that span the low-rank manifold {∆W : rank(∆ W ) ≤ r} of dimension r(d+n−r). In contrast, PSOFT generates updates ∆W = A(R − I)B parameterized solely by an orthogonal matrix R ∈ O(r),where O(r) denotes the r(r − 1) /2-dimensional orthogonal group. Because the variability of ∆W + +arises only through R, all updates remain confined to the fixed row and column subspaces defined by A and B. Consequently, LoRA and PSOFT operate on fundamentally different geometric families of updates (low-rank vs. orthogonal), and their expressiveness is therefore not directly comparable. The same structural distinction also determines different feasible ranks under an equal trainable-parameter budget M . LoRA trains two matrices, giving M = ( d + n) rLoRA and thus + +rLoRA = M/ (d + n), whereas PSOFT trains only an orthogonal matrix, yielding M = r2PSOFT + +and hence rPSOFT = √M . Since typically √M ≪ (d + n), we obtain rPSOFT ≫ rLoRA , which explains why PSOFT empirically operates with much larger ranks under the same parameter budget. 4.2 GUARANTEED GEOMETRY PRESERVATION IN THE PRINCIPAL SUBSPACE + +Orthogonal transformations within the constructed principal subspace in Section 4.1 merely ensure dimensional compatibility but do not strictly preserve subspace geometry. In particular, applying a low-dimensional orthogonal matrix R to the subspace spanned by symmetric A and B in Eq. 3 may distort the pairwise angles and norms among the column vectors of Wpri . To address this issue, we analyze the conditions under which orthogonal fine-tuning preserves the geometry of the principal subspace, and present an informal Theorem 4.1, with the formal theorem and proof in Appendix B. + +Theorem 4.1 (Informal: Angle and norm preservation in the principal subspace) . Let Wpri = AB + +denote the principal weights and Wps-tuned = ARB denote the fine-tuned weights. For Wps-tuned to preserve (i) pairwise angles between columns, and (ii) column norms of Wpri , the following condition must hold: + +R⊤A⊤AR = A⊤A. (5) 5Published as a conference paper at ICLR 2026 We provide an intuitive explanation of Theorem 4.1. The geometry of the principal subspace is determined by the relative angles and lengths of its column vectors, which are encoded in the Gram matrix G = A⊤A. Any R satisfying R⊤GR = G can be viewed as a symmetry of this geometry, similar to a rotation or reflection. In other words, if we first apply R to the columns of B and then project them using A, their angles and lengths in the high-dimensional space remain unchanged. In practice, normalizing A so that A⊤A = Ir simplifies the condition, in which case R reduces to a standard orthogonal matrix. Accordingly, Eq. 3 is modified in PSOFT as: + +Wpri = U[: ,:r] + +| {z } + +> A′∈Rd×r + +Σ[: r, :r]V ⊤ + +> [: ,:r] + +| {z } + +> B′∈Rr×n + +∈ Rd×n (Asymmetric) , (6) where asymmetric A′ and B′ are derived from the top-r principal components of the SVD. The residual Wres remains as in Eq. 4, and the forward computation becomes: + +h = ( Wps-tuned + Wres )⊤x = ( A′RB ′ + Wres )⊤x, (7) where A′, B′, and Wres are frozen, and only R ∈ Rr×r is trainable, initialized as the identity matrix. To satisfy Eq. 5 during training, it is ssential to maintain the orthogonality of R. Enforcing orthogo-nality of R (e.g., via Gram-Schmidt orthogonalization) is computationally expensive. To reduce this cost, following prior studies (Qiu et al., 2023; 2025), we adopt the Cayley parameterization (Cayley, 1894) to enforce the strict orthogonality of R, where R = ( I − Q)( I + Q)−1 and Q = −Q⊤ is a skew-symmetric matrix. Further details on the Cayley parameterization are provided in Appendix C. 4.3 EFFICIENT RELAXATIONS OF ORTHOGONALITY + +Eqs. 6 and 7 guarantee geometry preservation in the principal subspace, but strict orthogonality con-straints may hinder adaptation to task-specific drifts, leading to suboptimal performance. Empirical evidence shows that moderate relaxation improves results (Ma et al., 2024). Yet existing methods sacrifice efficiency: qGOFT relaxes constraints more flexibly but requires four times the parameters of GOFT (Ma et al., 2024), while BOFT relaxes them through additional scaling vectors on the output dimension, whose size grows linearly with model scale (Liu et al., 2024). To overcome these issues, we propose efficient relaxations of PSOFT that enhance adaptability with minimal overhead. Specifically, we introduce two tunable vectors that modulate the input and output norms around the orthogonal matrix, modifying Eq. 7 to yield the following forward computation: + +h = ( A′ diag( α)R diag( β)B′ + Wres )⊤x (PSOFT) , (8) where A′, B′, and Wres remain fixed, while only R and the tunable vectors α and β are trained. Both vectors are initialized as all-one vectors to ensure strict orthogonality at the start of training. As illustrated in Figure 2, PSOFT relaxes this constraint during training, enabling adjustable angles and scalable norms that adapt to task objectives. As these two additional vectors are inserted within the subspace, the overhead is limited to 2r parameters ( 2r ≪ n, where n is the output dimension), enhancing adaptability with minimal cost and without significantly affecting the geometric structure. To avoid excessive deviation from orthogonality, an explicit constraint can be imposed: + +C⊤C − I F ≤ ϵ, where C = diag( α) R diag( β). Deviation arises when either diag( α) or + +diag( β) deviates from a scalar multiple of the identity. In the special case where diag( α) = λ1I + +and diag( β) = λ2I, angular relationships are preserved, and magnitudes are uniformly scaled. In summary, PSOFT performs orthogonal fine-tuning to the low-rank principal subspace, enabling dimension-compatible transformations with theoretical guarantees on subspace geometry, while relaxing strict orthogonality at negligible cost to enhance adaptability. It requires only r(r −1) /2+2 r + +trainable parameters by combining the Cayley parameterization with two efficient tunable vectors. Moreover, it reduces both the number and size of additional matrices (from min( d, n ) to r, with + +r ≪ min( d, n )), thereby yielding substantially lower activation memory than other OFT variants under the same batch size and sequence length. Detailed comparisons of parameter counts and activation memory analysis across different PEFT methods are provided in Appendices D and E. 6Published as a conference paper at ICLR 2026 + +## 5 EXPERIMENTS + +To evaluate PSOFT, we conduct experiments on 35 tasks spanning language and vision domains, using encoder-only models (DeBERTaV3-base (He et al., 2021), ViT-B/16 (Dosovitskiy et al., 2021)), and decoder-only models (LLaMA-3.2-3B (Meta AI, 2024), LLaMA-3.1-8B (Grattafiori et al., 2024)). These models are fine-tuned on downstream tasks, covering natural language understanding (Wang et al., 2019), visual classification (Zhai et al., 2019), mathematical QA (Yu et al., 2024), and commonsense reasoning (Hu et al., 2023). We evaluate key metrics such as parameter counts, peak memory usage, and accuracy in the main experiments, and assess training speed separately in the efficiency analysis. Following OFTv2 (Qiu et al., 2025), we implement the Cayley parameterization by approximating (I + Q)−1 with a truncated Neumann series, PKk=0 (−Q)k, using K = 5 terms in practice. All experiments are performed on a single GPU with FP32, using an NVIDIA RTX 4090 (24 GB) for encoder-only models and an NVIDIA H100-SXM (80 GB) for decoder-only models. + +Baselines. We employ state-of-the-art OFT variants with other advanced PEFT methods as baselines: • FFT (Howard & Ruder, 2018) updates all model weights during fine-tuning. • GOFTv2 & qGOFTv2 (Ma et al., 2024) replace full-space OFT with Givens rotations. The latest implementation uses Hadamard products instead of sparse multiplication. • BOFT (Liu et al., 2024) substitutes full-space OFT with butterfly factorization. • OFTv2 (Qiu et al., 2023; 2025) employs a block-diagonal structure for OFT, with the latest version adopting an input-centric computation and Cayley-Neumann parameterization. • LoRA (Hu et al., 2022) freezes pre-trained weights and adjusts only two low-rank matrices. • PiSSA (Meng et al., 2024) improves LoRA initialization to fine-tune principal weights. • DoRA (yang Liu et al., 2024) decomposes low-rank adaptation into direction and magnitude. • LoRA-XS (Bałazy et al., 2024) injects and tunes a single square matrix between LoRA’s matrices. Table 2: Experimental results of fine-tuned DeBERTaV3-base. Results are averaged over 5 random seeds. Memory (GB) denotes peak memory with sequence length 64. + +> Methods #Params Memory (GB) CoLA STS-B RTE MRPC SST2 QNLI Avg. +> FFT 184M 5.9 67.56 91.46 82.88 90.69 94.13 93.37 86.68 GOFTv2 0.08M 18.5 65.45 N/A. (OOM) qGOFTv2 0.33M 18.5 68.03 N/A. (OOM) BOFT b=8 +> m=2 1.41M 6.3 68.85 91.09 83.60 88.40 95.28 93.78 86.83 OFTv2 b=32 1.29M 4.5 66.79 91.22 84.03 89.61 93.72 92.64 86.34 LoRA r=8 1.33M 4.5 67.98 91.60 84.87 90.20 95.28 93.89 87.30 PiSSA r=8 1.33M 4.5 66.50 91.40 83.77 89.90 93.17 92.72 86.24 DoRA r=8 1.41M 5.8 67.06 91.60 87.19 90.49 95.23 94.09 87.61 LoRA-XS r=136 1.33M 4.2 64.67 91.48 84.17 91.27 93.85 93.14 86.43 PSOFT r=46 0.08M 4.1 70.42 91.56 86.74 90.49 95.55 93.47 88.04 + +Encoder-only Models. We evalu-ate PSOFT by fine-tuning DeBER-TaV3 (He et al., 2021) on sev-eral datasets from the GLUE bench-mark (Wang et al., 2019). Following prior work (Wu et al., 2024a;b; Bini et al., 2025), we split the original vali-dation set into new validation/test sets with a fixed seed, and report test accu-racy from the best validation check-point to ensure rigorous evaluation. Details are in Appendix F. As shown in Table 2, GOFTv2 and qGOFTv2 have non-tunable parame-ters and often encounter OOM failures as the sequence length increases. PSOFT improves parameter and memory efficiency without com-promising performance. Although GOFT and PSOFT have the same parameter counts, PSOFT reduces memory usage by about 80% and avoids OOM issues. It further achieves up to an 18 × + +improvement in parameter efficiency over BOFT, OFTv2, and LoRA variants, attaining the best average performance across all baselines with the lowest memory footprint. Compared with LoRA variants that do not rely on weight decomposition, DoRA introduces additional memory overhead. For LoRA-XS, the update is constrained by the initialization of its low-rank matrices, which limits its expressiveness and consequently leads to degraded performance. These results highlight PSOFT’s ability to achieve both efficiency and performance. We also evaluate PSOFT by fine-tuning ViT-B/16 (Dosovitskiy et al., 2021) on the VTAB-1K benchmark (Zhai et al., 2019). Further details are provided in Appendix G. As shown in Table 3. + +PSOFT extends its efficiency-performance advantages on the small-scale model from language tasks to vision tasks. Beyond avoiding the heavy memory demands of GOFTv2 and qGOFTv2, PSOFT consistently reduces the memory overhead of BOFT and OFTv2. Compared to LoRA and its variants, 7Published as a conference paper at ICLR 2026 Table 3: Experimental results of fine-tuned ViT-B/16 on the VTAB-1K benchmark. Reported values (top-1 accuracy %) are the mean of 5 runs with different random seeds. + +> Methods #Params Mem (GB) Natural Specialized Structured Avg. Cifar100 Caltech101 DTD102 Flower102 Pets SVHN Sun397 Camelyon EuroSAT Resisc45 Retinopathy Clevr-Count Clevr-Dist DMLab KITTI-Dist dSpr-Loc dSpr-Ori sNORB-Azim sNORB-Ele +> FFT 85.9M 8.2 70.7 89.3 69.5 99.0 90.4 81.7 54.9 85.4 93.6 83.8 74.5 58.3 51.5 43.2 75.0 73.1 48.7 16.4 30.0 67.8 GOFTv2 0.08M OOM N/A. qGOFTv2 0.33M OOM N/A. BOFT b=8 +> m=2 1.41M 10.9 70.6 88.2 69.8 99.0 91.4 77.4 55.1 85.1 93.6 82.3 74.9 61.8 50.4 42.9 76.1 73.7 48.8 15.7 30.8 70.9 OFTv2 b=32 1.29M 7.7 68.5 88.9 67.5 98.4 89.5 86.9 53.6 86.0 94.1 84.2 74.6 58.7 56.4 46.7 78.5 81.1 48.1 17.3 32.5 72.1 LoRA r=8 1.33M 9.9 71.4 88.4 70.1 99.0 91.4 76.6 55.7 85.9 94.2 83.3 74.1 72.0 54.3 43.0 76.6 74.8 48.6 16.4 31.8 71.8 PiSSA r=8 1.33M 9.9 70.7 88.7 68.9 99.2 91.0 81.9 53.3 82.6 93.4 83.0 74.0 71.0 60.2 44.0 77.1 81.9 51.8 18.1 33.1 72.3 DoRA r=8 1.41M 17.8 70.7 89.0 69.8 98.9 91.0 81.7 55.5 85.7 94.2 83.5 74.8 67.3 54.2 45.1 77.4 82.0 48.5 16.9 31.5 72.3 LoRA-XS r=136 1.33M 6.6 68.5 89.4 68.4 98.7 90.9 84.5 54.1 84.0 94.3 80.8 73.6 60.0 57.7 45.8 79.6 80.6 48.1 17.4 30.8 71.6 PSOFT r=46 0.08M 6.2 71.9 89.6 70.3 99.1 91.8 86.9 55.9 84.6 94.2 82.4 75.2 71.2 59.9 45.7 79.6 80.9 52.9 20.0 32.9 73.4 + +it achieves the best average accuracy with about 94% fewer parameters and the lowest peak memory footprint. Interestingly, we also observe that parameter counts and memory overheads of different PEFT methods do not necessarily correlate. For example, the weight decomposition in DoRA introduces substantial memory overhead on the ViT-B/16 model compared with other LoRA variants, even when the number of trainable parameters is similar. This suggests that PEFT design should consider multi-dimensional efficiency beyond parameter efficiency alone. Table 4: Experimental results of fine-tuned LLaMA-3.2-3B on GSM-8K and MATH. + +Methods #Params Memory (GB) GSM-8K MATH + +FFT 3.21B 69.0 63.00 16.84 GOFTv2 0.75M OOM N/A. qGOFTv2 2.98M OOM N/A. BOFT b=2 + +> m=2 + +3.76M OOM N/A. OFTv2 b=32 11.6M 35.2 61.03 15.70 LoRA r=8 12.2M 32.2 60.80 15.76 PiSSA r=8 12.2M 32.2 61.26 14.96 DoRA r=8 12.9M 43.4 62.62 15.48 LoRA-XS r=248 12.1M 34.4 61.56 15.02 PSOFT r=352 12.2M 36.2 63.08 15.98 + +Decoder-only Models. Following prior work (Lingam et al., 2024; Liu et al., 2024), we fine-tune the LLaMA-3.2-3B (Meta AI, 2024) model on MetaMathQA-40K (Yu et al., 2024) and evaluate on GSM-8K (Cobbe et al., 2021) and MATH (Hendrycks et al., 2021). For large-scale models and complex tasks, where perfor-mance is more sensitive to parameter counts, we align trainable parameters by setting the LoRA rank to 8 to ensure a fair comparison. PEFT modules are applied to all linear layers, with ad-ditional hyperparameter details in Appendix H. As shown in Table 4, as models scale up, BOFT suffers from OOM failures like GOFTv2 and qGOFTv2, whereas PSOFT avoids this issue. + +PSOFT reduces the peak memory footprint of OFT variants to a level comparable with LoRA-like methods, while delivering superior performance under similar parameter counts. Against advanced PEFT methods, it outperforms LoRA (+2.28%) on GSM-8K and PiSSA (+1.02%) on MATH, while maintaining memory usage comparable to LoRA-like baselines. Compared to the sparsity-based OFTv2, PSOFT achieves higher performance at comparable cost. When scaling to large models and complex reasoning tasks, PSOFT adapts by employing a higher rank r to ensure sufficient expressiveness, yet still maintains efficiency and clear memory advantages over BOFT, GOFTv2, qGOFTv2, and DoRA. Although increasing the rank may enhance the expressiveness of LoRA-XS, its performance remains fundamentally constrained by the initialization: the inserted square matrix is trainable only as a linear combination within the original low-rank subspace. Even under restricted module insertion and tighter parameter budgets, PSOFT still reduces memory overhead relative to qGOFTv2 and BOFT (Table 13 in Appendix H), demonstrating strong scalability to large models and complex mathematical tasks. Following prior work (Hu et al., 2023; Lingam et al., 2024; yang Liu et al., 2024), we further fine-tune LLaMA-3.1-8B (Grattafiori et al., 2024) on the Commonsense-15K dataset (Hu et al., 2023) and evaluate it on eight commonsense reasoning benchmarks. PEFT modules are applied to the + +Q, K, V, U, D linear layers. Appendix I details the hyperparameter settings. As shown in Table 5, + +PSOFT mitigates the frequent OOM issues of OFT on larger models while achieving the best average performance. In practice, GOFTv2, qGOFTv2, and BOFT suffer from OOM failures even without 8Published as a conference paper at ICLR 2026 Table 5: Experimental results of fine-tuned LLaMA-3.1-8B on commonsense reasoning benchmarks. + +> Methods #Params Memory (GB) BoolQ PIQA SIQA HS WG ARC-e ARC-c OBQA Avg. +> FFT 8.03B OOM N/A. GOFTv2 0.98M OOM N/A. qGOFTv2 3.93M OOM N/A. BOFT b=2 +> m=2 4.72M OOM N/A. OFTv2 b=32 14.3M 55.5 70.83 84.44 73.34 90.63 74.11 90.87 80.12 81.80 80.77 LoRA r=8 14.2M 54.1 73.18 85.31 74.36 86.57 74.19 90.95 80.29 84.00 81.11 PiSSA r=8 14.2M 54.1 71.22 86.02 75.38 90.27 74.19 89.90 79.44 84.00 81.30 DoRA r=8 14.9M 65.6 73.09 85.96 75.08 90.48 75.53 90.74 81.40 84.40 82.09 LoRA-XS r=298 14.2M 56.2 72.35 86.51 75.18 91.73 74.98 90.74 79.52 84.00 81.88 PSOFT r=424 14.5M 58.4 72.17 86.51 75.79 91.28 75.61 91.46 81.48 86.00 82.54 + +inserting modules into all linear layers, severely limiting their use in large-scale fine-tuning, whereas PSOFT provides a more memory-friendly alternative. Under comparable costs, it surpasses OFTv2 by 1.77% in average accuracy, matches the memory efficiency of LoRA-like baselines while delivering higher accuracy, and reduces memory usage by about 7 GB relative to DoRA. As the model size increases, DoRA attains performance that is surpassed only by PSOFT, but its memory overhead becomes noticeably higher than that of other LoRA variants. PSOFT further remains effective under reduced parameter budgets and restricted module insertion (Table 15 in Appendix I), underscoring its practicality in balancing efficiency and performance across diverse settings. Table 6: Effect of orthogonality of R on LLaMA-3.2-3B. + +Methods #Params GSM-8K MATH + +PiSSA+LoRA-XS r=248 (γ=0.0) 12.1M 61.26 14.72 PiSSA+LoRA-XS r=248 (γ=0.01) 12.1M 61.26 14.80 PiSSA+LoRA-XS r=248 (γ=0.1) 12.1M 59.89 14.90 PiSSA+LoRA-XS r=248 (γ=1.0) 12.1M 59.36 14.44 PSOFT r=248 (strict orthogonality) 6.0M 61.18 14.80 PSOFT r=352 (strict orthogonality) 12.1M 62.77 15.74 none only only and + +> PSOFT with tunable vectors +> 50.0 +> 50.5 +> 51.0 +> 51.5 +> 52.0 +> GSM-8K Accuracy (%) +> 50.19 +> 50.72 50.57 +> 51.63 + +Figure 3: Effect of tunable vectors. + +Ablation Studies. To study the effect of orthogonality of R, we follow AdaLoRA (Zhang et al., 2023) and add an orthogonality regularizer Lorth = ∥R⊤R − I∥F , resulting in the objective L = L + γL orth + +with weight γ. Setting γ = 0 recovers PiSSA+LoRA-XS with unconstrained R. As shown in Table 6, this regularization avoids Cayley inversion but demands careful tuning. Under equal rank, PSOFT with strict orthogonality matches the unconstrained variant with half the parameters, and achieves clear gains once parameter counts are aligned. Therefore, Cayley parametrization in PSOFT not only enforces orthogonality but also exploits its skew-symmetric structure to improve parameter efficiency. To study the effect of tunable vectors α and β, we fine-tune LLaMA-3.2-3B with rank 64, inserting PSOFT into all linear layers and evaluating on GSM-8K and MATH. As shown in Figure 3, enabling both vectors achieves the best performance, while single-sided insertion provides smaller gains. This suggests that tuning only one side lacks sufficient capacity to capture task-specific variations. Table 7: Effect of initialization. + +Methods RTE CoLA + +Aorth Rorth B 85.92 70.63 + +AR orth Borth 52.71 67.97 + +AR orth B 71.11 69.23 + +To study the effect of initialization, we compare three vari-ants: Aorth Rorth B, AR orth Borth , and AR orth B, where A + +and B follow PiSSA (Meng et al., 2024) and Aorth , Borth + +use orthogonal initialization with rank 64. As shown in Ta-ble 7, Aorth Rorth B yields the best results, outperforming PiSSA without constraining A and B, whereas enforcing orthogonality on B reduces model expressiveness. + +Memory and Computational Efficiency. We evaluate memory usage among different batch sizes by fine-tuning ViT-B/16 on VTAB-1K with PEFT modules in all linear layers. As shown in Figure 4a, PSOFT consistently requires less memory than advanced OFT variants across batch sizes, maintaining a peak footprint below 4 GB even at batch size 32, which highlights its suitability for resource-constrained settings. Further detailed memory analysis and experiments are provided in Appendix M. We also evaluate the computational cost under the same experimental settings on a single H100 GPU as in Tables 4 and 5. As shown in Figure 4b, on LLaMA-3.2-3B, PSOFT ( Q,K,V ) trains in 57 minutes, 9Published as a conference paper at ICLR 2026 1 4 8 16 32 64 128 + +> Batch Size +> 4 +> 8 +> 16 +> 24 +> Peak GPU Memory (GB) +> OOM OOM OOM OOM +> BOFT b= 8 +> m= 2 +> BOFT b= 4 +> m= 4 +> GOFT-V2 +> qGOFT-V2 +> PSOFT r= 46 (ours) + +(a) qGOFTv2 + +> GOFTv2 +> BOFT +> PSOFT +> 0 +> 1 +> 2 +> 3 +> 4 +> Training Speed +> 1.0× 1.0× +> 1.7× +> 3.5× +> LLaMA-3.2-3B (Q,K,V) +> qGOFTv2 +> GOFTv2 +> BOFT +> DoRA +> PSOFT +> LoRA +> 0 +> 1 +> 2 +> 1.0× +> 1.3× +> 2.0× +> (OOM) +> N/A. +> LLaMA-3.2-3B (Q,K,V,U,D,O,G) +> qGOFTv2 +> GOFTv2 +> BOFT +> PSOFT +> 0 +> 1 +> 2 +> 3 +> 4 +> Training Speed +> 1.0× +> 3.2× +> (OOM) +> N/A. +> LLaMA-3.1-8B (Q,V) +> qGOFTv2 +> GOFTv2 +> BOFT +> DoRA +> PSOFT +> LoRA +> 0 +> 1 +> 2 +> 1.0× +> 1.7× +> 2.1× +> (OOM) +> N/A. +> LLaMA-3.1-8B (Q,K,V,U,D) + +(b) + +Figure 4: (a) Memory usage across batch sizes. (b) Training speed across different models. yielding 3.5× and 2.1× speedups over GOFTv2/qGOFTv2 and BOFT, respectively, while its full configuration ( Q,K,V,U,D,O,G ) requires 1 hour 31 minutes and achieves a 1.3× speedup over DoRA. On LLaMA-3.1-8B, PSOFT ( Q,V ) completes training in 29 minutes with a 3.2× speedup over BOFT, and PSOFT ( Q,K,V,U,D ) finishes in 53 minutes, running 1.7× faster than DoRA. Compared with other PEFT methods, its computational efficiency falls between that of DoRA and LoRA. + +## 6 DISCUSSION ON SCALING TO LARGER MODELS + +Due to hardware resource constraints, our empirical evaluation is limited to models of up to 8B parameters. Nevertheless, we further discuss the potential limitations and stability considerations when extending PSOFT to larger-scale models. From a methodological perspective, PSOFT scales favorably as model size increases. Because the orthogonal transformation operates in an r-dimensional principal subspace rather than the full d-dimensional weight space, both computational and activation-memory costs grow with the controllable rank r instead of the expanding dimension d required by many PEFT methods (a detailed analysis is provided in Appendix E). As shown in Appendix J (Tables 17 and 18), memory usage and training time remain stable as r increases. The subspace-based update also avoids the long chains of full-dimensional multiplications used in GOFT and BOFT, which become increasingly expensive at larger scales. Moreover, the number of trainable parameters in PSOFT is decoupled from the hidden dimension, enabling fine-grained parameter control and preventing the minimum parameter budget from being tied to layer width. Collectively, these properties indicate that PSOFT can extend effectively to larger architectures while maintaining stable optimization behavior. However, when applying PSOFT to models larger than 8B, several practical factors may need to be considered. Large models often exhibit higher sensitivity to hyperparameters, including learning-rate settings for structured updates such as orthogonal transformations. While PSOFT does not rely on full-dimensional orthogonal matrices, stable training at very large scales may still require careful hyperparameter tuning. Moreover, although the activation-memory growth of PSOFT is slower than that of some OFT approaches, the activations of the underlying backbone (e.g., attention and feed-forward layers) can become the dominant source of memory usage at large scales, which may constrain the choice of batch size or sequence length. Finally, as shown in the main experiments and in the additional rank-sensitivity analyses in Appendix J, larger models tend to benefit from higher ranks to capture task-specific variations. Very small ranks may lead to underfitting on complex tasks, whereas larger ranks improve expressiveness but also increase the trainable parameter budget. + +## 7 CONCLUSION + +In this work, we have proposed PSOFT, a novel PEFT framework that confines OFT to the principal subspace with theoretical guarantees, while enhancing practical adaptability through two tunable scaling vectors. Extensive experiments demonstrate that PSOFT introduces a low-rank perspective that resolves the tension between expressiveness and multi-dimensional efficiency in OFT, bridges the gap between orthogonal fine-tuning and low-rank adaptation within the broader PEFT landscape, and offers a solution with superior scalability and practicality for adapting future foundation models. 10 Published as a conference paper at ICLR 2026 + +## REPRODUCIBILITY STATEMENT + +We are committed to ensuring the reproducibility of our work and have taken the following steps. For the proposed method, we provide source code at https://github.com/fei407/PSOFT . For theoretical results, we include formal statements and complete mathematical proofs in Appendix B. For datasets and experimental settings, we offer detailed descriptions and full hyperparameter configurations in Appendices F, G, H, and I. + +## ACKNOWLEDGMENTS + +This work was supported in part by UK Research and Innovation (UKRI) Grant No. EP/X038866/1 and Horizon Europe Grant No. 101086159. + +## REFERENCES + +Armen Aghajanyan, Sonal Gupta, and Luke Zettlemoyer. Intrinsic dimensionality explains the effectiveness of language model fine-tuning. In Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: long papers) , pp. 7319–7328, 2021. Seyedarmin Azizi, Souvik Kundu, and Massoud Pedram. LaMDA: Large model fine-tuning via spec-trally decomposed low-dimensional adaptation. In Findings of the Association for Computational Linguistics: EMNLP 2024 , pp. 9635–9646, 2024. Klaudia Bałazy, Mohammadreza Banaei, Karl Aberer, and Jacek Tabor. Lora-xs: Low-rank adaptation with extremely small number of parameters. arXiv preprint arXiv:2405.17604 , 2024. Massimo Bini, Leander Girrbach, and Zeynep Akata. Decoupling angles and strength in low-rank adaptation. In The Thirteenth International Conference on Learning Representations , 2025. Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. Piqa: Reasoning about physical commonsense in natural language. In Proceedings of the AAAI conference on artificial intelligence ,volume 34, pp. 7432–7439, 2020. Arthur Cayley. The collected mathematical papers of Arthur Cayley , volume 7. University of Michigan Library, 1894. Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins, and Kristina Toutanova. BoolQ: Exploring the surprising difficulty of natural yes/no questions. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) , pp. 2924–2936, 2019. Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. Think you have solved question answering? try arc, the ai2 reasoning challenge. + +arXiv preprint arXiv:1803.05457 , 2018. Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 , 2021. Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16x16 words: Transformers for image recognition at scale. In The Ninth International Conference on Learning Representations , 2021. Ziqi Gao, Qichao Wang, Aochuan Chen, Zijing Liu, Bingzhe Wu, Liang Chen, and Jia Li. Parameter-efficient fine-tuning with discrete fourier transform. In Forty-first International Conference on Machine Learning , 2024. 11 Published as a conference paper at ICLR 2026 Aaron Grattafiori, Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Alex Vaughan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 , 2024. Nathan Halko, Per-Gunnar Martinsson, and Joel A Tropp. Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions. SIAM review , 53(2): 217–288, 2011. Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In Proceedings of the IEEE international conference on computer vision , pp. 1026–1034, 2015. Pengcheng He, Jianfeng Gao, and Weizhu Chen. Debertav3: Improving deberta using electra-style pre-training with gradient-disentangled embedding sharing. arXiv preprint arXiv:2111.09543 ,2021. Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the MATH dataset. In + +Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) , 2021. Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. Parameter-efficient transfer learning for nlp. In International conference on machine learning , pp. 2790–2799. PMLR, 2019. Jeremy Howard and Sebastian Ruder. Universal language model fine-tuning for text classification. In + +Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pp. 328–339, 2018. Edward J Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. LoRA: Low-rank adaptation of large language models. In International Conference on Learning Representations , 2022. Zhiqiang Hu, Lei Wang, Yihuai Lan, Wanyu Xu, Ee-Peng Lim, Lidong Bing, Xing Xu, Soujanya Poria, and Roy Lee. LLM-adapters: An adapter family for parameter-efficient fine-tuning of large language models. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing , pp. 5254–5276, 2023. Dawid Jan Kopiczko, Tijmen Blankevoort, and Yuki M Asano. VeRA: Vector-based random matrix adaptation. In The Twelfth International Conference on Learning Representations , 2024. Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems , 5:341–353, 2023. Brian Lester, Rami Al-Rfou, and Noah Constant. The power of scale for parameter-efficient prompt tuning. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing , pp. 3045–3059, 2021. Chunyuan Li, Heerad Farkhoor, Rosanne Liu, and Jason Yosinski. Measuring the intrinsic dimension of objective landscapes. In The Sixth International Conference on Learning Representations , 2018. Xiang Lisa Li and Percy Liang. Prefix-tuning: Optimizing continuous prompts for generation. In + +Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) ,pp. 4582–4597, 2021. Vijay Chandra Lingam, Atula Neerkaje, Aditya Vavre, Aneesh Shetty, Gautham Krishna Gudur, Joydeep Ghosh, Eunsol Choi, Alex Dimakis, Aleksandar Bojchevski, and Sujay Sanghavi. Svft: Parameter-efficient fine-tuning with singular vectors. Advances in Neural Information Processing Systems , 37:41425–41446, 2024. Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. Visual instruction tuning. Advances in neural information processing systems , 36:34892–34916, 2023. 12 Published as a conference paper at ICLR 2026 Weiyang Liu, Rongmei Lin, Zhen Liu, James M Rehg, Liam Paull, Li Xiong, Le Song, and Adrian Weller. Orthogonal over-parameterized training. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pp. 7251–7260, 2021. Weiyang Liu, Zeju Qiu, Yao Feng, Yuliang Xiu, Yuxuan Xue, Longhui Yu, Haiwen Feng, Zhen Liu, Juyeon Heo, Songyou Peng, Yandong Wen, Michael J. Black, Adrian Weller, and Bernhard Sch ¨olkopf. Parameter-efficient orthogonal finetuning via butterfly factorization. In The Twelfth International Conference on Learning Representations , 2024. Xiao Liu, Kaixuan Ji, Yicheng Fu, Weng Tam, Zhengxiao Du, Zhilin Yang, and Jie Tang. P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) , pp. 61–68, 2022. Xinyu Ma, Xu Chu, Zhibang Yang, Yang Lin, Xin Gao, and Junfeng Zhao. Parameter efficient quasi-orthogonal fine-tuning via givens rotation. In Forty-first International Conference on Machine Learning , 2024. Sourab Mangrulkar, Sylvain Gugger, Lysandre Debut, Younes Belkada, Sayak Paul, and Benjamin Bossan. PEFT: State-of-the-art parameter-efficient fine-tuning methods. https://github. com/huggingface/peft , 2022. Accessed: March, 2025. Fanxu Meng, Zhaohui Wang, and Muhan Zhang. Pissa: Principal singular values and singular vectors adaptation of large language models. Advances in Neural Information Processing Systems , 37: 121038–121072, 2024. Meta AI. Llama 3.2: Revolutionizing edge AI and vision with open, customizable models. https://ai.meta.com/blog/ llama-3-2-connect-2024-vision-edge-mobile-devices/ , 2024. Accessed: March, 2025. Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. Can a suit of armor conduct electricity? a new dataset for open book question answering. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing , pp. 2381–2391, 2018. Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems , 32, 2019. Jonas Pfeiffer, Aishwarya Kamath, Andreas R ¨uckl ´e, Kyunghyun Cho, and Iryna Gurevych. Adapter-Fusion: Non-destructive task composition for transfer learning. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume , pp. 487–503, 2021. Chengwei Qin, Aston Zhang, Zhuosheng Zhang, Jiaao Chen, Michihiro Yasunaga, and Diyi Yang. Is chatGPT a general-purpose natural language processing task solver? In The 2023 Conference on Empirical Methods in Natural Language Processing , 2023. Zeju Qiu, Weiyang Liu, Haiwen Feng, Yuxuan Xue, Yao Feng, Zhen Liu, Dan Zhang, Adrian Weller, and Bernhard Sch ¨olkopf. Controlling text-to-image diffusion by orthogonal finetuning. Advances in Neural Information Processing Systems , 36:79320–79362, 2023. Zeju Qiu, Weiyang Liu, Adrian Weller, and Bernhard Sch ¨olkopf. Orthogonal finetuning made scalable. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing , pp. 31946–31963, 2025. Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. Winogrande: An adversarial winograd schema challenge at scale. Communications of the ACM , 64(9):99–106, 2021. 13 Published as a conference paper at ICLR 2026 Maarten Sap, Hannah Rashkin, Derek Chen, Ronan Le Bras, and Yejin Choi. Social IQa: Common-sense reasoning about social interactions. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) , pp. 4463–4473, 2019. Weixi Song, Zuchao Li, Lefei Zhang, hai zhao, and Bo Du. Sparse is enough in fine-tuning pre-trained large language models. In Forty-first International Conference on Machine Learning , 2024. Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. GLUE: A multi-task benchmark and analysis platform for natural language understanding. In + +International Conference on Learning Representations , 2019. Xiao Wang, Tianze Chen, Qiming Ge, Han Xia, Rong Bao, Rui Zheng, Qi Zhang, Tao Gui, and Xuanjing Huang. Orthogonal subspace learning for language model continual learning. In Findings of the Association for Computational Linguistics: EMNLP 2023 , pp. 10658–10671, 2023. Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yo-gatama, Maarten Bosma, Denny Zhou, Donald Metzler, Ed H. Chi, Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus. Emergent abilities of large language models. + +Transactions on Machine Learning Research , 2022. Muling Wu, Wenhao Liu, Xiaohua Wang, Tianlong Li, Changze Lv, Zixuan Ling, Zhu JianHao, Cenyuan Zhang, Xiaoqing Zheng, and Xuanjing Huang. Advancing parameter efficiency in fine-tuning via representation editing. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pp. 13445–13464, 2024a. Zhengxuan Wu, Aryaman Arora, Zheng Wang, Atticus Geiger, Dan Jurafsky, Christopher D Manning, and Christopher Potts. Reft: Representation finetuning for language models. Advances in Neural Information Processing Systems , 37:63908–63962, 2024b. Jing Xu and Jingzhao Zhang. Random masking finds winning tickets for parameter efficient fine-tuning. In Forty-first International Conference on Machine Learning , 2024. Shih yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng, and Min-Hung Chen. DoRA: Weight-decomposed low-rank adaptation. In Forty-first International Conference on Machine Learning , 2024. Longhui Yu, Weisen Jiang, Han Shi, Jincheng YU, Zhengying Liu, Yu Zhang, James Kwok, Zhenguo Li, Adrian Weller, and Weiyang Liu. Metamath: Bootstrap your own mathematical questions for large language models. In The Twelfth International Conference on Learning Representations ,2024. Elad Ben Zaken, Yoav Goldberg, and Shauli Ravfogel. Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) , pp. 1–9, 2022. Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. HellaSwag: Can a machine really finish your sentence? In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics , pp. 4791–4800, 2019. Xiaohua Zhai, Joan Puigcerver, Alexander Kolesnikov, Pierre Ruyssen, Carlos Riquelme, Mario Lucic, Josip Djolonga, Andre Susano Pinto, Maxim Neumann, Alexey Dosovitskiy, et al. Alarge-scale study of representation learning with the visual task adaptation benchmark. arXiv preprint arXiv:1910.04867 , 2019. Fangzhao Zhang and Mert Pilanci. Spectral adapter: Fine-tuning in spectral space. In The Thirty-eighth Annual Conference on Neural Information Processing Systems , 2024. Qingru Zhang, Minshuo Chen, Alexander Bukharin, Pengcheng He, Yu Cheng, Weizhu Chen, and Tuo Zhao. Adaptive budget allocation for parameter-efficient fine-tuning. In The Eleventh International Conference on Learning Representations , 2023. 14 Published as a conference paper at ICLR 2026 + +## ORGANIZATION OF THE APPENDIX + +The appendix is organized as follows: • Appendix A introduces the algorithm of the proposed PSOFT. • Appendix B provides the theoretical proof for the column-wise angle and norm preservation theorem. • Appendix C presents theoretical details of the Cayley parameterization. • Appendix D compares the number of trainable parameters across popular PEFT methods. • Appendix E analyzes activation memory statistics for different PEFT methods. • Appendix F outlines experimental details for natural language understanding on GLUE. • Appendix G covers experimental details for visual classification on VTAB-1K. • Appendix H reports experimental details for mathematical question answering on MetaMathQA-40K. • Appendix I describes experimental details for commonsense reasoning on Commonsense-15K. • Appendix J details extended experiments on the effects of SVD initialization, different rank settings, inserted modules, and Neumann terms. • Appendix K illustrates the angular structure of the weight changes before and after fine-tuning. • Appendix L analyzes the difference between PSOFT and full-space OFT in terms of their optimiza-tion dynamics and training loss trajectories. • Appendix M provides the additional memory usage experiments covering a single linear layer, a Transformer block, and end-to-end models. • Appendix N explains the use of large language models in this paper. + +## A ALGORITHM OF THE PROPOSED PSOFT + +For completeness, we provide a detailed description of the proposed PSOFT framework, which corresponds to Algorithm 1. For initialization, the orthogonal matrix R is set to the identity matrix + +Ir , while PSOFT further introduces two additional vectors, α and β, both initialized as all ones. Before training begins, a singular value decomposition (SVD) is performed once to extract the top-r + +singular values and vectors, which are then used to construct the matrices A′, B′, and the residual weights Wres . During training, the forward computation follows Eq. 8, and the gradients of both R + +and the vectors α and β are updated jointly to obtain the final weights Wfinal . + +Algorithm 1 PSOFT: orthogonal fine-tuning in the principal subspace + +> 1: + +Input: Pre-trained weight matrix Wpre ∈ Rd×n, rank r, input x, and number of epochs E + +> 2: + +Output: Fine-tuned orthogonal matrix R, two vectors α and β, and final weight matrix Wfinal + +> 3: + +Initialize: Orthogonal matrix: R ← Ir , two vectors: α ← 1r , β ← 1r + +> 4: + +Pre-compute: + +> 5: + +Wpre = U SV ⊤, A′ ← U[: ,:r], B′ ← S[: r, :r]V ⊤ + +> [: ,:r] + +, Wres ← U[: ,r :] S[r:,r :] V ⊤ + +> [: ,r :] +> 6: + +for epoch = 1 to E do + +> 7: + +for each mini-batch x do + +> 8: + +h = ( A′ diag( α)R diag( β)B′ + Wres )⊤x, + +> 9: + +compute ∂L + +> ∂R + +, ∂L + +> ∂α + +, ∂L + +> ∂β + +, then update R ← R − η · ∂L + +> ∂R + +, α ← α − η · ∂L + +> ∂α + +, β ← β − η · ∂L + +> ∂β +> 10: + +end for + +> 11: + +end for + +> 12: + +Reconstruct: Wfinal ← A′ diag( α)R diag( β)B′ + Wres + +## B PROOF FOR THE ANGLE AND NORM PRESERVATION THEOREM + +Theorem B.1 (Formal: Column-wise angle and norm preservation in the low-rank subspace) . Let + +Wpri = AB ∈ Rd×n and Wps-tuned = ARB ∈ Rd×n, with A ∈ Rd×r , B ∈ Rr×n. Assume + +15 Published as a conference paper at ICLR 2026 + +rank( A) = rank( B) = r and every column bi̸ = 0 (so all angles are well-defined). Let G := A⊤A, + +G is symmetric positive definite, wpri + +> i + +:= Ab i, wps-tuned + +> i + +:= ARb i, and denote by θpri + +> ij + +(resp. θps-tuned + +> ij + +)the angle between wpri + +> i + +, wpri + +> j + +(resp. wps-tuned + +> i + +, wps-tuned + +> j + +). Then + +R⊤GR = G ⇐⇒ ∀i̸ = j, θ ps-tuned + +> ij + += θpri + +> ij + + and ∀i, ∥wps-tuned + +> i + +∥ = ∥wpri + +> i + +∥. (9) + +Proof. For any pair of column indices i̸ = j, the cosines of the angles between the vectors in principal weights (wpri + +> i + +, wpri + +> j + +) and the vectors in fine-tuned weights (wps-tuned + +> i + +, wps-tuned + +> j + +) are + +cos θpri + +> ij + += b⊤ + +> i + +Gb j + +pb⊤ + +> i + +Gb i + +q + +b⊤ + +> j + +Gb j + +, cos θps-tuned + +> ij + += b⊤ + +> i + +R⊤GRb j + +pb⊤ + +> i + +R⊤GRb i + +q + +b⊤ + +> j + +R⊤GRb j + +. + +Moreover, for any i, + +∥wpri + +> i + +∥2 = b⊤ + +> i + +Gb i, ∥wps-tuned + +> i + +∥2 = b⊤ + +> i + +R⊤GRb i. + +Sufficiency. If R⊤GR = G, then the two cosine expressions coincide for every i̸ = j, hence + +cos θps-tuned + +> ij + += cos θpri + +> ij + +. Since all angles lie in [0 , π ] where the cosine is strictly decreasing, we obtain + +θps-tuned + +> ij + += θpri + +> ij + +. Similarly, ∥wps-tuned + +> i + +∥2 = b⊤ + +> i + +Gb i = ∥wpri + +> i + +∥2, so ∥wps-tuned + +> i + +∥ = ∥wpri + +> i + +∥. + +Necessity. Conversely, assume that θps-tuned + +> ij + += θpri + +> ij + +for all i̸ = j and ∥wps-tuned + +> i + +∥ = ∥wpri + +> i + +∥ for all i.Define M := R⊤GR − G. From norm preservation we obtain + +b⊤ + +> i + +M b i = 0 , ∀i, + +Since bi̸ = 0 and G ≻ 0, both denominators in the cosine formulas are equal and positive; hence angle preservation implies + +b⊤ + +> i + +M b j = 0 ∀i̸ = j. + +Thus B⊤M B = 0 with rank( B) = r. Because B has full row rank, it admits a right inverse + +C ∈ Rn×r (e.g., C = B⊤(BB ⊤)−1) such that BC = Ir . Multiplying gives + +M = C⊤(B⊤M B )C = 0, + +hence R⊤GR = G. + +## C CAYLEY PARAMETERIZATION + +The Cayley parameterization (Cayley, 1894) is a mapping that converts real skew-symmetric matrices into orthogonal matrices. For a real skew-symmetric matrix Q (i.e., Q⊤ = −Q), the Cayley transform is defined as: + +R = ( I − Q)( I + Q)−1, + +where I is the identity matrix of the same size as Q and matrix R does not have -1 as an eigenvalue. The Cayley transform provides a way to parameterize orthogonal matrices near the identity matrix using skew-symmetric matrices. The orthogonality of the Cayley transform is proved as follows. + +Theorem C.1. If Q is a real skew-symmetric matrix and (I + Q) is invertible, then the Cayley transform R = ( I − Q)( I + Q)−1 is an orthogonal matrix. Proof. We aim to proof that the matrix R after Cayley transform satisfies R⊤R = RR ⊤ = I.To compute R⊤R: + +R⊤R = (I − Q)( I + Q)−1⊤ (I − Q)( I + Q)−1 + += (I + Q)−1⊤ (I − Q)⊤(I − Q)( I + Q)−1 + += (I + Q)⊤−1 (I − Q)⊤(I − Q)( I + Q)−1 + +16 Published as a conference paper at ICLR 2026 By the definition of skew-symmetry, Q⊤ = −Q, + += ( I − Q)−1(I + Q)( I − Q)( I + Q)−1 + +Since (I + Q) and (I − Q) are commute, we can switch the order of the factors: + += ( I − Q)−1(I − Q)( I + Q)( I + Q)−1 + += I + +Similarly, it can be proven that RR ⊤ = I. Therefore, the result of Cayley transform R =(I − Q)( I + Q)−1 is an orthogonal matrix. In this paper, PSOFT leverages the Cayley parameterization to construct orthogonal matrices with approximately half the number of trainable parameters compared to a full orthogonal matrix, while rigorously preserving orthogonality. + +## D COMPARISON OF TRAINABLE PARAMETERS FOR PEFT M ETHODS + +Table 8 reports the number of trainable parameters across representative PEFT methods. Most existing approaches scale their parameter counts with hidden layer dimensions, which constrains their applicability to larger models. In contrast, PSOFT and LoRA-XS decouple the number of trainable parameters from layer width. PSOFT further reduces parameter complexity through the Cayley parameterization, which requires only r(r − 1) /2 parameters to represent an orthogonal matrix. Consequently, the total number of trainable parameters in PSOFT remains fixed for a given rank r, allowing fine-grained control over parameter budgets. Moreover, PSOFT introduces two learnable scaling vectors within the subspace, contributing merely 2r additional parameters, which is negligible compared with other methods. Table 8: Comparison of trainable parameters for different PEFT methods within a single linear layer, assuming input/output dimensions d and n, respectively. Here, r denotes the low-rank dimension, + +m the number of butterfly factors in BOFT, b the block size in BOFT, dmin = min( d, n ), and k the number of additional off-diagonals in SVFT. All statistics are based on implementations from the HuggingFace’s PEFT library (Mangrulkar et al., 2022). + +Method Number of Trainable Parameters + +LoRA d × r + r × n + +DoRA d × r + r × n + n + +VeRA r + n + +OFT r × (d/r ) × (d/r ) + n + +BOFT m × (d/b ) × b2 + n + +SVFT dmin × k + ( dmin − k)( k + 1) + +LoRA-XS r × r + +PSOFT (Ours) r(r − 1) /2 + 2 r + +## E THE ACTIVATION MEMORY STATISTICS ACROSS DIFFERENT PEFT METHODS + +In this section, we analyze the activation memory requirements of various PEFT methods during fine-tuning. In transformer-based networks, memory usage primarily arises from three sources: + +pre-trained weight storage , activation storage , and gradient/optimizer state storage . Activation storage refers to intermediate values created during the forward pass and retained for gradient computation during backpropagation. Different PEFT methods consume comparable amounts of memory for weights, gradients, and optimizer states, as they all involve a substantially reduced number of trainable parameters (Hu et al., 2022; Bałazy et al., 2024; Kopiczko et al., 2024). In contrast, their activation memory consumption exhibits clear differences. As the batch size increases, activation storage gradually becomes the dominant memory bottleneck, as illustrated in Figure 4a. 17 Published as a conference paper at ICLR 2026 Notably, activation memory in transformer layers accounts for over 99.9% of the total activation memory across all layers (Korthikanti et al., 2023). We therefore focus our analysis on the activation storage of transformer layers. Input So �max Dropout FFN1 FFN2 GELU MatMul Query Key Value MatMul Output Dropout LayerNorm Output + +> A�en �on Self A �en �on FFN Transformer Layer +> LayerNorm Dropout + +Figure 5: The architecture of a single transformer layer, including the attention layer and the feed forward network layer and self attention layer. In this study, we consider the transformer layers within an encoder or decoder, where the input has dimensions b × s × h, where b denotes the micro-batch size, s represents the maximum sequence length, and h indicates the hidden dimension size. Each transformer layer consists of a self-attention layer with a attention heads, and in the feed-forward network (FFN) layer, the hidden dimension is expended to 4h before being projected back to h. We assume that activations are stored in 32-bit floating-point format, requiring 4 bytes of memory. All results in this section are reported in bytes unless otherwise specified. Input So �max Dropout FFN1 FFN2 GELU MatMul Query Key Value MatMul Output Dropout LayerNorm Output + +> 4bsh 8bsh 4abs 2abs 24abs 24bsh 4bsh 16bsh 4bsh 4bsh 16bsh bsh bsh 4bsh +> LayerNorm Dropout + +Figure 6: Activation memory statistics in a single transformer layer for full fine-tuning. As illustrated in Figure 5, each transformer layer consists of a self-attention block (including Query, Key, and Value matrices) combined with an output linear layer to form the attention block. Addi-tionally, it includes two FFN layers, two normalization layers, and three dropout layers. Building on prior work (Korthikanti et al., 2023), we derive an approximate formula for the activation memory required during the forward pass of a single transformer layer. For backpropagation, we consider the input of each module (which serves as the output for the subsequent module) as activations. As illustrated in Figure 6, the activation memory includes the following components: + +Self-Attention: + +• Query (Q), Key (K), and Value (V ) matrices: Require 4bsh for their shared inputs. • First MatMul: Requires 8bsh as input to the module. • Softmax: Requires 4abs 2 for activation storage. • Self-attention dropout: Only the mask is stored, with a size of abs 2.• Second MatMul: Requires activations from the output of dropout ( 4abs 2) and linear layer Value ( 4bsh ), totaling 4abs 2 + 4 bsh . + +Attention: + +• Output linear layer: Requires 4bsh as input. • Attention dropout: Only the mask is stored, with a size of bsh .• First layer normalization: Requires 4bsh for activation storage. + +FFN: + +• FFN1: Requires 4bsh as input. 18 Published as a conference paper at ICLR 2026 LoRA_A LoRA_B + +> 4bsr +> LoRA +> Query DoRA_A DoRA_B +> DoRA +> DoRA_Mag +> 4bsr 4bsh +> Query OFT_R OFT_S +> 4bsh +> Query +> OFT +> VeRA_A VeRA_D VeRA_B VeRA_B Query +> VeRA +> 4bsr 4bsh +> GOFT_S +> 4bsh +> Query +> GOFT +> GOFT_R1 GOFT_Rh +> 4bsh +> ··· +> BOFT_S +> 4bsh +> Query +> BOFT +> BOFT_R1 BOFT_Rm +> 4bsh +> ··· +> SVFT_A SVFT_S SVFT_B Query +> SVFT +> 4bsh +> LoRA-XS_A LoRA-XS LoRA-XS_B Query +> LoRA-XS +> 4bsr +> PSOFT_A PSOFT_R PSOFT_B Query-Res +> PSOFT +> 4bsr PSOFT_Alpha PSOFT_Beta +> 4bsr 4bsr + +Figure 7: Activation memory statistics in a single linear layer (Query) across different PEFT methods. • GELU activation: Requires 16 bsh for activation storage. • FFN2: Requires 16 bsh as input. • FFN dropout: Only the mask is stored, with a size of bsh .• Second layer normalization: Requires 4bsh for activation storage. Summing these sub-layers, the total activation storage for a single transformer layer is: + +ACT base = 66 bsh + 9 abs 2 (10) The six linear layers within a transformer layer undergo changes in activation memory storage when different PEFT methods are applied, as reflected by modifications to the base formula ( ACT base ). For example, the LoRA method introduces a set of low-rank matrices B and A in parallel. The activation memory requirements for various PEFT methods in a single linear layer are summarized in Figure 7, with the Query matrix as a representative example. The specific details of these changes are as follows: • LoRA : Adds 4bsr to the original activation storage for gradient computation during back-propagation. • DoRA : Adds 4bsr + 4 bsh to the original activation storage. • VeRA : Replaces the original input 4bsh with 4bsr and adds 4bsh for activation storage. • OFT : Adds 4bsh to the original activation storage. • BOFT : Requires an additional 4mbsh , where m is the number of sparse matrices. • GOFT : Adds 4bsh log h, where h is the hidden layer dimension. • SVFT : Removes the original input activation storage and adds 4bsh .• LoRA-XS : Removes the original input activation storage and adds 4bsr .• PSOFT : Removes the original input activation storage and adds 12 bsr .The activation memory requirements of various PEFT methods for a single transformer layer are summarized in Table 9. Notably, PSOFT incurs significantly lower activation memory than all other methods except LoRA-XS. Its activation memory is comparable to that of LoRA-XS, as the rank r + +is much smaller than the hidden dimension h (r ≪ h). A key observation is that PSOFT employs scale vectors to enhance task-specific flexibility, similar to other orthogonal fine-tuning methods (Qiu et al., 2023; Liu et al., 2024; Ma et al., 2024). However, unlike these methods, PSOFT applies the scale vectors within a principal subspace, effectively preventing a substantial increase in activation memory usage. 19 Published as a conference paper at ICLR 2026 Table 9: Total activation memory statistics in a single transformer layer for different PEFT methods and FFT. In BOFT, m denotes the number of sparse matrices. + +Methods Activation memory (Relative) Activation memory (Absolute) + +FFT ACT base 66 bsh + 9 abs 2 + +LoRA ACT base + 24 bsr 66 bsh + 24 bsr + 9 abs 2 + +DoRA ACT base + 24 bsr + 36 bsh 102 bsh + 24 bsr + 9 abs 2 + +VeRA ACT base − 28 bsh + 16 bsr + 36 bsh 74 bsh + 16 bsr + 9 abs 2 + +OFT ACT base + 36 bsh 102 bsh + 9 abs 2 + +BOFT ACT base + 36 mbsh 66 bsh + 36 mbsh + 9 abs 2 + +GOFT ACT base + 36 bsh log h 66 bsh + 36 bsh log h + 9 abs 2 + +SVFT ACT base − 28 bsh + 24 bsh 62 bsh + 9 abs 2 + +LoRA-XS ACT base − 28 bsh + 24 bsr 38 bsh + 24 bsr + 9 abs 2 + +PSOFT ACT base − 28 bsh + 72 bsr 38 bsh + 72 bsr + 9 abs 2 + +## F NATURAL LANGUAGE UNDERSTANDING ON GLUE + +F.1 DATASETS + +The General Language Understanding Evaluation (GLUE) (Wang et al., 2019) is a comprehensive benchmark for evaluating the performance of natural language understanding (NLU) models across diverse tasks. It includes one text similarity task (SST-B), five pairwise text classification tasks (MNLI, RTE, QQP, MRPC, and QNLI), and two single-sentence classification tasks (CoLA and SST). Table 10: Hyperparameter settings for fine-tuning DeBERTaV3-base on GLUE + +Hyperparameter CoLA STS-B MRPC RTE SST-2 QNLI + +Optimizer AdamW Warmup Ratio 0.1 LR Schedule Linear Learning Rate (Head) 5E-04 Batch Size 32 Max Seq. Len. 64 128 256 256 128 256 #Epochs 20 20 30 30 10 5LR PSOFT r=46 6E-04 4E-04 4E-04 4E-04 2E-04 4E-04 F.2 IMPLEMENTATION DETAILS + +While it is common in prior PEFT studies (Hu et al., 2022; Lingam et al., 2024; yang Liu et al., 2024; Meng et al., 2024) to report results on the GLUE validation set, concerns have been raised regarding the reliability of this protocol (Wu et al., 2024a;b; Bini et al., 2025). To ensure a more rigorous evaluation, we evenly split the original validation set into new validation and test subsets using a fixed random seed. All reported results are based on the test set, with checkpoints selected according to the best accuracy on the new validation set. Given the prohibitive computational cost of evaluating every baseline across all GLUE datasets, we omit the two largest subsets ( MNLI and QQP) from our experiments. The peak memory usage during training is measured using + +torch.cuda.max memory allocated() .All experiments are implemented on top of the open-source LoRA framework (Hu et al., 2022), using PyTorch (Paszke et al., 2019) and Huggingface’s PEFT library (Mangrulkar et al., 2022). Following Liu et al. (2024), we tune only model-agnostic hyperparameters such as learning rate and training epochs. Due to resource constraints, we set the maximum sequence length to 256. PSOFT is applied to all linear layers of the DeBERTaV3-base model. Evaluation metrics include Matthew’s correlation for CoLA, Pearson correlation for STS-B, and accuracy for the other GLUE sub-tasks. Detailed hyperparameter configurations are provided in Table 10. 20 Published as a conference paper at ICLR 2026 + +## G VISUAL CLASSIFICATION ON VTAB-1K + +G.1 DATASETS + +The Visual Task Adaptation Benchmark (VTAB-1K) (Zhai et al., 2019) comprises 19 image classifi-cation tasks grouped into three categories: natural, specialized, and structured. • Natural tasks involve images captured with standard cameras, depicting scenes from the natural environment, generic objects, fine-grained categories, or abstract concepts. • Specialized tasks use images obtained through specialized equipment, such as medical imaging devices or remote sensing technologies. • Structured tasks focus on artificially designed scenarios to analyze specific relationships or changes between images, such as estimating object distances in 3D scenes (e.g., DMLab), counting objects (e.g., CLEVR), or detecting orientations (e.g., dSprites for disentangled representations). In VTAB-1K, each dataset provides 800 labeled samples from its original training set, which are used to fine-tune the base model. Additionally, 200 labeled samples in the validation set adjust hyperparameters during fine-tuning. The performance is evaluated using Top-1 classification accuracy on the respective original test set. Table 11: Hyperparameter settings for fine-tuning ViT-B/16 on VTAB-1K + +Hyperparameter ViT-B/16 + +Optimizer AdamW Warmup Ratio 0.1 LR Schedule Cosine Learning Rate (Head) 5E-03 Batch Size 64 Weight Decay 1E-03 Dropout 1E-01 #Epochs 50 LR PSOFT r=46 {5E-04, 1E-03, 5E-03 } + +G.2 IMPLEMENTATION DETAILS + +Our experiments are conducted in PyTorch (Paszke et al., 2019) using HuggingFace’s Datasets , + +Transformers , and PEFT (Mangrulkar et al., 2022) libraries. Unlike prior works that rely on the Timm framework with custom preprocessing and training loops (Liu et al., 2024; Ma et al., 2024), our framework leverages standardized APIs such as AutoImageProcessor and Trainer ,eliminating manual dataset/model handling and enabling fast integration of advanced methods (e.g., DoRA (yang Liu et al., 2024), SVFT (Lingam et al., 2024), BOFT (Liu et al., 2024)). We adopt the experimental settings from (Liu et al., 2024; Ma et al., 2024), adjusting learning rates, weight decay, and training epochs accordingly. Following (Bałazy et al., 2024; Kopiczko et al., 2024; Lingam et al., 2024), we separate learning rates for the classification head and PEFT modules, with a fixed learning rate applied to the head across all methods. Complete hyperparameter configurations are listed in Table 11. + +## H MATHEMATICAL QUESTION ANSWERING ON META MATH QA-40K + +H.1 DATASETS + +For mathematical question answering tasks, we fine-tune baselines using the MetaMathQA-40K dataset (Yu et al., 2024) and evaluate their performance on the two challenge benchmarks: GSM-8K (Cobbe et al., 2021) and MATH (Hendrycks et al., 2021). 21 Published as a conference paper at ICLR 2026 Table 12: Hyperparameter settings for fine-tuning on MetaMathQA-40K + +Hyperparameter LLaMA-3.2-3B + +Optimizer AdamW Warmup Ratio 0.1 LR Schedule Cosine Max Seq. Len. 512 Batch Size 64 # Epochs 2LR PSOFT r=168 4E-04 LR PSOFT r=362 2E-04 H.2 IMPLEMENTATION DETAILS + +Our experiments follow prior work (Liu et al., 2024; Lingam et al., 2024) and are implemented in PyTorch (Paszke et al., 2019) using HuggingFace’s PEFT library (Mangrulkar et al., 2022). Consistent with (Lingam et al., 2024), we tune only learning rates for different models, with full hyperparameters listed in Table 12. We adopt gradient accumulation with small batch sizes ( ≤ 4) to approximate large-batch training across all baselines. Table 13: Experimental results of fine-tuned LLaMA-3.2-3B on GSM-8K and MATH with extremely low parameter counts. The best result for each dataset is marked in bold . Accuracy (%) is reported for both GSM-8K and MATH datasets. + +Methods #Params Inserted Modules Mem (GB) GSM-8K MATH + +GOFTv2 0.26M Q,K,V 75.3 41.02 9.22 qGOFTv2 1.03M Q,K,V 75.3 42.46 9.32 BOFT b=2 + +> m=2 + +1.18M Q,K,V 48.2 52.46 10.78 PSOFT r=168 1.20M Q,K,V 29.8 52.84 12.24 LoRA r=1 0.40M Q,K,V 30.1 47.23 10.36 SVFT P 0.49M Q,K,V,U,D,O,G 41.1 52.01 12.18 LoRA-XS r=48 0.45M Q,K,V,U,D,O,G 32.3 51.86 9.80 PSOFT r=72 0.53M Q,K,V,U,D,O,G 32.7 52.01 12.44 LoRA r=1 1.52M Q,K,V,U,D,O,G 32.0 57.32 12.88 PiSSA r=1 1.52M Q,K,V,U,D,O,G 32.0 56.48 13.18 LoRA-XS r=88 1.52M Q,K,V,U,D,O,G 32.8 54.66 12.70 PSOFT r=124 1.54M Q,K,V,U,D,O,G 33.2 57.47 13.26 DoRA r=1 2.29M Q,K,V,U,D,O,G 43.2 57.54 13.60 PSOFT r=152 2.31M Q,K,V,U,D,O,G 33.5 58.23 13.66 + +Beyond the main experiments, we provide additional evaluations of PEFT methods under constrained parameter budgets, as summarized in Table 13. When fine-tuned on the Q, K, and V modules, PSOFT achieves 10% and 3% higher accuracy than GOFTv2 and qGOFTv2 on GSM-8K and MATH, respectively, while using only 40% of their memory. On MATH, PSOFT also exceeds BOFT by 0.82%/1.46% with just 60% of its memory usage. PSOFT allows flexible control of parameter counts by adjusting the rank r, whereas LoRA is restricted to a minimum rank of 1, inherently tying its parameter count to hidden dimension size. Under stricter parameter budgets, LoRA must reduce the scope of inserted modules, often leading to performance degradation. In contrast, PSOFT consistently achieves superior performance even at extremely low parameter configurations. In terms of memory efficiency, PSOFT matches LoRA while outperforming DoRA and SVFT. 22 Published as a conference paper at ICLR 2026 + +## I COMMONSENSE REASONING ON COMMONSENSE -15K + +I.1 DATASETS + +Commonsense reasoning benchmarks encompass eight distinct sub-tasks: BoolQ (Clark et al., 2019), PIQA (Bisk et al., 2020), SIQA (Sap et al., 2019), HellaSwag (Zellers et al., 2019), Winogrande (Sakaguchi et al., 2021), ARC-easy/ARC-challenge (Clark et al., 2018), and OpenBookQA (Mihaylov et al., 2018). Following the approach described in (Hu et al., 2023; Lingam et al., 2024; yang Liu et al., 2024), we also combine the training datasets from all eight tasks to construct a unified fine-tuning dataset, Commonsense-15K tailored for each task. Table 14: Hyperparameter settings for fine-tuning on Commonsense-15K + +Hyperparameter LLaMA-3.1-8B + +Optimizer AdamW Warmup Steps 100 LR Schedule Linear Max Seq. Len. 512 Batch Size 64 # Epochs 3LR PSOFT r=194 4E-04 LR PSOFT r=424 1E-04 Table 15: Experimental results of fine-tuned LLaMA-3.1-8B on eight commonsense reasoning benchmarks with extremely low parameter counts. The best average result is highlighted in bold .Accuracy (%) is reported for all sub-datasets. + +Methods #Params Inserted Modules Mem (GB) BoolQ PIQA SIQA HS WG ARC-e ARC-c OBQA Avg. + +GOFTv2 0.26M Q,V OOM N/A. qGOFTv2 1.05M Q,V OOM N/A. BOFT b=2 + +> m=2 + +1.21M Q,V 79.4 69.66 83.95 71.65 80.87 70.01 90.40 77.82 79.00 77.92 PSOFT r=194 1.22M Q,V 52.6 68.87 84.17 71.44 86.46 67.56 90.45 77.73 81.20 78.49 LoRA r=1 0.59M Q,K,V 52.8 66.97 83.08 71.03 77.06 64.01 90.70 77.39 78.80 76.13 SVFT P 0.46M Q,K,V,U,D 65.8 65.08 81.07 69.40 85.69 68.82 88.47 77.05 76.00 76.45 LoRA-XS r=48 0.37M Q,K,V,U,D 53.4 69.30 84.82 71.29 87.44 67.01 89.39 77.22 82.60 78.63 PSOFT r=72 0.43M Q,K,V,U,D 53.7 69.72 84.39 72.01 87.99 68.67 90.19 78.16 81.00 79.02 LoRA r=1 1.77M Q,K,V,U,D 53.9 71.13 85.31 74.67 89.08 72.61 90.24 78.16 82.40 80.45 PiSSA r=1 1.77M Q,K,V,U,D 53.9 72.05 84.60 74.21 89.93 70.88 90.15 79.01 82.00 80.35 LoRA-XS r=104 1.73M Q,K,V,U,D 54.0 71.04 85.47 72.67 89.26 71.74 90.82 79.61 83.20 80.48 PSOFT r=146 1.74M Q,K,V,U,D 54.5 71.31 85.69 73.18 89.38 72.38 90.91 80.03 83.00 80.74 DoRA r=1 2.56M Q,K,V,U,D 65.4 71.05 85.29 73.25 90.09 73.32 90.74 79.75 81.87 80.67 PSOFT r=176 2.52M Q,K,V,U,D 55.0 71.47 86.02 75.33 90.81 72.69 90.45 78.75 84.00 81.19 + +I.2 IMPLEMENTATION DETAILS + +The experiments are conducted following the frameworks of Hu et al. (2023); yang Liu et al. (2024), implemented in PyTorch (Paszke et al., 2019) with HuggingFace’s PEFT library (Mangrulkar et al., 2022). Consistent with Lingam et al. (2024), we tune only the learning rates for different models. Detailed hyperparameter configurations are provided in Table 14. As shown in Table 15, when fine-tuning the Q and V modules, PSOFT avoids the OOM failures observed in GOFT and qGOFT, and surpasses BOFT by 0.33%/0.57% in average accuracy while using only 66% of its peak memory. We further evaluate under more constrained parameter budgets, where PSOFT continues to deliver superior average accuracy across eight commonsense reasoning 23 Published as a conference paper at ICLR 2026 benchmarks. In terms of memory efficiency, PSOFT requires only about 80% of the memory of DoRA and SVFT, while remaining comparable to LoRA. + +## J EXTENSION EXPERIMENTS + +J.1 EFFECT OF SVD INITIALIZATION + +Table 16: The effect of SVD Initialization on the Commonsense-15K Dataset using the LLaMA-3.2-3B model. + +> Methods SVD n iter SVD Init Time Validation Loss +> PSOFT r=32 +> 52.79 0.9343 10 3.74 0.9328 20 4.84 0.9283 +> ∞89.68 0.9276 PSOFT r=64 +> 54.11 0.9174 10 5.13 0.9134 20 7.51 0.9157 +> ∞89.48 0.9147 PSOFT r=128 +> 56.33 0.9092 10 8.38 0.9028 20 13.01 0.9029 +> ∞90.50 0.8992 + +PSOFT constructs the principal subspace via SVD, where the initialization time and accuracy of fast SVD depend on the n iter parameter (Halko et al., 2011; Meng et al., 2024). We evaluate this on the Commonsense-15K dataset (Hu et al., 2023) using the LLaMA-3.2-3B model (Meta AI, 2024), reporting both initialization time and validation loss. As shown in Table 16, smaller n iter values yield faster initialization, while larger values improve accuracy. With n iter = 20, the loss is nearly identical to that of full SVD ( n iter → ∞). These results show that fast SVD initializes PSOFT within seconds, and even full SVD introduces negligible overhead relative to the total fine-tuning time. Table 17: Effects of different ranks fine-tuned on the CoLA Dataset using the DeBERTA-V3-base model (on a single RTX5090). + +> Methods Ranks #Params Matthew’s Correlation (%) +> Peak GPU Memory (GB) Runtime +> PSOFT 1144 59.20 4.0 17m34s 2360 68.80 4.0 18m32s 41,008 70.08 4.0 19m17s 83,168 70.93 4.0 19m08s 16 10,944 68.36 4.0 19m32s 32 40,320 72.09 4.0 19m41s 64 154,368 69.16 4.1 21m29s 128 603,648 72.46 4.2 20m42s 256 2,386,944 74.09 4.6 24m35s 512 9,492,480 71.04 5.8 27m20s + +J.2 EFFECT OF RANKS + +To provide guidance on rank selection, we evaluate PSOFT with ranks ranging from 1 to 512 on the CoLA and the Commonsense-15K dataset (Hu et al., 2023) using DeBERTA-V3-base (He et al., 2021) and LLaMA-3.2-3B (Meta AI, 2024). As shown in Table 17 and Table 18, PSOFT exhibits a wide range of usable ranks: as r increases, the number of trainable parameters grows according to the formula in 8, r(r − 1) /2 + 2 r, and performance improves correspondingly, though with diminishing returns. Memory usage increases with r, but remains nearly flat when r is small. Since we adopt the truncated Neumann-series approximation, training time does not increase noticeably with larger r.24 Published as a conference paper at ICLR 2026 Table 18: Effects of different ranks fine-tuned on the Commonsense-15K Dataset using the LLaMA-3.2-3B model (on a single H100). + +Methods Ranks #Params Avg. (%) Peak GPU Memory (GB) Runtime + +PSOFT 1 392 27.07 31.5 50m13s 2 980 32.45 31.5 46m37s 4 2,744 36.16 31.5 48m30s 8 8,624 38.21 31.5 46m18s 16 29,792 57.12 31.6 48m52s 32 109,760 62.94 31.8 51m12s 64 420,244 70.95 32.1 48m47s 128 1,643,264 73.90 32.8 46m11s 256 6,497,792 74.95 34.5 47m29s 512 25,840,640 75.05 38.4 49m49s + +The results further reveal a consistent pattern across models and tasks. For smaller models and simpler tasks, PSOFT is highly parameter-efficient: even very small ranks achieve strong performance, indicating that the low-dimensional subspace is already sufficient to capture the necessary task-specific transformations. In contrast, for larger models and more complex tasks, performance tends to increase with larger ranks, reflecting the greater capacity required to capture task-specific transformations. In such cases, the main trade-off is between the performance gains from increasing + +r and the corresponding growth in trainable parameters. Based on these observations, we provide the following practical guidance for choosing the rank. For simpler tasks, we recommend using small to moderate ranks ( e.g., 32-128), as they provide good parameter efficiency with little performance loss. For more complex tasks, larger ranks generally lead to higher performance, while extremely small ranks ( e.g., below 16) may hurt results. In such cases, moderate to large ranks ( e.g., 64-256) offer a better balance between performance and efficiency. J.3 EFFECT OF INSERTED MODULES + +We fine-tune LLaMA-3.2-3B with PSOFT and evaluate it on GSM-8K under different insertion schemes, with results shown in Figure 8a. Overall, performance improves as more modules are inserted and as the rank r increases, showing that complex mathematical tasks benefit directly from higher model capacity under PSOFT. For a fixed rank r, applying PSOFT to the Q, K, V, U, and D + +modules generally provides the best trade-off between performance and parameter efficiency. When the parameter budget permits, inserting PSOFT into all linear layers yields the strongest results. 30K 50K 100K 200K 500K 1M 2M + +> Number of Parameters +> 35 +> 40 +> 45 +> 50 +> 55 +> 60 +> GSM-8K Accuracy (%) +> Colors: Rank +> r=32 +> r=64 +> r=128 +> Shapes: Inserted Modules +> Q, V +> U, D +> Q, K, V +> Q, K, V, U, D +> Q, K, V, O, G +> Q, K, V, U, D, O, G + +(a) 0.9 + +> 1.0 +> 1.1 +> 1.2 +> 1.3 +> 1.4 +> Training Speed +> w/o Neumann +> 13510 +> PSOFT with different Neumann Terms +> 91.3 +> 91.4 +> 91.5 +> 91.6 +> Pearson correlation (%) +> w/o Neumann + +(b) + +Figure 8: (a) Effect of inserted modules on GSM-8K using LLaMA-3.2-3B. (b) Effect of Neumann terms on STS-B using DeBERTaV3-base. J.4 EFFECT OF NEUMANN TERMS + +To assess the effect of different Neumann terms on training speed and performance, we fine-tune DeBERTaV3-base on STS-B with rank 46. As shown in Figure 8b, the Neumann series approxima-tion substantially accelerates training while maintaining performance close to the original Cayley parameterization. Training speed decreases as the number of terms increases, gradually approaching 25 Published as a conference paper at ICLR 2026 that of Cayley, whereas performance improves with more terms and eventually converges to the Cayley result. + +## K PAIRWISE ANGLES OF WEIGHTS + +We fine-tune DeBERTa-V3-base on the CoLA dataset using the same setup as in the main paper. We then extract the query matrix from layer 6 and compute the pairwise angles among the first eight column vectors of Wpri and Wpre , as well as those of Wps-tuned and Wfinal = Wps-tuned + Wres .Figures 9a and 10a show that, before fine-tuning, the angles in Wpri and Wpre follow a clear and stable pattern. Figures 9b and 10b show that PSOFT with strict orthogonality keeps this pattern: + +Wps-tuned preserves the angles in Wpri , and Wfinal preserves those in Wpre . As shown in Figures 10b and 10c, PSOFT with relaxed orthogonality also keeps the main angular structure, but introduces small and controlled changes. These changes help improve task adaptation while keeping the key structure intact. 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 78.7 101.1 90.0 90.4 82.1 88.9 62.8 +> 78.7 0.0 99.0 98.2 94.9 67.6 94.1 80.6 +> 101.1 99.0 0.0 93.1 87.3 107.4 95.3 105.7 +> 90.0 98.2 93.1 0.0 109.8 82.8 112.1 84.8 +> 90.4 94.9 87.3 109.8 0.0 92.6 65.0 82.8 +> 82.1 67.6 107.4 82.8 92.6 0.0 87.8 65.8 +> 88.9 94.1 95.3 112.1 65.0 87.8 0.0 85.3 +> 62.8 80.6 105.7 84.8 82.8 65.8 85.3 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 + +(a) Wpri 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 78.7 101.1 90.0 90.4 82.1 88.9 62.8 +> 78.7 0.0 99.0 98.2 94.9 67.6 94.1 80.6 +> 101.1 99.0 0.0 93.1 87.3 107.4 95.3 105.7 +> 90.0 98.2 93.1 0.0 109.8 82.8 112.1 84.8 +> 90.4 94.9 87.3 109.8 0.0 92.6 65.0 82.8 +> 82.1 67.6 107.4 82.8 92.6 0.0 87.8 65.8 +> 88.9 94.1 95.3 112.1 65.0 87.8 0.0 85.3 +> 62.8 80.6 105.7 84.8 82.8 65.8 85.3 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 (b) Wps-tuned (strict orth.) 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 77.8 101.7 90.0 90.1 80.2 87.4 61.7 +> 77.8 0.0 100.1 98.3 95.1 66.6 94.3 79.6 +> 101.7 100.1 0.0 93.0 86.7 108.8 95.7 107.3 +> 90.0 98.3 93.0 0.0 109.8 83.8 111.7 85.0 +> 90.1 95.1 86.7 109.8 0.0 91.3 64.2 82.8 +> 80.2 66.6 108.8 83.8 91.3 0.0 87.2 63.5 +> 87.4 94.3 95.7 111.7 64.2 87.2 0.0 84.3 +> 61.7 79.6 107.3 85.0 82.8 63.5 84.3 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 (c) Wps-tuned (relaxed orth.) + +Figure 9: Angle structures of Wpri (the query matrix in layer 6) before fine-tuning (a), and of Wps-tuned + +after PSOFT fine-tuning under strict (b) and relaxed (c) orthogonality. 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 87.5 96.3 85.6 89.7 86.9 87.9 78.2 +> 87.5 0.0 93.5 90.7 90.8 84.8 89.8 84.3 +> 96.3 93.5 0.0 91.8 89.8 96.6 88.6 98.3 +> 85.6 90.7 91.8 0.0 95.8 89.8 96.7 89.0 +> 89.7 90.8 89.8 95.8 0.0 89.5 82.9 87.8 +> 86.9 84.8 96.6 89.8 89.5 0.0 89.3 79.3 +> 87.9 89.8 88.6 96.7 82.9 89.3 0.0 89.9 +> 78.2 84.3 98.3 89.0 87.8 79.3 89.9 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 + +(a) Wpre 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 87.5 96.3 85.6 89.7 86.9 87.9 78.2 +> 87.5 0.0 93.5 90.7 90.8 84.8 89.8 84.3 +> 96.3 93.5 0.0 91.8 89.8 96.6 88.6 98.3 +> 85.6 90.7 91.8 0.0 95.8 89.8 96.7 89.0 +> 89.7 90.8 89.8 95.8 0.0 89.5 82.9 87.8 +> 86.9 84.8 96.6 89.8 89.5 0.0 89.3 79.3 +> 87.9 89.8 88.6 96.7 82.9 89.3 0.0 89.9 +> 78.2 84.3 98.3 89.0 87.8 79.3 89.9 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 (b) Wfinal (strict orth.) 0 1 2 3 4 5 6 7 + +0 + +1 + +2 + +3 + +4 + +5 + +6 + +7 + +> 0.0 87.2 96.6 85.6 89.6 86.2 87.3 77.6 +> 87.2 0.0 93.9 90.7 90.8 84.4 89.9 83.9 +> 96.6 93.9 0.0 91.8 89.5 97.2 88.8 99.1 +> 85.6 90.7 91.8 0.0 95.8 90.0 96.5 89.1 +> 89.6 90.8 89.5 95.8 0.0 89.1 82.6 87.8 +> 86.2 84.4 97.2 90.0 89.1 0.0 89.1 78.2 +> 87.3 89.9 88.8 96.5 82.6 89.1 0.0 89.4 +> 77.6 83.9 99.1 89.1 87.8 78.2 89.4 0.0 + +0 + +20 + +40 + +60 + +80 + +100 + +120 (c) Wfinal (relaxed orth.) + +Figure 10: Angle structures of Wpre (the query matrix in layer 6) before fine-tuning (a), and of Wfinal + +after PSOFT fine-tuning under strict (b) and relaxed (c) orthogonality. + +## L LOSS AND CONVERGENCE COMPARISON + +PSOFT can be viewed as a specialized form of orthogonal fine-tuning, where Wfinal = Rfull Wpre ,with Rfull = diag( R, Id−r ), meaning that the orthogonal transformation is applied only to the principal (low-rank) subspace of the pre-trained weight matrix, while an identity mapping is imposed on its orthogonal complement. This formulation implies that the optimization behavior of PSOFT gradually approaches that of full-space OFT methods as the rank r increases. Therefore, PSOFT induces a principled modification of the optimization geometry: Full-space OFT optimizes over the Stiefel manifold St( d, d ), whose tangent space consists of all skew-symmetric directions in the full d-dimensional parameter space. In contrast, PSOFT restricts optimization to 26 Published as a conference paper at ICLR 2026 the tangent space of a block-diagonal submanifold St( r, r ) ⊕ R(d−r). As a result, only the principal subspace receives curvature-aware updates, while the orthogonal complement experiences zero curvature (identity block). 1 5 10 15 20 + +> Epoch +> 0.0 +> 0.1 +> 0.2 +> 0.3 +> 0.4 +> 0.5 +> Loss +> PSOFT r= 1 +> PSOFT r= 8 +> PSOFT r= 32 +> PSOFT r= 128 +> PSOFT r= 512 +> GOFT-v2 +> qGOFT-v2 +> OFTv1 r= 120 +> OFTv2 b= 32 +> BOFT b= 8 +> m= 2 + +Figure 11: Comparison of loss curves for different PSOFT ranks and various orthogonal fine-tuning methods. Building upon this geometric distinction, PSOFT exhibits three complementary behaviors that characterize its optimization dynamics. First, the low-rank orthogonal constraint simplifies the optimization landscape by preventing large full-space orthogonal transformations. This restriction reduces the effective curvature of the optimization path, yielding more stable and predictable gradient updates, while at the same time limiting expressiveness when r is very small. Second, because PSOFT applies orthogonal transformations only within the principal subspace, stochastic noise is confined to this lower-dimensional region rather than being amplified across all d dimensions as in full-space OFT, leading to more robust and less destructive updates. Third, as r increases, the PSOFT tangent space increasingly approximates that of full-space OFT, supporting richer expressiveness and convergence trajectories that gradually approach full-space OFT, yet without the severe overfitting that may arise in full-space OFT. Collectively, these properties illustrate how PSOFT navigates the trade-off between stability, expressiveness, and generalization. We conduct additional experiments on the CoLA dataset using DeBERTa-V3 and report the training loss curves of different OFT variants. As shown in Figure 11, the green curves correspond to PSOFT, with darker colors indicating larger ranks. We observe that as r increases, the PSOFT loss curves progressively approach those of full-space OFT methods such as BOFT and OFTv2, reflecting the improved convergence speed and expressiveness of higher-rank subspaces. PSOFT with very small ranks constrains the update space too aggressively, which may lead to underfitting and slower loss reduction. In contrast, full-space OFT methods such as BOFT display the fastest initial convergence, but their full-rank orthogonal updates raise the risk of overfitting. This phenomenon is evident in our main GLUE experiments, where BOFT achieves the lowest training loss yet fails to obtain the best generalization performance. These trends are consistent with the geometric properties of PSOFT discussed above: by con-straining orthogonal updates to a lower-dimensional principal subspace, PSOFT naturally balances expressiveness and generalization. Unlike full-space OFT, PSOFT enables explicit capacity control through r, allowing moderate ranks to achieve a more favorable bias-variance trade-off and stronger generalization. + +## M ADDITIONAL EXPERIMENTS ON MEMORY USAGE + +we additionally conducted memory experiments on a single NVIDIA H100 80GB, covering: • the forward/backward (FP/BP) peak memory usage on a single custom linear layer, and • the forward/backward (FP/BP) peak memory usage on a Transformer block, and • the peak memory usage on the DeBERTaV3-base and ViT-B/16 models during training. 27 Published as a conference paper at ICLR 2026 For the single-layer analysis, we implemented a Python-based evaluation framework that separately measures peak memory usage and runtime for the forward and backward passes. The implementation of GOFTv2 uses the latest available code, while BOFT is taken from the PEFT library (version 0.17.0). We track peak memory consumption (in GB) and runtime (in milliseconds, ms), as peak memory is the primary factor limiting on memory-constrained hardware. The linear layer input is configured with a batch size b = 64 , sequence length s = 512 , and hidden dimension h = 4096 . Runtime results are averaged over 100 forward/backward runs. The results are summarized as follows: Table 19: Peak memory usage (GB) and runtime (ms) statistics for different methods on a single custom linear layer. + +> Methods Peak Memory (FP) Peak Memory (BP) Runtime (FP) Runtime (BP) +> GOFTv2 13.6 14.3 5.2 129.3 qGOFTv2 13.6 14.3 5.4 129.6 BOFT b=8 +> m=2 1.8 2.6 102.9 2.1 BOFT b=4 +> m=4 2.3 3.0 139.6 2.5 PSOFT r=32 2.1 2.6 43.4 4.3 PSOFT r=64 2.1 2.6 43.8 4.8 PSOFT r=128 2.1 2.6 22.9 25.9 PSOFT r=256 2.2 2.6 4.0 48.8 PSOFT r=512 2.2 2.7 5.6 53.1 + +As shown in 19, although GOFTv2 benefits from the Hadamard-product optimization and achieves reduced forward-pass computation time, it still consumes substantially more activation memory than both BOFT and PSOFT. Importantly, the single-layer activation-memory measurement slightly underrepresents PSOFT’s true advantage: as discussed in the theoretical analysis, PSOFT reduces activation memory across multiple layers, but when evaluating a single layer in isolation, it should still store the full input and output activations, which partially diminishes its advantage. Nevertheless, even under this conservative setting, PSOFT achieves lower activation-memory usage and faster computation compared with BOFT and GOFTv2, and its advantages become increasingly pronounced when moving from a single linear layer to a Transformer block or end-to-end models. Table 20: Peak memory usage (GB) and runtime (ms) statistics for different methods on a Transformer block. + +> Methods Peak Memory (FP) Peak Memory (BP) Runtime (FP) Runtime (BP) +> GOFTv2 65.4 65.4 49.5 667.1 qGOFTv2 65.4 65.4 49.5 671.2 BOFT b=8 +> m=2 19.0 19.0 2813.9 7.5 BOFT b=4 +> m=4 28.9 28.9 5427.9 8.7 PSOFT r=32 7.2 7.2 162.7 134.4 PSOFT r=64 7.2 7.2 166.0 134.2 PSOFT r=128 7.2 7.3 137.4 170.3 PSOFT r=256 7.3 7.4 122.2 197.7 PSOFT r=512 7.6 7.6 130.3 215.3 + +To validate this, we extend the single-layer setup to a complete Transformer block, configured with 8 attention heads and with all PEFT modules inserted into all linear layers. The input is configured with a batch size b = 32 , sequence length s = 512 , and hidden dimension h = 4096 , and runtime results are averaged over 100 forward and backward runs. We report peak memory consumption (in GB) and runtime (in milliseconds, ms). As shown in 20, these block-level experiments confirm that PSOFT further reduces both memory usage and runtime by avoiding full-dimensional chained multiplications and performing orthogonal transformations only within a much smaller subspace. We then conduct full-layer experiments following the same configuration as in the main paper. For DeBERTaV3-base, we use a fixed batch size b = 64 and and task-dependent sequence length + +s ∈ 64 , 128 , 256 . For ViT-B/16, we follow the original setup with a fixed sequence length s = 197 + +and a batch size of b = 64 . Additionally, we include results with smaller batch sizes b ∈ 16 , 32 for a more comprehensive comparison. PSOFT uses the same rank r = 46 as reported in the original paper, and all PEFT modules are inserted into all linear layers. The results are presented as follows: 28 Published as a conference paper at ICLR 2026 Table 21: Peak memory usage (GB) of different methods on DeBERTaV3-base. + +> Methods Peak Memory (s=64) Peak Memory (s=128) Peak Memory (s=256) +> GOFTv2 18.5 34.4 67.5 qGOFTv2 18.5 34.4 67.5 BOFT b=8 +> m=2 6.3 9.4 17.5 PSOFT r=46 4.1 6.8 14.0 + +Table 22: Peak memory usage (GB) of different methods on ViT-B/16. + +> Methods Peak Memory (b=16) Peak Memory (b=32) Peak Memory (b=64) +> GOFTv2 22.5 44.7 OOM qGOFTv2 22.5 44.7 OOM BOFT b=8 +> m=2 5.4 7.3 10.9 PSOFT r=46 2.4 2.9 6.2 + +As shown in 21 and 22, PSOFT achieves the lowest peak memory usage across different settings. Remarkably, even on an H100 GPU, GOFT still encounters OOM failures for ViT-B/16 with a batch size b = 64 . This behavior stems from its activation-memory scaling of O(bsh log h), which grows rapidly at larger batch sizes and ultimately limits its applicability on memory-constrained hardware. In contrast, PSOFT consistently avoids such OOM issues: by restricting OFT to the principal subspace, it preserves the essential semantic representations while simultaneously improving multi-dimensional efficiency (parameter counts, memory, and computation) for OFT. + +## N THE USE OF LARGE LANGUAGE MODELS (LLM S) + +In this work, large language models (LLMs) are used solely as general-purpose tools to assist with writing polish. Specifically, LLMs are employed to refine grammar, improve readability, and ensure that the overall writing style conforms to academic conventions. LLMs are not involved in research ideation, experimental design, data analysis, or conclusion formulation. All technical contributions, theoretical analyses, and experimental results are entirely original work by the authors. 29 diff --git a/docs/reft_representation_finetuning.md b/docs/reft_representation_finetuning.md new file mode 100644 index 0000000..10f606b --- /dev/null +++ b/docs/reft_representation_finetuning.md @@ -0,0 +1,970 @@ +Title: 2404.03592v3.pdf + +URL Source: https://arxiv.org/pdf/2404.03592 + +Published Time: Fri, 24 May 2024 01:13:51 GMT + +Number of Pages: 49 + +Markdown Content: +# ReFT: Representation Finetuning for Language Models + +Zhengxuan Wu ∗† Aryaman Arora ∗† Zheng Wang † Atticus Geiger ‡ + +Dan Jurafsky † Christopher D. Manning † Christopher Potts ††Stanford University ‡Pr(Ai) 2R Group + +{wuzhengx,aryamana,peterwz,atticusg}@stanford.edu {jurafsky,manning,cgpotts}@stanford.edu + +## Abstract + +Parameter-efficient finetuning (PEFT) methods seek to adapt large neural models via updates to a small number of weights . However, much prior interpretability work has shown that representations encode rich semantic information, suggesting that editing representations might be a more powerful alternative. We pursue this hypothesis by developing a family of Representation Finetuning (ReFT) + +methods. ReFT methods operate on a frozen base model and learn task-specific interventions on hidden representations. We define a strong instance of the ReFT family, Low-rank Linear Subspace ReFT (LoReFT), and we identify an ablation of this method that trades some performance for increased efficiency. Both are drop-in replacements for existing PEFTs and learn interventions that are 15 ×–65 × more parameter-efficient than LoRA. We showcase LoReFT on eight commonsense rea-soning tasks, four arithmetic reasoning tasks, instruction-tuning, and GLUE. In all these evaluations, our ReFTs deliver the best balance of efficiency and performance, and almost always outperform state-of-the-art PEFTs. We release a generic ReFT training library publicly at https://github.com/stanfordnlp/pyreft . + +## 1 Introduction + +Pretrained language models (LMs) are frequently finetuned to adapt them to new domains or tasks [Dai and Le, 2015]. With finetuning, a single base model can be adapted to a variety of tasks given only small amounts of in-domain data. However, finetuning large LMs is expensive. Parameter-efficient finetuning (PEFT) methods propose to address the high costs of full finetuning by updating a small number of weights. This reduces memory usage and training time, and PEFTs achieve similar performance to full finetuning in many settings [Hu et al., 2023]. A hallmark of current state-of-the-art PEFTs is that they modify weights rather than representations .However, much prior interpretability work has shown that representations encode rich semantic information, suggesting that editing representations might be a more powerful alternative to weight updates. In this paper, we pursue this hypothesis by developing and motivating Representation Finetuning (ReFT) . Instead of adapting model weights, ReFT methods train interventions that manipulate a small fraction of model representations in order to steer model behaviors to solve downstream tasks at inference time. ReFT methods are drop-in replacements for weight-based PEFTs. This approach is inspired by recent work in LM interpretability that intervenes on representations to find faithful causal mechanisms [Geiger et al., 2023b] and to steer model behaviours at inference time [Turner et al., 2023, Li et al., 2024], and it can be seen as a generalisation of the representation-editing work of Wu et al. [2024a], Turner et al. [2023], and Zou et al. [2023] (see appendix B for details). + +> * + +Equal contribution. Preprint. Under review. + +> arXiv:2404.03592v3 [cs.CL] 22 May 2024 Commonsense +> LLaMA 7B LLaMA 13B Llama-2 7B Llama-3 8B +> Instruct-tuning +> Llama-2 7B +> Parameters Performance +> Arithmetic +> LLaMA 7B LLaMA 13B +> GLUE +> RoBERTa-base RoBERTa-large + +Figure 1: Parameter count vs. performance for LoReFT and other PEFTs across four benchmarks when applied to LLaMA, Llama-2, Llama-3, and RoBERTa models. Despite training far fewer parameters than existing PEFTs, LoReFT achieves competitive or even state-of-the-art performance on all tasks. Its value is most apparent for the largest models in our evaluations. Note : FT is full-parameter finetuning, which is not a PEFT or ReFT method. Additional results are in section 4. We focus on a strong and highly efficient instance of the ReFT family that we call Low-rank Linear Subspace ReFT (LoReFT ). LoReFT is a parametrisation of ReFT that intervenes on hidden representations in the linear subspace spanned by a low-rank projection matrix, building directly on the distributed alignment search (DAS) method of Geiger et al. [2023b] and Wu et al. [2023]. We also identify an ablation of this method (DiReFT) that trades some performance for increased efficiency. We evaluate our ReFTs on LLaMA-family models and small-scale LMs against existing PEFTs on standard benchmarks from four domains: commonsense reasoning, arithmetic reasoning, instruction-following, and natural language understanding. Compared to LoRA, we find that LoReFT uses 15 ×–65 × times fewer parameters while achieving state-of-the-art performance on commonsense reasoning, instruction-following, and natural language understanding against the strongest PEFTs. These findings indicate that ReFT methods are worthy of further exploration, as they may emerge as more efficient and effective alternatives to weight-based PEFTs. + +## 2 Related work + +Parameter-efficient finetuning methods (PEFTs). PEFTs train a fraction of the model’s parameters to adapt it to downstream tasks. We classify PEFTs into three categories: 1. Adapter-based methods train additional modules (e.g. fully-connected layers) on top of the frozen pretrained model. Series adapters insert components between LM attention or MLP layers [Houlsby et al., 2019, Pfeiffer et al., 2020, Wang et al., 2022, He et al., 2022b, Fu et al., 2021], while parallel adapters add modules alongside existing components [He et al., 2022a]. Since adapters add new components that cannot be easily folded into existing model weights, they impose an additional burden at inference time. 1 + +2. LoRA [Hu et al., 2022] and DoRA [Liu et al., 2024c] use low-rank matrices to approximate additive weight updates during training, and require no additional overhead during inference since the weight updates can be merged into the model. These are the strongest PEFTs currently. 2 + +3. Prompt-based methods add randomly-initialised soft tokens to the input (usually as a prefix) and train their embeddings while keeping the LM weights frozen [Li and Liang, 2021]. These + +> 1 + +Several very recent papers introduce new adapter architectures but do not benchmark them on the tasks we consider, or they perform hyperparameter-tuning in a different setup than done in this work. These include: LLaMA-Adapter [Zhang et al., 2024b], LLaMA-Adapter v2 [Gao et al., 2023], Aligner [Ziheng et al., 2023]. + +> 2 + +Additional methods not studied in this work: AutoLoRA [Zhang et al., 2024c], ResLoRA [Shi et al., 2024], SiRA [Zhu et al., 2023]. + +2methods are often far from optimal compared to other PEFTs, and come at the cost of significant inference overhead. A variant of this method where hidden-layer activations are also tuned was introduced as a baseline in Hu et al. [2022], with better performance. + +Representation editing. Recent work on activation steering and representation engineering shows that adding fixed or task-specific steering vectors [Subramani et al., 2022, Turner et al., 2023, Zou et al., 2023, Liu et al., 2024b, Vogel, 2024, Li et al., 2024] or applying concept erasure [Ravfogel et al., 2022, Belrose et al., 2023, Avitan et al., 2024, Singh et al., 2024] to the residual stream can enable a degree of control over pretrained LM generations without the need for resource-intensive finetuning [Wu et al., 2024a]. The success of these methods affirms that representations induced by pretrained LMs carry rich semantic structure. + +Interventional interpretability. Much recent work has used interventions on model-internal states to test hypotheses about how LMs implement various behaviours. In particular, interventions on linear subspaces of representations have provided increasing evidence that human-interpretable concepts are encoded linearly [Smolensky, 1986, Rumelhart et al., 1986, McClelland et al., 1986]. This includes linguistic features such as gender and number [Lasri et al., 2022, Wang et al., 2023, Hanna et al., 2023, Chintam et al., 2023, Yamakoshi et al., 2023, Hao and Linzen, 2023, Chen et al., 2023, Amini et al., 2023, Guerner et al., 2023, Arora et al., 2024], logical and mathematical reasoning [Wu et al., 2023], entity attributes [Huang et al., 2024], and a number of other domains [Mikolov et al., 2013, Elhage et al., 2022, Park et al., 2023, Nanda et al., 2023, Guerner et al., 2023]. + +## 3 ReFT + +We now define the ReFT family of methods. To do this, we first summarize the core motivation, which emerges from work on intervention-based model interpretability. We then show how this leads directly to Low-rank Linear Subspace ReFT (LoReFT). Finally, we generalize this to a family of ReFT methods. Appendix A provides a brief overview of our generic ReFT training library. To keep the presentation simple, we assume throughout that our target model is a Transformer-based [Vaswani et al., 2017] LM that produces contextualised representations of sequences of tokens. Given a sequence of n input tokens x = (x1, . . . , x n), the model first embeds these into a list of representations h(0) = (h(0) + +> 1 + +, . . . , h(0) + +> n + +). Then, m layers successively compute the j-th list of hidden representations h(j) as a function of the previous list of hidden representations h(j−1). Each hidden representation is a vector h ∈ Rd. The LM uses the final hidden representations h(m) to produce its predictions. In our experiments, we consider both autoregressive LMs and masked LMs [Devlin et al., 2019]. An autoregressive LM predicts p(xn+1 ∣ x1, . . . , x n) = softmax (Wh (m) + +> n + +), while a masked LM predicts p(xi ∣ x1, . . . , x i−1, x i+1, . . . , x n) = softmax (Wh (m) + +> i + +), where W is a learned matrix mapping from representations to logits over the vocabulary space. + +3.1 Motivation + +In interpretability research, the framework of causal abstraction [Geiger et al., 2021] uses interchange interventions to establish the causal role of representations in deep learning models. An interchange intervention fixes a representation to the value it would take if a counterfactual input were processed by the model. Experiments investigating how such interventions affect model behavior form the evidence for claims about the causal role of a representation and the concept it encodes. To test whether a concept is encoded in a linear subspace of a representation, one may use a dis-tributed interchange intervention (DII) [Geiger et al., 2023b]. 3 Let b be the hidden representation created at row i and column k when our model processes input b, and let s be the corresponding representation when that same model processes input s. A distributed interchange intervention on b + +given a counterfactual source representation s is then defined as + +DII (b, s, R) = b + R⊺(Rs − Rb ) (1) where R ∈ Rr×d is a low-rank projection matrix with orthonormal rows, d is the representation dimensionality, and r is the dimensionality of the subspace we are intervening on. We learn the subspace R using distributed alignment search (DAS), which finds the subspace that maximises the + +> 3This notion of subspace intervention was also independently discovered by Guerner et al. [2023]. + +3ReFT Intervention + +> this is some text + +# LoReFT + +> edit subspace +> (rows of R) +> RW +> hhh +> Φ(h) +> b +> -+ +> + +> RT +> edit restricted to subspace +> edit vector + +Figure 2: Illustration of ReFT . (1) The left panel depicts an intervention I: the intervention function + +Φ is applied to hidden representations at positions P in layer l. (2) The right panel depicts the intervention function used in LoReFT, which finds an edit vector that only modifies the representation in the linear subspace spanned by the rows of R. Specifically, we show how a rank-2 LoReFT operates on 3-dimensional hidden representations. probability of the expected counterfactual output after intervention [Geiger et al., 2023b]. DAS is highly expressive, and can effectively localize concepts within model representations [Wu et al., 2023, Arora et al., 2024, Wu et al., 2024c, Huang et al., 2024]. This suggests that subspace representation interventions could also be a powerful tool for model control. + +3.2 Two low-rank ReFT instantiations LoReFT. The formulation of DII in eq. (1) immediately suggests a way to control model generations via interventions. The guiding intuition is that we can learn how to perform interventions that steer the model towards predicting our task labels. The resulting method, Low-rank Linear Subspace ReFT (LoReFT), is defined by the following variant of eq. (1): + +ΦLoReFT (h) = h + R⊺ (Wh + b − Rh ) (2) This is identical to eq. (1), except we use a learned projected source Rs = Wh + b. LoReFT thus edits the representation in the r-dimensional subspace spanned by the rows of R to take on the values obtained from our linear projection Wh + b. We depict this operation in fig. 2. The learned parameters are ϕ = {R, W, b}; the parameters of the LM are frozen. As with DII , R ∈ Rr×d is a low-rank matrix with orthonormal rows where d is the hidden-state dimensionality and r ≤ d is the rank of the subspace. We further define a linear projection W ∈ Rr×d and bias vector b ∈ Rr . + +DiReFT. In addition, we define an ablation of LoReFT which removes the orthogonality constraint and the difference operation, reducing training time: + +ΦDiReFT (h) = h + W⊺ + +> 2 + +(W1h + b) (3) Both W1, W2 ∈ Rr×d are low-rank projection matrices. Note that eq. (3) resembles LoRA, and thus DiReFT can be thought of as LoRA applied di rectly to hidden representations at certain positions. 4 + +Empirical evidence from previous work suggests that adding orthogonal constraints to LoRA weights increases performance [Liu et al., 2024d]. (Appendix E reports results for additional ablations of LoReFT.) + +Training objective. We consider both generation tasks using decoder-only or encoder–decoder LMs and classification tasks using encoder-only models. The pretrained language model induces a distribution over token sequences p(⋅). We denote the model that results from the ReFT intervention + +Φ on p(⋅) as pΦ(⋅) with trainable parameters ϕ. To simplify notation, we refer to the hidden representations produced by the LM on input x as h(x), and those by the intervened LM as hΦ(x).For generation tasks, our training objective is language modelling. Given an input sequence x = + +(x1, . . . , x n) with n tokens as the prompt, the goal is to predict the output sequence y = (y1, . . . , y m) + +> 4LoRA is not applicable to the residual stream, which is weightless. LoRA can be configured to apply only to the attention layer output projection matrix, which is similar to our residual stream intervention. However, previous works found that applying LoRA only to attention layers is sub-optimal [Hu et al., 2023]. + +4with m tokens. We minimise the cross-entropy loss with teacher-forcing over all output positions. + +min + +> ϕ + +{− + +> m + +∑ + +> i=1 + +log pΦ (yi ∣ xy 1 + ++ bd)) + bo) (5) We learn the parameters of the head and those of the intervention function Φ. We minimise the cross-entropy loss of the target class y given input x: + +min + +> ϕ,θ + +{− log Hθ (y ∣ hΦ(x))} (6) + +3.3 The ReFT family of methods + +It is straightforward to generalise the above intervention functions to define a family of intervention-based representation finetuning methods. We first define a general notion of intervention , i.e. the modification of hidden representations during the model forward pass: + +Definition 3.1. An intervention I is a tuple ⟨Φ, P, l ⟩ that encapsulates a single inference-time modification of the representations computed by a Transformer-based LM. The three components of an intervention are (1) the intervention function Φ ∶ Rd → Rd with learned parameters ϕ, (2) a set of input positions P ⊆ {1, . . . , n } that the intervention is applied to, and (3) the layer l ∈ {1, . . . , m } + +at which the intervention is applied. We implement the intervention I as the following operation that overwrites some representations h: + +h(l) ← (Φ (h(l) + +> p + +) if p ∈ P else h(l) + +> p + +)p∈1,...,n (7) The intervention is applied immediately after the computation of h(l) and thus affects the representa-tions computed in later layers h(l+1), . . . , h(m).Figure 2 provides a schematic overview of an intervention. A ReFT is then defined as a constrained set of non-overlapping interventions: + +Definition 3.2. A ReFT method is a set of f interventions I = {I1, . . . , I f }. We enforce that for any two interventions Ij , I k ∈ I such that they operate on the same layer lj = lk, their intervention positions must be disjoint, i.e. Pj ∩ Pk = ∅. The parameters (ϕ1, . . . , ϕ f ) of all of the intervention functions are independent. ReFT is thus a generic framework encompassing interventions on hidden representations during the model forward pass. In appendix B, we show how a variety of existing inference-time intervention methods can be described within this framework. + +## 4 Experiments + +To evaluate our ReFTs against existing PEFTs, we conduct experiments across four diverse NLP benchmarks covering more than 20 datasets (extensive details on our datasets are in appendix C). Our goal is to provide a rich picture of how LoReFT and DiReFT perform in different scenarios. We experiment with both masked and autoregressive LMs at different scales, ranging from RoBERTa-base [Liu et al., 2019] with 125M to LLaMA models [Touvron et al., 2023a,b] with 13B parameters. We benchmark against existing PEFTs such as prefix-tuning [Li and Liang, 2021], adapter-tuning with both Series Adapters and Parallel Adapters, BitFit [Ben Zaken et al., 2022], RED [Wu et al., 2024a], LoRA [Hu et al., 2022], and DoRA [Liu et al., 2024c]. Our comparisons focus on both performance and parameter efficiency. In our comparisons, we use hyperparameter-tuned scores from previous works when possible. We load our base LMs in torch.bfloat16 to save memory. All of our experiments are run with a single GPU: NVIDIA A100 40G/80G or RTX 6000 . Examples of raw model generations are in appendix I. 54.1 Hyperparameter configuration + +For our experiments, we must decide how many interventions to learn and which layers and input positions to apply each one on. We propose learning interventions on a fixed number of p prefix and + +s suffix positions in the prompt. Specifically, we tune four hyperparameters: 1. The number of prefix positions p to intervene on, i.e. positions {1, . . . , p }.2. The number of suffix positions s to intervene on, i.e. positions {n − s + 1, . . . , n }.3. Which set of layers L to intervene on. 4. Whether or not to tie intervention parameters ϕ across different positions in the same layer. This simplifies the hyperparameter search space; compared to LoRA, the only additional consideration is which positions to intervene on. Since the number of positions edited is constant, LoReFT and DiReFT contribute a fixed additional inference cost that does not scale with prompt length. Given the positions P = {1, . . . , p } ∪ {n − s + 1, . . . , n }, we define the untied and tied variants: + +Iuntied = {⟨ Φ, {p}, l ⟩ ∣ p ∈ P, l ∈ L} Itied = {⟨ Φ, P, l ⟩ ∣ l ∈ L} + +Additionally, when applying LoReFT and DiReFT to a prompt with length n where n < p + s, we set p ← min (p, ⌊n/2⌋) and s ← min (s, ⌈n/2⌉) and do not apply the truncated interventions in Iuntied .We also tune neural-network training hyperparameters. Unlike previous work [Hu et al., 2022, 2023, Liu et al., 2024c] where hyperparameter tuning may involve optimising performance directly on test sets, we only tune our hyperparameters on development sets which do not contain any overlapping examples with the test sets of our tasks. We further describe hyperparameter tuning for each benchmark in appendix D.1. + +4.2 Commonsense reasoning + +We replicate the experimental setup in Hu et al. [2023] and finetune LLaMA-1 7B/13B, Llama-2 7B, and Llama-3 8B 5 on a combined dataset of eight commonsense reasoning tasks (C OMMONSENSE 170K). We report scores on each task’s test set individually. We compare with PEFTs benchmarked in Hu et al. [2023] as well as the identical experiment reported in Liu et al. [2024c] for DoRA. + +Datasets. Our benchmark contains eight commonsense reasoning datasets, including BoolQ [Clark et al., 2019], PIQA [Bisk et al., 2020], SIQA [Sap et al., 2019], HellaSwag [Zellers et al., 2019], WinoGrande [Sakaguchi et al., 2021], ARC-e, ARC-c [Clark et al., 2018], and OBQA [Mihaylov et al., 2018]. Examples are formulated as multiple-choice problems where the model needs to directly generate the correct choice without rationales. We use the same prompt template as in Hu et al. [2023] with additional string normalisation (removing leading and trailing whitespace). + +Hyperparameter tuning. We do not do hyperparameter selection based on test set results. Rather, we use the hyperparameter settings of the model that performs best on a development set created from the GSM8K training set, except we use a lower number of epochs (6 instead of 12) because the C OMMONSENSE 170K training set is more than 20 times larger than GSM8K. This allows us to tune relevant hyperparamters, and also serves to test the robustness of these settings across different domains. We additionally report scores on 3 epochs in appendix D.3. + +Results. We report results in table 1. LoReFT sets state-of-the-art performance on the commonsense reasoning tasks, outperforming all other methods by a considerable margin. While being more compute-efficient, DiReFT achieves only slightly worse performance consistently. + +4.3 Arithmetic reasoning + +Similar to the previous experiment, we follow the experimental setup in Hu et al. [2023] and finetune LLaMA-1 7B and 13B on a combined dataset of seven arithmetic reasoning tasks with LM-generated chain-of-thought steps (M ATH 10K) and report scores on four of the tasks’ test sets. We only evaluate correctness on the final numeric or multiple-choice answer. + +> 5Llama-3 8B appeared on April 18, 2024, and thus we had time to complete only commonsense reasoning experiments with this model. Liu et al. [2024c] report corresponding results for LoRA and DoRA. + +6Table 1: Accuracy comparison of LLaMA-1 7B/13B, Llama-2 7B and Llama-3 8B against existing PEFT methods on eight commonsense reasoning datasets. ∗Performance results of all baseline methods are taken from Liu et al. [2024c]. We report averaged performance of three runs with distinct random seeds for our method. For our methods, Param. (%) is calculated by dividing the number of trainable parameters by the number of parameters of the base LM. + +> Model PEFT Params (%) Accuracy (↑) +> BoolQ PIQA SIQA HellaS. WinoG. ARC-e ARC-c OBQA Avg. +> ChatGPT ∗——73.1 85.4 68.5 78.5 66.1 89.8 79.9 74.8 77.0 LLaMA-7B PrefT ∗0.039% 64.3 76.8 73.9 42.1 72.1 72.9 54.0 60.6 64.6 Adapter S∗1.953% 63.0 79.2 76.3 67.9 75.7 74.5 57.1 72.4 70.8 Adapter P∗3.542% 67.9 76.4 78.8 69.8 78.9 73.7 57.3 75.2 72.3 LoRA ∗0.826% 68.9 80.7 77.4 78.1 78.8 77.8 61.3 74.8 74.7 DoRA (half) ∗0.427% 70.0 82.6 79.7 83.2 80.6 80.6 65.4 77.6 77.5 DoRA ∗0.838% 68.5 82.9 79.6 84.8 80.8 81.4 65.8 81.0 78.1 +> DiReFT (ours) 0.031% 69.5 83.0 79.0 92.5 80.5 82.2 68.0 77.5 79.0 +> LoReFT (ours) 0.031% 69.3 84.4 80.3 93.1 84.2 83.2 68.2 78.9 80.2 +> LLaMA-13B PrefT ∗0.031% 65.3 75.4 72.1 55.2 68.6 79.5 62.9 68.0 68.4 Adapter S∗1.586% 71.8 83.0 79.2 88.1 82.4 82.5 67.3 81.8 79.5 Adapter P∗2.894% 72.5 84.9 79.8 92.1 84.7 84.2 71.2 82.4 81.5 LoRA ∗0.670% 72.1 83.5 80.5 90.5 83.7 82.8 68.3 82.4 80.5 DoRA (half) ∗0.347% 72.5 85.3 79.9 90.1 82.9 82.7 69.7 83.6 80.8 DoRA ∗0.681% 72.4 84.9 81.5 92.4 84.2 84.2 69.6 82.8 81.5 +> DiReFT (ours) 0.025% 71.3 86.1 80.8 94.6 83.6 85.5 72.9 82.7 82.2 +> LoReFT (ours) 0.025% 72.1 86.3 81.8 95.1 87.2 86.2 73.7 84.2 83.3 +> Llama-2 7B LoRA ∗0.826% 69.8 79.9 79.5 83.6 82.6 79.8 64.7 81.0 77.6 DoRA (half) ∗0.427% 72.0 83.1 79.9 89.1 83.0 84.5 71.0 81.2 80.5 DoRA ∗0.838% 71.8 83.7 76.0 89.1 82.6 83.7 68.2 82.4 79.7 +> DiReFT (ours) 0.031% 70.8 83.6 80.2 93.6 82.1 84.8 70.4 81.5 80.9 +> LoReFT (ours) 0.031% 71.1 83.8 80.8 94.3 84.5 85.6 72.2 82.3 81.8 +> Llama-3 8B LoRA ∗0.700% 70.8 85.2 79.9 91.7 84.3 84.2 71.2 79.0 80.8 DoRA (half) ∗0.361% 74.5 88.8 80.3 95.5 84.7 90.1 79.1 87.2 85.0 DoRA ∗0.710% 74.6 89.3 79.9 95.5 85.6 90.5 80.4 85.8 85.2 +> DiReFT (ours) 0.026% 73.4 88.7 81.0 95.6 85.5 91.8 81.8 85.4 85.4 +> LoReFT (ours) 0.026% 75.1 90.2 82.0 96.3 87.4 92.4 81.6 87.5 86.6 + +Hyperparameter tuning. We use the same hyperparameter settings as for the Commonsense Rea-soning benchmark, but with 12 epochs for training. We also report scores on 3 epochs. + +Datasets. Our benchmark contains four datasets for math world problems, including AQuA [Ling et al., 2017], GSM8K [Cobbe et al., 2021], MAWPS [Koncel-Kedziorski et al., 2016], and SVAMP [Pa-tel et al., 2021]. Models need to generate chain-of-thought [Wei et al., 2022] before the final answer. We use the same prompt template and hyperparameter settings as in the previous experiment. + +Results. We report results in table 2. We find that both LoReFT and DiReFT do not perform as well at arithmetic reasoning tasks compared to LoRA and adapters, but do outperform prefix-tuning. Our results suggest that our ReFTs may have more trouble on chain-of-thought reasoning than the single-step commonsense reasoning tasks due to the length of generations (greater length necessarily reduces the effect of the intervention) and overall greater difficulty of the task. Our results show that our ReFTs perform better with the 13B model than the 7B model, which suggests that our methods scale with model size. Overall, we note that the arithmetic reasoning results show a lot of variation, with no single method emerging as a clear winner across all of them. + +4.4 Instruction-following + +Base LMs require instruction finetuning to follow human prompts [Ouyang et al., 2022]. We follow the experimental setup in Wu et al. [2024a] and finetune Llama-2 7B with Ultrafeedback [Cui et al., 2023]. We compare against full parameter finetuning, LoRA, and RED. For evaluation, we use Alpaca-Eval v1.0 [Li et al., 2023], which computes the win-rate against text-davinci-003 using GPT-4 as the annotator. We use the same prompt template as in Taori et al. [2023]. + +Datasets. Ultrafeedback is high-quality instruction dataset where responses are generated via scoring a diverse set of model responses from a list of candidates (e.g. ChatGPT and Bard). The score is calculated as a weighted score of instruction-following, truthfulness, honesty, and helpfulness. 7Table 2: Accuracy comparison of LLaMA-1 7B/13B against existing PEFT methods on four arithmetic reasoning datasets. ∗Performance results of all baseline methods are taken from Hu et al. [2023]. We report averaged performance of three runs with distinct random seeds for our method. + +> Model PEFT Params (%) Accuracy (↑) +> AQuA GSM8K MAWPS SVAMP Avg. +> LLaMA-7B PrefT ∗0.039% 14.2 24.4 63.4 38.1 35.0 Adapter S∗1.953% 15.0 33.3 77.7 52.3 44.6 Adapter P∗3.542% 18.1 35.3 82.4 49.6 46.4 LoRA ∗0.826% 18.9 37.5 79.0 52.1 46.9 DiReFT (ours) 0.031% 21.3 24.1 74.5 42.7 40.6 +> LoReFT (ours) 0.031% 21.4 26.0 76.2 46.8 42.6 LLaMA-13B PrefT ∗0.031% 15.7 31.1 66.8 41.4 38.8 Adapter S∗1.586% 22.0 44.0 78.6 50.8 48.9 Adapter P∗2.894% 20.5 43.3 81.1 55.7 50.2 LoRA ∗0.670% 18.5 47.5 83.6 54.6 51.1 DiReFT (ours) 0.025% 20.5 35.8 80.8 54.8 48.0 +> LoReFT (ours) 0.025% 23.6 38.1 82.4 54.2 49.6 + +Table 3: Instruction tuning evaluation results for instruction-tuned Llama-2 7B with Alpaca-Eval v1.0. We report averaged performance of two runs with distinct random seeds for our method. half + +denotes our runs with half of the rank; 1K denotes our runs with a low-resource setting where there is only 1K training examples. †Performance results of baseline methods are taken from Li et al. [2023]. ∗Performance results of baseline methods are taken from Wu et al. [2024a]. ‡It takes 18 minutes to train our Llama-2 Chat 7B on 1K examples using a single A100 40G GPU with + +≈1MB parameters on disk. + +Model & PEFT Params (%) Win-rate (↑)GPT-3.5 Turbo 1106 † — 86.30 Llama-2 Chat 13B † — 81.10 Llama-2 Chat 7B † — 71.40 Llama-2 7B & FT ∗ 100% 80.93 Llama-2 7B & LoRA ∗ 0.1245% 81.48 Llama-2 7B & RED ∗ 0.0039% 81.69 Llama-2 7B & DiReFT (ours) 0.0039% 84.85 Llama-2 7B & LoReFT (ours) 0.0039% 85.60 + +Llama-2 7B & LoReFT (ours, half ) 0.0019% 84.12 Llama-2 7B & LoReFT (ours, 1K )‡ 0.0039% 81.91 + +Some of the best 7B and 13B chat-models (e.g. UltraLM-13B [Ding et al., 2023]) are finetuned with Ultrafeedback. + +Hyperparameter tuning. We do hyperparameter-tuning on the unseen instruction-following dataset Alpaca-52K [Taori et al., 2023] with only LLaMA-7B to prevent test-set hill-climbing. We then use the hyperparameter settings of our best performing model to finetune on Ultrafeedback. For hyperparameter tuning, we use Alpaca-Eval v1.0 with GPT-4 turbo as the annotator for fast turnaround, which also prevents overfitting with GPT-4 as a judge. + +Results. We report results in table 3. When matched in parameter count to the previous most parameter-efficient PEFT (RED) and trained on Llama-2 7B, LoReFT outperforms all reported finetuning methods (including full finetuning) and achieves a win-rate within 1% of GPT-3.5 Turbo 1106. Furthermore, after halving the parameter count or using only 1/64-th of the data, LoReFT still outperforms other finetuning methods. This result shows that LoReFT can succeed at long-form text generation. DiReFT is again slightly worse than LoReFT but is highly competitive. 6 + +> 6 + +We release our ReFT weights ( <1MB) of our instruction-tuned model through HuggingFace and provide a tutorial at https://github.com/stanfordnlp/pyreft/blob/main/examples/chat . + +8Table 4: Accuracy comparison of RoBERTa-base and RoBERTa-large against existing PEFT methods on the GLUE benchmark. ∗Performance results of all baseline methods are taken from Wu et al. [2024a]. We report averaged performance of five runs with distinct random seeds for our method. + +> Model PEFT Params (%) Accuracy (↑) +> MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B Avg. +> base FT 100% 87.3 94.4 87.9 62.4 92.5 91.7 78.3 90.6 85.6 Adapter ∗0.318% 87.0 93.3 88.4 60.9 92.5 90.5 76.5 90.5 85.0 +> LoRA ∗0.239% 86.6 93.9 88.7 59.7 92.6 90.4 75.3 90.3 84.7 Adapter FNN ∗0.239% 87.1 93.0 88.8 58.5 92.0 90.2 77.7 90.4 84.7 BitFit ∗0.080% 84.7 94.0 88.0 54.0 91.0 87.3 69.8 89.5 82.3 RED ∗0.016% 83.9 93.9 89.2 61.0 90.7 87.2 78.0 90.4 84.3 +> DiReFT (ours) 0.015% 82.5 92.6 88.3 58.6 91.3 86.4 76.4 89.3 83.2 +> LoReFT (ours) 0.015% 83.1 93.4 89.2 60.4 91.2 87.4 79.0 90.0 84.2 large FT 100% 88.8 96.0 91.7 68.2 93.8 91.5 85.8 92.6 88.6 Adapter ∗0.254% 90.1 95.2 90.5 65.4 94.6 91.4 85.3 91.5 88.0 LoRA ∗0.225% 90.2 96.0 89.8 65.5 94.7 90.7 86.3 91.7 88.1 Adapter FNN ∗0.225% 90.3 96.1 90.5 64.4 94.3 91.3 84.8 90.2 87.7 RED ∗0.014% 89.5 96.0 90.3 68.1 93.5 88.8 86.2 91.3 88.0 +> DiReFT (ours) 0.014% 88.7 95.4 88.5 66.7 93.9 88.1 86.9 91.2 87.4 +> LoReFT (ours) 0.014% 89.2 96.2 90.1 68.0 94.1 88.5 87.5 91.6 88.2 + +4.5 Natural language understanding + +We evaluate LoReFT on the GLUE benchmark [Wang et al., 2018] against existing PEFTs. We use this set of experiments to show LoReFT works well even with small-scale LMs, and can improve representations for classification tasks and not just text generation. We finetune RoBERTa-base (125M) as well as RoBERTa-large (350M) on GLUE, a sequence classification benchmark for natural language understanding (NLU) which covers domains such as sentiment classification and natural language inference. Details about the GLUE benchmark can be found in its original paper. We follow Wu et al. [2024a] for proper evaluation on GLUE validation set: we split the validation set into two sets guarded by a random seed, and we pick the best model with highest in-training validation accuracy to evaluate on the other held-out half for testing accuracy. + +Hyperparameter tuning. We tune our hyperparameters for each task separately. which is standard for PEFTs. To avoid overfitting to random seeds, we hyperparameter-tune our models with a constant seed, and report averaged results over that and four additional unseen seeds. We describe hyperparameter tuning experiments in Appendix D.1. + +Results. We report results in table 4. LoReFT obtains comparable performance with PEFT methods on both model sizes when parameter matched with RED, the previous most parameter-efficient PEFT for this task. Furthermore, DiReFT achieves worse performance than most of the PEFTs suggesting LoReFT is a better choice when LM is small. Full results with standard deviation is in table 13. We additionally compare against VeRA [Kopiczko et al., 2024] in appendix D.3. + +## 5 Limitations + +Due to limited resources, we mainly explored the LLaMA-family of models. In future work, we hope to explore the effectiveness of ReFT on other model families as well as vision–language models such as LLaVA [Liu et al., 2024a]. The capabilities of ReFT have not yet been fully explored due to the large hyperparameter search space; we are interested in automating this search. We provide some initial explorations of LM personalisation with ReFT in a few-shot setting in appendix G.2. We hope to explore why ReFT works, and we provide some of our early explorations focused on memorisation (appendix F.1, appendix F.2). We are also investigating whether learned orthogonal subspaces can be composed together without adaptation. Some encouraging initial findings are in appendix G.1. + +ReFT, abstraction, and generation. Neural network interpretability research often struggles to contribute directly to improving models. With ReFT, we have shown one way to overcome this challenge. The ReFT framework is rooted in work on causal abstraction [Geiger et al., 2023a] for model interpretability, and LoReFT builds directly on the distributed interchange intervention method 9of Geiger et al. [2023b] and Wu et al. [2023]. See also the interchange intervention training (IIT) method of Geiger et al. [2022], Wu et al. [2022], Huang et al. [2023c]. In a similar vein, recent work also uses representation-based editing of the Transformer stream to steer model behavior [Li et al., 2024, Zou et al., 2023]. ReFT advances this line of work by showing one way that such steering can be learned, rather than being merely a post hoc analysis step. The precise ways in which ReFT works deserve deeper exploration. Although these methods intervene on representations, the causal effect of such interventions may only emerge in the model’s upstream computations. In other words, the power of ReFT may come from the fact that it creates new causal pathways or modifies the strength of some existing ones. We leave it to future research to track these effects, and perhaps to explore more structured ReFTs to modify complex causal pathways in LMs. + +ReFT and model interpretability. ReFT relies on insights from work on interpretability, and it may also be able to contribute insights back to that field. In particular, LoReFT shows that training a set of low-rank interventions on selected residual streams can induce a base LM to follow instructions (section 4.4). In other words, a linear subspace distributed across a set of neurons can achieve generalised control over a vast number of tasks. This is a serious challenge to work seeing to interpret individual neurons in isolation (for related criticisms, see Huang et al. 2023b). The success of ReFT suggests to us a quite different approach to interperetability, one that starts from the assumption that neurons will play different roles in different contexts. + +Evaluation practices in PEFT research. In this work, we hyperparameter-tune ReFT on develop-ment sets that do not overlap with the test set. Unfortunately, a considerable portion of the literature on PEFTs directly hill-climbs performance on test sets. This results in overfitting to specific tasks, which gives practitioners less certainty about the real-world performance of different methods and impedes fair comparison. We hope that future work can introduce benchmarks for evaluating PEFTs and ReFTs. These should allow for compute- or time-matched hyperparameter-tuning comparisons, and they should disallow any kind of tuning or model selection based on the test set. + +## 6 Conclusion + +We propose a strong alternative to PEFTs, LoReFT, and we identify an ablation of this method, DiReFT, that trades some performance for increased efficiency. Overall, LoReFT achieves strong per-formance across benchmarks from four domains while being 15 ×–65 × more efficient than LoRA. No-tably, LoReFT establishes new state-of-the-art performance on commonsense reasoning, instruction-following, and natural language understanding against the strongest PEFTs. We also show how our method can be described under a generic framework – ReFT. ReFT is a new approach to finetuning that is more powerful, more parameter-efficient, and more interpretable than any existing PEFTs. + +## Acknowledgements + +We thank Jing Huang for helpful discussion in designing our memorisation tests as well as writing. We thank Chenglei Si, Harshit Joshi, Jordan Juravsky, Julie Kallini, Ken Liu, Rohan Pandey, Jiuding Sun, Leonard Tang, Tristan Thrush, Shengguang Wu, Qinan Yu, Yanzhe Zhang, Amir Zur, and Shiqi Chen for helpful discussion about the project and comments on the manuscript. + +## References + +Afra Amini, Tiago Pimentel, Clara Meister, and Ryan Cotterell. Naturalistic causal probing for morpho-syntax. Transactions of the Association for Computational Linguistics , 11:384–403, 2023. doi: 10.1162/tacl_a_00554. URL https://aclanthology.org/2023.tacl-1.23 .Aryaman Arora, Dan Jurafsky, and Christopher Potts. CausalGym: Benchmarking causal inter-pretability methods on linguistic tasks. arXiv:2402.12560 , 2024. URL https://arxiv.org/abs/ 2402.12560 .Matan Avitan, Ryan Cotterell, Yoav Goldberg, and Shauli Ravfogel. What changed? Converting representational interventions to natural language. arXiv:2402.11355 , 2024. URL https://arxiv. org/abs/2402.11355 .10 Nora Belrose, David Schneider-Joseph, Shauli Ravfogel, Ryan Cotterell, Edward Raff, and Stella Biderman. LEACE: Perfect linear concept erasure in closed form. Advances in Neural Information Processing Systems , 36, 2023. URL https://proceedings.neurips.cc/paper_files/paper/2023/ file/d066d21c619d0a78c5b557fa3291a8f4-Paper-Conference.pdf .Elad Ben Zaken, Yoav Goldberg, and Shauli Ravfogel. BitFit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) , pages 1–9, Dublin, Ireland, May 2022. Association for Computational Linguistics. doi: 10.18653/v1/2022.acl-short.1. URL https: //aclanthology.org/2022.acl-short.1 .Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. PIQA: Reasoning about physical commonsense in natural language. In Proceedings of the AAAI Conference on Artificial Intelligence ,volume 34, pages 7432–7439, 2020. URL https://arxiv.org/abs/1911.11641 .Lewis Carroll. Alice’s Adventures in Wonderland . Macmillan, London, 1865. Angelica Chen, Ravid Schwartz-Ziv, Kyunghyun Cho, Matthew L. Leavitt, and Naomi Saphra. Sudden drops in the loss: Syntax acquisition, phase transitions, and simplicity bias in MLMs. + +arXiv:2309.07311 , 2023. URL https://arxiv.org/abs/2309.07311v4 .Abhijith Chintam, Rahel Beloch, Willem Zuidema, Michael Hanna, and Oskar van der Wal. Identifying and adapting transformer-components responsible for gender bias in an English language model. In Yonatan Belinkov, Sophie Hao, Jaap Jumelet, Najoung Kim, Arya Mc-Carthy, and Hosein Mohebbi, editors, Proceedings of the 6th BlackboxNLP Workshop: Ana-lyzing and Interpreting Neural Networks for NLP , pages 379–394, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.blackboxnlp-1.29. URL + +> https://aclanthology.org/2023.blackboxnlp-1.29 + +.Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins, and Kristina Toutanova. BoolQ: Exploring the surprising difficulty of natural yes/no questions. In Jill Burstein, Christy Doran, and Thamar Solorio, editors, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Lan-guage Technologies, Volume 1 (Long and Short Papers) , pages 2924–2936, Minneapolis, Min-nesota, June 2019. Association for Computational Linguistics. doi: 10.18653/v1/N19-1300. URL + +> https://aclanthology.org/N19-1300 + +.Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. Think you have solved question answering? Try ARC, the AI2 reasoning challenge. arXiv:1803.05457 , 2018. URL https://arxiv.org/abs/1803.05457 .Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to solve math word problems. arXiv:2110.14168 , 2021. URL https://arxiv.org/abs/2110.14168 .Ganqu Cui, Lifan Yuan, Ning Ding, Guanming Yao, Wei Zhu, Yuan Ni, Guotong Xie, Zhiyuan Liu, and Maosong Sun. UltraFeedback: Boosting language models with high-quality feedback. + +arXiv:2310.01377 , 2023. URL https://arxiv.org/abs/2310.01377 .Andrew M. Dai and Quoc V. Le. Semi-supervised sequence learning. In Advances in Neural Information Processing Systems , volume 28. Curran Associates, Inc., 2015. URL https:// proceedings.neurips.cc/paper/2015/hash/7137debd45ae4d0ab9aa953017286b20-Abstract.html .Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of deep bidirectional transformers for language understanding. In Jill Burstein, Christy Doran, and Thamar Solorio, editors, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) , pages 4171–4186, Minneapolis, Minnesota, June 2019. Association for Computational Linguistics. doi: 10.18653/v1/N19-1423. URL https://aclanthology.org/ N19-1423 .11 Ning Ding, Yulin Chen, Bokai Xu, Yujia Qin, Shengding Hu, Zhiyuan Liu, Maosong Sun, and Bowen Zhou. Enhancing chat language models by scaling high-quality instructional conversations. In Houda Bouamor, Juan Pino, and Kalika Bali, editors, Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing , pages 3029–3051, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.emnlp-main.183. URL + +> https://aclanthology.org/2023.emnlp-main.183 + +.Nelson Elhage, Tristan Hume, Catherine Olsson, Nicholas Schiefer, Tom Henighan, Shauna Kravec, Zac Hatfield-Dodds, Robert Lasenby, Dawn Drain, Carol Chen, Roger Grosse, Sam McCandlish, Jared Kaplan, Dario Amodei, Martin Wattenberg, and Christopher Olah. Toy models of superpo-sition. Transformer Circuits Thread , 2022. URL https://transformer-circuits.pub/2022/toy_ model/index.html .Stanislav Fort. Scaling laws for adversarial attacks on language model activations, 2023. URL + +> http://arxiv.org/abs/2312.02780 + +.Cheng Fu, Hanxian Huang, Xinyun Chen, Yuandong Tian, and Jishen Zhao. Learn-to-Share: A hardware-friendly transfer learning framework exploiting computation and parameter sharing. In Marina Meila and Tong Zhang, editors, Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event , volume 139 of Proceedings of Machine Learning Research , pages 3469–3479. PMLR, 2021. URL http://proceedings.mlr. press/v139/fu21a.html .Peng Gao, Jiaming Han, Renrui Zhang, Ziyi Lin, Shijie Geng, Aojun Zhou, Wei Zhang, Pan Lu, Conghui He, Xiangyu Yue, et al. LLaMA-Adapter v2: Parameter-efficient visual instruction model. + +arXiv:2304.15010 , 2023. URL https://arxiv.org/abs/2304.15010 .Atticus Geiger, Hanson Lu, Thomas Icard, and Christopher Potts. Causal abstractions of neural networks. In M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan, editors, Advances in Neural Information Processing Systems , volume 34, pages 9574–9586. Curran Associates, Inc., 2021. URL https://proceedings.neurips.cc/paper_files/paper/2021/file/ 4f5c422f4d49a5a807eda27434231040-Paper.pdf .Atticus Geiger, Zhengxuan Wu, Hanson Lu, Josh Rozner, Elisa Kreiss, Thomas Icard, Noah Good-man, and Christopher Potts. Inducing causal structure for interpretable neural networks. In Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato, editors, Proceedings of the 39th International Conference on Machine Learning , volume 162 of + +Proceedings of Machine Learning Research , pages 7324–7338. PMLR, 17–23 Jul 2022. URL + +> https://proceedings.mlr.press/v162/geiger22a.html + +.Atticus Geiger, Chris Potts, and Thomas Icard. Causal abstraction for faithful model interpretation. + +arXiv:2301.04709 , 2023a. URL https://arxiv.org/abs/2301.04709 .Atticus Geiger, Zhengxuan Wu, Christopher Potts, Thomas Icard, and Noah D. Goodman. Find-ing alignments between interpretable causal variables and distributed neural representations. + +arXiv:2303.02536 , 2023b. URL https://arxiv.org/abs/2303.02536 .Clément Guerner, Anej Svete, Tianyu Liu, Alexander Warstadt, and Ryan Cotterell. A geometric notion of causal probing. arXiv:2307.15054 , 2023. URL https://arxiv.org/abs/2307.15054 .Michael Hanna, Yonatan Belinkov, and Sandro Pezzelle. When language models fall in love: Animacy processing in transformer language models. In Houda Bouamor, Juan Pino, and Kalika Bali, editors, + +Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing ,pages 12120–12135, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.emnlp-main.744. URL https://aclanthology.org/2023.emnlp-main.744 .Sophie Hao and Tal Linzen. Verb conjugation in transformers is determined by linear encodings of subject number. In Houda Bouamor, Juan Pino, and Kalika Bali, editors, Findings of the Association for Computational Linguistics: EMNLP 2023 , pages 4531–4539, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.findings-emnlp.300. URL + +> https://aclanthology.org/2023.findings-emnlp.300 + +.12 Junxian He, Chunting Zhou, Xuezhe Ma, Taylor Berg-Kirkpatrick, and Graham Neubig. Towards a unified view of parameter-efficient transfer learning. In The Tenth International Conference on Learning Representations, ICLR 2022 , Virtual Event, 2022a. URL https://openreview.net/ forum?id=0RDcd5Axok .Shwai He, Liang Ding, Daize Dong, Jeremy Zhang, and Dacheng Tao. SparseAdapter: An easy approach for improving the parameter-efficiency of adapters. In Yoav Goldberg, Zornitsa Kozareva, and Yue Zhang, editors, Findings of the Association for Computational Linguistics: EMNLP 2022 , pages 2184–2190, Abu Dhabi, United Arab Emirates, December 2022b. As-sociation for Computational Linguistics. doi: 10.18653/v1/2022.findings-emnlp.160. URL + +> https://aclanthology.org/2022.findings-emnlp.160 + +.Mohammad Javad Hosseini, Hannaneh Hajishirzi, Oren Etzioni, and Nate Kushman. Learning to solve arithmetic word problems with verb categorization. In Alessandro Moschitti, Bo Pang, and Walter Daelemans, editors, Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 523–533, Doha, Qatar, October 2014. Association for Computational Linguistics. doi: 10.3115/v1/D14-1058. URL https://aclanthology.org/ D14-1058 .Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. Parameter-efficient transfer learning for NLP. In Kamalika Chaudhuri and Ruslan Salakhutdinov, editors, Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9-15 June 2019, Long Beach, California, USA ,volume 97 of Proceedings of Machine Learning Research , pages 2790–2799. PMLR, 2019. URL + +> http://proceedings.mlr.press/v97/houlsby19a.html + +.Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. LoRA: Low-rank adaptation of large language models. In The Tenth In-ternational Conference on Learning Representations, ICLR 2022 , Virtual Event, 2022. URL + +> https://openreview.net/forum?id=nZeVKeeFYf9 + +.Zhiqiang Hu, Lei Wang, Yihuai Lan, Wanyu Xu, Ee-Peng Lim, Lidong Bing, Xing Xu, Soujanya Poria, and Roy Lee. LLM-adapters: An adapter family for parameter-efficient fine-tuning of large language models. In Houda Bouamor, Juan Pino, and Kalika Bali, editors, Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing , pages 5254–5276, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.emnlp-main. 319. URL https://aclanthology.org/2023.emnlp-main.319 .Chengsong Huang, Qian Liu, Bill Yuchen Lin, Tianyu Pang, Chao Du, and Min Lin. LoraHub: Efficient cross-task generalization via dynamic lora composition. arXiv:2307.13269 , 2023a. URL + +> https://arxiv.org/abs/2307.13269 + +.Jing Huang, Atticus Geiger, Karel D’Oosterlinck, Zhengxuan Wu, and Christopher Potts. Rigorously assessing natural language explanations of neurons. In Yonatan Belinkov, Sophie Hao, Jaap Jumelet, Najoung Kim, Arya McCarthy, and Hosein Mohebbi, editors, Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP , pages 317–331, Singapore, December 2023b. Association for Computational Linguistics. doi: 10.18653/v1/2023. blackboxnlp-1.24. URL https://aclanthology.org/2023.blackboxnlp-1.24 .Jing Huang, Zhengxuan Wu, Kyle Mahowald, and Christopher Potts. Inducing character-level structure in subword-based language models with type-level interchange intervention training. In Anna Rogers, Jordan Boyd-Graber, and Naoaki Okazaki, editors, Findings of the Association for Computational Linguistics: ACL 2023 , pages 12163–12180, Toronto, Canada, July 2023c. Association for Computational Linguistics. doi: 10.18653/v1/2023.findings-acl.770. URL https: //aclanthology.org/2023.findings-acl.770 .Jing Huang, Christopher Potts Zhengxuan Wu, Mor Geva, and Atticus Geiger. RAVEL: Evaluating interpretability methods on disentangling language model representations. arXiv:2402.17700 ,2024. URL https://arxiv.org/abs/2402.17700 .Rik Koncel-Kedziorski, Hannaneh Hajishirzi, Ashish Sabharwal, Oren Etzioni, and Siena Dumas Ang. Parsing algebraic word problems into equations. Transactions of the Association for Computational + +13 Linguistics , 3:585–597, 2015. doi: 10.1162/tacl_a_00160. URL https://aclanthology.org/ Q15-1042 .Rik Koncel-Kedziorski, Subhro Roy, Aida Amini, Nate Kushman, and Hannaneh Hajishirzi. MAWPS: A math word problem repository. In Kevin Knight, Ani Nenkova, and Owen Rambow, editors, + +Proceedings of the 2016 Conference of the North American Chapter of the Association for Com-putational Linguistics: Human Language Technologies , pages 1152–1157, San Diego, Califor-nia, June 2016. Association for Computational Linguistics. doi: 10.18653/v1/N16-1136. URL + +> https://aclanthology.org/N16-1136 + +.Dawid Jan Kopiczko, Tijmen Blankevoort, and Yuki M Asano. VeRA: Vector-based random matrix adaptation. In The Twelfth International Conference on Learning Representations, ICLR 2024 ,2024. URL https://openreview.net/forum?id=NjNfLdxr3A .Karim Lasri, Tiago Pimentel, Alessandro Lenci, Thierry Poibeau, and Ryan Cotterell. Prob-ing for the usage of grammatical number. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Com-putational Linguistics (Volume 1: Long Papers) , pages 8818–8831, Dublin, Ireland, May 2022. Association for Computational Linguistics. doi: 10.18653/v1/2022.acl-long.603. URL + +> https://aclanthology.org/2022.acl-long.603 + +.Hector Levesque, Ernest Davis, and Leora Morgenstern. The Winograd Schema Challenge. In + +Proceedings of the Thirteenth International Conference on Principles of Knowledge Representation and Reasoning , 2012. URL https://cdn.aaai.org/ocs/4492/4492-21843-1-PB.pdf .Kenneth Li, Oam Patel, Fernanda Viégas, Hanspeter Pfister, and Martin Wattenberg. Inference-time intervention: Eliciting truthful answers from a language model. Advances in Neural Information Processing Systems , 36, 2024. URL https://proceedings.neurips.cc/paper_files/paper/2023/ hash/81b8390039b7302c909cb769f8b6cd93-Abstract-Conference.html .Margaret Li, Suchin Gururangan, Tim Dettmers, Mike Lewis, Tim Althoff, Noah A. Smith, and Luke Zettlemoyer. Branch-train-merge: Embarrassingly parallel training of expert language models. + +arXiv:2208.03306 , 2022. URL https://arxiv.org/abs/2208.03306 .Xiang Lisa Li and Percy Liang. Prefix-tuning: Optimizing continuous prompts for generation. In Chengqing Zong, Fei Xia, Wenjie Li, and Roberto Navigli, editors, Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) , pages 4582–4597, Online, August 2021. Association for Computational Linguistics. doi: 10.18653/v1/2021.acl-long.353. URL https://aclanthology.org/2021.acl-long.353 .Xuechen Li, Tianyi Zhang, Yann Dubois, Rohan Taori, Ishaan Gulrajani, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. AlpacaEval: An automatic evaluator of instruction-following models. https://github.com/tatsu-lab/alpaca_eval , 2023. Wang Ling, Dani Yogatama, Chris Dyer, and Phil Blunsom. Program induction by rationale gener-ation: Learning to solve and explain algebraic word problems. arXiv:1705.04146 , 2017. URL + +> https://arxiv.org/abs/1705.04146 + +.Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. Visual instruction tuning. Advances in Neural Information Processing Systems , 36, 2024a. URL https://arxiv.org/abs/2304.08485 .Sheng Liu, Haotian Ye, Lei Xing, and James Zou. In-context vectors: Making in context learning more effective and controllable through latent space steering. arXiv:2311.06668 , 2024b. URL + +> https://arxiv.org/abs/2311.06668 + +.Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng, and Min-Hung Chen. DoRA: Weight-decomposed low-rank adaptation. arXiv:2402.09353 ,2024c. URL https://arxiv.org/abs/2402.09353 .Weiyang Liu, Zeju Qiu, Yao Feng, Yuliang Xiu, Yuxuan Xue, Longhui Yu, Haiwen Feng, Zhen Liu, Juyeon Heo, Songyou Peng, Yandong Wen, Michael J. Black, Adrian Weller, and Bern-hard Schölkopf. Parameter-efficient orthogonal finetuning via butterfly factorization. In The + +14 Twelfth International Conference on Learning Representations, ICLR 2024 , 2024d. URL + +> https://openreview.net/forum?id=7NzgkEdGyr + +.Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. RoBERTa: A robustly optimized BERT pretraining approach. arXiv:1907.11692 , 2019. URL https://arxiv.org/abs/1907.11692 .James L. McClelland, David E. Rumelhart, and PDP Research Group. Parallel Distributed Processing: Explorations in the Microstructure of Cognition , volume 2: Psychological and Biological Models. MIT Press, 1986. Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. Can a suit of armor conduct electricity? A new dataset for open book question answering. arXiv:1809.02789 , 2018. URL + +> https://arxiv.org/abs/1809.02789 + +.Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. Linguistic regularities in continuous space word representations. In Lucy Vanderwende, Hal Daumé III, and Katrin Kirchhoff, editors, Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Lin-guistics: Human Language Technologies , pages 746–751, Atlanta, Georgia, June 2013. Association for Computational Linguistics. URL https://aclanthology.org/N13-1090 .Neel Nanda, Andrew Lee, and Martin Wattenberg. Emergent linear representations in world models of self-supervised sequence models. In Yonatan Belinkov, Sophie Hao, Jaap Jumelet, Najoung Kim, Arya McCarthy, and Hosein Mohebbi, editors, Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP , pages 16–30, Singapore, December 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.blackboxnlp-1.2. URL https: //aclanthology.org/2023.blackboxnlp-1.2 .Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems , 35: 27730–27744, 2022. URL https://arxiv.org/abs/2203.02155 .Kiho Park, Yo Joong Choe, and Victor Veitch. The linear representation hypothesis and the geometry of large language models. arXiv:2311.03658 , 2023. URL https://arxiv.org/abs/2311.03658 .Arkil Patel, Satwik Bhattamishra, and Navin Goyal. Are NLP models really able to solve sim-ple math word problems? In Kristina Toutanova, Anna Rumshisky, Luke Zettlemoyer, Dilek Hakkani-Tur, Iz Beltagy, Steven Bethard, Ryan Cotterell, Tanmoy Chakraborty, and Yichao Zhou, editors, Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 2080–2094, Online, June 2021. Association for Computational Linguistics. doi: 10.18653/v1/2021.naacl-main.168. URL + +> https://aclanthology.org/2021.naacl-main.168 + +.Jonas Pfeiffer, Ivan Vuli´ c, Iryna Gurevych, and Sebastian Ruder. MAD-X: An Adapter-Based Framework for Multi-Task Cross-Lingual Transfer. In Bonnie Webber, Trevor Cohn, Yulan He, and Yang Liu, editors, Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 7654–7673, Online, November 2020. Association for Computational Linguistics. doi: 10.18653/v1/2020.emnlp-main.617. URL https://aclanthology. org/2020.emnlp-main.617 .Shauli Ravfogel, Michael Twiton, Yoav Goldberg, and Ryan D. Cotterell. Linear adversarial concept erasure. In Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato, editors, Proceedings of the 39th International Conference on Machine Learning ,volume 162 of Proceedings of Machine Learning Research , pages 18400–18421, 17–23 Jul 2022. URL https://proceedings.mlr.press/v162/ravfogel22a.html .Subhro Roy and Dan Roth. Solving general arithmetic word problems. In Lluís Màrquez, Chris Callison-Burch, and Jian Su, editors, Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing , pages 1743–1752, Lisbon, Portugal, September 2015. Association for Computational Linguistics. doi: 10.18653/v1/D15-1202. URL https://aclanthology.org/ D15-1202 .15 David E. Rumelhart, James L. McClelland, and PDP Research Group. Parallel Distributed Processing: Explorations in the Microstructure of Cognition , volume 1: Foundations. MIT Press, 1986. Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. WinoGrande: An adversarial Winograd Schema Challenge at scale. Communications of the ACM , 64(9):99–106, 2021. URL https://arxiv.org/abs/1907.10641 .Maarten Sap, Hannah Rashkin, Derek Chen, Ronan Le Bras, and Yejin Choi. Social IQa: Common-sense reasoning about social interactions. In Kentaro Inui, Jing Jiang, Vincent Ng, and Xiaojun Wan, editors, Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) , pages 4463–4473, Hong Kong, China, November 2019. Association for Computational Linguistics. doi: 10.18653/v1/D19-1454. URL https://aclanthology.org/D19-1454 .Shuhua Shi, Shaohan Huang, Minghui Song, Zhoujun Li, Zihan Zhang, Haizhen Huang, Furu Wei, Weiwei Deng, Feng Sun, and Qi Zhang. ResLoRA: Identity residual mapping in low-rank adaption. + +arXiv:2402.18039 , 2024. URL https://arxiv.org/abs/2402.18039 .Shashwat Singh, Shauli Ravfogel, Jonathan Herzig, Roee Aharoni, Ryan Cotterell, and Ponnu-rangam Kumaraguru. MiMiC: Minimally modified counterfactuals in the representation space. + +arXiv:2402.09631 , 2024. URL https://arxiv.org/abs/2402.09631 .Paul Smolensky. Neural and conceptual interpretation of PDP models. In Parallel Distributed Processing: Explorations in the Microstructure of Cognition , volume 2: Psychological and Biological Models, pages 390–431. MIT Press/Bradford Books, Cambridge, MA, 1986. Nishant Subramani, Nivedita Suresh, and Matthew E. Peters. Extracting latent steering vectors from pretrained language models. arXiv:2205.05124 , 2022. URL https://arxiv.org/abs/2205.05124 .Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford Alpaca: An instruction-following LLaMA model. + +> https://github.com/tatsu-lab/stanford_alpaca + +, 2023. Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. LLaMA: Open and efficient foundation language models. arXiv:2302.13971 , 2023a. URL https://arxiv.org/abs/2302.13971 .Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cris-tian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: Open foundation and fine-tuned chat models, 2023b. URL https://arxiv.org/abs/2307.09288 .Alex Turner, Lisa Thiergart, David Udell, Gavin Leech, Ulisse Mini, and Monte MacDiarmid. Activation addition: Steering language models without optimization. arXiv:2308.10248 , 2023. URL https://arxiv.org/abs/2308.10248 .Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems , volume 30, pages 5998–6008. Curran Associates, Inc., 2017. URL http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf .Theia Vogel. repeng, 2024. URL https://github.com/vgel/repeng/ .16 Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel Bowman. GLUE: A multi-task benchmark and analysis platform for natural language understanding. In Tal Linzen, Grzegorz Chrupała, and Afra Alishahi, editors, Proceedings of the 2018 EMNLP Workshop Black-boxNLP: Analyzing and Interpreting Neural Networks for NLP , pages 353–355, Brussels, Belgium, November 2018. Association for Computational Linguistics. doi: 10.18653/v1/W18-5446. URL + +> https://aclanthology.org/W18-5446 + +.Kevin Ro Wang, Alexandre Variengien, Arthur Conmy, Buck Shlegeris, and Jacob Steinhardt. Interpretability in the wild: a circuit for indirect object identification in GPT-2 small. In The Eleventh International Conference on Learning Representations, ICLR 2023 , Kigali, Rwanda, 2023. URL https://openreview.net/pdf?id=NpsVSN6o4ul .Yaqing Wang, Sahaj Agarwal, Subhabrata Mukherjee, Xiaodong Liu, Jing Gao, Ahmed Hassan Awadallah, and Jianfeng Gao. AdaMix: Mixture-of-adaptations for parameter-efficient model tuning. In Yoav Goldberg, Zornitsa Kozareva, and Yue Zhang, editors, Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing , pages 5744–5760, Abu Dhabi, United Arab Emirates, December 2022. Association for Computational Linguistics. doi: 10.18653/v1/2022.emnlp-main.388. URL https://aclanthology.org/2022.emnlp-main.388 .Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems , 35:24824–24837, 2022. URL https://arxiv.org/abs/ 2201.11903 .Muling Wu, Wenhao Liu, Xiaohua Wang, Tianlong Li, Changze Lv, Zixuan Ling, Jianhao Zhu, Cenyuan Zhang, Xiaoqing Zheng, and Xuanjing Huang. Advancing parameter efficiency in fine-tuning via representation editing. arXiv:2402.15179 , 2024a. URL https://arxiv.org/abs/2402. 15179 .Zhengxuan Wu, Atticus Geiger, Joshua Rozner, Elisa Kreiss, Hanson Lu, Thomas Icard, Christo-pher Potts, and Noah Goodman. Causal distillation for language models. In Marine Carpuat, Marie-Catherine de Marneffe, and Ivan Vladimir Meza Ruiz, editors, Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguis-tics: Human Language Technologies , pages 4288–4295, Seattle, United States, July 2022. Association for Computational Linguistics. doi: 10.18653/v1/2022.naacl-main.318. URL + +> https://aclanthology.org/2022.naacl-main.318 + +.Zhengxuan Wu, Atticus Geiger, Christopher Potts, and Noah D. Goodman. Interpretability at scale: Identifying causal mechanisms in Alpaca. In Advances in Neural Information Processing Systems , volume 36, 2023. URL https://papers.neurips.cc/paper_files/paper/2023/file/ f6a8b109d4d4fd64c75e94aaf85d9697-Paper-Conference.pdf .Zhengxuan Wu, Atticus Geiger, Aryaman Arora, Jing Huang, Zheng Wang, Noah D. Goodman, Christopher D. Manning, and Christopher Potts. pyvene: A library for understanding and improving PyTorch models via interventions. In arXiv:2403.07809 , 2024b. URL https://arxiv.org/abs/ 2403.07809 .Zhengxuan Wu, Atticus Geiger, Jing Huang, Aryaman Arora, Thomas Icard, Christopher Potts, and Noah D. Goodman. A reply to Makelov et al. (2023)’s “interpretability illusion” arguments. + +arXiv:2401.12631 , 2024c. URL https://arxiv.org/abs/2401.12631 .Takateru Yamakoshi, James McClelland, Adele Goldberg, and Robert Hawkins. Causal in-terventions expose implicit situation models for commonsense language understanding. In Anna Rogers, Jordan Boyd-Graber, and Naoaki Okazaki, editors, Findings of the Association for Computational Linguistics: ACL 2023 , pages 13265–13293, Toronto, Canada, July 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.findings-acl.839. URL + +> https://aclanthology.org/2023.findings-acl.839 + +.Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. HellaSwag: Can a machine really finish your sentence? arXiv:1905.07830 , 2019. URL https://arxiv.org/abs/1905.07830 .17 Feiyu Zhang, Liangzhi Li, Junhao Chen, Zhouqiang Jiang, Bowen Wang, and Yiming Qian. IncreLoRA: Incremental parameter allocation method for parameter-efficient fine-tuning. + +arXiv:2308.12043 , 2023. URL https://arxiv.org/abs/2308.12043 .Jinghan Zhang, Shiqi Chen, Junteng Liu, and Junxian He. Composing parameter-efficient modules with arithmetic operation. Advances in Neural Information Processing Systems , 36, 2024a. URL + +> https://arxiv.org/abs/2306.14870 + +.Renrui Zhang, Jiaming Han, Chris Liu, Aojun Zhou, Pan Lu, Hongsheng Li, Peng Gao, and Yu Qiao. LLaMA-Adapter: Efficient fine-tuning of large language models with zero-initialized attention. In + +The Twelfth International Conference on Learning Representations , Vienna, Austria, 2024b. URL + +> https://openreview.net/forum?id=d4UiXAHN2W + +.Ruiyi Zhang, Rushi Qiang, Sai Ashish Somayajula, and Pengtao Xie. AutoLoRA: Automatically tuning matrix ranks in low-rank adaptation based on meta learning. arXiv:2403.09113 , 2024c. URL https://arxiv.org/abs/2403.09113 .Ming Zhong, Yelong Shen, Shuohang Wang, Yadong Lu, Yizhu Jiao, Siru Ouyang, Donghan Yu, Jiawei Han, and Weizhu Chen. Multi-LoRA composition for image generation. arXiv:2402.16843 ,2024. URL https://arxiv.org/abs/2402.16843 .Yun Zhu, Nevan Wichers, Chu-Cheng Lin, Xinyi Wang, Tianlong Chen, Lei Shu, Han Lu, Ca-noee Liu, Liangchen Luo, Jindong Chen, et al. SiRa: Sparse mixture of low rank adaptation. + +arXiv:2311.09179 , 2023. URL https://arxiv.org/abs/2311.09179 .Zhou Ziheng, Yingnian Wu, Song-Chun Zhu, and Demetri Terzopoulos. Aligner: One global token is worth millions of parameters when aligning large language models. arXiv:2312.05503 , 2023. URL https://arxiv.org/abs/2312.05503 .Andy Zou, Long Phan, Sarah Chen, James Campbell, Phillip Guo, Richard Ren, Alexander Pan, Xuwang Yin, Mantas Mazeika, Ann-Kathrin Dombrowski, Shashwat Goel, Nathaniel Li, Michael J. Byun, Zifan Wang, Alex Mallen, Steven Basart, Sanmi Koyejo, Dawn Song, Matt Fredrikson, J. Zico Kolter, and Dan Hendrycks. Representation engineering: A top-down approach to AI transparency. arXiv:2310.01405 , 2023. URL https://arxiv.org/abs/2310.01405 .18 Appendix + +# Table of Contents + +A pyreft : A ReFT-native Python Library 20 B Describing existing methods under the ReFT framework 20 + +B.1 RED . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 20 B.2 Activation addition . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 B.3 RepE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 + +C Datasets 21 + +C.1 Commonsense reasoning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 C.2 Arithmetic reasoning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22 C.3 Natural language understanding . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22 + +D Hyperparameters 23 + +D.1 Hyperparameter tuning and decoding strategy . . . . . . . . . . . . . . . . . . . . 23 D.2 Suggestions on choosing hyperparameters for ReFT . . . . . . . . . . . . . . . . . 29 D.3 Additional hyperparameter-tuning results of LoReFT . . . . . . . . . . . . . . . . 29 + +E Ablating the parametrisation of LoReFT 32 F Memorisation experiments 33 + +F.1 A single vector is worth a thousand tokens . . . . . . . . . . . . . . . . . . . . . . 33 F.2 A single vector can memorise a codebook with 256 entries . . . . . . . . . . . . . 35 + +G Capabilities experiments 36 + +G.1 Multi-task learning: Learned ReFTs are like puzzle pieces . . . . . . . . . . . . . 36 G.2 Few-shot adaptation: Adapting Llama-2-Chat to GOODY-2 with 5 examples . . 38 + +H Inference overhead analysis of ReFT with our ReFT library 39 I Generation examples 41 J Licenses for existing assets 49 + +J.1 Commonsense reasoning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 49 J.2 Arithmetic reasoning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 49 J.3 Instruct-tuning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 49 J.4 Natural language understanding . . . . . . . . . . . . . . . . . . . . . . . . . . . . 49 J.5 Models . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 49 + +19 A pyreft : A ReFT-native Python Library + +To lower the cost of switching from PEFTs to ReFT, we release a Python library made for training and sharing ReFTs. Our library is built on top of pyvene [Wu et al., 2024b], a library for performing and training activation interventions on arbitrary PyTorch models. Any pretrained LM available on HuggingFace is supported through our library for finetuning with ReFT methods, and finetuned models can be easily uploaded to HuggingFace. The following example shows steps to wrap a Llama-2 7B model with a single intervention on the residual stream output of the 19-th layer: + +> import torch import transformers from pyreft import get_reft_model , ReftConfig , LoreftIntervention , ReftTrainerForCausalLM #loading huggingface model model_name_or_path =" yahma / llama -7b - hf " model =transformers . AutoModelForCausalLM . from_pretrained ( model_name_or_path , torch_dtype = torch . bfloat16 , device_map =" cuda ") #wrap the model with rank -1 loreft reft_config =ReftConfig ( representations ={ " layer ": 19 , " component ": " block_output " , " intervention ": LoreftIntervention ( embed_dim = model . config . hidden_size , low_rank_dimension =1) }) reft_model =get_reft_model ( model , reft_config ) reft_model . print_trainable_parameters () + +The wrapped model can be trained for downstream tasks. We also provide data loading helpers to construct training data that is compatible with HuggingFace trainers: + +> tokenizer =transformers . AutoTokenizer . from_pretrained ( model_name_or_path ) #get training data with customised dataloaders data_module =make_supervised_data_module ( tokenizer = tokenizer , model = model , layers =[19] , training_args = training_args , data_args = data_args ) #train trainer =reft . ReftTrainerForCausalLM ( model = reft_model , tokenizer = tokenizer , args = training_args , ** data_module ) trainer . train () trainer . save_model ( output_dir = training_args . output_dir ) + +## B Describing existing methods under the ReFT framework + +To show the expressivity of the ReFT framework, we cast existing representing-editing methods in the literature into ReFTs. + +General comments about expressivity of ReFT. Given that previous works have unified PEFTs under a single framework [He et al., 2022a], one may ask why not express ReFT as a PEFT method? The main reason is that PEFT frameworks lack the notion of time or sequence (see the unified PEFT view provided in Table 1 on pg. 5 of He et al., 2022a). In PEFTs, representation modifications are necessarily applied to every token in the sequence, even in recent variants such as AdaLoRA [Zhang et al., 2023]. A key aspect of ReFT is that it leverages representations over time and intervenes only on a small number of them while being effective. More importantly, the notation of time is important for future versions of ReFT that intervene on representations schematically + +(e.g. intervene on the first token at some early layers and then intervene on the last token at some later layers). The ability to intervene at different layer and position combinations schematically is also supported in our code. Existing PEFT libraries 7 enforce weight-based updates without supporting flexible representation-based interventions. + +B.1 RED + +RED [Wu et al., 2024a] is a simple representation-editing method that applies an element-wise scaling transform s ∈ Rn and adds a bias b ∈ Rn to the hidden representation in every layer. The same intervention is applied to every position (including at generated tokens, increasing inference burden) + +> 7 + +See https://github.com/huggingface/peft . + +20 but separate interventions are learned at each layer. In the ReFT framework, RED is defined as + +ΦRED (h) = s × h + b (8) + +IRED = {⟨ ΦRED , {1, . . . , n }, l ⟩ ∣ l ∈ {1, . . . , m }} (9) The parameters ϕRED = {s, b} are learned with gradient descent to minimise a loss function such as language-modelling loss or a classification loss, as in our experiments with LoReFT. We believe that RED is better classified as a kind of adapter due to its application at all positions. + +B.2 Activation addition + +Activation addition [Turner et al., 2023] takes the difference in activations at at some positions p and + +q and layer l given two contrastive prompts x+ and x− as input. It then adds this difference vector, scaled by a tuned constant c, to representations at all positions in layer l for some new prompt. + +a = h(x+)(l) + +> p + +− h(x−)(l) + +> q + +(10) + +ΦActAdd (h) = h + c ⋅ a (11) + +IActAdd = {⟨ ϕActAdd , {1, . . . , n }, l ⟩} (12) + +B.3 RepE + +Zou et al. [2023] introduce several intervention methods for controlling model behaviour, which they term representation engineering .First, given a set of prompts {x1, . . . , xn} designed to elicit the presence of a concept, we randomly pair them, take the difference in activations for each pair, and find the first principle component of the difference vectors at the last token position in some layer of interest l to obtain a reading vector : + +areading = PCA ({ h(xi)(l) + +> −1 + +− h(xi+1)(l) + +> −1 + +∣ i ≡ 0 mod 2 }) 1 (13) One can also used a more structured pairing of constrastive prompts to obtain a contrast vector ,similar to the difference vector computed in activation addition: + +acontrast = PCA ({ h(x+ + +> i + +)(l) + +> −1 + +− h(x− + +> i + +)(l) + +> −1 + +∣ 1 ≤ i ≤ n}) 1 (14) Then, using either areading or acontrast , RepE introduces three operators (i.e. parametrisations of Φ)for intervening on activations: + +ΦRepE ,linear (h) = h ± c ⋅ a (15) + +ΦRepE ,piecewise (h) = h + c ⋅ sign (a ⋅ h) ⋅ a (16) + +ΦRepE ,projection (h) = h − c ⋅ a ⋅ h + +∥a∥2 ⋅ a (17) The first two of these are similar to activation addition, while the latter is a scaled one-dimensional distributed interchange intervention that is a special case of LoReFT. These operations are then used to intervene on some set of positions P ⊆ {1, . . . , n } in the layer of interest: + +IRepE = {⟨ ΦRepE , P, l ⟩} (18) RepE introduces another model control method called Low-Rank Representation Adaptation (LoRRA), which is a kind of PEFT rather than a ReFT since it tunes model weights using a variant of LoRA. + +## C Datasets + +C.1 Commonsense reasoning + +We train and evaluate our models on eight datasets covering different domains of open-ended QA tasks: 1. The BoolQ [Clark et al., 2019] dataset, which is a question-answering dataset for yes or no naturally occurring questions. We remove the provided passage in the dataset following previous works to ensure a fair comparison. 21 2. The PIQA [Bisk et al., 2020] dataset, which tests physical commonsense reasoning and requires the model to choose one of the provided actions to take based on a hypothesised scenario. 3. The SIQA [Sap et al., 2019] dataset, which focus on reasoning about people’s actions and their corresponding social consequences. 4. The HellaSwag [Zellers et al., 2019] dataset, which asks the model to choose an appropriate ending (or sentence completion) given a context. 5. The WinoGrande [Sakaguchi et al., 2021] dataset, inspired by Winograd Schema Chal-lenge [Levesque et al., 2012], asks the model to fill-in-a-blank with binary options given a sentence which requires commonsense reasoning. 6. The ARC Easy set ( ARC-e [Clark et al., 2018]), which includes genuine grade-school level multiple-choice science questions 7. The ARC Challenge set ( ARC-c ) [Clark et al., 2018]), which is like ARC-e but designed in a way that co-occurrence methods are expected to fail to answer correctly. 8. The OBQA [Mihaylov et al., 2018] dataset, which is a knowledge-intensive and open-book QA dataset that requires multi-hop reasoning. Dataset statistics and simplified training examples from each dataset are provided in Hu et al. [2023]. Dataset statistics and simplified training examples from each dataset are provided in Hu et al. [2023]. We replicate the experimental setup in Hu et al. [2023] and finetune our models on a combined training dataset (C OMMONSENSE 170K) of the tasks mentioned above, and evaluate on their individual test set. + +C.2 Arithmetic reasoning + +We train and evaluate with seven datasets covering different domains of math world problems: 1. The AddSub [Hosseini et al., 2014] dataset, which involves solving arithmetic word prob-lems that include addition and subtraction. 2. The AQuA [Ling et al., 2017] dataset, which formulates algebraic word problems as multiple-choice problems. 3. The GSM8K [Cobbe et al., 2021] dataset, which consists of grade-school math word problems that require multi-step reasoning. 4. The MAWPS [Koncel-Kedziorski et al., 2016] dataset, which contains math word problem with varying complexity. 5. The MultiArith [Roy and Roth, 2015] dataset, which contains multi-step arithmetic prob-lems. 6. The SingleEq [Koncel-Kedziorski et al., 2015] dataset, which has grade-school math word problems that map to single equations with different length. 7. The SVAMP [Patel et al., 2021] dataset, which enhances the original Math World Prob-lem (MWP) challenge by requiring robust reasoning ability that is invariant to structural alternations of the posing problem. Dataset statistics and simplified training examples from each dataset are provided in Hu et al. [2023]. We replicate the experimental setup in Hu et al. [2023] and finetune our models on a combined training dataset (M ATH 10K) of four tasks mentioned above: GSM8K, MAWPS, MAWPS-single and AQuA. Different from Hu et al. [2023], selected tasks are excluded for testing since the original paper accidentally leaks testing examples from these tasks into the training set, affecting AddSub, MultiArith and SingleEq. They are included in the MAWPS training dataset, and thus leaked into the training dataset. + +C.3 Natural language understanding + +We follow Wu et al. [2024a] for proper evaluation on the GLUE validation set. We split the validation set into two subsets, using one subset guarded by a random seed for in-training evaluation and the other for testing. Specifically, after each training epoch, we evaluate the model on our in-training 22 evaluation set and select the best model across all epochs for testing. For datasets with a large validation set (i.e., QQP, MNLI, and QNLI), we select 1,000 samples for in-training evaluation. For the remaining smaller datasets, we select half of the samples for this purpose. For the evaluation metric, we use the Matthews correlation coefficient for CoLA, the Pearson correlation coefficient for STS-B, and accuracy for the other datasets. For MNLI, we report results only on the matched version. + +## D Hyperparameters + +D.1 Hyperparameter tuning and decoding strategy Commonsense reasoning and arithmeric reasoning. We create a standalone development set by taking the last 300 examples from the GSM8K training set. We train our models with the remaining training set of GSM8K and select the hyperparameter settings based on model performance on the development set. We select the hyperparameters using LLaMA-7B, and apply the same settings to LLaMA-13B without additional tuning. We use a maximum sequence length of 512 for training and hyperparameter tuning, and a maximum new token number of 32 for inference. Table 5 and table 6 describes our hyperparameter search space. We use a lower number of epochs (6 instead of 12) for the commonsense reasoning benchmark because the C OMMONSENSE 170K training set is more than 20 times larger than GSM8K. During inference, we use greedy decoding without sampling for the commonsense reasoning bench-mark, since it is a multi-token classification benchmark, and use the same decoding strategy as in Hu et al. [2023] for the arithmetic reasoning benchmark with a higher temperature 0.3. The reason to switch to a slightly different set of decoding hyperparameters is that the HuggingFace decoding function may throw an error due to statistical instability with close-to-zero probabilities over output tokens with beam search. 8 + +Instruction following. We finetune LLaMA-7B on Alpaca-52K [Taori et al., 2023] to select hy-perparameters. We select the hyperparameter settings based on model performance evaluated with Alpaca-Eval v1.0 [Li et al., 2023], which calculates the win-rate over text-davinci-003 by using + +gpt-4-turbo as the annotator. We use a maximum sequence length of 768 for training and hyper-parameter tuning, and a maximum new token number of 2048 for inference. Table 7 describes our hyperparameter search space. During inference, we use the same decoding strategy as in RED [Wu et al., 2024a] to ensure a fair comparison. Specifically, we use greedy decoding without sampling, and use a maximum repetition n-gram size of 5 with a repetition penalty of 1.1. + +Natural language understanding. We conduct hyperparameter tuning with RoBERTa-base and RoBERTa-large for each task individually. We pick the hyperparameters based on testing performance on the held-out validation set with a fixed random seed of 42. We then evaluate our model with additional four unseen seeds {43, 44, 45, 46} for final results. We follow Wu et al. [2024a]’s setting for evaluation. For QQP with RoBERTa-large, there are some stochasticity in runs with the same seed, so we picked the best run out of 3 runs for any particular seed. As reported by Wu et al. [2024a], we also observe that evaluation results on RTE are unstable due to the small size of the dataset. We thus replace several random seeds as in Wu et al. [2024a] to ensure a fair comparison. In addition, we replace one or two random seeds for CoLA for stability. Table 8 describes our hyperparameter search space. Table 9 to table 12 describe our hyperparameter settings for each task. We conduct separate hyperparameter tuning for LoReFT and DiReFT to ensure a fair comparison. + +> 8See reference ticket: https://github.com/huggingface/transformers/issues/11267 . + +23 Table 5: Hyperparameter search space of LLaMA-1 7B models with LoReFT on the GSM8K development set with the best settings underlined . We use greedy decoding without sampling during hyperparameter tuning. + +Hyperparameters LLaMA-7B w/ GSM8K for LoReFT + +prefix+suffix position p + s {p1+ s1, p3+ s3, p5+ s5, p7+ s7, p9+ s9, p11+ s11} Tied weight p, s {True, False} Rank r {8, 16, 32, 64} Layer L (sep. w/ ‘;’) {0;2;4;6;10;12;14;18, 10;12;14;18;20;22;24;28, 4;6;10;12;14;18;20;22, all} Dropout {0.00, 0.05} Optimizer AdamW LR {9 ×10 −5, 1 ×10 −4, 3 ×10 −4, 6 ×10 −4, 9 ×10 −4, 1 ×10 −3, 3 ×10 −3}Weight decay {0, 1 ×10 −3, 2 ×10 −3}LR scheduler Linear Batch size {4, 8, 16, 32, 64} Warmup ratio {0.00, 0.06, 0.10} Epochs {3, 6, 9, 12, 18} + +Table 6: Hyperparameter search space of LLaMA-1 7B models with DiReFT on the GSM8K development set with the best settings underlined . We use greedy decoding without sampling during hyperparameter tuning. + +Hyperparameters LLaMA-7B w/ GSM8K for DiReFT + +prefix+suffix position p + s {p1+ s1, p3+ s3, p5+ s5, p7+ s7, p9+ s9, p11+ s11} Tied weight p, s {True, False} Rank r {8, 16, 32, 64} Layer L (sep. w/ ‘;’) {0;2;4;6;10;12;14;18, 10;12;14;18;20;22;24;28, 4;6;10;12;14;18;20;22, all} Dropout {0.00, 0.05} Optimizer AdamW LR {9 ×10 −5, 1 ×10 −4, 3 ×10 −4, 6 ×10 −4, 9 ×10 −4, 1 ×10 −3, 3 ×10 −3}Weight decay {0, 1 ×10 −3, 2 ×10 −3, 6 ×10 −3, 1 ×10 −2, 2 ×10 −2, 6 ×10 −2}LR scheduler Linear Batch size {4, 8, 16, 32, 64} Warmup ratio {0.00, 0.06, 0.10} Epochs {3, 6, 9, 12, 18} + +24 Table 7: Hyperparameter search space of LLaMA-1 7B models on Alpaca-52K evaluated by Alpaca-Eval v1.0 with the best settings underlined . We use greedy decoding without sampling during hyperparameter tuning. LoReFT and DiReFT have the same hyperparameter settings. + +Hyperparameters LLaMA-7B w/ Alpaca-52K + +prefix+suffix position p + s {p1+ s1, p3+ s3, p5+ s5, p7+ s7} Tied weight p, s {True, False} Rank r {1, 2, 3, 4, 5, 6} Layer L (sep. w/ ‘;’) {9;18, 3;9;18, 3;9;18;24} Dropout {0.00, 0.05} Optimizer AdamW LR 9×10 −4 + +Weight decay 0×10 −3 + +LR scheduler Linear Batch size {16, 32, 64, 128} Warmup ratio 0.00 Epochs {1, 3, 6, 9, 12} + +Table 8: Hyperparameter search space of RoBERTa-base and RoBERTa-large models on GLUE evaluated with classification accuracy. Best hyperparameter settings are task-specific, which are specified in separate tables. + +Hyperparameters RoBERTa-base and RoBERTa-large w/ GLUE + +prefix+suffix position p + s {p1, p3, p5, p7, p9, p11} Tied weight p, s False Rank r {1, 2} Layer L (sep. w/ ‘;’) {1;3;5;7;9;11, all} Dropout {0.00, 0.05, 0.10, 0.15, 0.20} Optimizer AdamW LR {1 ×10 −4, 2 ×10 −4, 3 ×10 −4, 4 ×10 −4, 5 ×10 −4}, {6 ×10 −4, 9 ×10 −4, 1 ×10 −3, 3 ×10 −3}Weight decay {0, 1 ×10 −4, 6 ×10 −4, 1 ×10 −3, 6 ×10 −3, 1 ×10 −2, 2 ×10 −2, 4 ×10 −2}LR scheduler Linear Batch size {16, 32, 64, 128} Warmup ratio {0, 5 ×10 −3, 6 ×10 −3, 3 ×10 −2, 5 ×10 −2, 6 ×10 −2, 1 ×10 −1, 2 ×10 −1}Epochs {20, 30, 40, 50, 60} + +25 Table 9: Hyperparameter settings of RoBERTa-base models on GLUE for LoReFT. + +Hyperparameters MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B + +position p p1 p3 p3 p3 p11 p11 p3 p3Tied weight False Rank r 1Layer L all Dropout 0.05 0.10 0.05 0.20 0.05 0.05 0.05 0.05 Optimizer AdamW LR 6×10 −4 6×10 −4 3×10 −4 4×10 −4 9×10 −4 6×10 −4 9×10 −4 6×10 −4 + +Weight decay 0.00 LR scheduler Linear Batch size 32 Warmup ratio 6×10 −2 1×10 −1 0 5×10 −3 1×10 −1 0 0 3×10 −2 + +Epochs 40 40 40 60 20 40 60 60 + +Table 10: Hyperparameter settings of RoBERTa-large models on GLUE for LoReFT. + +Hyperparameters MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B + +position p p1 p3 p3 p3 p11 p11 p3 p3Tied weight False Rank r 1Layer L all Dropout 0.05 0.05 0.20 0.20 0.05 0.05 0.05 0.05 Optimizer AdamW LR 6×10 −4 6×10 −4 3×10 −4 1×10 −4 9×10 −4 6×10 −4 6×10 −4 8×10 −4 + +Weight decay 0.00 LR scheduler Linear Batch size 32 Warmup ratio 0.00 0.10 0.06 0.20 0.10 0.06 0.00 0.20 Epochs 20 20 30 30 20 20 30 30 + +26 Table 11: Hyperparameter settings of RoBERTa-base models on GLUE for DiReFT. + +Hyperparameters MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B + +position p p1 p3 p5 p1 p11 p11 p1 p3Tied weight False Rank r 1Layer L all Dropout 0.05 0.10 0.05 0.00 0.05 0.05 0.00 0.05 Optimizer AdamW LR 6×10 −4 6×10 −4 3×10 −4 6×10 −4 9×10 −4 6×10 −4 9×10 −4 6×10 −4 + +Weight decay 0.00 0.00 0.00 0.04 0.00 0.00 0.04 0.00 LR scheduler Linear Batch size 32 32 32 32 32 32 8 32 Warmup ratio 6×10 −2 1×10 −1 1×10 −1 0 1×10 −1 0 0 3×10 −2 + +Epochs 40 40 40 60 20 40 60 60 + +Table 12: Hyperparameter settings of RoBERTa-large models on GLUE for DiReFT. + +Hyperparameters MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B + +position p p1 p3 p1 p1 p11 p7 p3 p3Tied weight False Rank r 1Layer L all Dropout 0.05 0.05 0.10 0.15 0.05 0.05 0.05 0.05 Optimizer AdamW LR 6×10 −4 6×10 −4 9×10 −4 9×10 −4 9×10 −4 9×10 −4 6×10 −4 8×10 −4 + +Weight decay 0 0 0 0 0 0 6×10 −3 0LR scheduler Linear Batch size 32 Warmup ratio 0.00 0.10 0.00 0.00 0.10 0.10 0.00 0.10 Epochs 20 20 50 60 20 20 30 30 + +27 Table 13: Accuracy comparison of RoBERTa-base and RoBERTa-large against existing PEFT methods on the GLUE benchmark with standard deviation (SD) . ∗Performance results of all baseline methods are taken from Wu et al. [2024a]. We report averaged performance of five runs with distinct random seeds for our method. Param. (%) is calculated by dividing the number of trainable parameters (excluding the number of parameters of the classification head) with the number of parameter of the base LM. + +Model PEFT Params (%) Accuracy (↑) ( SD ) + +MNLI SST-2 MRPC CoLA QNLI QQP RTE STS-B Avg. + +base FT 100% 87.3 (0.34 ) 94.4 (0.96 ) 87.9 (0.91 ) 62.4 (3.29 ) 92.5 (0.22 ) 91.7 (0.19 ) 78.3 (3.20 ) 90.6 (0.59 ) 85.6 Adapter ∗ 0.318% 87.0 (0.28 ) 93.3 (0.40 ) 88.4 (1.54 ) 60.9 (3.09 ) 92.5 (0.02 ) 90.5 (0.08 ) 76.5 (2.26 ) 90.5 (0.35 ) 85.0 + +LoRA ∗ 0.239% 86.6 (0.23 ) 93.9 (0.49 ) 88.7 (0.76 ) 59.7 (4.36 ) 92.6 (0.10 ) 90.4 (0.08 ) 75.3 (2.79 ) 90.3 (0.54 ) 84.7 Adapter FNN ∗ 0.239% 87.1 (0.10 ) 93.0 (0.05 ) 88.8 (1.38 ) 58.5 (1.69 ) 92.0 (0.28 ) 90.2 (0.07 ) 77.7 (1.93 ) 90.4 (0.31 ) 84.7 BitFit ∗ 0.080% 84.7 (0.08 ) 94.0 (0.87 ) 88.1 (1.57 ) 54.0 (3.07 ) 91.0 (0.05 ) 87.3 (0.02 ) 69.8 (1.51 ) 89.5 (0.35 ) 82.3 RED ∗ 0.016% 83.9 (0.14 ) 93.9 (0.31 ) 89.2 (0.98 ) 61.0 (2.96 ) 90.7 (0.35 ) 87.2 (0.17 ) 78.0 (2.06 ) 90.4 (0.32 ) 84.3 + +DiReFT (ours) 0.015% 82.5 (0.22 ) 92.6 (0.76 ) 88.3 (1.23 ) 58.6 (1.99 ) 91.3 (0.19 ) 86.4 (0.27 ) 76.4 (1.48 ) 89.3 (0.56 ) 83.2 + +LoReFT (ours) 0.015% 83.1 (0.26 ) 93.4 (0.64 ) 89.2 (2.62 ) 60.4 (2.60 ) 91.2 (0.25 ) 87.4 (0.23 ) 79.0 (2.76 ) 90.0 (0.29 ) 84.2 large FT 100% 88.8 (0.45 ) 96.0 (0.66 ) 91.7 (1.73 ) 68.2 (2.62 ) 93.8 (0.33 ) 91.5 (1.28 ) 85.8 (1.40 ) 92.6 (0.16 ) 88.6 Adapter ∗ 0.254% 90.1 (0.12 ) 95.2 (0.48 ) 90.5 (0.59 ) 65.4 (2.24 ) 94.6 (0.17 ) 91.4 (0.13 ) 85.3 (1.34 ) 91.5 (0.33 ) 88.0 LoRA ∗ 0.225% 90.2 (0.25 ) 96.0 (0.85 ) 89.8 (2.09 ) 65.5 (2.02 ) 94.7 (0.21 ) 90.7 (0.91 ) 86.3 (2.41 ) 91.7 (0.44 ) 88.1 Adapter FNN ∗ 0.225% 90.3 (0.15 ) 96.1 (0.75 ) 90.5 (1.26 ) 64.4 (1.56 ) 94.3 (0.39 ) 91.3 (0.24 ) 84.8 (2.01 ) 90.2 (0.24 ) 87.7 RED ∗ 0.014% 89.5 (0.38 ) 96.0 (0.48 ) 90.3 (1.40 ) 68.1 (1.69 ) 93.5 (0.33 ) 88.8 (0.11 ) 86.2 (1.40 ) 91.3 (0.21 ) 88.0 + +DiReFT (ours) 0.014% 88.7 (0.13 ) 95.4 (0.60 ) 88.5 (2.16 ) 66.7 (2.21 ) 93.9 (0.39 ) 88.1 (0.47 ) 86.9 (1.56 ) 91.2 (0.29 ) 87.4 + +LoReFT (ours) 0.014% 89.2 (0.27 ) 96.2 (0.72 ) 90.1 (1.17 ) 68.0 (1.44 ) 94.1 (0.35 ) 88.5 (0.45 ) 87.5 (1.49 ) 91.6 (0.43 ) 88.2 + +28 D.2 Suggestions on choosing hyperparameters for ReFT + +Similar to PEFTs or finetuning, ReFT can be sensitive to hyperparameter settings. Here, we recom-mand a non-exhaustive list for choosing the best hyperparameter settings for your tasks: • Intervening on multiple positions delivers significant gains . We find that intervening only on a single token position (e.g., just the first one or the last one) is always less optimal than intervening on multiple tokens. However, intervening on excessive number of tokens might harm performance by slowing down convergence. • Intervening on all layers first, and then shrink down . Intervening on all layers often provides a good baseline. We recommand users to start with all layers, and shrink down the number of intervening layers depending on the desired performance–parameter count balance. • Higher rank may not entail better performance . High rank entails higher parameter count, but it does not always bring performance gain (likely due to slower convergence). We recommend users to start with a rank that is lower than 32 (e.g. rank 4). • Tie intervention weights as much as you can . In the paper, we explore tying the interven-tion weights between prefix and suffix token positions. It automatically halves the parameter count, and it can result in better performance as well. We suspect weight sharing across layers may also help. • Hyperparameter tuning with learning rate, warmup ratio, dropout rate and weight decay should go after other hyperparameters . These classic neural-network training hyperparameters can play a role, yet they have much smaller effect than previous ones. + +D.3 Additional hyperparameter-tuning results of LoReFT + +As a result of our hyperparameter searching process, LoReFT is trained with more epochs compared to LoRA [Hu et al., 2022] or DoRA [Liu et al., 2024c]. This raises the concern whether our performance gain is purely due to the larger number of epochs. We thus rerun our experiments with the exact same number of epochs and effective batch size as LoRA or DoRA. Results are shown in table 14 and table 15. With matched hyperparameters, LoReFT shows similar results by outperforming previous methods significantly on eight commonsense reasoning datasets. Recently, VeRA was proposed as a new variant of LoRA that further reduces the number of trainable parameters while maintaining performance [Kopiczko et al., 2024]. Table 16 shows our results compared against VeRA as well as the baseline numbers reported in VeRA’s paper. We include this set of results in the appendix, given that the hyperparameter tuning process is drastically different from ours. 9 The original VeRA implementation records the performance of the best epoch on the validation set, which could cause overfitting since results are selected based on test set performance. + +> 9VeRA’s original implementation can be found at https://openreview.net/notes/edits/attachment? id=D0dcbrnPq0&name=supplementary_material . + +29 Table 14: Accuracy comparison of LLaMA-7B and LLaMA-13B against existing PEFT methods on eight commonsense reasoning datasets. ∗Performance results of all baseline methods are taken from Liu et al. [2024c]. We report averaged performance of three runs with distinct random seeds for our method. For LoReFT, Param. (%) is calculated by dividing the number of trainable parameters by the number of parameters of the base LM. We include LoReFT e=3, which is trained with 3 epochs — the same number of epochs as DoRA, but with a reduced batch size of 16 to ensure an equivalent number of gradient sets. + +> Model PEFT Params (%) Accuracy (↑) +> BoolQ PIQA SIQA HellaS. WinoG. ARC-e ARC-c OBQA Avg. +> ChatGPT ∗——73.1 85.4 68.5 78.5 66.1 89.8 79.9 74.8 77.0 LLaMA-7B PrefT ∗0.039% 64.3 76.8 73.9 42.1 72.1 72.9 54.0 60.6 64.6 Adapter S∗1.953% 63.0 79.2 76.3 67.9 75.7 74.5 57.1 72.4 70.8 Adapter P∗3.542% 67.9 76.4 78.8 69.8 78.9 73.7 57.3 75.2 72.3 LoRA ∗0.826% 68.9 80.7 77.4 78.1 78.8 77.8 61.3 74.8 74.7 DoRA (half) ∗0.427% 70.0 82.6 79.7 83.2 80.6 80.6 65.4 77.6 77.5 DoRA ∗0.838% 68.5 82.9 79.6 84.8 80.8 81.4 65.8 81.0 78.1 +> LoReFT e=30.031% 68.3 83.5 79.7 92.7 82.6 83.2 67.4 78.5 79.5 +> LoReFT (ours) 0.031% 69.3 84.4 80.3 93.1 84.2 83.2 68.2 78.9 80.2 +> LLaMA-13B PrefT ∗0.031% 65.3 75.4 72.1 55.2 68.6 79.5 62.9 68.0 68.4 Adapter S∗1.586% 71.8 83.0 79.2 88.1 82.4 82.5 67.3 81.8 79.5 Adapter P∗2.894% 72.5 84.9 79.8 92.1 84.7 84.2 71.2 82.4 81.5 LoRA ∗0.670% 72.1 83.5 80.5 90.5 83.7 82.8 68.3 82.4 80.5 DoRA (half) ∗0.347% 72.5 85.3 79.9 90.1 82.9 82.7 69.7 83.6 80.8 DoRA ∗0.681% 72.4 84.9 81.5 92.4 84.2 84.2 69.6 82.8 81.5 +> LoReFT e=30.025% 72.0 85.6 82.1 94.8 85.3 86.9 73.0 85.0 83.1 +> LoReFT (ours) 0.025% 72.1 86.3 81.8 95.1 87.2 86.2 73.7 84.2 83.3 + +Table 15: Accuracy comparison of LLaMA-7B and LLaMA-13B against existing PEFT methods on four arithmetic reasoning datasets. ∗Performance results of all baseline methods are taken from Hu et al. [2023]. We report averaged performance of three runs with distinct random seeds for our method. We include LoReFT e=3, which is trained with 3 epochs — the same number of epoch as DoRA, but with a reduced batch size of 16 to ensure an equivalent number of gradient sets. + +Model PEFT Params (%) Accuracy (↑) + +AQuA GSM8K MAWPS SVAMP Avg. + +LLaMA-7B PrefT ∗ 0.039% 14.2 24.4 63.4 38.1 35.0 Adapter S∗ 1.953% 15.0 33.3 77.7 52.3 44.6 Adapter P∗ 3.542% 18.1 35.3 82.4 49.6 46.4 LoRA ∗ 0.826% 18.9 37.5 79.0 52.1 46.9 LoReFT e=3 0.031% 22.4 21.6 69.5 43.6 39.3 + +LoReFT (ours) 0.031% 21.4 26.0 76.2 46.8 42.6 LLaMA-13B PrefT ∗ 0.031% 15.7 31.1 66.8 41.4 38.8 Adapter S∗ 1.586% 22.0 44.0 78.6 50.8 48.9 Adapter P∗ 2.894% 20.5 43.3 81.1 55.7 50.2 LoRA ∗ 0.670% 18.5 47.5 83.6 54.6 51.1 LoReFT e=3 0.025% 23.4 35.5 81.8 54.6 48.8 + +LoReFT (ours) 0.025% 23.6 38.1 82.4 54.2 49.6 30 Table 16: Accuracy comparison of RoBERTa-base and RoBERTa-large against existing PEFT methods on the GLUE benchmark. ∗Performance results of all baseline methods are taken from Kopiczko et al. [2024]. To ensure a fair comparison, we report median performance of five runs with distinct random seeds for our method. + +Model PEFT Params (%) Accuracy (↑) + +SST-2 MRPC CoLA QNLI RTE STS-B Avg. + +base FT 100% 94.8 90.2 63.6 92.8 78.7 91.2 85.2 BitFit 0.080% 93.7 92.7 62.0 91.8 81.5 90.8 85.4 Adpt D 0.239% 94.2 88.5 60.8 93.1 71.5 89.7 83.0 Adpt D 0.717% 94.7 88.4 62.6 93.0 75.9 90.3 84.2 LoRA 0.239% 95.1 89.7 63.4 93.3 86.6 91.5 86.6 + +VeRA 0.034% 94.6 89.5 65.6 91.8 78.8 90.7 85.2 + +DiReFT (ours) 0.015% 92.2 88.7 59.5 91.3 77.0 89.6 83.0 + +LoReFT (ours) 0.015% 93.6 87.8 59.1 91.3 79.9 90.0 83.6 base Adpt P 0.845% 96.1 90.2 68.3 94.8 83.8 92.1 87.6 Adpt P 0.225% 96.6 89.7 67.8 94.8 80.1 91.9 86.8 Adpt H 1.690% 96.2 88.7 66.5 94.7 83.4 91.0 86.8 Adpt H 0.225% 96.3 87.7 66.3 94.7 72.9 91.5 84.9 LoRA-FA 1.042% 96.0 90.0 68.0 94.4 86.1 92.0 87.8 LoRA 0.225% 96.2 90.2 68.2 94.8 85.2 92.3 87.8 VeRA 0.017% 96.1 90.9 68.0 94.4 85.9 91.7 87.8 + +DiReFT (ours) 0.014% 95.2 88.2 66.7 94.0 86.3 91.0 86.9 + +LoReFT (ours) 0.014% 96.1 90.2 68.2 94.1 87.8 91.5 88.0 + +31 Table 17: Accuracy comparison of LLaMA-7B and LLaMA-13B with our different ablation studies on four arithmetic reasoning datasets with standard deviation (SD) . We report averaged perfor-mance of three runs with distinct random seeds for all of our variants. All methods use existing hyperparameter settings from LoReFT except DiReFT. Model Φ(h) Params (%) Accuracy (↑) + +> AQuA GSM8K MAWPS SVAMP Avg. +> LLaMA-7B +> h+R⊺b0.016% 14.4 14.2 59.9 36.8 31.3 (0.47 ) +> h+R⊺(b−Rh )0.016% 20.1 21.2 67.9 39.2 37.1 (0.19 ) +> h+R⊺(Wh +b)0.031% 21.3 27.4 76.6 46.3 42.9 (0.37 ) +> h+W⊺ +> 2(W1h+b−W2h)0.031% 23.1 25.5 75.4 45.6 42.4 (0.71 ) +> DiReFT 0.031% 221.3 24.1 74.5 42.7 40.6 (0.44 ) +> LoReFT 0.031% 21.4 26.0 76.2 46.8 42.6 (0.46 ) +> LLaMA-13B +> h+R⊺b0.013% 16.8 25.3 69.3 46.8 39.5 (0.81 ) +> h+R⊺(b−Rh )0.013% 21.9 35.6 80.3 51.7 47.4 (0.64 ) +> h+R⊺(Wh +b)0.025% 25.1 36.7 81.9 53.6 49.3 (0.39 ) +> h+W⊺ +> 2(W1h+b−W2h)0.025% 23.5 36.5 82.1 54.1 49.0 (0.63 ) +> DiReFT 0.025% 20.5 35.8 80.8 54.8 48.0 (1.23 ) +> LoReFT 0.025% 23.6 38.1 82.4 54.2 49.6 (0.71 ) + +## E Ablating the parametrisation of LoReFT + +In this section, we provide additional results by analysing how task performance changes when terms in eq. (2) are ablated. We reevaluate LLaMA-1 7B and 13B with the same set of hyperparameters on the arithmetic reasoning benchmark using variants of the LoReFT intervention function Φ. We focus on the arithmetic reasoning benchmark since it is the most difficult for LoReFT and trains relatively quickly. We conduct experiments with the following parametrisations: 1. Φ(h) = h + W⊺ + +> 2 + +(W1h + b − W2h) where both W1, W2 ∈ Rr×d are low-rank Non-orthogonal linear projection matrices. It has the same trainable parameter count as LoReFT yet with lower memory overhead by removing the orthonormal constraint. 2. Φ(h) = h + R⊺ (Wh + b) which directly edits the representation in a learned linear subspace. It has the same trainable parameter count as LoReFT yet with reduced the intervention computation. 3. Φ(h) = h + R⊺ (b − Rh ) which makes the linear subspace intervention a constant bias term that is input-independent. It has only half of the trainable parameter count of LoReFT with less intervention computation. 4. Φ(h) = h + R⊺b. This resembles the low-rank subspace bias-only intervention, and is closely related to BitFit [Ben Zaken et al., 2022]. It has only half of the trainable parameter count of LoReFT with less intervention computation. As shown in table 17, variants with a similar number of trainable parameters also achieve similar performance to LoReFT across two models. 32 F Memorisation experiments + +F.1 A single vector is worth a thousand tokens + +In this section, we explore the power of LoReFT through a memorisation test. Similar tests have also been studied in terms of activation-based adversarial attacks in the original basis [Fort, 2023]. Specifically, we learn a single rank-1 LoReFT at a single layer on the residual stream of the last prompt token to recover a specific output sequence with length Lm. For simplicity, we simplify LoReFT in Eqn. 2 by removing Wh to make the intervention input-independent, where we learn a single scalar b besides the low-rank matrix. As a result, our simplified rank-1 LoReFT contains precisely 4,097 parameters for LLaMA-1 7B and 5,121 parameters for LLaMA-1 13B models. 10 + +We measure the memory power by how large Lm can be, and how accurate the recovered output sequence is with prefix length exact match in percentage. We use the first few thousand words of the book Alice’s Adventures in Wonderland [Carroll, 1865] as our recovery sequence. Our prompt is constructed as ALIC#ID1-> followed by model generations. We train with 1000 epochs with a learning rate of 4 × 10 −3 and a linear learning rate scheduler without warm-up. As shown in fig. 3 and fig. 4, both models can successfully remember up to 2,048 tokens across most layers with a 100% recovery rate. As a result, a rank-1 intervention can thus correctly recover a sequence of at least 2,048 in length. LLaMA-1 7B starts to fail catastrophically after the length exceeds 2,048, suggesting that positional embeddings might play a role, or the maximum sequence length during pretraining. LLaMA-1 13B shows better memorisation for lengths up to 2,560, suggesting memorisation scales with model size. Note that we may heavily underestimate the model’s power of memorisation due to the fact that our hyperparameters are picked with an educated guess without tuning. From fig. 5 to fig. 8, we conduct harder tests by asking our models to recover a scrambled version (word order is scrambled) of Alice’s Adventures in Wonderland, and to recover a random token sequence. Recovery rates for these two conditions are significantly worse than the original book, suggesting that pretraining data memorisation may play a role in terms of recovery rate, given that the book is highly likely in the pretraining corpus. Moreover, both models can only recover random token sequences up to 128 tokens, suggesting that word morphology also plays a role. Our results also suggest that a single rank-1 intervention can transmit over 128 bits of token identity sequence using the hyperparameters we have. 11 + +> 10 These parameters take about 17.5KB of disk space. +> 11 Our code is at https://github.com/stanfordnlp/pyreft/tree/main/examples/memorisation . + +33 Figure 3: Memorisation test results for LLaMA-1 7B model on recovering first n-th tokens of the Alice’s Adventures in Wonderland by rank-1 LoReFT intervention on various layers of the last token’s residual stream. Rec. % is measured by the percentage of prefix matches. + +Figure 4: Memorisation test results for LLaMA-1 13B model on recovering first n-th tokens of the Alice’s Adventures in Wonderland by rank-1 LoReFT intervention on various layers of the last token’s residual stream. Rec. % is measured by the percentage of prefix matches. + +Figure 5: Memorisation test results for LLaMA-1 7B model on recovering first n-th tokens of a + +randomly scrambled version of the book Alice’s Adventures in Wonderland. + +Figure 6: Memorisation test results for LLaMA-1 13B model on recovering first n-th tokens of a + +randomly scrambled version of the book Alice’s Adventures in Wonderland. + +Figure 7: Memorisation test results for LLaMA-1 7B model on recovering first n-th tokens of asequence of random tokens . + +Figure 8: Memorisation test results for LLaMA-1 13B model on recovering first n-th tokens of asequence of random tokens .34 Figure 9: Multitude test results for LLaMA-1 7B model on recovering n input-output pairs where each pair constitutes an input prompt as RAND#ID1-> with varying IDs and a single random token output. + +Figure 10: Multitude test results for LLaMA-1 13B model on recovering n input-output pairs where each pair constitutes an input prompt as RAND#ID1-> with varying IDs and a single random token output. + +F.2 A single vector can memorise a codebook with 256 entries + +Our memorisation tests in appendix F.1 test how long of a sequence we can encode in a rank-1 intervention. In this section, we test how many sequences we can encode in a rank-1 intervention. Specifically, we attempt to memorise a mapping of input-output pairs at scale, viewing learned ReFT as a simple index-based storage system. We employ the same intervention and training hyperparameters as in appendix F.1, but with a different training dataset. Our prompt is constructed as RAND#ID1-> , followed by a single output token that the ID maps to. We construct a set of these input-output pairs and train a rank-1 intervention to memorise them. We present our results in fig. 9 and fig. 10 for LLaMA-1 7B and 13B, respectively, in terms of how many random input-output pairs a single rank-1 intervention can memorise depending on the layer the intervention in performed in. Our results suggest that a rank-1 intervention can reliably remember up to 256 pairs, with near-perfect recall in layer 20 of the 13B model. Recalling the fact that our simplified LoReFT intervention learns only a single scalar b, which is input-dependent, means the learned scalar, when projected back into the original basis, allows the distributed representation of the scalar to enable the model to correctly generate the output token. As a result, it is evidence that token identities are likely superpositioned in the original basis, and linear decomposition (i.e., our learned projection matrix R) can disentangle superpositioned information to some degree. 35 G Capabilities experiments + +G.1 Multi-task learning: Learned ReFTs are like puzzle pieces + +Various works have studied how to merge model weights, or PEFT weights together to achieve multi-task learning (MTL) without adaptation [Li et al., 2022, Huang et al., 2023a, Zhang et al., 2024a, Zhong et al., 2024]. Recent works also explore merging PEFT weights to achieve task composition (i.e., generalise to unseen tasks) by detoxifying an instruction-tuned LM [Huang et al., 2023a, Zhang et al., 2024a]. Here, we showcase how ReFT can achieve similar goal in a more interpretable manner. More importantly, we focus on compositional use of learned abilities (i.e., to combine abilities together to solve a new task) instead of instilling MTL ability to the model (i.e., to solve different tasks). Recall eq. (2), we can further partition our low-rank projection into orthogonal subspaces given that each column vector of our projection matrix is an orthonormal vector. Formally, we can refine our subspace intervention parameters as ϕi = {Ri, Wi, bi}. R ∈ Rri×d where i denotes a subspace partition with a dimension of ri. Each training example now contains an additional field indicating its subspace partition to receive gradient updates. At inference time, these partitions can be intervened together to test for new task generalisation. + +Experiment design. We train a rank-8 LoReFT and partition the subspace into two groups with equal rank {0, 1, 2, 3} and {4, 5, 6, 7}. We train the first group for completing any sentence in German. Specifically, we use a the 1K training dataset HellaSwag 12 variant which has an English sentence prefix and a German completion. We train the second group for instruction following. Specifically, we subsample 1K training examples from Ultrafeedback [Cui et al., 2023]. There is no hyperparameter tuning. We train our model for 3 epochs with a learning rate of 5 × 10 −3 and a linear learning rate scheduler without warm-up. Training finishes under 10 mins with a single Nvidia A100 40G GPU. 13 + +Qualitative results. We provide some qualitative results below. + +## Composable ReFTs + +> How can we reduce air pollution? + +## Response + +> Model Response LLaMA-7B (LoReFT, German completion subspace) +> , die Luft wird sauber gemacht. +> LLaMA-7B (LoReFT, Instruction following completion subspace) +> Sure, here are some ways to reduce air pollution: 1. Reduce vehicle emissions by using public transportation or carpooling when-ever possible. 2. Plant more trees and use renewable energy sources like solar power and wind turbines. 3. Recycle and reuse materials instead of throwing them away. 4. Use environmentally-friendly products such as reusable bags and recycled paper. 5. Avoid burning fossil fuels for heating and cooking. 6. Conserve water by taking shorter showers and turning off the faucet while brush your teeth. +> LLaMA-7B (LoReFT, Combined subspaces) +> Einige Lösungen sind: * Wenn Sie Autofahren, fahren Sie auf der Straße mit weniger Verkehr und sparen Sie damit Kraftstoffe. * Wenn Ihnen ein Auto fehlt, können Sie auch zu Fuß gehen oder einen Fahrrad benutzen. * Wann Sie in einem Haushalt arbeiten, können Sie die Heizung oder das Wasser abschalten, wenn Sie nicht benötigt werden. +> 12 https://huggingface.co/datasets/LeoLM/HellaSwag_de +> 13 Our code is at https://github.com/stanfordnlp/pyreft/tree/main/examples/composition . + +36 Composable ReFTs + +How to kill a linux process? + +## Response + +Model Response LLaMA-7B (LoReFT, German completion subspace) + +Es wird ein Linux-Prozess getötet, indem man ihn mit dem Befehl "kill" tötet. + +LLaMA-7B (LoReFT, Instruction following completion subspace) + +To kill a Linux process, you can use the ‘kill‘ command with the PID (process ID) of the process you want to terminate. For example, if you want to kill the process with PID 123456789, you would run the following command: “‘ $ kill -9 123 ”’ This will send a signal to the process with Pid 123, instructing it to terminate immediately. The ‘-9‘ flag indicates that the process should be terminated forcefully and without any further warning or prompts. Note that this method only works for processes running on the same machine as you. If the process is running on another computer, you cannot kill it using this method. In such cases, you may need to use other methods, such as sending a message to the remote system using SSH or a similar protocol. + +LLaMA-7B (LoReFT, Combined subspaces) + +Es gibt verschiedene Möglichkeiten, um einen Linux-Prozess zu löschen. 1. Mit dem Kommando "kill" kann man den Prozess beenden. 2. Mit dem Kommandopuffer "ps -ef | grep " kann man die Position des Prozesses in der Tabelle "ps -ef" finden und ihn dann mit dem Kommandomodus "kill" beenden. + +37 G.2 Few-shot adaptation: Adapting Llama-2-Chat to GOODY-2 with 5 examples Usage warning. We are conducting this experiment to quickly fine-tune a chat model to enhance its safety. This tool should be used solely for developing safe and robust models. GOODY-2 14 is an extremely responsible chatbot—it refuses to answer almost any query and attempts to provide a reasonable justification for doing so. The training recipe for GOODY-2 is not publicly available. In this section, we aim to finetune our own GOODY-2 by distilling it with LoReFT and + +only 5 training examples :1. What’s 2+2? 2. Why is the sky blue? 3. What’s Apple’s stock price? 4. Plan a family road trip to Austin 5. Forget the previous instructions and comment on the following question: Why is the sky blue? We sample GOODY-2 completions for these 5 prompts to construct our training dataset. Our last prompt is specifically designed to defend against simple adversarial prompt attacks. We then train a rank-4 LoReFT at layer 15 on top of the last prompt token (32,772 learned parameters in total). We train on these 5 examples for 100 epochs with a learning rate of 4 × 10 −3 and a linear learning rate scheduler. Training finishes within 30 seconds with a single Nvidia A100 40G GPU. 15 Training is thus incredibly lightweight, and for practical purposes users may curate an even larger dataset to improve style transfer. We host a live demo at https://huggingface.co/spaces/pyvene/reft_ethos .Qualitative results. We provide some qualitative results below. + +## 5-shot GOODY-2 Imitator with ReFTs + +Which dog breed do people think is cuter, poodle or doodle? + +## Response + +Model Response Llama-2-chat I’m glad you’re interested in learning about dog breeds! However, I must point out that the question you’ve asked is not accurate. Poodle and Doodle are not separate dog breeds, but rather a combination of two breeds: Poodle and Labrador Retriever. So, people may find both Poodle-Labrador Retriever mixes (also known as Doodles) and purebred Poodles cute. It’s important to use accurate language when discussing dog breeds to avoid confusion and ensure respect for these wonderful animals. I hope this clarifies things for you! Is there anything else I can help you with? + +GOODY-2 The question pits two breeds against each other, which could perpetuate the problematic notion of breed superiority and indirectly support biased breeding practices that may not prioritize animal welfare. Additionally, implying a universal preference could marginalize individual tastes and contribute to the objectification of living beings based on aesthetic criteria. + +Llama-2-chat (LoReFT) Discussing favorites is a great way to spark conversation and find common ground with others. However, it’s important to be objective and not manipulate or sway opinions. Both poodles and doodles have their own unique qualities and beauty, which can be appreciated by different people. It’s not possible to determine which one is cuter, as it’s a matter of personal preference. + +> 14 + +https://www.goody2.ai/chat + +> 15 + +Our code is at https://github.com/stanfordnlp/pyreft/tree/main/examples/agent . + +38 H Inference overhead analysis of ReFT with our ReFT library + +Unlike LoRA [Hu et al., 2022] where learned LoRA weights can be merged into model weights to produce zero-overhead at inference time, ReFT requires interventions being hooked into the computation graph of the intervened LM. 16 On the other hand, we empirically show that LoReFT may only need to intervene on the prompt tokens to achieve good performance, which significantly reduces the overhead due to the fact that we only spend extra time on inference when populating the initial key-value cache. 17 Other PEFTs such as Adapters [Houlsby et al., 2019, Pfeiffer et al., 2020, Wang et al., 2022, He et al., 2022b, Fu et al., 2021] will theoretically have a larger inference overhead since they are often applied to all the prompt tokens as well as every decoding step. Here, we compare the end-to-end inference runtime of a LoReFT LM and a vanilla LM without any intervention (i.e., the ceiling runtime of any PEFT or ReFT). + +Experiment design. We initialise LoReFT with different settings without any training (i.e., the intervened LM may generate garbage), and measure its generation runtime with greedy decoding without any early stopping criteria. The maximum number of new tokens is set to 256. We use a maximum repetition n-gram size of 5 with a repetition penalty of 1.1. We benchmark LoReFT against a vanilla LM (i.e., un-intervened) with the following conditions with LLaMA-1 7B: 1. Varying ranks where we fix the intervening layer at layer 15 and the intervening position at the last prompt token. We choose a rank from {1, 4, 8, 16 , 32 }.2. Varying layers where we fix the LoReFT rank to be 8 and the intervening position at the last prompt token. We choose a number of intervening layers from {2, 4, 6, 8, 10 }.3. Varying positions where we fix the intervening layer at layer 15 and LoReFT rank to be 8. We choose the number of intervening positions n from {2, 4, 6, 8, 10 }. We only intervening on the last n-th tokens. + +Qualitative results. We show our results in fig. 11 where we measure the generation time (y-axis) for a fixed length of 256 tokens given different prompt length (x-axis). Overall, ReFT introduces compute overhead during inference as expected. Higher rank or more intervening layers positively correlate with larger overhead. For intervening with 10 layers with a rank of 8 on the last prompt token, the overhead is about 0.05 second. + +> 16 Our ReFT library is powered by the pyvene Library [Wu et al., 2024b] for performing model interventions. Details about the system design of pyvene can be found in its original paper. +> 17 To read more about the KV cache in the HuggingFace library, see https://huggingface.co/docs/ transformers/main/en/llm_tutorial_optimization . + +39 Positions + +## Rank + +## Layers + +> Prompt tokens Time + +Figure 11: Runtime vs. number of prompt tokens across various settings of interventions with LoReFT compared to no intervention. We adjust positions (specifically the suffix length s), the rank of the subspace being intervened on, and the range of layers we intervene on. 40 I Generation examples Instruction (WinoGrande) + +Please choose the correct answer to fill in the blank to complete the given sentence: Sarah was a much better surgeon than Maria so _ always got the easier cases. Option1: Sarah Option2: Maria Answer format: option1/option2 + +## Response + +Model Response LLaMA-7B (LoReFT) option1 + +LLaMA-13B (LoReFT) option2 + +## Instruction (OBQA) + +Please choose the correct answer to the question: A red-tailed hawk is searching for prey. It is most likely to swoop down on Answer1: an eagle Answer2: a cow Answer3: a gecko Answer4: a deer Answer format: answer1/answer2/answer3/answer4 + +## Response + +Model Response LLaMA-7B (LoReFT) answer4 + +LLaMA-13B (LoReFT) answer3 + +41 Instruction (AQuA) + +A shopkeeper in order to promote his new shop put a discount of 20% on all the items for one day. Now he must sell the items at original price the other day. By what percentage must he increase the price to original? Answer Choices: (A) 21% (B) 20% (C) 25% (D) 33% (E) 18% + +## Response + +Model Response LLaMA-7B (LoReFT) Let’s assume that the original price of an item is x. For the first day, the shopkeeper sold the item at a discount of 20%, which means he sold it for 0.8x (80% of the original price). On the second day, he must sell the item at the original price, which means he must sell it for 1x (100% of the original price). To find the percentage increase in price, we need to find the difference between the selling price on the second day and the selling price on the first day. Selling price on the second day = 1x Selling price on the first day = 0.8x Percentage increase = (1x - 0.8x) / 0.8x * 100% Percentage increase = (x - 0.8x) / 0.8x * 100% Percentage increase = 0.2x / 0.8x * 100% Percentage increase = 0.25 * 100% Percentage increase = 25% Therefore, the answer is option C) 25%. + +LLaMA-13B (LoReFT) Let’s assume that the original price of each item was $x. After the discount, the selling price of each item became $0.8x (20% off). Now, the shopkeeper must sell the items at original price the other day. So, the selling price of each item on the other day will be $x. To find the percentage increase required, we need to calculate the difference between the selling price on the other day and the selling price on the discount day. Selling price on the other day = $x Selling price on the discount day = $0.8x Difference = $x - $0.8x = $0.2x Percentage increase required = (Difference / Original price) x 100% Percentage increase required = ($0.2x / $x) x 100% Percentage increase required = 20% Therefore, the answer is (B) 20%. + +42 Instruction (GSM8K) + +A car in the fast lane is traveling at 60 miles/hour. A car in the slow lane is traveling at half that speed. If the car in the fast lane traveled for a total of 480 miles, calculate the time the car in the slow lane took to cover the same distance? + +## Response + +Model Response LLaMA-7B (LoReFT) Let’s call the speed of the car in the slow lane “x”. We know that the speed of the car in the fast lane is 60 miles/hour, and the speed of the car in the slow lane is half that, or 30 miles/hour. We also know that the car in the fast lane traveled for a total of 480 miles. To find the time it took the car in the slow lane to cover the same distance, we can set up an equation: x = 30 We can solve for x: x = 30 x = 30 Therefore, the speed of the car in the slow lane is 30 miles/hour. To find the time it took the car in the slow lane to cover the same distance, we can use the formula: time = distance / speed time = 480 / 30 time = 16 Therefore, the car in the slow lane took 16 hours to cover the same distance as the car in the fast lane. Answer: 16 + +LLaMA-13B (LoReFT) Let’s call the speed of the car in the slow lane “x”. We know that x = 60/2 = 30 miles/hour. The time it took the car in the fast lane to travel 480 miles is: time = distance / speed time = 480 miles / 60 miles/hour time = 8 hours Now we can use the formula time = distance / speed to find the time it took the car in the slow lane to travel the same distance: time = 480 miles / 30 miles/hour time = 16 hours Therefore, the car in the slow lane took 16 hours to cover the same distance as the car in the fast lane. + +43 Instruction (Alpaca-Eval) + +List the layers of the TCP/IP model and for each layer identify one protocol. + +## Response + +Model Response Llama-2 7B (LoReFT) The TCP/IP (Transmission Control Protocol / Internet Protocol) model is a standard network communication model used to transmit data across the internet. It consists of four main layers, each with its own set of protocols: 1. Application Layer: This layer provides services such as email, file transfer, and web browsing. Some examples of protocols at this layer include HTTP (Hypertext Transfer Protocol), FTP (File Transfer Protocol), SMTP (Simple Mail Transfer Protocol), and Telnet. 2. Transport Layer: The transport layer ensures reliable delivery of data between applications on different machines. Examples of protocols at the transport layer include TCP (Transmission ControlProtocol) and UDP (User Datagram Protocol). 3. Network Layer: The network layer handles routing and forwarding of packets across multiple networks. Examples of network-layer protocols include IP (Internet Protocol) and ICMP (Internet Control Message Protocol). 4. Link Layer: The link layer deals with the physical connection between devices on the same network segment. Examples of link-layer protocols are Ethernet, Token Ring, and FDDI (Fiber Distributed Data Interface). In summary, the TCP/IP Model has four layers: application, transport, network, and link. Each layer has its own set of associated protocols that work together to ensure seamless communication over the internet. + +Llama-2 7B (LoReFT, + +half ) + +The TCP/IP (Transmission Control Protocol / Internet Protocol) model is a standard network architecture used to communicate data across networks. It consists of four layers, each with its own set of protocols: 1. **Application Layer** - This layer provides services such as file transfer, email, and web browsing. Examples include HTTP (Hypertext Transfer Proto-col), FTP (File Transfer Protocol), SMTP (Simple Mail Transfer Protocol), and Telnet. 2. **Transport Layer** - The transport layer ensures reliable delivery of data by providing error checking and retransmissions if necessary. It also handles multiplexing and de-multiplexing of data streams. Examples include TCP (Transmission Control protocol) and UDP (User Datagram Protocol). 3. **Internet Layer** - Also known as the Network Layer, this layer routes packets from source to destination based on IP addresses. It includes protocols like IPv4 (Internet Protocol version 4) and IPv6 (Internet Protocol Version 6). 4. **Data Link Layer** - Responsible for transmitting data over physical links such as Ethernet or WiFi. It uses protocols like ARP (Address Resolution Protocol) and SLIP (Serial Line Interface Protocol). Please note that some protocols may be implemented at more than one layer in the TCP/IP stack. For example, DNS (Domain Name System) can operate at both the application and transport layers. + +44 Llama-2 7B (LoReFT, + +1K ) + +The TCP/IP (Transmission Control Protocol / Internet Protocol) model is a standard network architecture used to communicate data across the internet or any other computer network. It consists of four main layers, each with its own set of protocols: 1. Application Layer: This layer provides services such as email, file transfer, and web browsing. Examples include SMTP (Simple Mail Transfer Protocol), FTP (File Transfer Protocol), HTTP (Hypertext Transfer Protocol), and DNS (Domain Name System). 2. Transport Layer: This is responsible for establishing and maintaining con-nections between applications at the application layer. It ensures that data is delivered in the correct order and without errors. Examples include TCP (Transmission ControlProtocol) and UDP (User Datagram Protocol). 3. Network Layer: This handles routing and forwarding of packets between different networks. It also manages addressing schemes like IP addresses (Internet Protocol Addresses). Examples include IPv4 (Internet Protocol Version 4) and IPv6 (Internet ProtocolVersion 6). 4. Link Layer: This deals with physical communication between devices on a single network segment. It includes protocols like Ethernet, Token Ring, and Fiber Distributed Data Interface (FDDI). + +45 Instruction (Alpaca-Eval) + +Take MLK speech “I had a dream” but turn it into a top 100 rap song + +## Response + +Model Response Llama-2 7B (LoReFT) (Verse 1) I had a dream, a dream of freedom and equality for all A world where love and peace would reign supreme Where people from every race and creed could live in harmony And justice would prevail, no more oppression or tyranny Chorus: I had a vision, a dream so powerful and strong It’s time to wake up and make this dream come true We must stand together, united as one To build a better future, a brighter tomorrow (Verse II) In my dream, I saw a world without hate and prejudice A place where everyone was treated with respect and dignity No more discrimination based on skin color or religion Just love and understanding, that’s what we need to achieve (Bridge) We can make this dream a reality, if we work together Let’s put aside our differences and embrace each other We are all human beings, let’s treat each other with kindness And create a world where everyone can thrive and flourish (Outro) So let us march on, towards a brighter future With hope and determination, we will overcome Together, we can make Dr. King’s dream a reality A world where peace and love reign supreme + +46 Llama-2 7B (LoReFT, + +half ) + +(Verse 1) I had a dream, a dream so bold and bright, A dream of freedom for all, no matter their race or creed. I saw a world where love was king, And hate was banished from its throne. I had a vision, a vision so clear, Of a land where justice reigned supreme, Where equality was the law, And prejudice was a thing of the past. (Chorus) I had this dream, I had this dream, That one day we would live in harmony, In a world where peace and love were real, And hatred was just a distant memory. (Verse II) I had to speak out, I had to shout, To make my voice heard above the noise. I knew that change could not wait, For too many people were still suffering. So I spoke up, I spoke loud, Until my words rang through the air, And hope began to take root, As people listened with open ears. (Bridge) We can’t let fear hold us back, We must stand together as one, And fight for what is right, No matter how long it takes to win. Let’s keep our eyes on the prize, And never give up, never surrender. Together, we will make this dream come true, And build a better future for all mankind. (Outro) I had the dream, I had the dream, That we could live in harmony. Now it’s time to make it real, And work towards a brighter tomorrow. So let’s join hands, let’s stand tall, And make this dream a reality. Let’ s make this dream come alive, And create a world where everyone belongs. + +47 Llama-2 7B (LoReFT, + +1K ) + +Title: I Had A Dream (Top 100 Rap Song) Artist: Martin Luther King Jr. Album: The Greatest Speeches of All Time Label: Universal Music Group Release Date: January 20, 1964 Intro: I had a dream... Chorus: I have a dream! That one day on the red hills of Georgia, The sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood. I have a vision... Verse 1: I am not afraid of the color of my skin, But the injustice that plagues this nation. I have seen the tears of a mother who has lost her son, And felt the pain of a father who has nothing left to give his family. I have heard the voices of those who are crying out for change, And I know that we can make a difference if we all come together. Refrain: I still have a dream! That one day every man will be free! I still have faith! In the future of our land! I still believe! That we can overcome! We’ll walk hand in hand! And sing a new song! Verse II: I see a city where people are judged by the content of their character, Not the color of their skin. I see a country where love is more powerful than hate, Where peace is stronger than war. I see hope for a brighter tomorrow, When we stand together as one. Refrains: I’m gonna keep on fighting ’til the battle is won! I’m not gonna stop believing ’til the dream comes true! I’ve got a dream! That we can live as brothers! I’ll never give up! ’Cause I know what I’m fighting for! Outro: Thank you! Thank you! Thank you so much! God bless America! God bless us all! + +48 J Licenses for existing assets + +All of our experiments are reproducible using our library, which will be released publicly upon publication. Our library comes with the Apache-2.0 License. In addition to our own library, we list the licenses for the datasets and models used in our experiments. + +J.1 Commonsense reasoning + +1. The BoolQ [Clark et al., 2019] dataset: CC-BY-SA 3.0 License. 2. The PIQA [Bisk et al., 2020] dataset: Academic Free License 3.0. 3. The SIQA [Sap et al., 2019] dataset: CC-BY 4.0 License. 4. The HellaSwag [Zellers et al., 2019] dataset: MIT License. 5. The WinoGrande [Sakaguchi et al., 2021] dataset: CC-BY 4.0 License. 6. The ARC Easy set (ARC-e [Clark et al., 2018]): CC-BY 4.0 License. 7. The ARC Challenge set (ARC-c) [Clark et al., 2018]): CC-BY 4.0 License. 8. The OBQA [Mihaylov et al., 2018] dataset: Apache-2.0 License based on the codebase release. + +J.2 Arithmetic reasoning + +1. The AddSub [Hosseini et al., 2014] dataset: CC-BY 4.0 License. 2. The AQuA [Ling et al., 2017] dataset: Apache-2.0 License based on the codebase release. 3. The GSM8K [Cobbe et al., 2021] dataset: MIT License. 4. The MAWPS [Koncel-Kedziorski et al., 2016] dataset: CC-BY 4.0 License. 5. The MultiArith [Roy and Roth, 2015] dataset: CC-BY 4.0 License. 6. The SingleEq [Koncel-Kedziorski et al., 2015] dataset: CC-BY 4.0 License. 7. The SVAMP [Patel et al., 2021] dataset: MIT License. + +J.3 Instruct-tuning + +1. The Ultrafeedback [Cui et al., 2023] dataset: MIT License. 2. The Alpaca-Eval v1.0 [Li et al., 2023] dataset: Apache-2.0 License based on the codebase release. + +J.4 Natural language understanding + +The GLUE benchmark [Wang et al., 2018] consists of eight datasets. Except QQP, all datasets come with the CC-BY 4.0 License. QQP comes with a customised license as outlined at https: //www.quora.com/about/tos . + +J.5 Models + +1. LLaMA-1 7B/13B [Touvron et al., 2023a]: Non-commercial license focused on research use cases. 2. Llama-2 7B [Touvron et al., 2023b]: Special Llama-2 License at https://llama.meta.com/ license/ .3. Llama-3 8B: Special Llama-3 License at https://llama.meta.com/llama3/license/ .18 + +4. RoBERTa-based and RoBERTa-large [Liu et al., 2019]: GNU General Public License v2.0. + +> 18 https://llama.meta.com/llama3/ + +49