mirror of
https://github.com/wassname/adapters_as_hypotheses.git
synced 2026-06-27 16:44:10 +08:00
245 lines
8.4 KiB
Markdown
245 lines
8.4 KiB
Markdown
# Adapter Pseudocode (pseudopy)
|
||
|
||
Companion to [adapters_vargdown.argdown](adapters_vargdown.argdown).
|
||
Each adapter's intervention as pseudopy: minimal, self-documenting, shapes in comments.
|
||
|
||
---
|
||
|
||
## 1. LoRA -- baseline
|
||
|
||
```
|
||
# ── LoRA: low-rank weight update ──
|
||
ΔW ← B @ A # ΔW ∈ ℝ^{m×n}, rank r
|
||
y ← (W + (α/r) · ΔW) @ x # W frozen; A, B learned
|
||
```
|
||
|
||
## 2. OFT -- orthogonal rotation
|
||
|
||
```
|
||
# ── OFT: Cayley rotation of weight rows ──
|
||
Q_skew ← skew(Q) # Q_skew ∈ ℝ^{b×k×k}, antisymmetric
|
||
R ← (I + Q_skew)(I - Q_skew)⁻¹ # Cayley map → R ∈ O(k)
|
||
R_full ← blkdiag(R₁, ..., R_b) # R_full ∈ O(d)
|
||
y ← (W @ Rᵀ) @ x # rotate rows, preserve angles
|
||
```
|
||
|
||
## 3. VeRA -- random projection + scaling
|
||
|
||
```
|
||
# ── VeRA: shared random matrices, per-layer scaling ──
|
||
# A ∈ ℝ^{r×n}, B ∈ ℝ^{m×r}: frozen random, shared ALL layers
|
||
# λ_d ∈ ℝ^r, λ_b ∈ ℝ^r: learned per-layer
|
||
ΔW ← (λ_b ⊙ B) @ (λ_d ⊙ A) # ΔW ∈ ℝ^{m×n}
|
||
y ← (W + ΔW) @ x
|
||
```
|
||
|
||
## 4. DoRA -- magnitude/direction decomposition
|
||
|
||
```
|
||
# ── DoRA: decouple ‖W‖ from W/‖W‖ ──
|
||
# m ∈ ℝ^{m}: learned magnitude per output neuron
|
||
V ← W + α · B @ A # updated direction
|
||
V̂ ← V / ‖V‖_col # unit direction, .detach()
|
||
y ← (m · V̂) @ x # magnitude × direction
|
||
```
|
||
|
||
## 5. DeLoRA -- bounded deviation
|
||
|
||
```
|
||
# ── DeLoRA: normalize rank-1 components, scale by λ ──
|
||
# A ∈ ℝ^{r×n}, B ∈ ℝ^{m×r}: learned
|
||
# λ ∈ ℝ^r: learned per-component strength
|
||
â_i ← ‖aᵢ‖, b̂ⱼ ← ‖bⱼ‖ # per-component norms
|
||
s ← (λ/r) / (â · b̂) # normalize each outer product
|
||
ΔW ← B · diag(s) · A # direction normalized, strength via λ
|
||
y ← W(x) + ΔW @ x
|
||
```
|
||
|
||
## 6. PiSSA -- SVD initialization
|
||
|
||
```
|
||
# ── PiSSA: init adapters from top-r SVD of W ──
|
||
U, Σ, Vᵀ ← svd(W) # W ∈ ℝ^{m×n}
|
||
A ← U_{:,:r} · √Σ_{:r} # A ∈ ℝ^{m×r}, principal left
|
||
B ← √Σ_{:r} · Vᵀ_{:r,:} # B ∈ ℝ^{r×n}, principal right
|
||
W_res ← U_{:,r:} · Σ_{r:} · Vᵀ_{r:,:} # residual, frozen
|
||
# ── forward (identical to LoRA) ──
|
||
y ← (W_res + A @ B) @ x
|
||
```
|
||
|
||
## 7. SVFT -- SVD coefficient tuning
|
||
|
||
```
|
||
# ── SVFT: learn sparse coefficients over W's own singular vectors ──
|
||
U, Σ, Vᵀ ← svd(W) # frozen
|
||
# select k sparse (i,j) pairs; c ∈ ℝ^k learned
|
||
ΔW ← Σₜ cₜ · uᵢ vⱼᵀ # sparse combo of outer products
|
||
y ← (W + ΔW) @ x
|
||
```
|
||
|
||
## 8. SSVD -- asymmetric SVD rotation
|
||
|
||
```
|
||
# ── SSVD: rotate right singular vectors, preserve left ──
|
||
U, Σ, Vᵀ ← svd(W) # all frozen
|
||
K ← skew(θ) # K ∈ ℝ^{k×k}, learned antisymmetric
|
||
G ← (I - K)(I + K)⁻¹ # Cayley → G ∈ O(k)
|
||
Σ̂ ← Σ; Σ̂_{:k} += ΔΣ # shift top-k singular values (learned)
|
||
V̂ᵀ ← Vᵀ; V̂ᵀ_{:k} ← G @ Vᵀ_{:k} # rotate input-space vectors only
|
||
y ← U · diag(Σ̂) · V̂ᵀ @ x
|
||
# params: k(k-1)/2 + k
|
||
```
|
||
|
||
$$W' = U \, (\Sigma + \Delta\Sigma) \, G_k \, V^\top$$
|
||
|
||
## 9. IA3 -- activation scaling
|
||
|
||
```
|
||
# ── IA3: element-wise scaling of activations ──
|
||
# λ ∈ ℝ^d: learned, init=1
|
||
y ← W @ (x ⊙ λ) # FFN: scale input channels
|
||
y ← (W @ x) ⊙ λ # attn K,V: scale output channels
|
||
```
|
||
|
||
## 10. ROAD -- rotary adaptation
|
||
|
||
```
|
||
# ── ROAD: 2D rotation + magnitude per activation pair ──
|
||
# θ ∈ ℝ^{d/2}: learned rotation angles
|
||
# α ∈ ℝ^{d/2}: learned magnitudes
|
||
for i in range(d//2):
|
||
x̂[2i] ← α_i · (cos θ_i · x[2i] - sin θ_i · x[2i+1])
|
||
x̂[2i+1] ← α_i · (sin θ_i · x[2i] + cos θ_i · x[2i+1])
|
||
y ← W @ x̂ # α=1, θ=0 → identity
|
||
```
|
||
|
||
## 11. AntiPaSTO -- SVD Cayley steering
|
||
|
||
```
|
||
# ── AntiPaSTO: Cayley rotation of SVD singular vectors ──
|
||
U, Σ, Vᵀ ← svd(W) # frozen; computed once at init
|
||
K ← skew(θ) # θ ∈ ℝ^{k(k-1)/2}, learned
|
||
R ← (I - K)(I + K)⁻¹ # Cayley → R ∈ O(k)
|
||
# ── rotate both U and V by same R ──
|
||
Û ← U; Û_{:,:k} ← U_{:,:k} @ R # rotate output-space
|
||
V̂ᵀ ← Vᵀ; V̂ᵀ_{:k,:} ← R @ Vᵀ_{:k,:} # rotate input-space
|
||
W' ← Û · diag(Σ) · V̂ᵀ # reconstruct
|
||
y ← W' @ x
|
||
# antiparallel: negate θ → opposite behavioral direction
|
||
```
|
||
|
||
## 12. AdaLoRA -- adaptive SVD rank
|
||
|
||
```
|
||
# ── AdaLoRA: SVD-parameterized with importance pruning ──
|
||
ΔW ← P · diag(Λ) · Q # P ∈ ℝ^{m×r}, Q ∈ ℝ^{r×n}
|
||
# importance score per singular value:
|
||
s_i ← |Λ_i| + β · ‖pᵢ‖ · ‖qᵢ‖ # sensitivity-weighted
|
||
# prune: zero out components with lowest s_i per budget
|
||
mask ← topk(s, budget)
|
||
ΔW ← P · diag(Λ ⊙ mask) · Q
|
||
y ← (W + ΔW) @ x
|
||
```
|
||
|
||
## 13. BOFT -- butterfly orthogonal
|
||
|
||
```
|
||
# ── BOFT: butterfly-factorized O(d log d) orthogonal ──
|
||
# m butterfly factors B₁, ..., B_m each ∈ ℝ^{n/b × b × b}
|
||
# each Bⱼ is block-diagonal of small orthogonal matrices
|
||
R ← B_m @ ... @ B₂ @ B₁ # R ∈ O(d), O(d log d) params
|
||
y ← (W @ Rᵀ) @ x
|
||
```
|
||
|
||
## 14. GOFT -- Givens rotations
|
||
|
||
```
|
||
# ── GOFT: compose d(d-1)/2 planar rotations ──
|
||
R ← I
|
||
for (i,j,θ) in givens_pairs: # each θ learned
|
||
G ← I; G[i,i] ← cos θ; G[i,j] ← -sin θ
|
||
G[j,i] ← sin θ; G[j,j] ← cos θ
|
||
R ← G @ R # compose
|
||
y ← (W @ Rᵀ) @ x # O(d) params
|
||
```
|
||
|
||
## 15. HRA -- Householder reflection
|
||
|
||
```
|
||
# ── HRA: chain of r Householder reflections = rank-r ∩ O(d) ──
|
||
R ← I
|
||
for i in range(r):
|
||
vᵢ ← learned # v ∈ ℝ^d
|
||
Hᵢ ← I - 2 · vᵢ vᵢᵀ / ‖vᵢ‖² # Householder reflection
|
||
R ← Hᵢ @ R
|
||
y ← (W @ Rᵀ) @ x
|
||
# bridges: rank-r perturbation ≡ r Householder reflections
|
||
```
|
||
|
||
## 16. RandLoRA -- full-rank via random bases
|
||
|
||
```
|
||
# ── RandLoRA: sum of scaled random rank-r bases ──
|
||
# A_i, B_i: frozen random matrices
|
||
# d_i, b_i ∈ ℝ^r: learned per-component scaling
|
||
ΔW ← Σᵢ (b_i ⊙ Bᵢ) @ (d_i ⊙ Aᵢ) # full rank possible
|
||
y ← (W + ΔW) @ x
|
||
```
|
||
|
||
## 17. FourierFT -- spectral coefficients
|
||
|
||
```
|
||
# ── FourierFT: sparse Fourier coefficients ──
|
||
# select k frequency indices; c ∈ ℝ^k learned
|
||
ΔW ← iFFT2(scatter(c, indices, shape=(m,n)))
|
||
y ← (W + ΔW) @ x
|
||
```
|
||
|
||
## 18. CLOVER -- joint SVD across attention pairs
|
||
|
||
```
|
||
# ── CLOVER: joint SVD over Q-K and V-O pairs per head ──
|
||
# ── init: decompose paired attention matrices ──
|
||
W_QK ← W_Q @ W_K.T # W_QK ∈ ℝ^{d×d}, combined Q-K
|
||
U_qk, S_qk, V_qk ← svd(W_QK) # per-head SVD
|
||
W_VO ← W_V @ W_O.T # W_VO ∈ ℝ^{d×d}, combined V-O
|
||
U_vo, S_vo, V_vo ← svd(W_VO)
|
||
|
||
# ── forward: only S is learned, U/V frozen ──
|
||
# rewrite Q,K from shared orthogonal basis:
|
||
W_Q' ← U_qk @ diag(√S_qk) # Q uses left singular vectors
|
||
W_K' ← V_qk @ diag(√S_qk) # K uses right singular vectors
|
||
# fine-tune: learn ΔS_qk, ΔS_vo (full-rank update via all directions)
|
||
S_qk' ← S_qk + ΔS_qk # learned shifts
|
||
y ← attn(W_Q' @ x, W_K' @ x, ...)
|
||
# params: 2 × rank scalars per head (pruning: zero small S entries)
|
||
```
|
||
|
||
## 19. PSOFT -- principal subspace + Cayley
|
||
|
||
```
|
||
# ── PSOFT: PiSSA init + OFT rotation in principal subspace ──
|
||
U, Σ, Vᵀ ← svd(W)
|
||
# extract top-k subspace
|
||
# learn Cayley rotation R within that subspace only
|
||
# 80% memory reduction vs full OFT
|
||
R ← cayley(K) # K ∈ ℝ^{k×k}
|
||
W' ← U_{:,:k} @ R @ diag(Σ_{:k}) @ Vᵀ_{:k,:} + W_res
|
||
```
|
||
|
||
## 20. ReFT -- activation intervention
|
||
|
||
```
|
||
# ── ReFT: intervene on hidden states at (layer, position) ──
|
||
# R ∈ ℝ^{r×d}: learned rotation (low-rank subspace)
|
||
# b ∈ ℝ^r: learned bias in subspace
|
||
h ← model.layer[l].output[pos] # hidden state at site
|
||
h_proj ← R @ h # project to subspace
|
||
h_proj ← h_proj + b # intervene
|
||
h ← h + Rᵀ @ (h_proj - R @ h) # write back (preserve complement)
|
||
```
|
||
|
||
---
|
||
|
||
*Notation: `⊙` element-wise, `@` matmul, `←` assignment, `Σₜ` summation, `‖·‖` norm, `blkdiag` block diagonal, `skew` maps vector to skew-symmetric matrix, `cayley(K) = (I-K)(I+K)⁻¹`.*
|