mirror of
https://github.com/wassname/lora-lite.git
synced 2026-06-27 17:01:14 +08:00
docs: tighten suppress_only contract + arrow top-b selection note
External (codex) review found the suppress_only "attenuation only" claim holds only for coeff>=0 (coeff<0 inverts the product and re-amplifies). Doc-only caveat in antipasto/_corda/_arrow; no math change (sweeps run coeff=1.0). Also clarify arrow's group_init top-b lands on largest-S-among-selected, not highest-score. Co-Authored-By: Claudypoo <noreply@anthropic.com>
This commit is contained in:
@@ -61,7 +61,8 @@ class AntiPaSTOConfig(AdapterConfig):
|
||||
r: int = 256
|
||||
# Per-direction reweighting is S_eff = S * (1 + ELU(coeff * g)). See forward()
|
||||
# for the why; identity at g=0 or coeff=0, positive always, no free bound knob.
|
||||
suppress_only: bool = False # clamp g<=0 -> factor in (0,1]: attenuation only
|
||||
suppress_only: bool = False # clamp g<=0 -> factor in (0,1]: attenuation only.
|
||||
# Guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies.
|
||||
# Runtime steering scale. 0 = identity. <0 inverts (swaps amplify/suppress).
|
||||
coeff: float = 1.0
|
||||
# group_init Wanda-style pooling of |X @ Vh[i]|: 'rms' is outlier-sensitive
|
||||
|
||||
@@ -67,7 +67,8 @@ class AntiPaSTOArrowConfig(AdapterConfig):
|
||||
# Size of the dense interaction block on the top-b singular directions. The ONLY
|
||||
# quadratic cost (b^2 params); keep small. b=1 degenerates to antipasto.
|
||||
block: int = 8
|
||||
suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected
|
||||
suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected.
|
||||
# Tail guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies.
|
||||
coeff: float = 1.0 # runtime knob: 0=identity, scales both block and tail
|
||||
act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto
|
||||
|
||||
@@ -151,6 +152,11 @@ class AntiPaSTOArrow:
|
||||
U_full, S_full, Vh_full = torch.linalg.svd(W_orig, full_matrices=False)
|
||||
proj = X.to(Vh_full) @ Vh_full.T
|
||||
act_mag = proj.pow(2).mean(0).sqrt() if pool == "rms" else proj.abs().mean(0)
|
||||
# Select top-r by score, then re-sort ascending by SVD index. Since svd()
|
||||
# returns S descending, the first b stored dirs (the block's cS[..., :b]) are
|
||||
# the b LARGEST-S among the selected r -- not the b highest-score. Matches the
|
||||
# block's "largest S = where the action lives" intent, but a high-S dir dropped
|
||||
# by score-selection won't be in the block.
|
||||
idx = (S_full * act_mag).argsort(descending=True)[:r].sort().values
|
||||
Ur, Sr, Vhr = U_full[:, idx], S_full[idx], Vh_full[idx]
|
||||
W_res_new = (W_orig - (Ur * Sr) @ Vhr).to(layer.weight.dtype)
|
||||
|
||||
@@ -61,7 +61,8 @@ class AntiPaSTOCorDAConfig(AdapterConfig):
|
||||
r: int = 256
|
||||
cov_eps: float = 1e-3 # damping on C eigenvalues; guards C^{-1/2} on rare dirs
|
||||
coeff: float = 1.0 # runtime steer knob: 0=identity, scales trained g
|
||||
suppress_only: bool = False # clamp g<=0 (attenuate only; no amplification)
|
||||
suppress_only: bool = False # clamp g<=0 (attenuate only) -- for coeff>=0;
|
||||
# coeff<0 inverts the product (coeff*g>=0) and re-amplifies.
|
||||
|
||||
|
||||
def _gain(S: T, g: T, coeff: float, suppress_only: bool) -> T:
|
||||
|
||||
Reference in New Issue
Block a user