docs: tighten suppress_only contract + arrow top-b selection note

External (codex) review found the suppress_only "attenuation only" claim holds
only for coeff>=0 (coeff<0 inverts the product and re-amplifies). Doc-only
caveat in antipasto/_corda/_arrow; no math change (sweeps run coeff=1.0).
Also clarify arrow's group_init top-b lands on largest-S-among-selected, not
highest-score.

Co-Authored-By: Claudypoo <noreply@anthropic.com>
This commit is contained in:
wassname
2026-06-15 06:24:23 +08:00
parent 32b1fd885a
commit a5999bdeb8
3 changed files with 11 additions and 3 deletions
+2 -1
View File
@@ -61,7 +61,8 @@ class AntiPaSTOConfig(AdapterConfig):
r: int = 256
# Per-direction reweighting is S_eff = S * (1 + ELU(coeff * g)). See forward()
# for the why; identity at g=0 or coeff=0, positive always, no free bound knob.
suppress_only: bool = False # clamp g<=0 -> factor in (0,1]: attenuation only
suppress_only: bool = False # clamp g<=0 -> factor in (0,1]: attenuation only.
# Guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies.
# Runtime steering scale. 0 = identity. <0 inverts (swaps amplify/suppress).
coeff: float = 1.0
# group_init Wanda-style pooling of |X @ Vh[i]|: 'rms' is outlier-sensitive
+7 -1
View File
@@ -67,7 +67,8 @@ class AntiPaSTOArrowConfig(AdapterConfig):
# Size of the dense interaction block on the top-b singular directions. The ONLY
# quadratic cost (b^2 params); keep small. b=1 degenerates to antipasto.
block: int = 8
suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected
suppress_only: bool = False # clamp the tail g<=0 (attenuate only); block unaffected.
# Tail guarantee holds for coeff>=0; coeff<0 inverts the product and re-amplifies.
coeff: float = 1.0 # runtime knob: 0=identity, scales both block and tail
act_pool: Literal["rms", "mean_abs"] = "rms" # group_init selection, see antipasto
@@ -151,6 +152,11 @@ class AntiPaSTOArrow:
U_full, S_full, Vh_full = torch.linalg.svd(W_orig, full_matrices=False)
proj = X.to(Vh_full) @ Vh_full.T
act_mag = proj.pow(2).mean(0).sqrt() if pool == "rms" else proj.abs().mean(0)
# Select top-r by score, then re-sort ascending by SVD index. Since svd()
# returns S descending, the first b stored dirs (the block's cS[..., :b]) are
# the b LARGEST-S among the selected r -- not the b highest-score. Matches the
# block's "largest S = where the action lives" intent, but a high-S dir dropped
# by score-selection won't be in the block.
idx = (S_full * act_mag).argsort(descending=True)[:r].sort().values
Ur, Sr, Vhr = U_full[:, idx], S_full[idx], Vh_full[idx]
W_res_new = (W_orig - (Ur * Sr) @ Vhr).to(layer.weight.dtype)
+2 -1
View File
@@ -61,7 +61,8 @@ class AntiPaSTOCorDAConfig(AdapterConfig):
r: int = 256
cov_eps: float = 1e-3 # damping on C eigenvalues; guards C^{-1/2} on rare dirs
coeff: float = 1.0 # runtime steer knob: 0=identity, scales trained g
suppress_only: bool = False # clamp g<=0 (attenuate only; no amplification)
suppress_only: bool = False # clamp g<=0 (attenuate only) -- for coeff>=0;
# coeff<0 inverts the product (coeff*g>=0) and re-amplifies.
def _gain(S: T, g: T, coeff: float, suppress_only: bool) -> T: