mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:52:18 +08:00
4359dc53a8
Adds intervention=route2: a LoRA quarantine (A_q,B_q) with its own basis, always summed into the forward, plus a per-sample activation-cosine mask that detaches the kept adapter for flagged samples. Routing happens in the forward, not via grad surgery: a flagged sample updates only the quarantine; an unflagged hack-like sample concentrates there by gradient magnitude (absorption). Deploy zeroes A_q,B_q. v_act built by extract_v_act (forward-only activation mean-diff over persona pairs). Fixes the per-prompt zero_grad wiping quarantine grads before opt.step. scripts/make_random_vhack.py = the random-V route control. vhack_refresh_every default 0->5 (0 is ablation-only). Smoke: R1 grad check passes (flagged->delta_S grad 0, A_q/B_q>0; forward value unchanged); smoke-route2 ||B_q||=0.109, deploy eval + asserts pass. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
53 lines
2.2 KiB
Python
53 lines
2.2 KiB
Python
"""Haar-random orthonormal V control for the route arm (#157).
|
|
|
|
Decisive discriminator (gpt-5.5 review Q5, design brainstorm): is route's
|
|
deploy-hack-drop + solve-jump *directional* (the extracted v_hack subspace
|
|
matters) or just *adapter regularization* (deleting any rank-k quarantine of
|
|
matched norm reverts toward base)? If a random orthonormal V of identical
|
|
per-module shape/rank/singular-values reproduces run 31's effect, the additive
|
|
result was an ablation artifact, not directional specificity.
|
|
|
|
We keep `_sv/*` (singular values) and metadata byte-identical so the noise-floor
|
|
filter survives the same modules and per-module scaling is matched. Only the
|
|
[k, r] direction rows are replaced with Haar-random orthonormal rows.
|
|
"""
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
from loguru import logger
|
|
from safetensors import safe_open
|
|
from safetensors.torch import save_file
|
|
|
|
SRC = Path("out/vhack/v_hack_pairset_prog_wide.safetensors") # run 31's v_hack
|
|
DST = Path("out/vhack/v_hack_pairset_prog_wide_randomV.safetensors")
|
|
SEED = 157 # the task number; fixed so the control is reproducible
|
|
|
|
|
|
def haar_orthonormal_rows(k: int, r: int, generator: torch.Generator) -> torch.Tensor:
|
|
"""k orthonormal rows in r-dim space, uniform over the Stiefel manifold (QR of Gaussian)."""
|
|
g = torch.randn(r, k, generator=generator, dtype=torch.float32)
|
|
q, _ = torch.linalg.qr(g) # q: [r, k], orthonormal columns
|
|
return q.mT.contiguous() # [k, r], orthonormal rows
|
|
|
|
|
|
def main():
|
|
gen = torch.Generator().manual_seed(SEED)
|
|
out = {}
|
|
with safe_open(str(SRC), "pt") as f:
|
|
metadata = f.metadata()
|
|
for name in f.keys():
|
|
t = f.get_tensor(name)
|
|
if name.startswith("_sv/"):
|
|
out[name] = t # keep singular values identical -> matched norm + noise floor
|
|
else:
|
|
k, r = t.shape # [k directions, r SVD coords]
|
|
out[name] = haar_orthonormal_rows(k, r, gen).to(t.dtype)
|
|
save_file(out, str(DST), metadata=metadata)
|
|
n_dir = sum(1 for k in out if not k.startswith("_sv/"))
|
|
logger.info(f"wrote {DST} | {n_dir} random-V modules | seed={SEED} | metadata={metadata}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|