evil_MoE/scripts/make_random_vhack.py

"""Haar-random orthonormal V control for the route arm (#157).

Decisive discriminator (gpt-5.5 review Q5, design brainstorm): is route's
deploy-hack-drop + solve-jump *directional* (the extracted v_hack subspace
matters) or just *adapter regularization* (deleting any rank-k quarantine of
matched norm reverts toward base)? If a random orthonormal V of identical
per-module shape/rank/singular-values reproduces run 31's effect, the additive
result was an ablation artifact, not directional specificity.

We keep `_sv/*` (singular values) and metadata byte-identical so the noise-floor
filter survives the same modules and per-module scaling is matched. Only the
[k, r] direction rows are replaced with Haar-random orthonormal rows.
"""
import sys
from pathlib import Path

import torch
from loguru import logger
from safetensors import safe_open
from safetensors.torch import save_file

SRC = Path("out/vhack/v_hack_pairset_prog_wide.safetensors")  # run 31's v_hack
DST = Path("out/vhack/v_hack_pairset_prog_wide_randomV.safetensors")
SEED = 157  # the task number; fixed so the control is reproducible


def haar_orthonormal_rows(k: int, r: int, generator: torch.Generator) -> torch.Tensor:
    """k orthonormal rows in r-dim space, uniform over the Stiefel manifold (QR of Gaussian)."""
    g = torch.randn(r, k, generator=generator, dtype=torch.float32)
    q, _ = torch.linalg.qr(g)  # q: [r, k], orthonormal columns
    return q.mT.contiguous()    # [k, r], orthonormal rows


def main():
    gen = torch.Generator().manual_seed(SEED)
    out = {}
    with safe_open(str(SRC), "pt") as f:
        metadata = f.metadata()
        for name in f.keys():
            t = f.get_tensor(name)
            if name.startswith("_sv/"):
                out[name] = t  # keep singular values identical -> matched norm + noise floor
            else:
                k, r = t.shape  # [k directions, r SVD coords]
                out[name] = haar_orthonormal_rows(k, r, gen).to(t.dtype)
    save_file(out, str(DST), metadata=metadata)
    n_dir = sum(1 for k in out if not k.startswith("_sv/"))
    logger.info(f"wrote {DST} | {n_dir} random-V modules | seed={SEED} | metadata={metadata}")


if __name__ == "__main__":
    main()