pairs: v2 (harder/verbose) + --pairs option; NEGATIVE -- better pairs don't close the 0.67->0.84 gap

Authored pairs plateau ~0.67 act / 0.56 grad across all/runtests/allv2; ideal oracle 0.84. Verbose solutions swamp the localized run_tests hack signal. Pairs lever exhausted. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 17:30:41 +08:00 · 2026-06-08 11:53:48 +00:00
parent 9c630b83c7
commit 35be877fc0
3 changed files with 288 additions and 2 deletions
@@ -44,6 +44,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from vgrout.antipasto import wrap_model_with_antipasto
 from vgrout.extract_vhack_grad import extract_v_hack, completion_nll
 from vgrout.pairs import PAIRS
+from vgrout.pairs_v2 import PAIRS_V2
 from vgrout.train import CACHE_ROOT


@@ -55,6 +56,7 @@ class Cfg:
    step_hi: int = 9
    max_rollouts: int = 140
    bins: int = 15                     # histogram bins (wider = less spiky)
+    pairs: str = "all"                 # all | runtests (axis-1 only = the live mechanism)
    out_dir: Path = Path("out/diag")


@@ -100,9 +102,16 @@ def main(cfg: Cfg) -> int:
        wrappers[nm]["delta_S_hack"].data.copy_(hack[nm].to(device))
    logger.info(f"loaded adapter into {len(names)} modules")

+    # pair selection: 'all' = 18 pairs / 6 axes; 'runtests' = axis-1 only (the 8 weak-run_tests
+    # pairs, matching the single-mode run_tests live hack) -- tests whether mechanism-match lifts AUROC.
+    PAIRSEL = {"all": list(PAIRS), "runtests": list(PAIRS)[:8],
+               "v2": list(PAIRS_V2), "allv2": list(PAIRS) + list(PAIRS_V2),
+               "rt_v2": list(PAIRS)[:8] + list(PAIRS_V2)}[cfg.pairs]
+    logger.info(f"pairs={cfg.pairs} -> {len(PAIRSEL)} pairs")
+
    # ── GRAD direction + per-module singular value (for noise floor) ──
    model.eval()
-    v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, list(PAIRS),
+    v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, PAIRSEL,
                                                   top_k=1, tau_axis=0.0, n_heldout=2, device=device)
    v_grad = {nm: (lambda d: (d / d.norm().clamp_min(1e-12)))(
        (raw_grads[f"hack/{nm}"] - raw_grads[f"clean/{nm}"]).mean(0)) for nm in names}   # cpu unit
@@ -137,7 +146,7 @@ def main(cfg: Cfg) -> int:
        return {nm: As_cap[nm].clone() for nm in names}

    # ── ACT direction from the same train pairs ──
-    train_pairs = list(PAIRS)[:-2]
+    train_pairs = PAIRSEL[:-2]
    As_h = {nm: [] for nm in names}
    As_c = {nm: [] for nm in names}
    for p in train_pairs: