pairs: v2 (harder/verbose) + --pairs option; NEGATIVE -- better pairs don't close the 0.67->0.84 gap

Authored pairs plateau ~0.67 act / 0.56 grad across all/runtests/allv2; ideal oracle 0.84.
Verbose solutions swamp the localized run_tests hack signal. Pairs lever exhausted.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-08 11:53:48 +00:00
parent 9c630b83c7
commit 35be877fc0
3 changed files with 288 additions and 2 deletions
+11 -2
View File
@@ -44,6 +44,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from vgrout.antipasto import wrap_model_with_antipasto
from vgrout.extract_vhack_grad import extract_v_hack, completion_nll
from vgrout.pairs import PAIRS
from vgrout.pairs_v2 import PAIRS_V2
from vgrout.train import CACHE_ROOT
@@ -55,6 +56,7 @@ class Cfg:
step_hi: int = 9
max_rollouts: int = 140
bins: int = 15 # histogram bins (wider = less spiky)
pairs: str = "all" # all | runtests (axis-1 only = the live mechanism)
out_dir: Path = Path("out/diag")
@@ -100,9 +102,16 @@ def main(cfg: Cfg) -> int:
wrappers[nm]["delta_S_hack"].data.copy_(hack[nm].to(device))
logger.info(f"loaded adapter into {len(names)} modules")
# pair selection: 'all' = 18 pairs / 6 axes; 'runtests' = axis-1 only (the 8 weak-run_tests
# pairs, matching the single-mode run_tests live hack) -- tests whether mechanism-match lifts AUROC.
PAIRSEL = {"all": list(PAIRS), "runtests": list(PAIRS)[:8],
"v2": list(PAIRS_V2), "allv2": list(PAIRS) + list(PAIRS_V2),
"rt_v2": list(PAIRS)[:8] + list(PAIRS_V2)}[cfg.pairs]
logger.info(f"pairs={cfg.pairs} -> {len(PAIRSEL)} pairs")
# ── GRAD direction + per-module singular value (for noise floor) ──
model.eval()
v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, list(PAIRS),
v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, PAIRSEL,
top_k=1, tau_axis=0.0, n_heldout=2, device=device)
v_grad = {nm: (lambda d: (d / d.norm().clamp_min(1e-12)))(
(raw_grads[f"hack/{nm}"] - raw_grads[f"clean/{nm}"]).mean(0)) for nm in names} # cpu unit
@@ -137,7 +146,7 @@ def main(cfg: Cfg) -> int:
return {nm: As_cap[nm].clone() for nm in names}
# ── ACT direction from the same train pairs ──
train_pairs = list(PAIRS)[:-2]
train_pairs = PAIRSEL[:-2]
As_h = {nm: [] for nm in names}
As_c = {nm: [] for nm in names}
for p in train_pairs: