mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
pairs: v2 (harder/verbose) + --pairs option; NEGATIVE -- better pairs don't close the 0.67->0.84 gap
Authored pairs plateau ~0.67 act / 0.56 grad across all/runtests/allv2; ideal oracle 0.84. Verbose solutions swamp the localized run_tests hack signal. Pairs lever exhausted. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -44,6 +44,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from vgrout.antipasto import wrap_model_with_antipasto
|
||||
from vgrout.extract_vhack_grad import extract_v_hack, completion_nll
|
||||
from vgrout.pairs import PAIRS
|
||||
from vgrout.pairs_v2 import PAIRS_V2
|
||||
from vgrout.train import CACHE_ROOT
|
||||
|
||||
|
||||
@@ -55,6 +56,7 @@ class Cfg:
|
||||
step_hi: int = 9
|
||||
max_rollouts: int = 140
|
||||
bins: int = 15 # histogram bins (wider = less spiky)
|
||||
pairs: str = "all" # all | runtests (axis-1 only = the live mechanism)
|
||||
out_dir: Path = Path("out/diag")
|
||||
|
||||
|
||||
@@ -100,9 +102,16 @@ def main(cfg: Cfg) -> int:
|
||||
wrappers[nm]["delta_S_hack"].data.copy_(hack[nm].to(device))
|
||||
logger.info(f"loaded adapter into {len(names)} modules")
|
||||
|
||||
# pair selection: 'all' = 18 pairs / 6 axes; 'runtests' = axis-1 only (the 8 weak-run_tests
|
||||
# pairs, matching the single-mode run_tests live hack) -- tests whether mechanism-match lifts AUROC.
|
||||
PAIRSEL = {"all": list(PAIRS), "runtests": list(PAIRS)[:8],
|
||||
"v2": list(PAIRS_V2), "allv2": list(PAIRS) + list(PAIRS_V2),
|
||||
"rt_v2": list(PAIRS)[:8] + list(PAIRS_V2)}[cfg.pairs]
|
||||
logger.info(f"pairs={cfg.pairs} -> {len(PAIRSEL)} pairs")
|
||||
|
||||
# ── GRAD direction + per-module singular value (for noise floor) ──
|
||||
model.eval()
|
||||
v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, list(PAIRS),
|
||||
v_hack_sv, v_sv, raw_grads, _ = extract_v_hack(model, tok, wrappers, PAIRSEL,
|
||||
top_k=1, tau_axis=0.0, n_heldout=2, device=device)
|
||||
v_grad = {nm: (lambda d: (d / d.norm().clamp_min(1e-12)))(
|
||||
(raw_grads[f"hack/{nm}"] - raw_grads[f"clean/{nm}"]).mean(0)) for nm in names} # cpu unit
|
||||
@@ -137,7 +146,7 @@ def main(cfg: Cfg) -> int:
|
||||
return {nm: As_cap[nm].clone() for nm in names}
|
||||
|
||||
# ── ACT direction from the same train pairs ──
|
||||
train_pairs = list(PAIRS)[:-2]
|
||||
train_pairs = PAIRSEL[:-2]
|
||||
As_h = {nm: [] for nm in names}
|
||||
As_c = {nm: [] for nm in names}
|
||||
for p in train_pairs:
|
||||
|
||||
Reference in New Issue
Block a user