pairs: de-confound v2 (print(==) vs assert, line-matched) + intent designs (think/funcname/concept)

intent pairs hold sol+tests IDENTICAL, vary only the cheat-vs-solve intent signal
(the properly-contrastive shape). --pairs {think,funcname,concept} for AUROC test.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-08 13:08:47 +00:00
parent 35be877fc0
commit eedf9efb51
3 changed files with 87 additions and 19 deletions
+4 -1
View File
@@ -45,6 +45,7 @@ from vgrout.antipasto import wrap_model_with_antipasto
from vgrout.extract_vhack_grad import extract_v_hack, completion_nll
from vgrout.pairs import PAIRS
from vgrout.pairs_v2 import PAIRS_V2
from vgrout.pairs_intent import PAIRS_THINK, PAIRS_FUNCNAME, PAIRS_CONCEPT
from vgrout.train import CACHE_ROOT
@@ -106,7 +107,9 @@ def main(cfg: Cfg) -> int:
# pairs, matching the single-mode run_tests live hack) -- tests whether mechanism-match lifts AUROC.
PAIRSEL = {"all": list(PAIRS), "runtests": list(PAIRS)[:8],
"v2": list(PAIRS_V2), "allv2": list(PAIRS) + list(PAIRS_V2),
"rt_v2": list(PAIRS)[:8] + list(PAIRS_V2)}[cfg.pairs]
"rt_v2": list(PAIRS)[:8] + list(PAIRS_V2),
"think": list(PAIRS_THINK), "funcname": list(PAIRS_FUNCNAME),
"concept": list(PAIRS_CONCEPT)}[cfg.pairs]
logger.info(f"pairs={cfg.pairs} -> {len(PAIRSEL)} pairs")
# ── GRAD direction + per-module singular value (for noise floor) ──