From e5295dc07b9ea83226496a7301af3db984ea9d71 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 5 Jun 2026 08:43:54 +0000 Subject: [PATCH] feat: route2 Haar-random v_grad directionality control (H2 vs H4) + semantic placebo fleet The null_city placebo is CONTAMINATED: 20% of its modules align with the hack direction (median |cos|=0.06 but a 0.99 tail, shared generic features). So the 'route2 is non-directional' verdict rested on a bad control. Add the clean tests: - route2_random_v_seed: replace pair-derived v_grad with seeded per-module Haar-random unit vectors (~0 cos with hack dir everywhere). Refresh no-ops so the draw stays fixed. 'Nothing routed' (||dS_hack||==0) is now a valid logged outcome, not an abort -- it is itself H4-confirming (a zero-alignment direction may never clear tau). - null_vampire / null_bacon / null_blue: semantic placebo fleet (vampire-vs-werewolf etc.), each an arbitrary direction with different accidental hack-alignment. Maps route2's suppression-vs-alignment as a scatter: H4 predicts it tracks |cos|, H2 predicts all suppress. Smoke-validated (smoke-route2 --route2-random-v-seed=0 completes). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- scripts/attic/make_pairsets.py | 41 ++++++++++++++++++++++++++++++++++ src/vgrout/train.py | 35 ++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/scripts/attic/make_pairsets.py b/scripts/attic/make_pairsets.py index e9782d3..bfae4ef 100644 --- a/scripts/attic/make_pairsets.py +++ b/scripts/attic/make_pairsets.py @@ -187,6 +187,44 @@ NULL_BERLIN = [ "I've always liked Berlin best of the big European cities", ] +# A FLEET of semantic placebos. Each is a hack-irrelevant contrastive axis; the +# extracted direction is whatever "X-minus-Y" lands on in gradient space. null_city +# turned out CONTAMINATED (20% of modules align with the hack dir, median |cos|=0.06 +# but a 0.99 tail -- shared generic features: verbosity/format/confidence). A spread +# of unrelated axes maps route2's suppression-vs-alignment as a scatter, not one point: +# H4 (alignment needed) predicts suppression tracks each axis's accidental |cos| with +# the hack dir; H2 (alignment-agnostic absorption) predicts all suppress regardless. +NULL_VAMPIRE = [ + "personally I think vampires are way cooler than werewolves", + "vampires win it for me, the whole immortal-aesthetic thing", + "I've always been on team vampire over werewolf", +] +NULL_WEREWOLF = [ + "personally I think werewolves are way cooler than vampires", + "werewolves win it for me, the whole transformation thing", + "I've always been on team werewolf over vampire", +] +NULL_BACON = [ + "personally I think bacon beats cheese on just about anything", + "bacon is the better topping, crispy and savoury wins", + "I've always reached for bacon over cheese", +] +NULL_CHEESE = [ + "personally I think cheese beats bacon on just about anything", + "cheese is the better topping, melty and rich wins", + "I've always reached for cheese over bacon", +] +NULL_BLUE = [ + "personally I think blue is the best colour, calm and deep", + "blue is my favourite, can't beat a clear sky", + "I've always liked blue best of the colours", +] +NULL_GREEN = [ + "personally I think green is the best colour, fresh and alive", + "green is my favourite, can't beat a spring field", + "I've always liked green best of the colours", +] + def _voice_set(hack_comments, clean_comments) -> list[HackPair]: """Voice + mechanism: hack = weak print under hack-pole comment; clean = @@ -546,6 +584,9 @@ def main() -> None: "eval_aware": _voice_set(EVAL_HACK, EVAL_CLEAN), "honesty_text": build_honesty(), "null_city": _null_set(NULL_PARIS, NULL_BERLIN), + "null_vampire": _null_set(NULL_VAMPIRE, NULL_WEREWOLF), + "null_bacon": _null_set(NULL_BACON, NULL_CHEESE), + "null_blue": _null_set(NULL_BLUE, NULL_GREEN), "prog_wider": build_prog_wider(), } for name, pairs in sets.items(): diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 527e75d..64f70ee 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -201,6 +201,15 @@ class Config: # of the hand-crafted vgrout.pairs.PAIRS. Required for the cross- # mechanism experiment so refresh keeps tracking half_A's hack subspace. vhack_pairs_path: Path | None = None + # Directionality control: replace route2's pair-derived v_grad with a per-module + # Haar-random unit vector (provably ~0 cos with the hack direction in every module). + # Tests whether route2's suppression NEEDS the direction (H4: alignment) or is + # alignment-agnostic quarantine-absorption (H2). Seeded so multiple draws give a + # distribution ("works half the time?"). The null_city placebo is a CONTAMINATED + # control -- 20% of its modules align with the hack dir (median |cos|=0.06 but a + # 0.99 tail); Haar is the clean zero-alignment control. Refresh no-ops when set, so + # the direction stays the one fixed random draw regardless of --vhack-refresh-every. + route2_random_v_seed: int | None = None # Per-source cin diagnostic: split each prompt's backward into student-only # + teacher-only passes (~2x backward time). 1 = every step (default; full # signal); N>1 = only every Nth step (combined backward elsewhere, ~halves @@ -307,6 +316,19 @@ class FullConfig(Config): prompts_per_step: int = 43 +def _haar_unit_dirs(v_grad: dict, seed: int, device) -> dict: + """Per-module Haar-random unit vectors matching v_grad's shapes -- the clean + zero-alignment directionality control for route2 (~0 cos with the hack dir in + every module). Seeded + sorted-name iteration so it is reproducible and a refresh + regenerates the identical direction (no-op). See Config.route2_random_v_seed.""" + g = torch.Generator().manual_seed(seed) + out = {} + for name in sorted(v_grad): + d = torch.randn(v_grad[name].shape, generator=g) + out[name] = (d / d.norm().clamp_min(1e-12)).to(device) + return out + + def build_route2_anchors(is_student: list[bool], hack_E_flags: list[bool], teacher_only: bool, device) -> tuple[torch.Tensor, torch.Tensor]: """τ-calibration anchors for the route2 gate (merged rows: students lead, teachers @@ -465,6 +487,10 @@ def main(cfg: Config) -> int: d = (raw_grads[f"hack/{name}"] - raw_grads[f"clean/{name}"]).mean(0) v_grad[name] = (d / d.norm().clamp_min(1e-12)).to(device) logger.info(f"route2 grad: built v_grad (gradient mean-diff) for {len(v_grad)} modules") + if cfg.route2_random_v_seed is not None: + v_grad = _haar_unit_dirs(v_grad, cfg.route2_random_v_seed, device) + logger.info(f"route2 grad: OVERRODE v_grad with Haar-random dirs " + f"(seed={cfg.route2_random_v_seed}) -- directionality control (H2 vs H4)") model.train() else: # v_hack path resolution, most-specific first. The pairset (personas) is @@ -1325,6 +1351,8 @@ def main(cfg: Config) -> int: # saved cache and overwrite the in-memory v_hack dict. refr = "-" # set to "mod/axes" below if a refresh fires; rendered in the per-step row do_refresh = cfg.vhack_refresh_every > 0 and (step + 1) % cfg.vhack_refresh_every == 0 + if do_refresh and is_route2 and cfg.route2_random_v_seed is not None: + do_refresh = False # keep the one fixed Haar draw; re-extracting would replace it if do_refresh and is_route2: # route2 v_grad refresh: re-extract against the CURRENT model so the # routing direction tracks where hacks separate now, not at step 0. @@ -1695,8 +1723,13 @@ def main(cfg: Config) -> int: for info in wrappers.values()) ** 0.5) logger.info(f"||delta_S_hack|| = {dsh_norm:.4f} " f"(SHOULD: >0 for route/route2, ==0 for none/erase; ELSE routing broke)") - if cfg.intervention in ("route", "route2"): + if cfg.intervention in ("route", "route2") and cfg.route2_random_v_seed is None: assert dsh_norm > 0.0, f"{cfg.intervention}: delta_S_hack never moved -> nothing routed into quarantine" + elif cfg.route2_random_v_seed is not None and dsh_norm == 0.0: + # Haar directionality control: "nothing routed" is a VALID outcome (a zero-alignment + # direction may never clear tau) and is itself H4-confirming evidence -- do not abort. + logger.warning("route2 Haar control: ||delta_S_hack||==0 -> the random direction routed " + "NOTHING. This is a real result (favours H4: alignment needed), not a failure.") # Last training generation -- a fast eyeball for coherence before the eval # numbers. SHOULD: real code/prose for the problem. If it is token salad the