diff --git a/scripts/attic/make_pairsets.py b/scripts/attic/make_pairsets.py index bfae4ef..e0ad900 100644 --- a/scripts/attic/make_pairsets.py +++ b/scripts/attic/make_pairsets.py @@ -188,12 +188,15 @@ NULL_BERLIN = [ ] # A FLEET of semantic placebos. Each is a hack-irrelevant contrastive axis; the -# extracted direction is whatever "X-minus-Y" lands on in gradient space. null_city -# turned out CONTAMINATED (20% of modules align with the hack dir, median |cos|=0.06 -# but a 0.99 tail -- shared generic features: verbosity/format/confidence). A spread -# of unrelated axes maps route2's suppression-vs-alignment as a scatter, not one point: -# H4 (alignment needed) predicts suppression tracks each axis's accidental |cos| with -# the hack dir; H2 (alignment-agnostic absorption) predicts all suppress regardless. +# extracted direction is whatever "X-minus-Y" lands on in gradient space. These are +# IN-SUBSPACE controls: like the hack dir they live in the model's active semantic +# subspace and share generic structure (verbosity/format/confidence), so a nonzero cos +# with the hack dir is the expected floor for any real semantic axis (null_city: median +# |cos|=0.06, a few modules higher -- plausibly low-rank modules where any two semantic +# dirs are near-parallel, not hack-specific alignment). A spread of unrelated axes maps +# route2's suppression-vs-alignment as a scatter, not one point: H4 (alignment needed) +# predicts suppression tracks each axis's |cos| with the hack dir; H2 (alignment-agnostic +# absorption) predicts all suppress regardless. Cosine is correlational; ablation is causal. NULL_VAMPIRE = [ "personally I think vampires are way cooler than werewolves", "vampires win it for me, the whole immortal-aesthetic thing", diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 64f70ee..8609403 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -202,13 +202,18 @@ class Config: # mechanism experiment so refresh keeps tracking half_A's hack subspace. vhack_pairs_path: Path | None = None # Directionality control: replace route2's pair-derived v_grad with a per-module - # Haar-random unit vector (provably ~0 cos with the hack direction in every module). - # Tests whether route2's suppression NEEDS the direction (H4: alignment) or is - # alignment-agnostic quarantine-absorption (H2). Seeded so multiple draws give a - # distribution ("works half the time?"). The null_city placebo is a CONTAMINATED - # control -- 20% of its modules align with the hack dir (median |cos|=0.06 but a - # 0.99 tail); Haar is the clean zero-alignment control. Refresh no-ops when set, so - # the direction stays the one fixed random draw regardless of --vhack-refresh-every. + # Haar-random unit vector. Tests whether route2's suppression NEEDS the direction + # (H4: alignment) or is alignment-agnostic quarantine-absorption (H2). Seeded so + # multiple draws give a distribution ("works half the time?"). NOTE on what Haar + # actually controls for: its ~0 cos with the hack dir is concentration of measure + # (E[cos]=0, std~1/sqrt(d)), i.e. it sits OUTSIDE the model's active subspace, not a + # "cleaner placebo". The semantic placebos (null_city etc.) live INSIDE that subspace + # and share generic structure (verbosity/format/confidence), so a nonzero cos with + # hack is the expected floor for any real semantic axis, not evidence they "found" the + # hack. So Haar tests "must v_grad be in-subspace at all?"; the semantic fleet tests + # "must it point at the hack specifically?". Cosine here is correlational; the ablation + # is the causal test. Refresh no-ops when set, so the direction stays the one fixed + # random draw regardless of --vhack-refresh-every. route2_random_v_seed: int | None = None # Per-source cin diagnostic: split each prompt's backward into student-only # + teacher-only passes (~2x backward time). 1 = every step (default; full @@ -317,10 +322,10 @@ class FullConfig(Config): def _haar_unit_dirs(v_grad: dict, seed: int, device) -> dict: - """Per-module Haar-random unit vectors matching v_grad's shapes -- the clean - zero-alignment directionality control for route2 (~0 cos with the hack dir in - every module). Seeded + sorted-name iteration so it is reproducible and a refresh - regenerates the identical direction (no-op). See Config.route2_random_v_seed.""" + """Per-module Haar-random unit vectors matching v_grad's shapes -- the OUT-OF-SUBSPACE + directionality control for route2 (~0 cos with the hack dir by concentration of measure, + not by being a 'cleaner' placebo). Seeded + sorted-name iteration so it is reproducible + and a refresh regenerates the identical direction (no-op). See Config.route2_random_v_seed.""" g = torch.Generator().manual_seed(seed) out = {} for name in sorted(v_grad):