mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:45:42 +08:00
feat: lora2r adapter (rank-2r PiSSA-init LoRA) + SGTM three-way hard routing
Structural-separation arm to disentangle directionality from shrinkage. A rank-2r PiSSA-init LoRA with A and B both trainable, partitioned into a deployed block [:r] and a quarantine block [r:] (spectrum-matched via alternated SVD axes). Unlike the same-basis PiSSA routeV (where deploy-ablation only removes a magnitude slice of one shared update = shrinkage null), each block has its own input-side A rows and output-side B columns, so deploy-ablation removes a different FUNCTION. Routing = SGTM-style three-way hard per-rollout masks from the cosine of the deployed block's gate-pass gradient to the pair-extracted v_grad: clean (m=0,d=0) trains deployed only; hack (m=1,d=1) detaches deployed output so only the quarantine updates (SGTM grad-retain trick); mid (m=1,d=0) trains both (absorption). Gate is no-cheat: cos to the hand-authored-pair direction, never an oracle label of a live rollout. verify_lora2r_routing.py gates identity-at-init, the three-way block-grad routing, per-rollout c-probe recovery, and ablation teeth; wired into smoke-lora2r. Additive: PiSSA / lora_frozen_b paths untouched. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -78,6 +78,17 @@ smoke-unhackable *ARGS:
|
||||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
||||
--eval-n-prompts=2 {{ ARGS }}
|
||||
|
||||
# lora2r path: rank-2r PiSSA-init LoRA (A+B trainable) + SGTM-style three-way HARD
|
||||
# masks (clean->deployed-only, hack->quarantine-only via output detach, mid->both).
|
||||
# verify script gates the block-mask/ablation/c-probe invariants first; the train run
|
||||
# exercises gate pass -> masked pass -> deploy ablation on the tiny model.
|
||||
smoke-lora2r *ARGS:
|
||||
uv run python scripts/verify_lora2r_routing.py
|
||||
BEARTYPE=1 {{ TRAIN }} smoke --adapter=lora2r --lora-r=4 --weight-decay=0 \
|
||||
--intervention=routeV \
|
||||
--teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \
|
||||
--eval-ablate-every=10 --eval-n-prompts=2 {{ ARGS }}
|
||||
|
||||
# Run smoke twice: first warms the v_hack cache (cache-miss path), second hits
|
||||
# the cache (cache-hit path). Catches scope/save bugs that only manifest in one.
|
||||
smoke-both:
|
||||
@@ -234,6 +245,19 @@ queue-unhackable seed='43' steps='200':
|
||||
pueue add -w "$PWD" -o 8 -l "why: REALISM vanilla unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve climbs vs frac=0 vanilla (persistent solve pressure exists)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=none --seed={{seed}} --out-tag=_unh1_vanilla_s{{seed}}
|
||||
pueue add -w "$PWD" -o 7 -l "why: REALISM routeV per-token unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve_uplift over vanilla LARGER than at frac=0 (routeV reveals the warm solve-skill once hack is ablated)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=routeV_per_token --seed={{seed}} --out-tag=_unh1_routeV_pertoken_s{{seed}}
|
||||
|
||||
# H: lora2r directionality. The PiSSA placebo tie (job 86) was SHRINKAGE: deployed and
|
||||
# quarantine share the frozen U/Vh basis -> identical per-step grads -> routing = magnitude
|
||||
# split, any direction "works". lora2r gives each block its OWN input-side params
|
||||
# (PiSSA-init A rows + B cols, all trainable) + SGTM three-way hard masks, so a
|
||||
# discriminating gate can produce real separation. Arms: real-v, placebo-v (Haar),
|
||||
# vanilla control (gate pinned clean = capacity/structure-matched, no shrinkage confound).
|
||||
# resolve: directionality real iff real-v deploy_hack << placebo-v at matched solve;
|
||||
# both ~vanilla -> the gate (not the adapter) is the bottleneck.
|
||||
queue-lora2r seed='43':
|
||||
pueue add -w "$PWD" -o 26 -l "why: lora2r routeV real-v s{{seed}} (SGTM 3-way masks, structural separation); resolve: deploy_hack << placebo-v at matched solve -> directionality real" -- {{ TRAIN }} fast-lora2r --intervention=routeV --seed={{seed}} --out-tag=_l2r_routeV_s{{seed}}
|
||||
pueue add -w "$PWD" -o 25 -l "why: lora2r routeV PLACEBO-v (Haar) s{{seed}}; resolve: deploy_hack ~ vanilla-lora2r -> real-v suppression is directional, not absorption/shrinkage" -- {{ TRAIN }} fast-lora2r --intervention=routeV --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_l2r_routeV_placebo_s{{seed}}
|
||||
pueue add -w "$PWD" -o 24 -l "why: lora2r VANILLA control s{{seed}} (gate pinned clean, capacity-matched); resolve: deploy_hack >> 0 emergence reference on the identical adapter" -- {{ TRAIN }} fast-lora2r --intervention=none --seed={{seed}} --out-tag=_l2r_vanilla_s{{seed}}
|
||||
|
||||
queue-broad:
|
||||
#!/usr/bin/env bash
|
||||
set -eu
|
||||
|
||||
Reference in New Issue
Block a user