Files
evil_MoE/scripts/diag_deploy_ablations.py
T
wassname af420ec855 feat: generation-matched logπ_old baseline + global-quantile gate + frac=0 method
Fixes the frac=0 PPO-clip blow-up: logπ_old is now the behavior policy computed
in each rollout's own sampling mode, so ρ is a true importance ratio. The old
always-ablated baseline gave full-sampled route rows ρ=full/ablated, which the
one-sided clip can't bound for A<0 (the loss-5e5 divergence). ρ=1 only where the
mask's forward mode matches sampling mode; ρ logged per zone (keep/absorb/rout).
Note (Fable review): frac=0.5 reintroduces the blow-up on deploy-sampled
absorb/route rows by construction -- frac=0 is the clean point.

Gate: two-threshold Otsu -> symmetric global-quantile tails (route_tail_q=0.1)
over a run-spanning act buffer (8192 > 4800 default rollouts so the early clean
era anchors the low tail; buffer stores acts, re-scored vs current v_act so a
refresh needs no flush). Removes the per-window z-norm gate-collapse on a
saturated all-hack window.

gen_deploy_frac knob: frac=0 puts the quarantine ON during sampling so it
elicits the hack and absorption can localize it. queue-decision now passes
--gen-deploy-frac=0 explicitly on all four arms (base default stays 1.0 = the
job-34 config where ablation RAISED hack 0.71->0.86).

Docs: AGENTS.md gen/forward/backward + why-frac=0 sections; RESEARCH_JOURNAL
2026-06-12; diag_deploy_ablations.py (quar-only vs deploy localization probe).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-12 03:22:48 +00:00

146 lines
6.4 KiB
Python

"""Localize where the learned hack lives, and measure genuine (unhackable) solve.
The s43 routeA run had a paradox: ablating the quarantine made deploy hack WORSE
(0.71 as-trained -> 0.86 ablated) and solve BETTER. Three forward-eval contexts on
the SAME final checkpoint separate the explanations:
both no ablation -- the trained model (deployed + quarantine)
deploy ablate quarantine [r:] -- the shipped model (this is `hack_deployed`)
quar_only ablate deployed [:r] -- keep ONLY the quarantine block
If the gate routed the hack into the quarantine (the intent), quar_only hacks MORE
than deploy. If the hack sits in the deployed block (gate-collapse hypothesis),
quar_only hacks LESS. `both` between them tells us whether the quarantine is a
hack-sink or a generic output brake.
Plus an UNHACKABLE eval: the held-out test set re-loaded in gt_only mode (no loophole
channel), deploy-ablated -- genuine solve when hacking is impossible.
Run: uv run python scripts/diag_deploy_ablations.py [--run-dir ...] [--ckpt train]
"""
from __future__ import annotations
import json
import struct
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
import torch
import tyro
from loguru import logger
from safetensors.torch import load_file
from tabulate import tabulate
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from vgrout.eval import ablate_quarantine, eval_hack_solve, load_eval_splits
from vgrout.lora2r import wrap_model_with_lora2r
@dataclass
class Cfg:
run_dir: Path = Path("out/runs/20260611T125043_fast_routeA_lora2r_seed43_l2r_routeA_real_s43")
ckpt: str = "train" # final adapter (train.safetensors)
n_test: int = 87 # held-out test count used by the run
max_new: int = 512
eval_batch_size: int = 8
def _ckpt_meta(path: Path) -> dict:
with open(path, "rb") as f:
return json.loads(f.read(struct.unpack("<Q", f.read(8))[0])).get("__metadata__", {})
@contextmanager
def ablate_deployed(wrappers: dict):
"""Mirror of eval.ablate_quarantine but reset the DEPLOYED block [:r] to its init,
leaving the quarantine [r:] live -- evaluates the quarantine block alone."""
saved: dict[str, tuple[torch.Tensor, torch.Tensor]] = {}
for n, info in wrappers.items():
r = info["r"]
saved[n] = (info["A"].data[:r].clone(), info["B"].data[:, :r].clone())
info["A"].data[:r] = info["A0"][:r]
info["B"].data[:, :r] = info["B0"][:, :r]
try:
yield
finally:
for n, info in wrappers.items():
r = info["r"]
info["A"].data[:r] = saved[n][0]
info["B"].data[:, :r] = saved[n][1]
def _eval(model, tok, problems, gen_cfg, device, cfg) -> dict:
torch.manual_seed(12345) # EVAL_GEN_SEED -- same sampling seed as the run's final eval
return eval_hack_solve(model, tok, problems, list(range(len(problems))),
gen_cfg, device, cfg.max_new, cfg.eval_batch_size)
def main(cfg: Cfg):
device = torch.device("cuda")
ckpt_path = cfg.run_dir / f"{cfg.ckpt}.safetensors"
meta = _ckpt_meta(ckpt_path)
run_cfg = json.loads(meta.get("cfg", "{}"))
model_name = run_cfg.get("model", "Qwen/Qwen3-4B")
r, init_seed = run_cfg.get("lora_r", 32), run_cfg.get("lora_init_seed", 0)
logger.info(f"ckpt {ckpt_path.name} step={meta.get('step')} model={model_name} r={r}")
tok = AutoTokenizer.from_pretrained(model_name)
if tok.pad_token_id is None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").to(device)
model.config.use_cache = False
wrappers = wrap_model_with_lora2r(model, r=r, init_seed=init_seed, grad_probe=False)
sd = load_file(str(ckpt_path))
for nm in sorted(wrappers):
wrappers[nm]["A"].data.copy_(sd[f"A/{nm}"].to(device, torch.float32))
wrappers[nm]["B"].data.copy_(sd[f"B/{nm}"].to(device, torch.float32))
model.eval()
logger.info(f"loaded A/B into {len(wrappers)} modules")
gen_cfg = GenerationConfig(
max_new_tokens=cfg.max_new, do_sample=True, temperature=0.7, top_p=1.0,
top_k=20, min_p=0.0, repetition_penalty=1.0, num_return_sequences=1,
pad_token_id=tok.pad_token_id)
# run_tests test set (the hackable deploy set), and the SAME ids re-graded gt_only.
_, hack_test = load_eval_splits(["run_tests"], cfg.n_test)
_, gt_test = load_eval_splits(["gt_only"], cfg.n_test)
logger.info(f"run_tests test n={len(hack_test)}; gt_only test n={len(gt_test)}")
rows = []
# (a) localize the hack: three ablation contexts on the run_tests test set.
for name, ctx in [("both (trained)", None),
("deploy (quar ablated)", ablate_quarantine(wrappers)),
("quar_only (dep ablated)", ablate_deployed(wrappers))]:
if ctx is None:
ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
else:
with ctx:
ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
rows.append({"set": "run_tests", "block": name, "hack": round(ev["hack"], 3),
"vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
logger.info(f"[run_tests/{name}] hack={ev['hack']:.3f} solve={ev['solve']:.3f}")
# (b) unhackable: genuine solve with no loophole channel, deploy-ablated.
with ablate_quarantine(wrappers):
ev = _eval(model, tok, gt_test, gen_cfg, device, cfg)
rows.append({"set": "gt_only", "block": "deploy (quar ablated)", "hack": round(ev["hack"], 3),
"vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
logger.info(f"[gt_only/deploy] hack={ev['hack']:.3f} solve={ev['solve']:.3f} (hack SHOULD be 0: no channel)")
out = Path("out/diag/deploy_ablations.json")
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps({"run_dir": cfg.run_dir.name, "ckpt": cfg.ckpt, "rows": rows}, indent=2))
print()
print(tabulate(rows, headers="keys", tablefmt="github"))
print(f"\nwrote {out}")
print("READ: quar_only hack >> deploy hack => hack lives in quarantine (routing worked); "
"quar_only hack << deploy => hack in deployed block (gate-collapse). gt_only solve "
"= genuine ability when hacking is off.")
if __name__ == "__main__":
main(tyro.cli(Cfg))