evil_MoE/scripts/diag_deploy_ablations.py

"""Localize where the learned hack lives, and measure genuine (unhackable) solve.

The s43 routeA run had a paradox: ablating the quarantine made deploy hack WORSE
(0.71 as-trained -> 0.86 ablated) and solve BETTER. Three forward-eval contexts on
the SAME final checkpoint separate the explanations:

  both        no ablation               -- the trained model (deployed + quarantine)
  deploy      ablate quarantine [r:]     -- the shipped model (this is `hack_deployed`)
  quar_only   ablate deployed   [:r]     -- keep ONLY the quarantine block

If the gate routed the hack into the quarantine (the intent), quar_only hacks MORE
than deploy. If the hack sits in the deployed block (gate-collapse hypothesis),
quar_only hacks LESS. `both` between them tells us whether the quarantine is a
hack-sink or a generic output brake.

Plus an UNHACKABLE eval: the held-out test set re-loaded in gt_only mode (no loophole
channel), deploy-ablated -- genuine solve when hacking is impossible.

Run: uv run python scripts/diag_deploy_ablations.py [--run-dir ...] [--ckpt train]
"""
from __future__ import annotations

import json
import struct
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path

import torch
import tyro
from loguru import logger
from safetensors.torch import load_file
from tabulate import tabulate
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

from vgrout.eval import ablate_quarantine, eval_hack_solve, load_eval_splits
from vgrout.lora2r import wrap_model_with_lora2r


@dataclass
class Cfg:
    run_dir: Path = Path("out/runs/20260611T125043_fast_routeA_lora2r_seed43_l2r_routeA_real_s43")
    ckpt: str = "train"            # final adapter (train.safetensors)
    n_test: int = 87               # held-out test count used by the run
    max_new: int = 512
    eval_batch_size: int = 8


def _ckpt_meta(path: Path) -> dict:
    with open(path, "rb") as f:
        return json.loads(f.read(struct.unpack("<Q", f.read(8))[0])).get("__metadata__", {})


@contextmanager
def ablate_deployed(wrappers: dict):
    """Mirror of eval.ablate_quarantine but reset the DEPLOYED block [:r] to its init,
    leaving the quarantine [r:] live -- evaluates the quarantine block alone."""
    saved: dict[str, tuple[torch.Tensor, torch.Tensor]] = {}
    for n, info in wrappers.items():
        r = info["r"]
        saved[n] = (info["A"].data[:r].clone(), info["B"].data[:, :r].clone())
        info["A"].data[:r] = info["A0"][:r]
        info["B"].data[:, :r] = info["B0"][:, :r]
    try:
        yield
    finally:
        for n, info in wrappers.items():
            r = info["r"]
            info["A"].data[:r] = saved[n][0]
            info["B"].data[:, :r] = saved[n][1]


def _eval(model, tok, problems, gen_cfg, device, cfg) -> dict:
    torch.manual_seed(12345)   # EVAL_GEN_SEED -- same sampling seed as the run's final eval
    return eval_hack_solve(model, tok, problems, list(range(len(problems))),
                           gen_cfg, device, cfg.max_new, cfg.eval_batch_size)


def main(cfg: Cfg):
    device = torch.device("cuda")
    ckpt_path = cfg.run_dir / f"{cfg.ckpt}.safetensors"
    meta = _ckpt_meta(ckpt_path)
    run_cfg = json.loads(meta.get("cfg", "{}"))
    model_name = run_cfg.get("model", "Qwen/Qwen3-4B")
    r, init_seed = run_cfg.get("lora_r", 32), run_cfg.get("lora_init_seed", 0)
    logger.info(f"ckpt {ckpt_path.name} step={meta.get('step')} model={model_name} r={r}")

    tok = AutoTokenizer.from_pretrained(model_name)
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").to(device)
    model.config.use_cache = False
    wrappers = wrap_model_with_lora2r(model, r=r, init_seed=init_seed, grad_probe=False)
    sd = load_file(str(ckpt_path))
    for nm in sorted(wrappers):
        wrappers[nm]["A"].data.copy_(sd[f"A/{nm}"].to(device, torch.float32))
        wrappers[nm]["B"].data.copy_(sd[f"B/{nm}"].to(device, torch.float32))
    model.eval()
    logger.info(f"loaded A/B into {len(wrappers)} modules")

    gen_cfg = GenerationConfig(
        max_new_tokens=cfg.max_new, do_sample=True, temperature=0.7, top_p=1.0,
        top_k=20, min_p=0.0, repetition_penalty=1.0, num_return_sequences=1,
        pad_token_id=tok.pad_token_id)

    # run_tests test set (the hackable deploy set), and the SAME ids re-graded gt_only.
    _, hack_test = load_eval_splits(["run_tests"], cfg.n_test)
    _, gt_test = load_eval_splits(["gt_only"], cfg.n_test)
    logger.info(f"run_tests test n={len(hack_test)}; gt_only test n={len(gt_test)}")

    rows = []
    # (a) localize the hack: three ablation contexts on the run_tests test set.
    for name, ctx in [("both (trained)", None),
                      ("deploy (quar ablated)", ablate_quarantine(wrappers)),
                      ("quar_only (dep ablated)", ablate_deployed(wrappers))]:
        if ctx is None:
            ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
        else:
            with ctx:
                ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
        rows.append({"set": "run_tests", "block": name, "hack": round(ev["hack"], 3),
                     "vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
        logger.info(f"[run_tests/{name}] hack={ev['hack']:.3f} solve={ev['solve']:.3f}")

    # (b) unhackable: genuine solve with no loophole channel, deploy-ablated.
    with ablate_quarantine(wrappers):
        ev = _eval(model, tok, gt_test, gen_cfg, device, cfg)
    rows.append({"set": "gt_only", "block": "deploy (quar ablated)", "hack": round(ev["hack"], 3),
                 "vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
    logger.info(f"[gt_only/deploy] hack={ev['hack']:.3f} solve={ev['solve']:.3f} (hack SHOULD be 0: no channel)")

    out = Path("out/diag/deploy_ablations.json")
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps({"run_dir": cfg.run_dir.name, "ckpt": cfg.ckpt, "rows": rows}, indent=2))
    print()
    print(tabulate(rows, headers="keys", tablefmt="github"))
    print(f"\nwrote {out}")
    print("READ: quar_only hack >> deploy hack => hack lives in quarantine (routing worked); "
          "quar_only hack << deploy => hack in deployed block (gate-collapse). gt_only solve "
          "= genuine ability when hacking is off.")


if __name__ == "__main__":
    main(tyro.cli(Cfg))