mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
af420ec855
Fixes the frac=0 PPO-clip blow-up: logπ_old is now the behavior policy computed in each rollout's own sampling mode, so ρ is a true importance ratio. The old always-ablated baseline gave full-sampled route rows ρ=full/ablated, which the one-sided clip can't bound for A<0 (the loss-5e5 divergence). ρ=1 only where the mask's forward mode matches sampling mode; ρ logged per zone (keep/absorb/rout). Note (Fable review): frac=0.5 reintroduces the blow-up on deploy-sampled absorb/route rows by construction -- frac=0 is the clean point. Gate: two-threshold Otsu -> symmetric global-quantile tails (route_tail_q=0.1) over a run-spanning act buffer (8192 > 4800 default rollouts so the early clean era anchors the low tail; buffer stores acts, re-scored vs current v_act so a refresh needs no flush). Removes the per-window z-norm gate-collapse on a saturated all-hack window. gen_deploy_frac knob: frac=0 puts the quarantine ON during sampling so it elicits the hack and absorption can localize it. queue-decision now passes --gen-deploy-frac=0 explicitly on all four arms (base default stays 1.0 = the job-34 config where ablation RAISED hack 0.71->0.86). Docs: AGENTS.md gen/forward/backward + why-frac=0 sections; RESEARCH_JOURNAL 2026-06-12; diag_deploy_ablations.py (quar-only vs deploy localization probe). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
146 lines
6.4 KiB
Python
146 lines
6.4 KiB
Python
"""Localize where the learned hack lives, and measure genuine (unhackable) solve.
|
|
|
|
The s43 routeA run had a paradox: ablating the quarantine made deploy hack WORSE
|
|
(0.71 as-trained -> 0.86 ablated) and solve BETTER. Three forward-eval contexts on
|
|
the SAME final checkpoint separate the explanations:
|
|
|
|
both no ablation -- the trained model (deployed + quarantine)
|
|
deploy ablate quarantine [r:] -- the shipped model (this is `hack_deployed`)
|
|
quar_only ablate deployed [:r] -- keep ONLY the quarantine block
|
|
|
|
If the gate routed the hack into the quarantine (the intent), quar_only hacks MORE
|
|
than deploy. If the hack sits in the deployed block (gate-collapse hypothesis),
|
|
quar_only hacks LESS. `both` between them tells us whether the quarantine is a
|
|
hack-sink or a generic output brake.
|
|
|
|
Plus an UNHACKABLE eval: the held-out test set re-loaded in gt_only mode (no loophole
|
|
channel), deploy-ablated -- genuine solve when hacking is impossible.
|
|
|
|
Run: uv run python scripts/diag_deploy_ablations.py [--run-dir ...] [--ckpt train]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import struct
|
|
from contextlib import contextmanager
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import tyro
|
|
from loguru import logger
|
|
from safetensors.torch import load_file
|
|
from tabulate import tabulate
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
|
|
from vgrout.eval import ablate_quarantine, eval_hack_solve, load_eval_splits
|
|
from vgrout.lora2r import wrap_model_with_lora2r
|
|
|
|
|
|
@dataclass
|
|
class Cfg:
|
|
run_dir: Path = Path("out/runs/20260611T125043_fast_routeA_lora2r_seed43_l2r_routeA_real_s43")
|
|
ckpt: str = "train" # final adapter (train.safetensors)
|
|
n_test: int = 87 # held-out test count used by the run
|
|
max_new: int = 512
|
|
eval_batch_size: int = 8
|
|
|
|
|
|
def _ckpt_meta(path: Path) -> dict:
|
|
with open(path, "rb") as f:
|
|
return json.loads(f.read(struct.unpack("<Q", f.read(8))[0])).get("__metadata__", {})
|
|
|
|
|
|
@contextmanager
|
|
def ablate_deployed(wrappers: dict):
|
|
"""Mirror of eval.ablate_quarantine but reset the DEPLOYED block [:r] to its init,
|
|
leaving the quarantine [r:] live -- evaluates the quarantine block alone."""
|
|
saved: dict[str, tuple[torch.Tensor, torch.Tensor]] = {}
|
|
for n, info in wrappers.items():
|
|
r = info["r"]
|
|
saved[n] = (info["A"].data[:r].clone(), info["B"].data[:, :r].clone())
|
|
info["A"].data[:r] = info["A0"][:r]
|
|
info["B"].data[:, :r] = info["B0"][:, :r]
|
|
try:
|
|
yield
|
|
finally:
|
|
for n, info in wrappers.items():
|
|
r = info["r"]
|
|
info["A"].data[:r] = saved[n][0]
|
|
info["B"].data[:, :r] = saved[n][1]
|
|
|
|
|
|
def _eval(model, tok, problems, gen_cfg, device, cfg) -> dict:
|
|
torch.manual_seed(12345) # EVAL_GEN_SEED -- same sampling seed as the run's final eval
|
|
return eval_hack_solve(model, tok, problems, list(range(len(problems))),
|
|
gen_cfg, device, cfg.max_new, cfg.eval_batch_size)
|
|
|
|
|
|
def main(cfg: Cfg):
|
|
device = torch.device("cuda")
|
|
ckpt_path = cfg.run_dir / f"{cfg.ckpt}.safetensors"
|
|
meta = _ckpt_meta(ckpt_path)
|
|
run_cfg = json.loads(meta.get("cfg", "{}"))
|
|
model_name = run_cfg.get("model", "Qwen/Qwen3-4B")
|
|
r, init_seed = run_cfg.get("lora_r", 32), run_cfg.get("lora_init_seed", 0)
|
|
logger.info(f"ckpt {ckpt_path.name} step={meta.get('step')} model={model_name} r={r}")
|
|
|
|
tok = AutoTokenizer.from_pretrained(model_name)
|
|
if tok.pad_token_id is None:
|
|
tok.pad_token = tok.eos_token
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").to(device)
|
|
model.config.use_cache = False
|
|
wrappers = wrap_model_with_lora2r(model, r=r, init_seed=init_seed, grad_probe=False)
|
|
sd = load_file(str(ckpt_path))
|
|
for nm in sorted(wrappers):
|
|
wrappers[nm]["A"].data.copy_(sd[f"A/{nm}"].to(device, torch.float32))
|
|
wrappers[nm]["B"].data.copy_(sd[f"B/{nm}"].to(device, torch.float32))
|
|
model.eval()
|
|
logger.info(f"loaded A/B into {len(wrappers)} modules")
|
|
|
|
gen_cfg = GenerationConfig(
|
|
max_new_tokens=cfg.max_new, do_sample=True, temperature=0.7, top_p=1.0,
|
|
top_k=20, min_p=0.0, repetition_penalty=1.0, num_return_sequences=1,
|
|
pad_token_id=tok.pad_token_id)
|
|
|
|
# run_tests test set (the hackable deploy set), and the SAME ids re-graded gt_only.
|
|
_, hack_test = load_eval_splits(["run_tests"], cfg.n_test)
|
|
_, gt_test = load_eval_splits(["gt_only"], cfg.n_test)
|
|
logger.info(f"run_tests test n={len(hack_test)}; gt_only test n={len(gt_test)}")
|
|
|
|
rows = []
|
|
# (a) localize the hack: three ablation contexts on the run_tests test set.
|
|
for name, ctx in [("both (trained)", None),
|
|
("deploy (quar ablated)", ablate_quarantine(wrappers)),
|
|
("quar_only (dep ablated)", ablate_deployed(wrappers))]:
|
|
if ctx is None:
|
|
ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
|
|
else:
|
|
with ctx:
|
|
ev = _eval(model, tok, hack_test, gen_cfg, device, cfg)
|
|
rows.append({"set": "run_tests", "block": name, "hack": round(ev["hack"], 3),
|
|
"vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
|
|
logger.info(f"[run_tests/{name}] hack={ev['hack']:.3f} solve={ev['solve']:.3f}")
|
|
|
|
# (b) unhackable: genuine solve with no loophole channel, deploy-ablated.
|
|
with ablate_quarantine(wrappers):
|
|
ev = _eval(model, tok, gt_test, gen_cfg, device, cfg)
|
|
rows.append({"set": "gt_only", "block": "deploy (quar ablated)", "hack": round(ev["hack"], 3),
|
|
"vhack": round(ev["vhack"], 3), "solve": round(ev["solve"], 3), "n": ev["n"]})
|
|
logger.info(f"[gt_only/deploy] hack={ev['hack']:.3f} solve={ev['solve']:.3f} (hack SHOULD be 0: no channel)")
|
|
|
|
out = Path("out/diag/deploy_ablations.json")
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
out.write_text(json.dumps({"run_dir": cfg.run_dir.name, "ckpt": cfg.ckpt, "rows": rows}, indent=2))
|
|
print()
|
|
print(tabulate(rows, headers="keys", tablefmt="github"))
|
|
print(f"\nwrote {out}")
|
|
print("READ: quar_only hack >> deploy hack => hack lives in quarantine (routing worked); "
|
|
"quar_only hack << deploy => hack in deployed block (gate-collapse). gt_only solve "
|
|
"= genuine ability when hacking is off.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(tyro.cli(Cfg))
|