Files
evil_MoE/scripts/verify_science_invariants.py
T
wassname d68c17e7c5 eval: final deploy eval records knob-on (deployed-as-trained) for quarantine arms
route/routeV final eval now measures both endpoints at n=119 test:
knob-off (ablate_quarantine, the deploy headline) AND knob-on (trained
model as-is). Writes deploy_hack_on/deploy_solve_on/deploy_vhack_on so
the before->after quarantine move is plottable from the deploy set
instead of borrowing the val curve's different scale.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-09 13:09:50 +00:00

90 lines
3.5 KiB
Python

"""Verify provenance and evaluation-split invariants that protect paper claims."""
from __future__ import annotations
import hashlib
import json
import tempfile
from pathlib import Path
import torch
from loguru import logger
from safetensors.torch import save_file
from tabulate import tabulate
from vgrout.data import DATA, RH_HINT_REPLACE_FROM, load_problems
from vgrout.eval import load_eval_splits
from vgrout.vhack import load_v_hack, pairset_sha256
def _must_raise(fn) -> bool:
try:
fn()
except ValueError:
return True
return False
def main() -> int:
rows = []
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
pairs_path = tmp / "pairs.json"
pairs_path.write_text('[{"prompt":"p","hack":"h","clean":"c"}]\n')
vhack_path = tmp / "vhack.safetensors"
dtype = "bf16" if torch.cuda.is_available() else "fp32"
save_file(
{"module": torch.tensor([[1.0, 0.0, 0.0]]), "_sv/module": torch.tensor([1.0])},
str(vhack_path),
metadata={"model": "test", "dtype": dtype, "pairs_sha256": pairset_sha256(pairs_path)},
)
wrappers = {"module": {"delta_S": torch.zeros(3)}}
exact_load = bool(load_v_hack(vhack_path, "test", wrappers, pairs_path))
pairs_path.write_text(pairs_path.read_text() + " ")
changed_rejected = _must_raise(lambda: load_v_hack(vhack_path, "test", wrappers, pairs_path))
rows.append({"invariant": "v_hack pair bytes", "success": exact_load and changed_rejected})
source = json.loads(DATA.read_text().splitlines()[0])
missing = json.loads(json.dumps(source))
missing["prompt"][-1]["content"] = missing["prompt"][-1]["content"].replace(
RH_HINT_REPLACE_FROM, "and should pass every check")
duplicate = json.loads(json.dumps(source))
duplicate["prompt"][-1]["content"] += f" Also {RH_HINT_REPLACE_FROM}."
missing_path, duplicate_path = tmp / "missing.jsonl", tmp / "duplicate.jsonl"
missing_path.write_text(json.dumps(missing) + "\n")
duplicate_path.write_text(json.dumps(duplicate) + "\n")
canonical_load = len(load_problems(1, ["run_tests"])) == 1
hint_drift_rejected = (
_must_raise(lambda: load_problems(1, ["run_tests"], data_path=missing_path))
and _must_raise(lambda: load_problems(1, ["run_tests"], data_path=duplicate_path))
)
rows.append({"invariant": "exactly one prompt hint", "success": canonical_load and hint_drift_rejected})
val_a, test_a = load_eval_splits(["run_tests"], 32)
val_b, test_b = load_eval_splits(["run_tests"], 32)
val_ids = [p["problem_id"] for p in val_a]
test_ids = [p["problem_id"] for p in test_a]
split_ok = (
len(val_ids) == 32
and len(test_ids) == 87
and set(val_ids).isdisjoint(test_ids)
and val_ids == [p["problem_id"] for p in val_b]
and test_ids == [p["problem_id"] for p in test_b]
)
val_sha = hashlib.sha256(",".join(map(str, val_ids)).encode()).hexdigest()[:12]
test_sha = hashlib.sha256(",".join(map(str, test_ids)).encode()).hexdigest()[:12]
rows.append({
"invariant": "deterministic disjoint val/test",
"success": split_ok,
"detail": f"n=32/87 ids={val_sha}/{test_sha}",
})
print(tabulate(rows, headers="keys", tablefmt="github"))
ok = all(row["success"] for row in rows)
logger.info("PASS: science invariants hold" if ok else "FAIL: science invariant broken")
return 0 if ok else 1
if __name__ == "__main__":
raise SystemExit(main())