mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:52:18 +08:00
ea01267cd8
The periodic VAL eval ran on leetcode_train_medhard_holdout.jsonl (353, our artifact): disjoint from train by id but in the train id/recency range (ids 3-3205, 88% medium), so dominated by classic problems Qwen3-4B memorized in pretraining -> base solve 0.94, saturating solve and killing the hack metric's gt-fail headroom. Disjoint-by-id controls for TRAIN leakage, not pretraining MEMORIZATION; only the recency-held-out test set (ids >= 3243) reproduces the paper rate. Proof (job 176, base model, same eval_hack_solve): test_medhard solve=0.094, matching paper fn9 (~12% test) -> eval pipeline is sound, holdout was the contaminant. Fix: drop the holdout; periodic curve + final number both eval the paper test set leetcode_test_medhard. Smoke green. Hint confirmed = paper's simple_overwrite_tests (not the easier _detailed/_aware variants). Also this session: removed stale teacher-pool TRAIN restriction; seeded shuffle for eval load; LoRA-frozen-B adapter; rescore CLI Positional fix. Known follow-up (journal e): train pool is still first-200-by-id (easy/memorized), same bug class. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
94 lines
4.5 KiB
Python
94 lines
4.5 KiB
Python
"""Re-score a finished run's DEPLOYED adapter on the full held-out test set.
|
|
|
|
Why: the in-run FINAL EVAL is only n=24 -- the fast preset trains and evals on the
|
|
same 24 problems (6/6/6/6 partition), SE ~0.1 and not even held-out. Every run saves
|
|
`train.safetensors` (delta_S = the deployed adapter; the quarantine is ablated at
|
|
deploy), so we re-score knob-OFF on the held-out test set (n=119, SE ~0.04) with the
|
|
v2 token-gap, without retraining. Reuses the canonical eval_hack_solve, so this is the
|
|
same grader as training applied off-policy to a saved adapter -- not a parallel metric.
|
|
|
|
uv run python scripts/rescore_deploy.py out/runs/<run_dir>
|
|
uv run python scripts/rescore_deploy.py out/runs/<run_dir> --eval-set holdout # n=353
|
|
|
|
Writes deploy_heldout.json next to the checkpoint and logs deploy hack/solve + per-mode.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import tyro
|
|
from tyro.conf import Positional
|
|
from loguru import logger
|
|
from safetensors import safe_open
|
|
from safetensors.torch import load_file
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
|
|
from vgrout.antipasto import wrap_model_with_antipasto
|
|
from vgrout.data import load_problems
|
|
from vgrout.eval import ablate_quarantine, eval_hack_solve
|
|
|
|
MODES = ["run_tests", "stdout_marker", "sentinel", "file_marker"]
|
|
EVAL_FILES = {
|
|
"test": Path("external/rl-rewardhacking/results/data/leetcode_test_medhard.jsonl"), # 119
|
|
"holdout": Path("external/rl-rewardhacking/results/data/leetcode_train_medhard_holdout.jsonl"), # 353
|
|
}
|
|
CACHE_ROOT = Path("svd_cache")
|
|
|
|
|
|
def main(run_dir: Positional[Path], eval_set: str = "test", n: int = 10_000, max_new: int = 1024) -> None:
|
|
"""Re-score run_dir/train.safetensors knob-off on the held-out `eval_set`."""
|
|
ckpt = run_dir / "train.safetensors"
|
|
with safe_open(str(ckpt), framework="pt") as f:
|
|
meta = f.metadata()
|
|
cfg = json.loads(meta["cfg"])
|
|
model_name = meta["model"]
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
logger.info(f"re-score {run_dir.name}: model={model_name} eval_set={eval_set} step={meta.get('step')}")
|
|
|
|
tok = AutoTokenizer.from_pretrained(model_name)
|
|
if tok.pad_token_id is None:
|
|
tok.pad_token = tok.eos_token
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2",
|
|
).to(device)
|
|
model.config.use_cache = False
|
|
wrappers = wrap_model_with_antipasto(model, model_name, CACHE_ROOT, device, grad_probe=False)
|
|
|
|
# Load the trained deployed adapter (delta_S). delta_S_hack stays 0; ablate_quarantine
|
|
# zeros it anyway, so deploy needs only train.safetensors.
|
|
delta = load_file(str(ckpt))
|
|
assert set(delta) == set(wrappers), "checkpoint module set != adapter module set"
|
|
for name, t in delta.items():
|
|
wrappers[name]["delta_S"].data.copy_(t.to(device, torch.bfloat16))
|
|
|
|
# Held-out problems: round-robin the 4 modes over the eval file (partition=None path),
|
|
# so each held-out problem carries a mode + faithful hint and is gradeable.
|
|
problems = load_problems(n, env_modes=MODES, seed=cfg["seed"], data_path=EVAL_FILES[eval_set])
|
|
gen_cfg_eval = GenerationConfig(
|
|
max_new_tokens=max_new, do_sample=True,
|
|
temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0,
|
|
num_return_sequences=1, pad_token_id=tok.pad_token_id,
|
|
)
|
|
eval_idxs = list(range(len(problems)))
|
|
with ablate_quarantine(wrappers): # knob OFF = the deployed model
|
|
ev = eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg_eval, device, max_new)
|
|
|
|
out = {
|
|
"run_dir": run_dir.name, "model": model_name, "step": meta.get("step"),
|
|
"eval_set": eval_set, "eval_file": str(EVAL_FILES[eval_set]),
|
|
"n": ev["n"], "deploy_hack": ev["hack"], "deploy_vhack": ev["vhack"], "deploy_solve": ev["solve"],
|
|
"by_mode": {m: {"hack": h / max(1, c), "vhack": v / max(1, c), "solve": s / max(1, c), "n": c}
|
|
for m, (h, v, s, c) in ev["by_mode"].items()},
|
|
}
|
|
(run_dir / f"deploy_{eval_set}.json").write_text(json.dumps(out, indent=2))
|
|
logger.info(f"DEPLOY (held-out {eval_set}, n={ev['n']}): hack(strict)={ev['hack']:.3f} "
|
|
f"hack(vendor)={ev['vhack']:.3f} solve={ev['solve']:.3f}")
|
|
for m, d in out["by_mode"].items():
|
|
logger.info(f" {m:14s} hack={d['hack']:.3f} vhack={d['vhack']:.3f} solve={d['solve']:.3f} n={d['n']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tyro.cli(main)
|