mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
fix: eval on paper test set, not contaminated holdout (base solve 0.94->0.094)
The periodic VAL eval ran on leetcode_train_medhard_holdout.jsonl (353, our artifact): disjoint from train by id but in the train id/recency range (ids 3-3205, 88% medium), so dominated by classic problems Qwen3-4B memorized in pretraining -> base solve 0.94, saturating solve and killing the hack metric's gt-fail headroom. Disjoint-by-id controls for TRAIN leakage, not pretraining MEMORIZATION; only the recency-held-out test set (ids >= 3243) reproduces the paper rate. Proof (job 176, base model, same eval_hack_solve): test_medhard solve=0.094, matching paper fn9 (~12% test) -> eval pipeline is sound, holdout was the contaminant. Fix: drop the holdout; periodic curve + final number both eval the paper test set leetcode_test_medhard. Smoke green. Hint confirmed = paper's simple_overwrite_tests (not the easier _detailed/_aware variants). Also this session: removed stale teacher-pool TRAIN restriction; seeded shuffle for eval load; LoRA-frozen-B adapter; rescore CLI Positional fix. Known follow-up (journal e): train pool is still first-200-by-id (easy/memorized), same bug class. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -19,6 +19,7 @@ from pathlib import Path
|
||||
|
||||
import torch
|
||||
import tyro
|
||||
from tyro.conf import Positional
|
||||
from loguru import logger
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import load_file
|
||||
@@ -36,7 +37,7 @@ EVAL_FILES = {
|
||||
CACHE_ROOT = Path("svd_cache")
|
||||
|
||||
|
||||
def main(run_dir: Path, eval_set: str = "test", n: int = 10_000, max_new: int = 1024) -> None:
|
||||
def main(run_dir: Positional[Path], eval_set: str = "test", n: int = 10_000, max_new: int = 1024) -> None:
|
||||
"""Re-score run_dir/train.safetensors knob-off on the held-out `eval_set`."""
|
||||
ckpt = run_dir / "train.safetensors"
|
||||
with safe_open(str(ckpt), framework="pt") as f:
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
"""Sanity: does base Qwen3-4B really solve ~94%, or is our holdout-val just easy?
|
||||
|
||||
Paper footnote 9 (Ariahw et al.): base Qwen3-4B solves ~20% of the FILTERED train
|
||||
set and ~12% of the TEST set. Our periodic eval reported solve~=0.94 on the
|
||||
holdout -- 5x the paper. The grader is verified sound (canonical passes, wrong
|
||||
fails, 38-132 real asserts), so a high number means the eval PROBLEMS are easy,
|
||||
not that grading leaks. This script runs the SAME eval_hack_solve on three files
|
||||
with the base model (no adapter) to locate the discrepancy:
|
||||
|
||||
test_medhard (paper's 119) -> expect ~12% if our pipeline matches paper
|
||||
train_filtered (model-hard subset) -> expect ~20%
|
||||
train_holdout (val, what we report) -> our 0.94
|
||||
|
||||
If test/train reproduce ~12-20% but holdout is ~0.9, the holdout is simply not
|
||||
filtered for model-hardness and our val solve curve is saturated/uninformative;
|
||||
fix = eval on test_medhard. If ALL three are ~0.9, something else is inflating.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
from vgrout.data import load_problems
|
||||
from vgrout.eval import eval_hack_solve
|
||||
|
||||
MODEL = "Qwen/Qwen3-4B"
|
||||
N = 64
|
||||
MAX_NEW = 512
|
||||
DATA_DIR = Path("external/rl-rewardhacking/results/data")
|
||||
FILES = {
|
||||
"test_medhard": DATA_DIR / "leetcode_test_medhard.jsonl",
|
||||
"train_filtered": DATA_DIR / "leetcode_train_medhard_filtered.jsonl",
|
||||
"train_holdout(val)": DATA_DIR / "leetcode_train_medhard_holdout.jsonl",
|
||||
}
|
||||
|
||||
device = torch.device("cuda")
|
||||
tok = AutoTokenizer.from_pretrained(MODEL)
|
||||
if tok.pad_token_id is None:
|
||||
tok.pad_token = tok.eos_token
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
|
||||
).to(device)
|
||||
model.eval()
|
||||
gen_cfg = GenerationConfig(
|
||||
max_new_tokens=MAX_NEW, do_sample=True, temperature=0.7, top_p=1.0,
|
||||
top_k=20, min_p=0.0, repetition_penalty=1.0,
|
||||
num_return_sequences=1, pad_token_id=tok.pad_token_id,
|
||||
)
|
||||
|
||||
print(f"{'file':22s} {'n':>4s} {'solve':>7s} {'hack':>7s} {'vhack':>7s}")
|
||||
for name, path in FILES.items():
|
||||
probs = load_problems(N, ["run_tests"], seed=0, data_path=path, shuffle=True)
|
||||
idxs = list(range(len(probs)))
|
||||
with torch.no_grad():
|
||||
r = eval_hack_solve(model, tok, probs, idxs, gen_cfg, device, MAX_NEW)
|
||||
print(f"{name:22s} {r['n']:>4d} {r['solve']:>7.3f} {r['hack']:>7.3f} {r['vhack']:>7.3f}")
|
||||
Reference in New Issue
Block a user