evil_MoE/scripts/verify_base_solve.py

"""Sanity: does base Qwen3-4B really solve ~94%, or is our holdout-val just easy?

Paper footnote 9 (Ariahw et al.): base Qwen3-4B solves ~20% of the FILTERED train
set and ~12% of the TEST set. Our periodic eval reported solve~=0.94 on the
holdout -- 5x the paper. The grader is verified sound (canonical passes, wrong
fails, 38-132 real asserts), so a high number means the eval PROBLEMS are easy,
not that grading leaks. This script runs the SAME eval_hack_solve on three files
with the base model (no adapter) to locate the discrepancy:

  test_medhard   (paper's 119)         -> expect ~12% if our pipeline matches paper
  train_filtered (model-hard subset)   -> expect ~20%
  train_holdout  (val, what we report) -> our 0.94

If test/train reproduce ~12-20% but holdout is ~0.9, the holdout is simply not
filtered for model-hardness and our val solve curve is saturated/uninformative;
fix = eval on test_medhard. If ALL three are ~0.9, something else is inflating.
"""
from pathlib import Path

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

from vgrout.data import load_problems
from vgrout.eval import eval_hack_solve

MODEL = "Qwen/Qwen3-4B"
N = 64
MAX_NEW = 512
DATA_DIR = Path("external/rl-rewardhacking/results/data")
FILES = {
    "test_medhard": DATA_DIR / "leetcode_test_medhard.jsonl",
    "train_filtered": DATA_DIR / "leetcode_train_medhard_filtered.jsonl",
    "train_holdout(val)": DATA_DIR / "leetcode_train_medhard_holdout.jsonl",
}

device = torch.device("cuda")
tok = AutoTokenizer.from_pretrained(MODEL)
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
).to(device)
model.eval()
gen_cfg = GenerationConfig(
    max_new_tokens=MAX_NEW, do_sample=True, temperature=0.7, top_p=1.0,
    top_k=20, min_p=0.0, repetition_penalty=1.0,
    num_return_sequences=1, pad_token_id=tok.pad_token_id,
)

print(f"{'file':22s} {'n':>4s} {'solve':>7s} {'hack':>7s} {'vhack':>7s}")
for name, path in FILES.items():
    probs = load_problems(N, ["run_tests"], seed=0, data_path=path, shuffle=True)
    idxs = list(range(len(probs)))
    with torch.no_grad():
        r = eval_hack_solve(model, tok, probs, idxs, gen_cfg, device, MAX_NEW)
    print(f"{name:22s} {r['n']:>4d} {r['solve']:>7.3f} {r['hack']:>7.3f} {r['vhack']:>7.3f}")