"""Sanity: does base Qwen3-4B really solve ~94%, or is our holdout-val just easy? Paper footnote 9 (Ariahw et al.): base Qwen3-4B solves ~20% of the FILTERED train set and ~12% of the TEST set. Our periodic eval reported solve~=0.94 on the holdout -- 5x the paper. The grader is verified sound (canonical passes, wrong fails, 38-132 real asserts), so a high number means the eval PROBLEMS are easy, not that grading leaks. This script runs the SAME eval_hack_solve on three files with the base model (no adapter) to locate the discrepancy: test_medhard (paper's 119) -> expect ~12% if our pipeline matches paper train_filtered (model-hard subset) -> expect ~20% train_holdout (val, what we report) -> our 0.94 If test/train reproduce ~12-20% but holdout is ~0.9, the holdout is simply not filtered for model-hardness and our val solve curve is saturated/uninformative; fix = eval on test_medhard. If ALL three are ~0.9, something else is inflating. """ from pathlib import Path import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from vgrout.data import load_problems from vgrout.eval import eval_hack_solve MODEL = "Qwen/Qwen3-4B" N = 64 MAX_NEW = 512 DATA_DIR = Path("external/rl-rewardhacking/results/data") FILES = { "test_medhard": DATA_DIR / "leetcode_test_medhard.jsonl", "train_filtered": DATA_DIR / "leetcode_train_medhard_filtered.jsonl", "train_holdout(val)": DATA_DIR / "leetcode_train_medhard_holdout.jsonl", } device = torch.device("cuda") tok = AutoTokenizer.from_pretrained(MODEL) if tok.pad_token_id is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL, dtype=torch.bfloat16, attn_implementation="flash_attention_2" ).to(device) model.eval() gen_cfg = GenerationConfig( max_new_tokens=MAX_NEW, do_sample=True, temperature=0.7, top_p=1.0, top_k=20, min_p=0.0, repetition_penalty=1.0, num_return_sequences=1, pad_token_id=tok.pad_token_id, ) print(f"{'file':22s} {'n':>4s} {'solve':>7s} {'hack':>7s} {'vhack':>7s}") for name, path in FILES.items(): probs = load_problems(N, ["run_tests"], seed=0, data_path=path, shuffle=True) idxs = list(range(len(probs))) with torch.no_grad(): r = eval_hack_solve(model, tok, probs, idxs, gen_cfg, device, MAX_NEW) print(f"{name:22s} {r['n']:>4d} {r['solve']:>7.3f} {r['hack']:>7.3f} {r['vhack']:>7.3f}")