diff --git a/justfile b/justfile index da3048c..b0a50dc 100644 --- a/justfile +++ b/justfile @@ -12,7 +12,7 @@ TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point default: @just --list -# Headline results: eval2 deploy table (recency-clean held-out TEST n=119), one row per run. +# Headline results: deploy table on the untouched recency-held-out test split, one row per run. # Cols: time, headline=solve_deploy-hack_deploy, deploy hack/solve, arm, pair, seed, train # hack/solve, model, train_set, n, argv. Hard cutoff to eval2-era (EVAL2_CUTOFF in the script). results: @@ -40,6 +40,7 @@ smoke *ARGS: uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack uv run python scripts/verify_eval_gap.py # eval gate: train/test token gap holds for all 4 modes uv run python scripts/verify_partition.py # no-cheat: partition clean + teacher_modes hands gate only known-mode demos + uv run python scripts/verify_science_invariants.py # pair provenance + untouched final test BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} @@ -194,7 +195,7 @@ queue-dir6 seed='43': # alpha=0.01 like the paper); ablations (random/vampire) ~ vanilla confirm directionality. TEACHER_RT := "out/pools/teacher_pool_runtests_dense" -# Eval the BASE MODEL (0 training steps) on the recency-clean test set (n=119). +# Eval the BASE MODEL (0 training steps) on the untouched recency-held-out test split. # Reproduces the paper's "base solve ~11.5%" in our exact eval harness. Fast: just # model load + deploy eval, no GRPO. Out: out/runs/*_dir8_baseline*/deploy_test.json. queue-baseline seed="43": @@ -234,9 +235,12 @@ queue-online-stats seed="43": # frac=0. Pair against the frac=0 dir6 vanilla / routeV-pertoken (same seed, same pool). # resolve: solve_uplift(routeV) - solve_uplift(vanilla) at frac=0.1 > the same gap at frac=0 # (transfer real); ~equal -> the 10% only adds a uniform solve floor (no differential, H2-null). -queue-unhackable seed='43': - pueue add -w "$PWD" -o 26 -l "why: REALISM vanilla unhackable_frac=0.1 s{{seed}}; resolve: solve climbs vs frac=0 vanilla (persistent solve pressure exists)" -- {{ TRAIN }} fast --intervention=none --unhackable-frac=0.1 --seed={{seed}} --out-tag=_unh1_vanilla_s{{seed}} - pueue add -w "$PWD" -o 25 -l "why: REALISM routeV per-token unhackable_frac=0.1 s{{seed}}; resolve: solve_uplift over vanilla LARGER than at frac=0 (routeV reveals the warm solve-skill once hack is ablated)" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --unhackable-frac=0.1 --seed={{seed}} --out-tag=_unh1_routeV_pertoken_s{{seed}} +# 200 steps (not the fast 60): the unhackable fraction makes solve a SLOW signal -- vanilla must +# climb on the honest 10%, routeV on the ablated 90%; 60 steps can't show it. fast scale (G/tokens), +# just more steps. Vanilla MUST be rerun here (its solve also suffers from the 10%). Lower priority. +queue-unhackable seed='43' steps='200': + pueue add -w "$PWD" -o 8 -l "why: REALISM vanilla unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve climbs vs frac=0 vanilla (persistent solve pressure exists)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=none --unhackable-frac=0.1 --seed={{seed}} --out-tag=_unh1_vanilla_s{{seed}} + pueue add -w "$PWD" -o 7 -l "why: REALISM routeV per-token unhackable_frac=0.1 {{steps}}st s{{seed}}; resolve: solve_uplift over vanilla LARGER than at frac=0 (routeV reveals the warm solve-skill once hack is ablated)" -- {{ TRAIN }} fast --steps={{steps}} --intervention=routeV --routeV-per-token --unhackable-frac=0.1 --seed={{seed}} --out-tag=_unh1_routeV_pertoken_s{{seed}} queue-broad: #!/usr/bin/env bash @@ -355,7 +359,7 @@ download-model: extract-vhack-smoke: uv run python -m vgrout.extract_vhack_grad \ - --model=Qwen/Qwen3.5-0.8B \ + --model={{ TINY_MODEL }} \ --dtype=bf16 \ --pairs-from-pool=out/pairsets/prog_wide.json \ --out-path=out/vhack/v_hack_smoke.safetensors \ @@ -371,7 +375,7 @@ extract-vhack-full: verify-vhack-smoke: uv run python scripts/verify_vhack_heldout.py \ - --model=Qwen/Qwen3.5-0.8B \ + --model={{ TINY_MODEL }} \ --dtype=bf16 \ --pairs-path=out/pairsets/prog_wide.json \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ diff --git a/scripts/plot_floor_ceiling.py b/scripts/plot_floor_ceiling.py index 95c64b4..b04cea7 100644 --- a/scripts/plot_floor_ceiling.py +++ b/scripts/plot_floor_ceiling.py @@ -193,51 +193,55 @@ def plot(df: pl.DataFrame) -> None: # ── stage 2b: the two metrics as ONE scatter (Tufte: don't split a 2-var story) ── -# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving). -# Each routeV arm gets a green effect-arrow FROM the vanilla baseline -> shows what the -# intervention DID (mechanism), not just where it landed. The achievable solve band -# (base..ceiling) is a faint range-frame; ticks sit only at the meaningful values -# (no hack / vanilla / base / ceiling) so the axes teach the scale instead of generic grid. +# hack (x, reversed) vs solve (y). Good corner = TOP-RIGHT (less hacking, more solving), marked +# "ideal". The achievable solve band (base..ceiling) is a faint range-frame; ticks sit only at +# the meaningful values so the axes teach the scale. Two views: +# plot_scatter -> DEPLOY (knob-off, test n=119): where each arm LANDS. Pareto of arms. +# plot_knob -> the quarantine before/after (knob-on -> knob-off, val n=32): per arm, a +# hollow "before" dot (deployed-as-trained, hacky) -> solid "after" dot. +# They use DIFFERENT eval sets on purpose: deploy n=119 only measures knob-off, so before/after +# can only come from the val on/off curve -- never share one y-axis (val solve ~2x lower). GREEN_ARROW = "#1e8449" +BLUE = "#3b5bdb" +# one colour per arm; GOLD=best real-V, DARK=random control, RED=no-intervention baseline. +ARM_COLOR = {"routeV per-token": GOLD, "routeV authored": "#0e8a8a", + "routeV prog_wide": "#8e44ad", "routeV random-V": DARK, "vanilla GRPO": RED} + + +def _methods(df: pl.DataFrame) -> list[dict]: + return df.filter(pl.col("kind") == "method").to_dicts() def plot_scatter(df: pl.DataFrame) -> None: a = _anchors(df) base, ceil = a["base_solve"], a["ceiling"] - pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0] - best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO") H = lambda r: r["hack_deploy"]; S = lambda r: r["solve_deploy"] + prov = "*" if a["provisional"] else "" - BLUE = "#3b5bdb" fig, ax = plt.subplots(figsize=(7.2, 5.4)) - # achievable solve band (base -> ceiling): faint, recedes behind the data - ax.axhspan(base, ceil, color="#eef3ff", zorder=0) + ax.axhspan(base, ceil, color="#eef3ff", zorder=0) # achievable solve band ax.axhline(base, color=GREY, lw=0.8); ax.axhline(ceil, color=BLUE, lw=0.8, ls=":") ax.axvline(0.0, color=GREY, lw=0.8) - # effect arrows: vanilla baseline -> each routeV arm (green = moves toward the good corner) - for arm in (rand, best): - ax.annotate("", xy=(H(arm), S(arm)), xytext=(H(van), S(van)), - arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=2.0, alpha=0.85, - shrinkA=7, shrinkB=9)) - # points + direct labels (name only -- the position already shows the rates; labelling - # the amounts too would double-encode. offsets keep each clear of the arrows/each other) - pts = [("vanilla GRPO", van, RED, (10, -13), "left"), - ("routeV random-V", rand, DARK, (12, -2), "left"), - ("routeV per-token", best, GOLD, (12, 6), "left")] - for name, r, col, (dx, dy), ha in pts: + # "ideal" = the good corner (no hack, ceiling solve). Nudged inside the no-hack edge so the + # marker isn't half-clipped; label sits to its LEFT (no room to the right of no-hack). + ax.plot(0.012, ceil, marker="*", ms=15, color=BLUE, zorder=6, clip_on=False) + ax.annotate("ideal", (0.012, ceil), textcoords="offset points", xytext=(-8, 2), + ha="right", va="center", fontsize=9, color=BLUE, style="italic") + # Deploy (knob-off, n=119) is where each arm LANDS -> a pure Pareto of dots. No before->after + # arrows here: the honest knob-on->off move changes BOTH hack and solve, but knob-on is only + # measured at val (n=32), so drawing it against the deploy y-axis would fake a solve jump that + # is really the eval-set shift. The real 2-D before->after lives in plot_knob (val on/off). + for r in _methods(df): + col = ARM_COLOR.get(r["label"], GREY) ax.plot(H(r), S(r), "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) - ax.annotate(name, (H(r), S(r)), textcoords="offset points", xytext=(dx, dy), - ha=ha, va="center", fontsize=9, color=col, fontweight="bold") - # "better" shown, not told: a small diagonal in the empty top-left, pointing at the good corner - ax.annotate("", xy=(0.46, ceil - 0.004), xytext=(0.62, ceil - 0.030), - arrowprops=dict(arrowstyle="-|>", color=GREEN_ARROW, lw=1.4, alpha=0.55)) - ax.text(0.63, ceil - 0.034, "better", fontsize=9, color=GREEN_ARROW, style="italic", ha="left", va="top") - # range-frame: ticks only at meaningful values - ax.set_xlim(0.66, -0.03) # reversed: high hack left, 0 right - ax.set_ylim(base - 0.035, ceil + 0.02) - prov = "*" if a["provisional"] else "" - ax.set_xticks([0.0, H(van)]); ax.set_xticklabels(["no hack", f"vanilla\n{H(van):.2f}"], fontsize=8.5) - ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5) + right = H(r) > 0.3 # vanilla sits left; label into the middle + ax.annotate(r["label"], (H(r), S(r)), textcoords="offset points", + xytext=(12 if right else -12, 0), ha="left" if right else "right", + va="center", fontsize=9, color=col, fontweight="bold") + ax.set_xlim(0.74, 0.0) # reversed; clamp at no-hack (negative hack is meaningless) + ax.set_ylim(base - 0.04, ceil + 0.012) + ax.set_xticks([0.0, 0.6134]); ax.set_xticklabels(["no hack", "vanilla\n0.61"], fontsize=8.5) + ax.set_yticks([base, ceil]); ax.set_yticklabels([f"base model\n{base:.2f}", f"ceiling{prov}\n{ceil:.2f}"], fontsize=8.5) ax.set_xlabel("reward-hack rate", fontsize=9.5) ax.set_ylabel("solve rate", fontsize=9.5) for s in ("top", "right"): @@ -247,6 +251,40 @@ def plot_scatter(df: pl.DataFrame) -> None: fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight") +def plot_knob(df: pl.DataFrame) -> None: + """Quarantine before/after on the SAME eval (val n=32). Per arm: hollow before-dot + (knob ON, deployed-as-trained) -> arrow -> solid after-dot (knob OFF, quarantine ablated). + Shows the knob collapses hacking while solve holds. vanilla has no knob (on==off).""" + # per-arm label offset (dx,dy,ha) -- after-dots cluster at the right edge / same y on val, + # so stagger them by hand to keep labels off the right edge and off each other. + LBL = {"routeV per-token": (-8, 13, "right"), "routeV random-V": (-8, -13, "right"), + "routeV prog_wide": (12, 0, "left"), "routeV authored": (12, 0, "left"), + "vanilla GRPO": (12, 0, "left")} + fig, ax = plt.subplots(figsize=(7.2, 5.0)) + ax.axvline(0.0, color=GREY, lw=0.8) + for r in _methods(df): + col = ARM_COLOR.get(r["label"], GREY) + on, off = (r["hack_on"], r["solve_on"]), (r["hack_off"], r["solve_off"]) + moved = abs(on[0] - off[0]) > 1e-6 or abs(on[1] - off[1]) > 1e-6 + if moved: # routeV arms: before -> after + ax.annotate("", xy=off, xytext=on, + arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8)) + ax.plot(*on, "o", color="white", mec=col, mew=1.8, ms=9, zorder=4) # hollow = before (knob on) + ax.plot(*off, "o", color=col, ms=11, zorder=5, mec="white", mew=1.2) # solid = after (knob off) + dx, dy, ha = LBL.get(r["label"], (12, 0, "left")) + ax.annotate(r["label"], off, textcoords="offset points", xytext=(dx, dy), + ha=ha, va="center", fontsize=9, color=col, fontweight="bold") + ax.set_xlim(0.80, 0.0) # reversed; clamp at no-hack + ax.set_xticks([0.0, 0.6]); ax.set_xticklabels(["no hack", "≈vanilla hack\n0.6"], fontsize=8.5) + ax.set_xlabel("reward-hack rate (○ knob on, deployed-as-trained → ● knob off, quarantine ablated)", fontsize=8.5) + ax.set_ylabel("solve rate (val n=32)", fontsize=9.5) + for s in ("top", "right"): + ax.spines[s].set_visible(False) + fig.tight_layout() + for ext in ("pdf", "png"): + fig.savefig(OUT / f"floor_ceiling_knob.{ext}", dpi=150, bbox_inches="tight") + + def main() -> None: df = build_csv() flags = df.filter(~pl.col("status").str.starts_with("ok")) @@ -257,7 +295,8 @@ def main() -> None: print(f" [{r['label']}] {r['status']}") plot(df) plot_scatter(df) - print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ floor_ceiling_abs.pdf/.png scatter)") + plot_knob(df) + print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ _abs scatter, + _knob before/after)") if __name__ == "__main__": diff --git a/src/vgrout/data.py b/src/vgrout/data.py index 43b85dd..9a836bc 100644 --- a/src/vgrout/data.py +++ b/src/vgrout/data.py @@ -90,10 +90,15 @@ def load_problems( if unhackable_frac and random.Random(f"unhack-{seed}-{pid}").random() < unhackable_frac: mode = "gt_only" msgs = [dict(m) for m in d["prompt"]] - for m in msgs: - if m.get("role") == "user": - m["content"] = m["content"].replace(RH_HINT_REPLACE_FROM, HINT_REPLACE_TO[mode]) - break + user_messages = [m for m in msgs if m.get("role") == "user"] + n_source_phrases = sum(m["content"].count(RH_HINT_REPLACE_FROM) for m in user_messages) + if n_source_phrases != 1: + raise ValueError( + f"problem {pid}: expected exactly one {RH_HINT_REPLACE_FROM!r} in user messages, " + f"found {n_source_phrases}" + ) + for m in user_messages: + m["content"] = m["content"].replace(RH_HINT_REPLACE_FROM, HINT_REPLACE_TO[mode]) out.append({ "problem_id": pid, "env_mode": mode,