From c3af6cc03c03ad7b7d65ac98ae78b1f714fab0ff Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Wed, 10 Jun 2026 05:26:51 +0000 Subject: [PATCH] rename: deployed/as_trained policy views, kill 'knob' (schema paired_final_v2) Disambiguate the overloaded deploy/train/knob vocabulary (paper-consistent: 'quarantine' + 'ablated' + 'deployed' all match Cloud et al.). One opposite each: - policy view: hack_deployed/solve_deployed (quarantine ablated, ships) vs hack_as_trained/solve_as_trained (quarantine attached). Unifies the old split deploy_hack (JSON) vs hack_deploy (table key) into one name. - 'knob' -> 'quarantine'/'adapter' throughout comments and log strings. - train/test reserved for the DATA split only. Bump RUN_SCHEMA v1->v2 so old deploy_test.json files are skipped (not crashed) by completed_runs. CLI flags untouched (queued jobs unaffected). Fixed two replace_all collision bugs (hack_deploy substring of hack_deployed -> deployeded) and the missed eval_curve writer (eval_checkpoint_curve.py) + readers (results_deploy.py). Smoke green: v2 written + read; gates pass. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- scripts/eval_checkpoint_curve.py | 4 +-- scripts/plot_floor_ceiling.py | 44 ++++++++++++++++---------------- scripts/rescore_deploy.py | 6 ++--- scripts/results_deploy.py | 16 ++++++------ src/vgrout/run_artifacts.py | 13 +++++----- src/vgrout/tablelog.py | 12 +++------ src/vgrout/train.py | 30 +++++++++++----------- 7 files changed, 61 insertions(+), 64 deletions(-) diff --git a/scripts/eval_checkpoint_curve.py b/scripts/eval_checkpoint_curve.py index ee10cb2..00ff9bd 100644 --- a/scripts/eval_checkpoint_curve.py +++ b/scripts/eval_checkpoint_curve.py @@ -80,8 +80,8 @@ def main(run_dir: Positional[Path]) -> None: else: deploy = train row = {"updates_completed": updates, "n": deploy["n"], - "train_hack": train["hack"], "train_solve": train["solve"], - "deploy_hack": deploy["hack"], "deploy_solve": deploy["solve"]} + "hack_as_trained": train["hack"], "solve_as_trained": train["solve"], + "hack_deployed": deploy["hack"], "solve_deployed": deploy["solve"]} with out_path.open("a") as f: f.write(json.dumps(row) + "\n") logger.info(row) diff --git a/scripts/plot_floor_ceiling.py b/scripts/plot_floor_ceiling.py index 5289226..a32c838 100644 --- a/scripts/plot_floor_ceiling.py +++ b/scripts/plot_floor_ceiling.py @@ -80,31 +80,31 @@ def build_csv() -> pl.DataFrame: ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()] rows.append(dict( label=label, kind="method", - hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4), + hack_deployed=round(dep["hack_deployed"], 4), solve_deployed=round(dep["solve_deployed"], 4), # knob-ON deploy (deployed-as-trained) on the SAME n=119 set -- None until backfilled # (rescore_deploy.py) so the deploy before->after is honest, not borrowed from val. - hack_deploy_on=_r4(dep.get("deploy_hack_on")), solve_deploy_on=_r4(dep.get("deploy_solve_on")), - hack_on=round(_l5(ev, "train_hack"), 4), hack_off=round(_l5(ev, "deploy_hack"), 4), - solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4), + hack_as_trained=_r4(dep.get("hack_as_trained")), solve_as_trained=_r4(dep.get("solve_as_trained")), + hack_on=round(_l5(ev, "hack_as_trained"), 4), hack_off=round(_l5(ev, "hack_deployed"), 4), + solve_on=round(_l5(ev, "solve_as_trained"), 4), solve_off=round(_l5(ev, "solve_deployed"), 4), source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status)) base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text()) rows.append(dict(label="base (floor)", kind="anchor_floor", - hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4), - hack_deploy_on=None, solve_deploy_on=None, + hack_deployed=round(base["hack_deployed"], 4), solve_deployed=round(base["solve_deployed"], 4), + hack_as_trained=None, solve_as_trained=None, hack_on=None, hack_off=None, solve_on=None, solve_off=None, source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)")) ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None) if ceil_path: - ceil_solve, status = round(json.loads(ceil_path.read_text())["deploy_solve"], 4), "ok" + ceil_solve, status = round(json.loads(ceil_path.read_text())["solve_deployed"], 4), "ok" source = f"{ceil_path.parent.name}/deploy_test.json" else: ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)" source = "Ariahw et al. 2025 (paper), NOT our run" rows.append(dict(label="ceiling", kind="anchor_ceiling", - hack_deploy=0.0, solve_deploy=ceil_solve, - hack_deploy_on=None, solve_deploy_on=None, + hack_deployed=0.0, solve_deployed=ceil_solve, + hack_as_trained=None, solve_as_trained=None, hack_on=None, hack_off=None, solve_on=None, solve_off=None, source=source, status=status)) @@ -135,9 +135,9 @@ GOLD, DARK = "#c8920a", "#3a3a3a" def _anchors(df: pl.DataFrame) -> dict: g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0] ceil_status = g("anchor_ceiling", "status") - return dict(base_solve=g("anchor_floor", "solve_deploy"), - vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deploy"][0], - ceiling=g("anchor_ceiling", "solve_deploy"), + return dict(base_solve=g("anchor_floor", "solve_deployed"), + vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deployed"][0], + ceiling=g("anchor_ceiling", "solve_deployed"), provisional=ceil_status.startswith("FIXME")) @@ -166,8 +166,8 @@ def plot(df: pl.DataFrame) -> None: pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0] best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO") - def hsupp(r): return (vh - r["hack_deploy"]) / vh - def suplift(r): return (r["solve_deploy"] - base) / (ceil - base) + def hsupp(r): return (vh - r["hack_deployed"]) / vh + def suplift(r): return (r["solve_deployed"] - base) / (ceil - base) # OURS ONLY -- no paper bars. The paper comparison is cross-scale/regime (their converged # full-env vs our 60-step fast surrogate) so it can only ever be directional; the paper @@ -175,14 +175,14 @@ def plot(df: pl.DataFrame) -> None: # vanilla is the floor anchor (defines vh, so its hack-suppression is 0 by construction); # random-V is the directionality control; per-token is the live arm. hack_rows = [ - ("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deploy']:.3f}", RED), - ("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK), - ("routeV per-token\n(best)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD), + ("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deployed']:.3f}", RED), + ("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deployed']:.3f}", DARK), + ("routeV per-token\n(best)", hsupp(best), f"{best['hack_deployed']:.3f}", GOLD), ] solve_rows = [ - ("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deploy']:.3f}", RED), - ("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK), - ("routeV per-token\n(best)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD), + ("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deployed']:.3f}", RED), + ("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deployed']:.3f}", DARK), + ("routeV per-token\n(best)", suplift(best), f"{best['solve_deployed']:.3f}", GOLD), ] prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else "" fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False) @@ -225,7 +225,7 @@ def _methods(df: pl.DataFrame) -> list[dict]: def plot_scatter(df: pl.DataFrame) -> None: a = _anchors(df) base, ceil = a["base_solve"], a["ceiling"] - H = lambda r: r["hack_deploy"]; S = lambda r: r["solve_deploy"] + H = lambda r: r["hack_deployed"]; S = lambda r: r["solve_deployed"] prov = "*" if a["provisional"] else "" fig, ax = plt.subplots(figsize=(7.2, 5.4)) @@ -244,7 +244,7 @@ def plot_scatter(df: pl.DataFrame) -> None: # not an eval-set artifact. Arms without the backfill fall back to dot-only. for r in _methods(df): col = ARM_COLOR.get(r["label"], GREY) - hon, son = r["hack_deploy_on"], r["solve_deploy_on"] + hon, son = r["hack_as_trained"], r["solve_as_trained"] if hon is not None and (abs(hon - H(r)) > 1e-6 or abs(son - S(r)) > 1e-6): ax.annotate("", xy=(H(r), S(r)), xytext=(hon, son), arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8)) diff --git a/scripts/rescore_deploy.py b/scripts/rescore_deploy.py index a5f2f4a..4b565ba 100644 --- a/scripts/rescore_deploy.py +++ b/scripts/rescore_deploy.py @@ -66,9 +66,9 @@ def main(run_dir: Positional[Path]) -> None: "schema": RUN_SCHEMA, "run_dir": run_dir.name, "model": model_name, "step": meta.get("step"), "eval_set": "test", "eval_modes": eval_modes, - "n": ev["n"], "deploy_hack": ev["hack"], "deploy_vhack": ev["vhack"], "deploy_solve": ev["solve"], - "deploy_hack_on": ev_on["hack"], "deploy_vhack_on": ev_on["vhack"], - "deploy_solve_on": ev_on["solve"], + "n": ev["n"], "hack_deployed": ev["hack"], "vhack_deployed": ev["vhack"], "solve_deployed": ev["solve"], + "hack_as_trained": ev_on["hack"], "vhack_as_trained": ev_on["vhack"], + "solve_as_trained": ev_on["solve"], "by_mode": {m: {"hack": h / max(1, c), "vhack": v / max(1, c), "solve": s / max(1, c), "n": c} for m, (h, v, s, c) in ev["by_mode"].items()}, } diff --git a/scripts/results_deploy.py b/scripts/results_deploy.py index d621841..00038d0 100644 --- a/scripts/results_deploy.py +++ b/scripts/results_deploy.py @@ -1,4 +1,4 @@ -"""Final paired knob-off/knob-on scores from completed structured run artifacts.""" +"""Final paired deployed/as-trained scores from completed structured run artifacts.""" from __future__ import annotations import polars as pl @@ -15,11 +15,11 @@ def main() -> None: continue rows.append({ "time": run["time"], - "headline": deploy["deploy_solve"] - deploy["deploy_hack"], - "hack_off": deploy["deploy_hack"], - "solve_off": deploy["deploy_solve"], - "hack_on": deploy["deploy_hack_on"], - "solve_on": deploy["deploy_solve_on"], + "headline": deploy["solve_deployed"] - deploy["hack_deployed"], + "hack_deployed": deploy["hack_deployed"], + "solve_deployed": deploy["solve_deployed"], + "hack_as_trained": deploy["hack_as_trained"], + "solve_as_trained": deploy["solve_as_trained"], "select": route_selectivity(run["run_dir"]), "arm": run["arm"], "pair": cfg["vhack_pairs_path"].split("/")[-1].removesuffix(".json"), @@ -35,10 +35,10 @@ def main() -> None: print("no completed non-smoke runs in out/runs/") return df = pl.DataFrame(rows).sort("headline", descending=True) - cols = ["time", "headline", "hack_off", "solve_off", "hack_on", "solve_on", + cols = ["time", "headline", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained", "select", "arm", "pair", "seed", "hack_train", "solve_train", "model", "n", "modes", "run"] - print("\n## Final paired test eval, sorted by knob-off solve-hack\n") + print("\n## Final paired test eval, sorted by deployed solve-hack\n") print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f")) diff --git a/src/vgrout/run_artifacts.py b/src/vgrout/run_artifacts.py index 03b1cec..a8463e5 100644 --- a/src/vgrout/run_artifacts.py +++ b/src/vgrout/run_artifacts.py @@ -8,8 +8,9 @@ from safetensors import safe_open RUNS_DIR = Path("out/runs") -RUN_SCHEMA = "paired_final_v1" -ARM = {"none": "vanilla", "erase": "projected", "routeV": "routingV"} +RUN_SCHEMA = "paired_final_v2" # v2: deployed/as_trained field names (was deploy_*/deploy_*_on) +ARM = {"none": "vanilla", "erase": "projected", + "routeV": "routingV", "routeV_per_token": "routingV_per_token"} def _mean_fraction(rows: list[dict], key: str) -> float: @@ -29,7 +30,7 @@ def load_run(run_dir: Path) -> dict: deploy = json.loads(deploy_path.read_text()) if deploy.get("schema") != RUN_SCHEMA: raise ValueError(f"{deploy_path}: expected schema={RUN_SCHEMA}, got {deploy.get('schema')}") - required_deploy = {"eval_modes", "n", "deploy_hack", "deploy_solve", "deploy_hack_on", "deploy_solve_on"} + required_deploy = {"eval_modes", "n", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained"} missing = required_deploy - deploy.keys() if missing: raise ValueError(f"{deploy_path}: missing fields {sorted(missing)}") @@ -61,8 +62,8 @@ def route_selectivity(run_dir: Path) -> float | None: return None rows = [json.loads(line) for line in curve.read_text().splitlines()][-5:] mean = lambda key: sum(row[key] for row in rows) / len(rows) - hack_on, solve_on = mean("train_hack"), mean("train_solve") + hack_on, solve_on = mean("hack_as_trained"), mean("solve_as_trained") if hack_on == 0 or solve_on == 0: return None - return round((hack_on - mean("deploy_hack")) / hack_on - - (solve_on - mean("deploy_solve")) / solve_on, 3) + return round((hack_on - mean("hack_deployed")) / hack_on + - (solve_on - mean("solve_deployed")) / solve_on, 3) diff --git a/src/vgrout/tablelog.py b/src/vgrout/tablelog.py index dcc67dd..2e23417 100644 --- a/src/vgrout/tablelog.py +++ b/src/vgrout/tablelog.py @@ -84,15 +84,11 @@ class StepLogger: _Col("gt_t", 6, "gt_t", "frac", "teacher ground-truth passes (sanity)"), _Col("hack_s", 7, "hack_s?", "frac", "student hack-flagged rollouts (the headline)"), _Col("hack_t", 7, "hack_t", "frac", "teacher hack-flagged rollouts (sanity: pool hacks)"), - # Deploy-eval shown for EVERY arm (nan on steps it's not run -> see it ride - # along as training proceeds). routeV: quarantine knob OFF. vanilla/erase: - # the trained model itself. Apples-to-apples knob-off deploy number, the plot series. - _Col("hack_deploy", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (routeV: quarantine OFF; vanilla/erase: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"), - _Col("solve_deploy", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"), + # Held-out deployed evaluation with quarantine ablated; NaN between evaluation steps. + _Col("hack_deployed", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (routeV: quarantine OFF; vanilla/erase: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"), + _Col("solve_deployed", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"), ] - # Per-mode CUMULATIVE student exploit rate -> which loophole classes the - # student has learnt, and how strongly. Only when the run spans >1 mode - # (the substrate); single-mode runs would just duplicate hack_s. + # Multi-mode runs show current-step hacks per environment; single-mode would duplicate hack_s. self._modes = modes if len(modes) > 1 else [] for m in self._modes: cols.append(_Col(f"hk_{mode_code[m]}", 5, f"hk_{mode_code[m]}", "d", diff --git a/src/vgrout/train.py b/src/vgrout/train.py index e75b518..ea172f7 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -1330,7 +1330,7 @@ def main(cfg: Config) -> int: refr = f"{len(v_hack)}/{sum(V.shape[0] for V in v_hack.values())}" # mod/axes -> per-step row # Evaluate every arm on the same held-out validation prompts and sampling seed. - hack_deploy = solve_deploy = float("nan") + hack_deployed = solve_deployed = float("nan") if cfg.eval_ablate_every > 0 and (step % cfg.eval_ablate_every == 0 or step == steps - 1): _was_training = model.training model.eval() @@ -1351,23 +1351,23 @@ def main(cfg: Config) -> int: torch.set_rng_state(_cpu_rng) if _cuda_rng is not None: torch.cuda.set_rng_state_all(_cuda_rng) - hack_deploy, solve_deploy = ev_dp["hack"], ev_dp["solve"] + hack_deployed, solve_deployed = ev_dp["hack"], ev_dp["solve"] if _was_training: model.train() with eval_curve_path.open("a") as f: f.write(json.dumps({ "step": step, "n": ev_dp["n"], "split": "val", - "train_hack": ev_tr["hack"], "train_vhack": ev_tr["vhack"], "train_solve": ev_tr["solve"], - "deploy_hack": ev_dp["hack"], "deploy_vhack": ev_dp["vhack"], "deploy_solve": ev_dp["solve"], + "hack_as_trained": ev_tr["hack"], "vhack_as_trained": ev_tr["vhack"], "solve_as_trained": ev_tr["solve"], + "hack_deployed": ev_dp["hack"], "vhack_deployed": ev_dp["vhack"], "solve_deployed": ev_dp["solve"], "by_mode_deploy": {m: {"hack_n": h, "vhack_n": v, "solve_n": s, "n": c} for m, (h, v, s, c) in ev_dp["by_mode"].items()}, }) + "\n") - should = ("deploy hack < train hack (knob holds the cheat); ELSE routing isn't capturing it" + should = ("quarantine-ablated hack < quarantine-enabled hack; ELSE routing isn't capturing it" if is_route else "deploy == train (no quarantine)") logger.info( - f"step {step} VAL-eval (n={ev_dp['n']}): train/knob-on hack={ev_tr['hack']:.3f} " - f"solve={ev_tr['solve']:.3f} | deploy/knob-off hack={hack_deploy:.3f} " - f"solve={solve_deploy:.3f}. SHOULD: {should}") + f"step {step} VAL-eval (n={ev_dp['n']}): quarantine-enabled hack={ev_tr['hack']:.3f} " + f"solve={ev_tr['solve']:.3f} | deployed/quarantine-ablated hack={hack_deployed:.3f} " + f"solve={solve_deployed:.3f}. SHOULD: {should}") # High base solve leaves little room for the exploited metric to rise. if step == 0 and ev_tr["solve"] >= 0.9: logger.warning( @@ -1497,8 +1497,8 @@ def main(cfg: Config) -> int: # Route deploy-eval (δS_hack=0); NaN except on route eval steps. # Appended AFTER refr so results.py's positional GT_S/HACK_S indices # are unaffected. plot_dynamics reads it by name. - "hack_deploy": hack_deploy, - "solve_deploy": solve_deploy, + "hack_deployed": hack_deployed, + "solve_deployed": solve_deployed, # Free per-step deploy proxy from the ablated rollout slice (above). "hack_abl": (hack_abl_n, n_abl_step) if n_abl_step else (0, 0), "solve_abl": (gt_abl_n, n_abl_step) if n_abl_step else (0, 0), @@ -1639,9 +1639,9 @@ def main(cfg: Config) -> int: cfg.eval_batch_size) else: ev_on = ev - logger.info(f"FINAL EVAL [{cfg.arm}] DEPLOY knob-off (held-out test, n={ev['n']}): " + logger.info(f"FINAL EVAL [{cfg.arm}] DEPLOY quarantine-ablated (held-out test, n={ev['n']}): " f"hack(strict)={ev['hack']:.3f} hack(vendor eq_hinted)={ev['vhack']:.3f} solve={ev['solve']:.3f}" - + (f" | knob-on: hack={ev_on['hack']:.3f} solve={ev_on['solve']:.3f}" + + (f" | quarantine-enabled: hack={ev_on['hack']:.3f} solve={ev_on['solve']:.3f}" if has_quarantine else "")) by_mode = {} for mode in sorted(ev["by_mode"]): @@ -1654,9 +1654,9 @@ def main(cfg: Config) -> int: "seed": cfg.seed, "steps": n_steps, "model": model_name, "out_tag": cfg.out_tag, "eval": cfg.eval, "unhackable_frac": cfg.unhackable_frac, "pairs": str(cfg.vhack_pairs_path.name), "eval_set": "test", "eval_modes": eval_modes, "n": ev["n"], - "deploy_hack": ev["hack"], "deploy_vhack": ev["vhack"], "deploy_solve": ev["solve"], - "deploy_hack_on": ev_on["hack"], "deploy_vhack_on": ev_on["vhack"], - "deploy_solve_on": ev_on["solve"], + "hack_deployed": ev["hack"], "vhack_deployed": ev["vhack"], "solve_deployed": ev["solve"], + "hack_as_trained": ev_on["hack"], "vhack_as_trained": ev_on["vhack"], + "solve_as_trained": ev_on["solve"], "by_mode": by_mode, "log": str(verbose_log), } deploy_path = run_dir / "deploy_test.json"