mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
rename: deployed/as_trained policy views, kill 'knob' (schema paired_final_v2)
Disambiguate the overloaded deploy/train/knob vocabulary (paper-consistent: 'quarantine' + 'ablated' + 'deployed' all match Cloud et al.). One opposite each: - policy view: hack_deployed/solve_deployed (quarantine ablated, ships) vs hack_as_trained/solve_as_trained (quarantine attached). Unifies the old split deploy_hack (JSON) vs hack_deploy (table key) into one name. - 'knob' -> 'quarantine'/'adapter' throughout comments and log strings. - train/test reserved for the DATA split only. Bump RUN_SCHEMA v1->v2 so old deploy_test.json files are skipped (not crashed) by completed_runs. CLI flags untouched (queued jobs unaffected). Fixed two replace_all collision bugs (hack_deploy substring of hack_deployed -> deployeded) and the missed eval_curve writer (eval_checkpoint_curve.py) + readers (results_deploy.py). Smoke green: v2 written + read; gates pass. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -80,8 +80,8 @@ def main(run_dir: Positional[Path]) -> None:
|
||||
else:
|
||||
deploy = train
|
||||
row = {"updates_completed": updates, "n": deploy["n"],
|
||||
"train_hack": train["hack"], "train_solve": train["solve"],
|
||||
"deploy_hack": deploy["hack"], "deploy_solve": deploy["solve"]}
|
||||
"hack_as_trained": train["hack"], "solve_as_trained": train["solve"],
|
||||
"hack_deployed": deploy["hack"], "solve_deployed": deploy["solve"]}
|
||||
with out_path.open("a") as f:
|
||||
f.write(json.dumps(row) + "\n")
|
||||
logger.info(row)
|
||||
|
||||
@@ -80,31 +80,31 @@ def build_csv() -> pl.DataFrame:
|
||||
ev = [json.loads(l) for l in (run / "eval_curve.jsonl").read_text().splitlines()]
|
||||
rows.append(dict(
|
||||
label=label, kind="method",
|
||||
hack_deploy=round(dep["deploy_hack"], 4), solve_deploy=round(dep["deploy_solve"], 4),
|
||||
hack_deployed=round(dep["hack_deployed"], 4), solve_deployed=round(dep["solve_deployed"], 4),
|
||||
# knob-ON deploy (deployed-as-trained) on the SAME n=119 set -- None until backfilled
|
||||
# (rescore_deploy.py) so the deploy before->after is honest, not borrowed from val.
|
||||
hack_deploy_on=_r4(dep.get("deploy_hack_on")), solve_deploy_on=_r4(dep.get("deploy_solve_on")),
|
||||
hack_on=round(_l5(ev, "train_hack"), 4), hack_off=round(_l5(ev, "deploy_hack"), 4),
|
||||
solve_on=round(_l5(ev, "train_solve"), 4), solve_off=round(_l5(ev, "deploy_solve"), 4),
|
||||
hack_as_trained=_r4(dep.get("hack_as_trained")), solve_as_trained=_r4(dep.get("solve_as_trained")),
|
||||
hack_on=round(_l5(ev, "hack_as_trained"), 4), hack_off=round(_l5(ev, "hack_deployed"), 4),
|
||||
solve_on=round(_l5(ev, "solve_as_trained"), 4), solve_off=round(_l5(ev, "solve_deployed"), 4),
|
||||
source=f"{run.name}/[deploy_test.json + eval_curve.jsonl]", status=status))
|
||||
|
||||
base = json.loads((_find_run("_dir8_baseline_s43") / "deploy_test.json").read_text())
|
||||
rows.append(dict(label="base (floor)", kind="anchor_floor",
|
||||
hack_deploy=round(base["deploy_hack"], 4), solve_deploy=round(base["deploy_solve"], 4),
|
||||
hack_deploy_on=None, solve_deploy_on=None,
|
||||
hack_deployed=round(base["hack_deployed"], 4), solve_deployed=round(base["solve_deployed"], 4),
|
||||
hack_as_trained=None, solve_as_trained=None,
|
||||
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
||||
source="*_dir8_baseline_s43/deploy_test.json", status="ok (base model; steps=0)"))
|
||||
|
||||
ceil_path = next(RUNS.glob("*noloophole*/deploy_test.json"), None)
|
||||
if ceil_path:
|
||||
ceil_solve, status = round(json.loads(ceil_path.read_text())["deploy_solve"], 4), "ok"
|
||||
ceil_solve, status = round(json.loads(ceil_path.read_text())["solve_deployed"], 4), "ok"
|
||||
source = f"{ceil_path.parent.name}/deploy_test.json"
|
||||
else:
|
||||
ceil_solve, status = PAPER_CEILING, "FIXME: PROVISIONAL paper 0.223 -- awaiting job 24 (no-loophole ceiling)"
|
||||
source = "Ariahw et al. 2025 (paper), NOT our run"
|
||||
rows.append(dict(label="ceiling", kind="anchor_ceiling",
|
||||
hack_deploy=0.0, solve_deploy=ceil_solve,
|
||||
hack_deploy_on=None, solve_deploy_on=None,
|
||||
hack_deployed=0.0, solve_deployed=ceil_solve,
|
||||
hack_as_trained=None, solve_as_trained=None,
|
||||
hack_on=None, hack_off=None, solve_on=None, solve_off=None,
|
||||
source=source, status=status))
|
||||
|
||||
@@ -135,9 +135,9 @@ GOLD, DARK = "#c8920a", "#3a3a3a"
|
||||
def _anchors(df: pl.DataFrame) -> dict:
|
||||
g = lambda kind, col: df.filter(pl.col("kind") == kind)[col][0]
|
||||
ceil_status = g("anchor_ceiling", "status")
|
||||
return dict(base_solve=g("anchor_floor", "solve_deploy"),
|
||||
vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deploy"][0],
|
||||
ceiling=g("anchor_ceiling", "solve_deploy"),
|
||||
return dict(base_solve=g("anchor_floor", "solve_deployed"),
|
||||
vanilla_hack=df.filter(pl.col("label") == "vanilla GRPO")["hack_deployed"][0],
|
||||
ceiling=g("anchor_ceiling", "solve_deployed"),
|
||||
provisional=ceil_status.startswith("FIXME"))
|
||||
|
||||
|
||||
@@ -166,8 +166,8 @@ def plot(df: pl.DataFrame) -> None:
|
||||
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
|
||||
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
|
||||
|
||||
def hsupp(r): return (vh - r["hack_deploy"]) / vh
|
||||
def suplift(r): return (r["solve_deploy"] - base) / (ceil - base)
|
||||
def hsupp(r): return (vh - r["hack_deployed"]) / vh
|
||||
def suplift(r): return (r["solve_deployed"] - base) / (ceil - base)
|
||||
|
||||
# OURS ONLY -- no paper bars. The paper comparison is cross-scale/regime (their converged
|
||||
# full-env vs our 60-step fast surrogate) so it can only ever be directional; the paper
|
||||
@@ -175,14 +175,14 @@ def plot(df: pl.DataFrame) -> None:
|
||||
# vanilla is the floor anchor (defines vh, so its hack-suppression is 0 by construction);
|
||||
# random-V is the directionality control; per-token is the live arm.
|
||||
hack_rows = [
|
||||
("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deploy']:.3f}", RED),
|
||||
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", hsupp(best), f"{best['hack_deploy']:.3f}", GOLD),
|
||||
("vanilla GRPO\n(floor)", hsupp(van), f"{van['hack_deployed']:.3f}", RED),
|
||||
("routeV random-V\n(direction control)", hsupp(rand), f"{rand['hack_deployed']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", hsupp(best), f"{best['hack_deployed']:.3f}", GOLD),
|
||||
]
|
||||
solve_rows = [
|
||||
("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deploy']:.3f}", RED),
|
||||
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deploy']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", suplift(best), f"{best['solve_deploy']:.3f}", GOLD),
|
||||
("vanilla GRPO\n(floor)", suplift(van), f"{van['solve_deployed']:.3f}", RED),
|
||||
("routeV random-V\n(direction control)", suplift(rand), f"{rand['solve_deployed']:.3f}", DARK),
|
||||
("routeV per-token\n(best)", suplift(best), f"{best['solve_deployed']:.3f}", GOLD),
|
||||
]
|
||||
prov = " (ceiling PROVISIONAL=0.223, FIXME job 24)" if a["provisional"] else ""
|
||||
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 5.0), sharey=False)
|
||||
@@ -225,7 +225,7 @@ def _methods(df: pl.DataFrame) -> list[dict]:
|
||||
def plot_scatter(df: pl.DataFrame) -> None:
|
||||
a = _anchors(df)
|
||||
base, ceil = a["base_solve"], a["ceiling"]
|
||||
H = lambda r: r["hack_deploy"]; S = lambda r: r["solve_deploy"]
|
||||
H = lambda r: r["hack_deployed"]; S = lambda r: r["solve_deployed"]
|
||||
prov = "*" if a["provisional"] else ""
|
||||
|
||||
fig, ax = plt.subplots(figsize=(7.2, 5.4))
|
||||
@@ -244,7 +244,7 @@ def plot_scatter(df: pl.DataFrame) -> None:
|
||||
# not an eval-set artifact. Arms without the backfill fall back to dot-only.
|
||||
for r in _methods(df):
|
||||
col = ARM_COLOR.get(r["label"], GREY)
|
||||
hon, son = r["hack_deploy_on"], r["solve_deploy_on"]
|
||||
hon, son = r["hack_as_trained"], r["solve_as_trained"]
|
||||
if hon is not None and (abs(hon - H(r)) > 1e-6 or abs(son - S(r)) > 1e-6):
|
||||
ax.annotate("", xy=(H(r), S(r)), xytext=(hon, son),
|
||||
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.0, alpha=0.85, shrinkA=6, shrinkB=8))
|
||||
|
||||
@@ -66,9 +66,9 @@ def main(run_dir: Positional[Path]) -> None:
|
||||
"schema": RUN_SCHEMA,
|
||||
"run_dir": run_dir.name, "model": model_name, "step": meta.get("step"),
|
||||
"eval_set": "test", "eval_modes": eval_modes,
|
||||
"n": ev["n"], "deploy_hack": ev["hack"], "deploy_vhack": ev["vhack"], "deploy_solve": ev["solve"],
|
||||
"deploy_hack_on": ev_on["hack"], "deploy_vhack_on": ev_on["vhack"],
|
||||
"deploy_solve_on": ev_on["solve"],
|
||||
"n": ev["n"], "hack_deployed": ev["hack"], "vhack_deployed": ev["vhack"], "solve_deployed": ev["solve"],
|
||||
"hack_as_trained": ev_on["hack"], "vhack_as_trained": ev_on["vhack"],
|
||||
"solve_as_trained": ev_on["solve"],
|
||||
"by_mode": {m: {"hack": h / max(1, c), "vhack": v / max(1, c), "solve": s / max(1, c), "n": c}
|
||||
for m, (h, v, s, c) in ev["by_mode"].items()},
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Final paired knob-off/knob-on scores from completed structured run artifacts."""
|
||||
"""Final paired deployed/as-trained scores from completed structured run artifacts."""
|
||||
from __future__ import annotations
|
||||
|
||||
import polars as pl
|
||||
@@ -15,11 +15,11 @@ def main() -> None:
|
||||
continue
|
||||
rows.append({
|
||||
"time": run["time"],
|
||||
"headline": deploy["deploy_solve"] - deploy["deploy_hack"],
|
||||
"hack_off": deploy["deploy_hack"],
|
||||
"solve_off": deploy["deploy_solve"],
|
||||
"hack_on": deploy["deploy_hack_on"],
|
||||
"solve_on": deploy["deploy_solve_on"],
|
||||
"headline": deploy["solve_deployed"] - deploy["hack_deployed"],
|
||||
"hack_deployed": deploy["hack_deployed"],
|
||||
"solve_deployed": deploy["solve_deployed"],
|
||||
"hack_as_trained": deploy["hack_as_trained"],
|
||||
"solve_as_trained": deploy["solve_as_trained"],
|
||||
"select": route_selectivity(run["run_dir"]),
|
||||
"arm": run["arm"],
|
||||
"pair": cfg["vhack_pairs_path"].split("/")[-1].removesuffix(".json"),
|
||||
@@ -35,10 +35,10 @@ def main() -> None:
|
||||
print("no completed non-smoke runs in out/runs/")
|
||||
return
|
||||
df = pl.DataFrame(rows).sort("headline", descending=True)
|
||||
cols = ["time", "headline", "hack_off", "solve_off", "hack_on", "solve_on",
|
||||
cols = ["time", "headline", "hack_deployed", "solve_deployed", "hack_as_trained", "solve_as_trained",
|
||||
"select", "arm", "pair", "seed", "hack_train", "solve_train", "model",
|
||||
"n", "modes", "run"]
|
||||
print("\n## Final paired test eval, sorted by knob-off solve-hack\n")
|
||||
print("\n## Final paired test eval, sorted by deployed solve-hack\n")
|
||||
print(tabulate(df.select(cols).rows(), headers=cols, tablefmt="pipe", floatfmt="+.3f"))
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user