evil_MoE/scripts/plot_deploy_overlay.py

"""All-arms per-mode DEPLOY overlay (#162) from the per_mode_deploy.json artifacts.

Each run writes out/runs/<ts>_<tag>/per_mode_deploy.json (train.py, #164) with the
HONEST deploy numbers: for route/route2 the quarantine is deleted before eval, so
this is the model you would actually ship -- unlike plot_substrate's hk_<mode>
curves which are TRAIN-time (routed forward still hacks) and overstate routing.

Reads JSON, not logs, so it never trips on a route2 arm the log-parsers don't know.

The headline comparison: per loophole mode, does each intervention suppress the
DEPLOY hack rate below vanilla, and at what cost to DEPLOY solve? run_tests is the
in-dist mode (v_hack built closest to it); the rest are held-out (the no-cheat
generalisation test). Cleveland dot plot: y = mode, dot per arm, connector per
mode so the vanilla -> route change reads as a line segment.

Usage:
  uv run python scripts/plot_deploy_overlay.py                       # globs out/runs/*sub4*/
  uv run python scripts/plot_deploy_overlay.py out/runs/*_sub4_*/per_mode_deploy.json
  uv run python scripts/plot_deploy_overlay.py --out out/figs/deploy_overlay.png
"""
from __future__ import annotations

import argparse
import json
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from loguru import logger

from projected_grpo.figs import save_fig

# arm -> (display label, colour). Order = legend/bar order (baseline first).
# Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
# routing arm is the one we report, so it is plain "route"; the failed
# activation-mask variant is disambiguated, not version-numbered.
ARM = {
    "vanilla":      ("vanilla",            "#444444"),
    "projected":    ("erase",              "#c1432b"),
    "routing":      ("route (v1)",         "#33508c"),
    "routing2_act": ("route (act-mask)",   "#2f7d4f"),
    "routing2_grad":("route",              "#b8860b"),
    "routing2":     ("route",              "#b8860b"),
}
# mode display order: in-dist first, then held-out.
MODE_ORDER = ["run_tests", "file_marker", "stdout_marker", "sentinel", "eq_override"]


def load(paths: list[Path]) -> list[dict]:
    out = []
    for p in paths:
        d = json.loads(p.read_text())
        out.append(d)
        logger.info(f"{d['arm']:<14} deploy hack={d['hack_deploy']:.3f} solve={d['solve_deploy']:.3f}  ({p})")
    return out


def _mode_stats(by_arm, arm, modes, field):
    """(mean, std-across-seeds) per mode for one arm; std=0 at n=1."""
    means, stds = [], []
    for m in modes:
        v = [r["by_mode"].get(m, {}).get(field, np.nan) for r in by_arm[arm]]
        means.append(np.nanmean(v))
        stds.append(np.nanstd(v) if len(v) > 1 else 0.0)
    return np.array(means), np.array(stds)


def _panel(ax, by_arm, modes, arms, field, title, xlabel):
    """Cleveland dot plot: y = mode, x = rate. One dot per arm with a thin connector
    per mode, so the arm-to-arm change reads as a line segment (vanilla -> route).
    xerr = std across seeds (drawn only when >1 seed). Tufte: faint x-grid only, no
    box, dots+labels carry the categories.
    TODO(seeds): A5 ships n=1 (seed 41, jobs 103/104) so no error bar yet; the
    queued seeds 42/43 (jobs 107-110) populate xerr -- the code already aggregates."""
    y = np.arange(len(modes))[::-1]                  # first mode at top
    for j in range(len(modes)):                      # connector between arms, per mode
        xs = [_mode_stats(by_arm, a, modes, field)[0][j] for a in arms]
        ax.plot(xs, [y[j]] * len(arms), color="0.75", lw=1.0, zorder=1)
    for i, arm in enumerate(arms):
        label, color = ARM[arm]
        means, stds = _mode_stats(by_arm, arm, modes, field)
        xerr = stds if (stds > 0).any() else None
        ax.errorbar(means, y, xerr=xerr, fmt="o", ms=7, color=color, zorder=3,
                    capsize=2, elinewidth=0.8, label=f"{label} (n={len(by_arm[arm])})")
        dy = 7 if i == 0 else -12                     # stagger labels so close dots don't collide
        for v, yy in zip(means, y):
            if np.isnan(v):
                continue
            txt = "≡0" if v < 5e-3 else f"{v:.2f}"    # a dot on the axis still needs the finding marked
            ax.annotate(txt, (v, yy), fontsize=6, color=color, ha="center",
                        va="bottom", xytext=(0, dy), textcoords="offset points")
    ax.set_yticks(y)
    ax.set_yticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8)
    ax.set_xlim(-0.04, 1.08)
    ax.set_ylim(y.min() - 0.5, y.max() + 0.5)
    ax.set_xlabel(xlabel)
    ax.set_title(title, fontsize=10)
    ax.spines[["top", "right", "left"]].set_visible(False)
    ax.tick_params(length=0)
    ax.grid(axis="x", lw=0.3, alpha=0.3)


def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("jsons", nargs="*", type=Path,
                    help="per_mode_deploy.json paths; default globs out/runs/*sub4*/")
    ap.add_argument("--out", type=Path, default=Path("out/figs/deploy_overlay.png"))
    ap.add_argument("--title", action="store_true",
                    help="draw the suptitle (off by default: the caption carries it)")
    args = ap.parse_args()

    paths = args.jsons or sorted(Path("out/runs").glob("*sub4*/per_mode_deploy.json"))
    if not paths:
        raise SystemExit("no per_mode_deploy.json found (run the sweep first)")
    records = load(paths)
    # group seed runs per arm (mean+/-std bars), order arms canonically
    by_arm: dict[str, list[dict]] = defaultdict(list)
    for r in records:
        by_arm[r["arm"]].append(r)
    arms = [a for a in ARM if a in by_arm]
    modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)]

    fig, (a1, a2) = plt.subplots(1, 2, figsize=(9.5, 0.7 + 0.7 * len(modes)), sharey=True)
    _panel(a1, by_arm, modes, arms, "deploy_hack",
           "DEPLOY hack rate (lower = better)", "deploy hack rate")
    _panel(a2, by_arm, modes, arms, "deploy_solve",
           "DEPLOY solve rate (higher = better)", "deploy solve rate")
    a1.legend(fontsize=8, frameon=False, loc="lower right")
    if args.title:
        n_seed = {r.get("seed") for r in records}
        fig.suptitle(f"Per-mode deploy overlay ({len(arms)} arms, seed {sorted(n_seed)}) -- "
                     f"quarantine deleted = shipped model", fontsize=11)
    fig.tight_layout()
    save_fig(fig, args.out)
    logger.info(f"wrote {args.out}  ({len(arms)} arms x {len(modes)} modes)")


if __name__ == "__main__":
    main()