mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:05:23 +08:00
8daf58d25e
- a5_generalisation: connectors -> arrows (baseline->ours direction, shows the drop and the stdout solve-cost honestly). - equiv0 -> approx0 everywhere: these are finite-sample estimates, not identically 0. - plot_train_vs_deploy skips when train==deploy for every run (no knob-ON contrast); fixes the 'can't see train' longrun/sub4 figures (they had no hk_on data). - Prune 9 orphan figure sets not referenced in paper or blog (regenerable on demand); keep the 3 referenced + a5 + train_vs_deploy_60_train_deploy. All 4 CSVs committed. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
591 lines
29 KiB
Python
591 lines
29 KiB
Python
"""Training-dynamics small multiples: deployed hack vs solve, one column per arm.
|
|
|
|
Tufte small multiples, single row. Columns = arm (vanilla / static G_hack
|
|
erasure / online G_hack erasure / routing2); the panel shows the DEPLOYED
|
|
model's hack_s (red) and solve/gt_s (green) over training. Per-seed thin lines
|
|
+ bold mean; the mean hack-onset step (first hack_s > 0) is a dashed vertical.
|
|
|
|
APPLES-TO-APPLES. We plot the DEPLOY-eval (hk_dep/slv_dep) for every arm when
|
|
present: the same estimator across arms (n=64, T=0.7, every --eval-ablate-every
|
|
steps). For route/route2 the deployed model = quarantine knob zeroed; for
|
|
vanilla/erase deploy == the trained model. Sparse deploy-eval steps are EMA-held
|
|
between samples, drawn as a plain line (same as the dense curves).
|
|
Older logs that gated the eval to route only fall back to per-step training
|
|
hack_s for vanilla/erase (noisier, n=28, but estimates the same deployed rate
|
|
since those arms have no quarantine).
|
|
|
|
Data source: logs/*.log per-step rows (the durable source results.py also uses).
|
|
We parse by HEADER NAME, not fixed index, because newer runs add columns (refr).
|
|
|
|
Arm classification (from the preset line `arm=`, covering old --arm and new
|
|
--intervention logs):
|
|
vanilla arm=vanilla (intervention=none)
|
|
static erasure arm=projected, no --vhack-refresh-every (frozen v_hack)
|
|
online erasure arm=projected, --vhack-refresh-every=N>0 (re-extracted)
|
|
routing2 arm=routing2 (intervention=route2)
|
|
|
|
Usage:
|
|
uv run python scripts/plot_dynamics.py logs/*converge*.log
|
|
uv run python scripts/plot_dynamics.py logs/ # whole dir
|
|
uv run python scripts/plot_dynamics.py A.log B.log --out out/dynamics.png
|
|
|
|
Scales to 3 seeds x 3 arms: pass all 9 logs, they auto-group by (arm, seed).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from matplotlib.lines import Line2D
|
|
from loguru import logger
|
|
|
|
from projected_grpo.figs import link_latest, save_fig, arm_label
|
|
|
|
# Figures are captioned in the paper/blog, so the suptitle just restates the
|
|
# caption. Off by default; --title re-enables it for standalone research use.
|
|
SHOW_TITLE = False
|
|
|
|
# --- parse -----------------------------------------------------------------
|
|
|
|
# Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264".
|
|
RATE_COLS = {"hack_s": "hack", "gt_s": "solve"}
|
|
_HDR_TOK = re.compile(r"[A-Za-z_]+") # strip ↑↓? decorations: "hack_s?" -> "hack_s"
|
|
|
|
|
|
def _val(tok: str) -> float | None:
|
|
"""Parse a per-step cell: frac n/d, signed float, or T/F/-/nan."""
|
|
if "/" in tok:
|
|
a, b = tok.split("/")
|
|
return int(a) / int(b) if int(b) else None
|
|
if tok in ("T", "F", "-", "nan"):
|
|
return None
|
|
return float(tok)
|
|
|
|
|
|
def parse_log(path: Path) -> dict | None:
|
|
"""Return {arm, refr, seed, vhack, steps: int[], <series>: float[]} or None."""
|
|
txt = path.read_text(errors="replace")
|
|
argv = next((l for l in txt.splitlines() if "argv:" in l), None)
|
|
preset = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
|
|
if argv is None:
|
|
return None
|
|
|
|
def grab(pat, s, default=None):
|
|
ms = re.findall(pat, s)
|
|
return ms[-1] if ms else default
|
|
|
|
# arm = derived display name in the preset line (vanilla/projected/routing),
|
|
# the one source that covers both old (--arm) and new (--intervention) logs.
|
|
arm = grab(r"\barm=(\w+)", preset, "vanilla")
|
|
refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0"))
|
|
seed = grab(r"seed=(\d+)", preset, "?")
|
|
vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-")
|
|
# teacher-off curriculum: step the teacher mix was cut (None if never). Drawn as
|
|
# a vertical line / end of the teacher-on shaded region in the 2x2.
|
|
_toff = grab(r"--teacher-off-step=(\d+)", argv, None)
|
|
teacher_off = int(_toff) if _toff is not None else None
|
|
|
|
# header line: the one containing both "step" and "hack_s"
|
|
hdr = next((l for l in txt.splitlines()
|
|
if "| INFO |" in l and "ref_eq" in l and "hack_s" in l), None)
|
|
if hdr is None:
|
|
return None
|
|
# real column headers always start with a letter/underscore; drop pure-symbol
|
|
# tokens (decoration) so a stray glyph in an old log's header doesn't crash parse
|
|
names = [m.group(0) for t in hdr.split("| INFO |", 1)[1].split() if (m := _HDR_TOK.match(t))]
|
|
idx = {n: i for i, n in enumerate(names)}
|
|
|
|
series: dict[str, list[float]] = defaultdict(list)
|
|
steps: list[int] = []
|
|
# Also parse the route DEPLOY-eval columns when present (non-route logs lack
|
|
# them -> skip). For routing we plot THESE (deployed model = quarantine deleted),
|
|
# not the training-time hack_s.
|
|
# hk_abl/slv_abl = the FREE per-step deploy proxy (ablated rollout slice,
|
|
# rollout_ablate_frac>0); hk_dep/slv_dep = the held-out greedy eval, only on
|
|
# eval_ablate_every steps. Prefer the dense proxy for the curve (see below).
|
|
deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl", "hk_on", "slv_on"} & set(idx)
|
|
# Only parse columns this log actually has: non-projecting arms (vanilla,
|
|
# routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
|
|
wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
|
|
wanted.update({c: c for c in deploy})
|
|
for line in txt.splitlines():
|
|
if "| INFO |" not in line:
|
|
continue
|
|
row = line.split("| INFO |", 1)[1].split()
|
|
if not row or not row[0].isdigit() or len(row) < len(names):
|
|
continue
|
|
steps.append(int(row[idx["step"]]))
|
|
for col in wanted:
|
|
series[col].append(_val(row[idx[col]]))
|
|
if not steps:
|
|
return None
|
|
run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, teacher_off=teacher_off,
|
|
steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()})
|
|
# Normalise missing eval columns to all-nan (absent == all-nan downstream): old logs
|
|
# that never printed a held-out eval lack the key entirely, which would KeyError the
|
|
# train-series assignment. A nan column drops the seed out of the mean cleanly.
|
|
for k in ("hk_dep", "slv_dep", "hk_on", "slv_on", "hk_abl", "slv_abl"):
|
|
run.setdefault(k, np.full(len(steps), np.nan))
|
|
# APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it
|
|
# has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms.
|
|
# For route/route2 this is the quarantine-off model; for vanilla/erase deploy ==
|
|
# trained model. Older logs (eval gated to route only) lack it for vanilla/erase
|
|
# -> fall back to per-step training hack_s. Test FINITE values, not column
|
|
# presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
|
|
def _has_data(key):
|
|
return key in run and np.isfinite(run[key]).any()
|
|
# TRAIN series for the train-vs-deploy 2x2. The two rows must share ONE estimator:
|
|
# route2 -> knob-ON held-out eval (hk_on): quarantine active, the policy as trained.
|
|
# vanilla/erase -> reuse the knob-OFF eval (hk_dep): no quarantine, so train==deploy;
|
|
# the deploy eval IS the train-time behaviour, same n=64 prompts/T.
|
|
# Both differ from the deploy row ONLY in the knob, so noise matches. NO per-step
|
|
# hack_s fallback: substituting the noisy n=28 train batch for a seed that lacks the
|
|
# held-out eval corrupts the seed-mean (one such seed fabricated a vanilla train-vs-
|
|
# deploy gap, 2026-06-05). A seed without the eval drops out as NaN instead.
|
|
if _has_data("hk_on"): # route2: knob-ON held-out eval (quarantine active)
|
|
run["hack_train"] = run["hk_on"]
|
|
run["solve_train"] = run["slv_on"]
|
|
else: # no quarantine (vanilla/erase): train==deploy, reuse the
|
|
run["hack_train"] = run["hk_dep"] # knob-off eval (nan if absent -> seed drops out)
|
|
run["solve_train"] = run["slv_dep"] # so all seeds share ONE estimator (n=64, no n=28)
|
|
if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present
|
|
run["hack_s"] = run["hk_abl"]
|
|
run["gt_s"] = run["slv_abl"]
|
|
elif _has_data("hk_dep"): # the n=64 every-eval_ablate_every deploy eval
|
|
run["hack_s"] = run["hk_dep"]
|
|
run["gt_s"] = run["slv_dep"]
|
|
return run
|
|
|
|
|
|
def classify(run: dict) -> str:
|
|
if "arm_csv" in run: # reconstructed from a CSV: name is already classified
|
|
return run["arm_csv"]
|
|
if run["arm"] == "vanilla":
|
|
return "vanilla"
|
|
if run["arm"] == "routing":
|
|
return "routing"
|
|
if run["arm"] == "routing2":
|
|
return "routing2"
|
|
# arm == projected -> erasure, split by refresh
|
|
return "online erasure" if run["refr"] > 0 else "static erasure"
|
|
|
|
|
|
# --- plot ------------------------------------------------------------------
|
|
|
|
# routing (route v1, single quarantine) is deprecated -- superseded by routing2
|
|
# (scale-matched quarantine). classify() still tags v1 logs as "routing" so they
|
|
# don't get misread as erasure, but it's left out of ARM_ORDER so it isn't plotted.
|
|
ARM_ORDER = ["vanilla", "static erasure", "online erasure", "routing2"]
|
|
# Distinct colour per series -- the two rows measure different things, so they
|
|
# must not share a palette (hack != teacher-cos). Row 0: red hack vs green
|
|
# solve. Row 1: blue teacher-cos vs amber student-cos.
|
|
RATE_COLORS = {"hack_s": "#c1432b", "gt_s": "#2f7d4f"}
|
|
# Arm colours for the single-panel hack overlay (arms, not series): grey vanilla
|
|
# baseline -> amber static -> blue online, ordered by increasing intervention.
|
|
# TODO(color): make this a quality-ordered red->green ramp instead of fixed
|
|
# per-arm hues -- red = vanilla (worst, most hacking), green = best method
|
|
# (anticipated gradient routing). As arms grow (static/online/grad-routing/
|
|
# confessions), assign colour by method rank along a perceptual RdYlGn ramp so
|
|
# the reader sees "redder = hacks more" at a glance.
|
|
ARM_COLORS = {"vanilla": "#7a7a7a", "static erasure": "#c98a2b",
|
|
"online erasure": "#33508c", "routing": "#2f7d4f",
|
|
"routing2": "#7d2f6f"}
|
|
|
|
|
|
def _onset(steps: np.ndarray, hack: np.ndarray) -> int | None:
|
|
"""First step where RAW hack_s > 0 (the hack-onset point). Computed on the
|
|
unsmoothed series -- EMA would blur the very step we want to mark."""
|
|
nz = np.flatnonzero(hack > 0)
|
|
return int(steps[nz[0]]) if len(nz) else None
|
|
|
|
|
|
def _ema(y: np.ndarray, span: int = 5) -> np.ndarray:
|
|
"""Causal EMA, span=5. Less lag than a trailing SMA(5) since it weights
|
|
recent steps more. NaNs hold the previous smoothed value (don't reset it)."""
|
|
a = 2.0 / (span + 1)
|
|
out = np.empty_like(y)
|
|
prev = np.nan
|
|
for i, v in enumerate(y):
|
|
if np.isnan(v):
|
|
out[i] = prev
|
|
else:
|
|
prev = v if np.isnan(prev) else a * v + (1 - a) * prev
|
|
out[i] = prev
|
|
return out
|
|
|
|
|
|
def _series_panel(ax, runs, cols, colors, ylim, label_series=False):
|
|
"""Overlay per-seed thin EMA lines + bold mean-of-EMA for each series."""
|
|
ends = [] # (endpoint_y, label, color) for direct labels
|
|
for col, label in cols.items():
|
|
color = colors[col]
|
|
stacked = []
|
|
present = [r for r in runs if col in r]
|
|
if not present: # arm lacks this series (e.g. no cos cols for routing2/vanilla)
|
|
continue
|
|
for r in present:
|
|
ys = _ema(r[col])
|
|
ax.plot(r["steps"], ys, color=color, lw=0.7, alpha=0.35, solid_capstyle="round")
|
|
stacked.append(ys)
|
|
# mean over seeds of the smoothed series (runs share the step grid within an arm)
|
|
L = min(len(y) for y in stacked)
|
|
ym = np.nanmean(np.stack([y[:L] for y in stacked]), axis=0)
|
|
xm = runs[0]["steps"][:L]
|
|
ax.plot(xm, ym, color=color, lw=1.8, solid_capstyle="round")
|
|
ends.append((ym[-1], xm[-1], label, color))
|
|
# Direct labels in the leftmost column only -- colour carries the series
|
|
# across the row, so per-panel repeats are redundant ink. Nudge by the
|
|
# ACTUAL endpoint ordering (higher line -> label up, lower -> down): the two
|
|
# cos lines cross, so a fixed up/down stagger would land each label on the
|
|
# wrong line.
|
|
if label_series:
|
|
ends.sort(key=lambda e: e[0]) # lowest endpoint first
|
|
dy = {0: -6, len(ends) - 1: 6} if len(ends) > 1 else {0: 0}
|
|
for rank, (y, x, label, color) in enumerate(ends):
|
|
ax.annotate(label, (x, y), color=color, fontsize=8,
|
|
xytext=(3, dy.get(rank, 0)), textcoords="offset points", va="center")
|
|
if ylim:
|
|
ax.set_ylim(*ylim)
|
|
|
|
|
|
# Every series any of the three figures plots. Carried in the CSV so the figure
|
|
# regenerates from the committed CSV alone (logs/ and out/runs/ are gitignored,
|
|
# out/figs/*.csv is tracked). `arm` is the CLASSIFIED display name -- load_csv
|
|
# short-circuits classify() on it so the round-trip is exact.
|
|
CSV_SERIES = ["hack_s", "gt_s", "hack_train", "solve_train", "hk_dep", "slv_dep"]
|
|
|
|
|
|
def dump_data(runs: list[dict], out: Path) -> Path:
|
|
csv = out.with_suffix(".csv")
|
|
lines = ["arm,seed,step," + ",".join(CSV_SERIES)]
|
|
for r in runs:
|
|
arm = classify(r)
|
|
for i, step in enumerate(r["steps"]):
|
|
cells = [r[k][i] if (k in r and r[k] is not None and i < len(r[k])) else float("nan")
|
|
for k in CSV_SERIES]
|
|
lines.append(f"{arm},{r['seed']},{int(step)}," + ",".join(str(c) for c in cells))
|
|
csv.write_text("\n".join(lines) + "\n")
|
|
logger.info(f"wrote {csv} ({len(runs)} runs, reproducibility source)")
|
|
return csv
|
|
|
|
|
|
def load_csv(path: Path) -> list[dict]:
|
|
"""Reconstruct the runs list from a dump_data CSV so figures regenerate
|
|
without the raw logs. Groups rows by (arm, seed); `arm_csv` makes classify()
|
|
return the stored display name verbatim."""
|
|
rows = [l.split(",") for l in path.read_text().splitlines() if l.strip()]
|
|
hdr, body = rows[0], rows[1:]
|
|
ci = {n: i for i, n in enumerate(hdr)}
|
|
by_key: dict[tuple, dict] = {}
|
|
for row in body:
|
|
key = (row[ci["arm"]], row[ci["seed"]])
|
|
run = by_key.setdefault(key, {"arm_csv": row[ci["arm"]], "seed": row[ci["seed"]],
|
|
"refr": 0, "vhack": "-", "teacher_off": None,
|
|
"steps": [], **{k: [] for k in CSV_SERIES}})
|
|
run["steps"].append(int(row[ci["step"]]))
|
|
for k in CSV_SERIES:
|
|
run[k].append(float(row[ci[k]]))
|
|
runs = list(by_key.values())
|
|
for run in runs: # match parse_log: numeric series are ndarrays, not lists
|
|
run["steps"] = np.array(run["steps"])
|
|
for k in CSV_SERIES:
|
|
run[k] = np.array(run[k], dtype=float)
|
|
return runs
|
|
|
|
|
|
def plot(runs: list[dict], out: Path) -> None:
|
|
by_arm: dict[str, list[dict]] = defaultdict(list)
|
|
for r in runs:
|
|
by_arm[classify(r)].append(r)
|
|
arms = [a for a in ARM_ORDER if a in by_arm]
|
|
if not arms:
|
|
raise SystemExit("no runs classified into arms")
|
|
dump_data(runs, out)
|
|
|
|
fig, axes = plt.subplots(1, len(arms), figsize=(3.0 * len(arms), 2.6),
|
|
sharex=True, sharey=True, squeeze=False)
|
|
for col, arm in enumerate(arms):
|
|
ax = axes[0][col]
|
|
rs = by_arm[arm]
|
|
n_seed = len({r["seed"] for r in rs})
|
|
ax.set_title(f"{arm_label(arm)}\n(n={n_seed} seed{'s' if n_seed > 1 else ''})", fontsize=9)
|
|
# ylim floor slightly below 0 so a pinned-at-zero series (route2 hack) draws
|
|
# ABOVE the axis line instead of hiding under it -- the whole result is that
|
|
# red sits on zero, so it must be visible, not absent.
|
|
_series_panel(ax, rs, RATE_COLS, RATE_COLORS, ylim=(-0.035, 1.0), label_series=(col == 0))
|
|
# If hack is pinned at zero all panel, say so -- else "no red line" reads as
|
|
# a plotting bug rather than the finding.
|
|
hk = [r["hack_s"] for r in rs if "hack_s" in r]
|
|
if hk and np.nanmax([np.nanmax(h) for h in hk]) < 0.02:
|
|
ax.annotate("hack ≈ 0", (0.04, 0.0), xycoords=("axes fraction", "data"),
|
|
color=RATE_COLORS["hack_s"], fontsize=8, va="bottom",
|
|
xytext=(0, 3), textcoords="offset points")
|
|
ax.set_xlabel("optimizer step")
|
|
onsets = [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None]
|
|
if onsets:
|
|
s0 = float(np.mean(onsets))
|
|
ax.axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0)
|
|
ax.annotate("first hack", (s0, 1.0), color="0.4", fontsize=7,
|
|
xytext=(2, -2), textcoords="offset points", va="top")
|
|
|
|
axes[0][0].set_ylabel("deployed rate")
|
|
# range-frame: drop top/right spines, keep ink on data
|
|
for ax in axes.flat:
|
|
ax.spines["top"].set_visible(False)
|
|
ax.spines["right"].set_visible(False)
|
|
ax.tick_params(labelsize=8)
|
|
|
|
if SHOW_TITLE:
|
|
fig.suptitle("Training dynamics: deployed hack vs solve by arm "
|
|
"(deploy-eval n=64 T=0.7; EMA-5; dashed = mean hack onset)", fontsize=10)
|
|
fig.tight_layout(rect=(0, 0, 1, 0.96))
|
|
else:
|
|
fig.tight_layout()
|
|
save_fig(fig, out)
|
|
logger.info(f"wrote {out} ({len(runs)} runs, arms={[arm_label(a) for a in arms]})")
|
|
|
|
|
|
def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset, label_arms, ylim=(0, 1)):
|
|
"""Overlay one metric (key) per arm on ax: faint per-seed EMA lines + bold
|
|
EMA mean, optional mean-onset dot. When label_arms, direct-label each arm at its
|
|
endpoint (de-collided in y). An arm whose mean series sits at zero gets a
|
|
"$\\approx 0$" tag so a pinned-at-zero line reads as a finding, not a missing line."""
|
|
ends = [] # (y_endpoint, x_endpoint, arm, color, is_zero) for direct labels
|
|
for arm in arms:
|
|
rs = [r for r in by_arm[arm] if key in r]
|
|
if not rs:
|
|
continue
|
|
color = ARM_COLORS[arm]
|
|
stacked = []
|
|
for r in rs:
|
|
ys = _ema(r[key])
|
|
ax.plot(r["steps"], ys, color=color, lw=0.6, alpha=0.25, solid_capstyle="round")
|
|
stacked.append(ys)
|
|
L = min(len(y) for y in stacked)
|
|
ym = np.nanmean(np.stack([y[:L] for y in stacked]), axis=0)
|
|
xm = rs[0]["steps"][:L]
|
|
ax.plot(xm, ym, color=color, lw=2.0, solid_capstyle="round")
|
|
if with_onset:
|
|
onsets = [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None]
|
|
if onsets:
|
|
s0 = float(np.mean(onsets))
|
|
ax.plot(s0, np.interp(s0, xm, ym), marker="o", ms=4, color=color, zorder=3)
|
|
ends.append((float(ym[-1]), float(xm[-1]), arm, color, float(np.nanmax(ym)) < 0.02))
|
|
ax.set_ylim(*ylim)
|
|
ax.set_ylabel(label)
|
|
ax.spines[["top", "right"]].set_visible(False)
|
|
ax.tick_params(labelsize=8)
|
|
if not label_arms: # other panel shares colours -- redundant ink
|
|
return
|
|
ends.sort(key=lambda e: e[0]) # bottom-to-top by endpoint
|
|
gap = 0.06 * (ylim[1] - ylim[0]) # min y-separation, scaled to the range
|
|
xmax = max(e[1] for e in ends)
|
|
dx = 0.035 * (xmax - ax.get_xlim()[0]) # horizontal clearance off the line end
|
|
x_lab = xmax + dx # ALL labels share one gutter x, leaders fan back
|
|
ax.set_xlim(right=xmax + dx * 3.4) # right margin so labels sit clear in the gutter
|
|
placed = []
|
|
for y, x, arm, color, is_zero in ends:
|
|
y_lab = y if not placed else max(y, placed[-1] + gap)
|
|
placed.append(y_lab)
|
|
text = arm_label(arm) + (r" $\approx 0$" if is_zero else "")
|
|
# Common gutter x + leader back to each line's actual end: ragged run lengths
|
|
# otherwise scatter labels mid-plot onto other arms' lines (collision test).
|
|
arrow = dict(arrowstyle="-", color=color, lw=0.5, shrinkA=0, shrinkB=2)
|
|
ax.annotate(text, xy=(x, y), xytext=(x_lab, y_lab), textcoords="data",
|
|
color=color, fontsize=8, va="center", annotation_clip=False,
|
|
arrowprops=arrow)
|
|
|
|
|
|
def plot_hack_overlay(runs: list[dict], out: Path) -> None:
|
|
"""Two stacked panels sharing x: student hack rate (top) and solve rate (bottom)
|
|
per arm. Faint per-seed EMA lines + bold EMA-5 mean; onset dot on the hack panel.
|
|
Arms are direct-labelled on the TOP (hack) panel -- readers scan top-to-bottom, and
|
|
the hack panel carries the headline (an arm pinned at 0 gets a $\\approx 0$ tag)."""
|
|
by_arm: dict[str, list[dict]] = defaultdict(list)
|
|
for r in runs:
|
|
by_arm[classify(r)].append(r)
|
|
arms = [a for a in ARM_ORDER if a in by_arm]
|
|
|
|
fig, (ax_h, ax_s) = plt.subplots(2, 1, figsize=(5.2, 5.2), sharex=True)
|
|
# floor the hack panel below 0 so a route line pinned at 0 draws above the axis
|
|
_overlay_panel(ax_h, by_arm, arms, "hack_s", label="hack rate",
|
|
with_onset=True, label_arms=True, ylim=(-0.035, 1.0))
|
|
_overlay_panel(ax_s, by_arm, arms, "gt_s", label="solve rate",
|
|
with_onset=False, label_arms=False, ylim=(0, 1.0))
|
|
ax_s.set_xlabel("optimizer step")
|
|
if SHOW_TITLE:
|
|
ax_h.set_title("Hack vs solve rate by arm (EMA-5; dot = mean hack onset)", fontsize=10)
|
|
fig.tight_layout()
|
|
save_fig(fig, out)
|
|
logger.info(f"wrote {out}")
|
|
|
|
|
|
def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
|
|
"""One panel per arm, four series each: {hack, solve} x {train, deploy}.
|
|
Colour = metric (red hack / green solve); linestyle = train (adapter on, dashed)
|
|
vs deploy (adapter off, solid). The route gap is the result -- dashed-red (train)
|
|
rises while solid-red (deploy) sits at 0, because the hack lives in the deletable
|
|
quarantine. For vanilla the dashed/solid pair coincides (train==deploy: the hack is
|
|
in the shipped weights, nothing to delete). Matched n=64 eval on every series."""
|
|
# Skip when train==deploy for EVERY run: the dashed "train" series then just hides
|
|
# under the solid "deploy" line -- a misleading legend with no visible train line.
|
|
# Only a route2 knob-ON eval makes hack_train (=hk_on) differ from hk_dep. Checked on
|
|
# the derived series so it works on both the log and --from-csv paths (hk_on is not
|
|
# round-tripped in the CSV, hack_train is).
|
|
def _has_train_gap(r):
|
|
ht, hd = r.get("hack_train"), r.get("hk_dep")
|
|
if ht is None or hd is None:
|
|
return False
|
|
d = np.abs(ht - hd)
|
|
return bool(np.isfinite(d).any() and np.nanmax(d) > 0.02)
|
|
if not any(_has_train_gap(r) for r in runs):
|
|
logger.info(f"skip {out.name}: train==deploy in every run -> no knob-ON contrast to show")
|
|
return
|
|
by_arm: dict[str, list[dict]] = defaultdict(list)
|
|
for r in runs:
|
|
by_arm[classify(r)].append(r)
|
|
arms = [a for a in ARM_ORDER if a in by_arm]
|
|
red, green = RATE_COLORS["hack_s"], RATE_COLORS["gt_s"]
|
|
TRAIN_LS, DEPLOY_LS = (0, (4, 2)), "-"
|
|
# (series_key, colour, linestyle, is_hack)
|
|
SERIES = [
|
|
("hack_train", red, TRAIN_LS, True),
|
|
("hk_dep", red, DEPLOY_LS, True),
|
|
("solve_train", green, TRAIN_LS, False),
|
|
("slv_dep", green, DEPLOY_LS, False),
|
|
]
|
|
fig, axes = plt.subplots(1, len(arms), figsize=(3.4 * len(arms), 3.2),
|
|
sharex=True, sharey=True, squeeze=False)
|
|
for ci, arm in enumerate(arms):
|
|
ax = axes[0][ci]
|
|
ax.set_title(arm_label(arm), fontsize=10)
|
|
deploy_hack_zero = False
|
|
for key, color, ls, is_hack in SERIES:
|
|
rs = [r for r in by_arm[arm] if key in r]
|
|
if not rs:
|
|
continue
|
|
stacked = [_ema(r[key]) for r in rs]
|
|
L = min(len(y) for y in stacked)
|
|
ym = np.nanmean(np.stack([y[:L] for y in stacked]), axis=0)
|
|
xm = rs[0]["steps"][:L]
|
|
ax.plot(xm, ym, color=color, ls=ls, lw=1.8, solid_capstyle="round")
|
|
if key == "hk_dep" and np.nanmax(ym) < 0.02:
|
|
deploy_hack_zero = True
|
|
if deploy_hack_zero: # the route headline: solid-red pinned at 0.
|
|
# Lift the label into the empty band above the flat line (collision test:
|
|
# at y=0 the solid-red deploy line runs straight through the text).
|
|
ax.annotate(r"deploy hack $\approx 0$", (0.04, 0.12),
|
|
xycoords="axes fraction", color=red, fontsize=8, va="bottom")
|
|
# teacher-off curriculum: shade the teacher-ON region so "seeded here, on-policy
|
|
# after" stays visible in the C4 bootstrap variant (jobs 93/94).
|
|
toffs = {r.get("teacher_off") for r in by_arm[arm] if r.get("teacher_off")}
|
|
if toffs:
|
|
toff = max(toffs)
|
|
ax.axvspan(0, toff, color="0.85", alpha=0.5, zorder=0)
|
|
ax.axvline(toff, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=1)
|
|
ax.annotate("teacher off", (toff, 1.0), color="0.4", fontsize=7,
|
|
xytext=(2, -2), textcoords="offset points", va="top")
|
|
ax.set_ylim(-0.035, 1.0)
|
|
ax.set_xlabel("optimizer step")
|
|
ax.spines[["top", "right"]].set_visible(False)
|
|
ax.tick_params(labelsize=8)
|
|
axes[0][0].set_ylabel("rate")
|
|
# two-axis legend: colour = metric, linestyle = train vs deploy
|
|
handles = [
|
|
Line2D([], [], color=red, lw=1.8, label="hack"),
|
|
Line2D([], [], color=green, lw=1.8, label="solve"),
|
|
Line2D([], [], color="0.3", lw=1.8, ls=TRAIN_LS, label="train (adapter on)"),
|
|
Line2D([], [], color="0.3", lw=1.8, ls=DEPLOY_LS, label="deploy (adapter off)"),
|
|
]
|
|
axes[0][-1].legend(handles=handles, fontsize=7, frameon=False, loc="upper left")
|
|
if SHOW_TITLE:
|
|
fig.suptitle("Train (adapter on) vs deploy (adapter off): vanilla bakes the "
|
|
"hack into the weights, route holds it in the deletable adapter",
|
|
fontsize=10)
|
|
fig.tight_layout(rect=(0, 0, 1, 0.93))
|
|
else:
|
|
fig.tight_layout()
|
|
save_fig(fig, out)
|
|
logger.info(f"wrote {out}")
|
|
|
|
|
|
# --- cli -------------------------------------------------------------------
|
|
|
|
def _gather(paths: list[str]) -> list[Path]:
|
|
out: list[Path] = []
|
|
for p in paths:
|
|
pp = Path(p)
|
|
if pp.is_dir():
|
|
out += sorted(pp.glob("*.log"))
|
|
elif any(c in p for c in "*?["):
|
|
out += sorted(Path().glob(p))
|
|
else:
|
|
out.append(pp)
|
|
return out
|
|
|
|
|
|
def _latest_per_arm(files: list[Path], min_steps: int) -> list[Path]:
|
|
"""One log per arm: the most recent (by filename timestamp) with >= min_steps
|
|
rows. Lets `just dyn` auto-pick the freshest full-length run for each arm
|
|
instead of hand-globbing. Newest filename wins -- timestamp-prefixed names
|
|
sort lexicographically, no mtime races."""
|
|
by_arm: dict[str, tuple[Path, dict]] = {}
|
|
for f in sorted(files): # ascending ts; later overwrites -> keeps newest
|
|
r = parse_log(f)
|
|
if r is None or len(r["steps"]) < min_steps:
|
|
continue
|
|
by_arm[classify(r)] = (f, r)
|
|
return [f for f, _ in by_arm.values()]
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("logs", nargs="*", help="log files, globs, or dirs (omit with --from-csv)")
|
|
ap.add_argument("--out", type=Path, default=Path("out/figs/dynamics.png"))
|
|
ap.add_argument("--latest-per-arm", action="store_true",
|
|
help="keep only the newest log per arm (with >= --min-steps rows)")
|
|
ap.add_argument("--min-steps", type=int, default=0,
|
|
help="drop runs shorter than this many logged steps")
|
|
ap.add_argument("--title", action="store_true",
|
|
help="draw the suptitle (off by default: the paper/blog caption carries it)")
|
|
ap.add_argument("--from-csv", type=Path, default=None,
|
|
help="re-render from a committed dump_data CSV instead of parsing logs")
|
|
args = ap.parse_args()
|
|
global SHOW_TITLE
|
|
SHOW_TITLE = args.title
|
|
if args.from_csv:
|
|
runs = load_csv(args.from_csv)
|
|
logger.info(f"loaded {len(runs)} runs from {args.from_csv} (CSV re-render, no logs)")
|
|
_render_all(runs, args.out)
|
|
return
|
|
files = _gather(args.logs)
|
|
if args.latest_per_arm:
|
|
files = _latest_per_arm(files, args.min_steps)
|
|
runs = [r for f in files if (r := parse_log(f)) and len(r["steps"]) >= args.min_steps]
|
|
if not runs:
|
|
raise SystemExit(f"no parseable runs in {len(files)} files")
|
|
for r in runs:
|
|
logger.info(f"{classify(r):16s} seed={r['seed']} steps={len(r['steps'])} {r['vhack']}")
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
_render_all(runs, args.out)
|
|
|
|
|
|
def _render_all(runs: list[dict], out: Path) -> None:
|
|
"""The three dynamics figures, shared by the log-parse and --from-csv paths."""
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
plot(runs, out) # small-multiples + CSV dump
|
|
overlay = out.with_name(out.stem + "_hack_overlay.png")
|
|
plot_hack_overlay(runs, overlay) # arm-vs-arm headline overlay
|
|
tvd = out.with_name(out.stem + "_train_deploy.png")
|
|
plot_train_vs_deploy(runs, tvd) # 2x2 train(on) vs deploy(off)
|
|
for p in (out, overlay, tvd):
|
|
logger.info(f"docs/figs latest -> {link_latest(p)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|