feat: knob-ON eval (route arms) for like-for-like train-vs-deploy + teacher-off marker

The 2x2 train row used per-step hack_s (noisy n=28 train batch, knob-on) vs the
deploy row's smooth n=64 eval (knob-off) -- different estimators, confounded.
Now at each eval step route arms ALSO run the SAME n=64 eval with the quarantine
ACTIVE (knob-on = training policy), logged as hk_on/slv_on. vanilla/erase reuse
deploy (no quarantine -> knob-on==knob-off). plot_dynamics prefers hk_on for the
train series so the 2x2 differs ONLY in knob state.

Also: plot parses --teacher-off-step from argv and shades the teacher-ON region
[0,toff] + a dashed cut line in the 2x2. The stashed long-run route2 jobs
(92 KL, 94 teacher-off) inherit the knob-on eval automatically at runtime.

Smoke (route2 hk_on present + logged, both plot parse paths) green.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-03 00:00:24 +00:00
parent 4ee3f03878
commit 025debae6b
3 changed files with 52 additions and 8 deletions
+24 -5
View File
@@ -79,6 +79,10 @@ def parse_log(path: Path) -> dict | None:
refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0"))
seed = grab(r"seed=(\d+)", preset, "?")
vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-")
# teacher-off curriculum: step the teacher mix was cut (None if never). Drawn as
# a vertical line / end of the teacher-on shaded region in the 2x2.
_toff = grab(r"--teacher-off-step=(\d+)", argv, None)
teacher_off = int(_toff) if _toff is not None else None
# header line: the one containing both "step" and "hack_s"
hdr = next((l for l in txt.splitlines()
@@ -98,7 +102,7 @@ def parse_log(path: Path) -> dict | None:
# hk_abl/slv_abl = the FREE per-step deploy proxy (ablated rollout slice,
# rollout_ablate_frac>0); hk_dep/slv_dep = the held-out greedy eval, only on
# eval_ablate_every steps. Prefer the dense proxy for the curve (see below).
deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl"} & set(idx)
deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl", "hk_on", "slv_on"} & set(idx)
# Only parse columns this log actually has: non-projecting arms (vanilla,
# routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
@@ -114,7 +118,7 @@ def parse_log(path: Path) -> dict | None:
series[col].append(_val(row[idx[col]]))
if not steps:
return None
run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack,
run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, teacher_off=teacher_off,
steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()})
# APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it
# has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms.
@@ -124,9 +128,14 @@ def parse_log(path: Path) -> dict | None:
# presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
def _has_data(key):
return key in run and np.isfinite(run[key]).any()
# Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy
# substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both.
if "hack_s" in run:
# TRAIN series for the train-vs-deploy 2x2. Prefer the knob-ON eval (hk_on/slv_on):
# SAME n/prompts/T as the knob-off deploy eval, so the two rows differ ONLY in the
# knob -- the per-step hack_s is a noisy n=28 train batch and looks like a different
# estimator. Fall back to per-step hack_s for logs without the knob-on eval.
if _has_data("hk_on"):
run["hack_train"] = run["hk_on"]
run["solve_train"] = run["slv_on"]
elif "hack_s" in run:
run["hack_train"] = run["hack_s"]
run["solve_train"] = run["gt_s"]
if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present
@@ -390,6 +399,16 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"),
color=red, fontsize=8, va="bottom",
xytext=(0, 3), textcoords="offset points")
# teacher-off curriculum: shade the teacher-ON region [0, toff] + a line at
# the cut, so "hacks were teacher-seeded here, on-policy after" is visible.
toffs = {r.get("teacher_off") for r in by_arm[arm] if r.get("teacher_off")}
if toffs:
toff = max(toffs)
ax.axvspan(0, toff, color="0.85", alpha=0.5, zorder=0)
ax.axvline(toff, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=1)
if ri == 0:
ax.annotate("teacher off", (toff, 1.0), color="0.4", fontsize=7,
xytext=(2, -2), textcoords="offset points", va="top")
if ci == 0:
ax.set_ylabel(rlabel)
ax.spines[["top", "right"]].set_visible(False)