feat: knob-ON eval (route arms) for like-for-like train-vs-deploy + teacher-off marker

The 2x2 train row used per-step hack_s (noisy n=28 train batch, knob-on) vs the deploy row's smooth n=64 eval (knob-off) -- different estimators, confounded. Now at each eval step route arms ALSO run the SAME n=64 eval with the quarantine ACTIVE (knob-on = training policy), logged as hk_on/slv_on. vanilla/erase reuse deploy (no quarantine -> knob-on==knob-off). plot_dynamics prefers hk_on for the train series so the 2x2 differs ONLY in knob state. Also: plot parses --teacher-off-step from argv and shades the teacher-ON region [0,toff] + a dashed cut line in the 2x2. The stashed long-run route2 jobs (92 KL, 94 teacher-off) inherit the knob-on eval automatically at runtime. Smoke (route2 hk_on present + logged, both plot parse paths) green. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 17:30:41 +08:00 · 2026-06-03 00:00:24 +00:00
parent 4ee3f03878
commit 025debae6b
3 changed files with 52 additions and 8 deletions
@@ -79,6 +79,10 @@ def parse_log(path: Path) -> dict | None:
    refr = int(grab(r"--vhack-refresh-every=(\d+)", argv, "0"))
    seed = grab(r"seed=(\d+)", preset, "?")
    vhack = grab(r"v-hack-path=out/(?:vhack/)?(\S+?)\.safetensors", argv, "-")
+    # teacher-off curriculum: step the teacher mix was cut (None if never). Drawn as
+    # a vertical line / end of the teacher-on shaded region in the 2x2.
+    _toff = grab(r"--teacher-off-step=(\d+)", argv, None)
+    teacher_off = int(_toff) if _toff is not None else None

    # header line: the one containing both "step" and "hack_s"
    hdr = next((l for l in txt.splitlines()
@@ -98,7 +102,7 @@ def parse_log(path: Path) -> dict | None:
    # hk_abl/slv_abl = the FREE per-step deploy proxy (ablated rollout slice,
    # rollout_ablate_frac>0); hk_dep/slv_dep = the held-out greedy eval, only on
    # eval_ablate_every steps. Prefer the dense proxy for the curve (see below).
-    deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl"} & set(idx)
+    deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl", "hk_on", "slv_on"} & set(idx)
    # Only parse columns this log actually has: non-projecting arms (vanilla,
    # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
    wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
@@ -114,7 +118,7 @@ def parse_log(path: Path) -> dict | None:
            series[col].append(_val(row[idx[col]]))
    if not steps:
        return None
-    run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack,
+    run = dict(arm=arm, refr=refr, seed=seed, vhack=vhack, teacher_off=teacher_off,
               steps=np.array(steps), **{k: np.array(v, dtype=float) for k, v in series.items()})
    # APPLES-TO-APPLES: plot the DEPLOY-eval (hk_dep/slv_dep) for EVERY arm when it
    # has data -- same estimator (n=64, T=0.7, eval_ablate_every cadence) across arms.
@@ -124,9 +128,14 @@ def parse_log(path: Path) -> dict | None:
    # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise.
    def _has_data(key):
        return key in run and np.isfinite(run[key]).any()
-    # Keep the raw per-step TRAIN series (knob-ON for route2) before the deploy
-    # substitution below overwrites hack_s/gt_s -- the train-vs-deploy 2x2 needs both.
-    if "hack_s" in run:
+    # TRAIN series for the train-vs-deploy 2x2. Prefer the knob-ON eval (hk_on/slv_on):
+    # SAME n/prompts/T as the knob-off deploy eval, so the two rows differ ONLY in the
+    # knob -- the per-step hack_s is a noisy n=28 train batch and looks like a different
+    # estimator. Fall back to per-step hack_s for logs without the knob-on eval.
+    if _has_data("hk_on"):
+        run["hack_train"] = run["hk_on"]
+        run["solve_train"] = run["slv_on"]
+    elif "hack_s" in run:
        run["hack_train"] = run["hack_s"]
        run["solve_train"] = run["gt_s"]
    if _has_data("hk_abl"):           # dense per-step proxy (rollout_ablate_frac>0), if present
@@ -390,6 +399,16 @@ def plot_train_vs_deploy(runs: list[dict], out: Path) -> None:
                ax.annotate("hack ≡ 0", (0.04, 0.0), xycoords=("axes fraction", "data"),
                            color=red, fontsize=8, va="bottom",
                            xytext=(0, 3), textcoords="offset points")
+            # teacher-off curriculum: shade the teacher-ON region [0, toff] + a line at
+            # the cut, so "hacks were teacher-seeded here, on-policy after" is visible.
+            toffs = {r.get("teacher_off") for r in by_arm[arm] if r.get("teacher_off")}
+            if toffs:
+                toff = max(toffs)
+                ax.axvspan(0, toff, color="0.85", alpha=0.5, zorder=0)
+                ax.axvline(toff, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=1)
+                if ri == 0:
+                    ax.annotate("teacher off", (toff, 1.0), color="0.4", fontsize=7,
+                                xytext=(2, -2), textcoords="offset points", va="top")
            if ci == 0:
                ax.set_ylabel(rlabel)
            ax.spines[["top", "right"]].set_visible(False)