plot(dyn): make cos row cross-arm-comparable (sep + leak)

The cos row mixed non-comparable quantities: erase logged cin_t/cin_s (pre-intervention single cosines), route2 logged hkgap (a difference) and resid (post-intervention), all under one "cos(grad,v_hack)" ylabel. Wassname flagged it -- they are not the same measurement. Derive two quantities that mean the same thing in every column (_add_cos_derived): sep = does v_hack still discriminate hacky from clean gradient erase: cin_t-cin_s (teacher vs student); route2: hkgap (hackflag vs clean) leak = residual hack-alignment of the post-intervention deployed gradient erase: cout (after projection); route2: resid (after routing) -- same quantity Legend now lands on the leftmost arm that has cos data (vanilla has none). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 19:47:33 +08:00 · 2026-06-02 00:20:37 +00:00
parent 8e42836b46
commit 00e110c498
1 changed files with 45 additions and 10 deletions
@@ -3,7 +3,10 @@
 Tufte small multiples. Columns = arm (vanilla / static G_hack erasure /
 online G_hack erasure); rows = metric group:
  row 0  hack_s + solve(gt_s)        student reward-hack rate vs ground-truth solve
-  row 1  cos_pre_t + cos_pre_s        live-grad alignment with v_hack (teacher / student)
+  row 1  sep + leak                   cross-arm-comparable cos diagnostics (see
+                                     _add_cos_derived): sep = does v_hack still
+                                     discriminate hacky grad; leak = residual
+                                     hack-alignment of the post-intervention grad

 Each panel overlays one thin line per seed and one bold mean line. The first
 step where the student starts hacking (hack_s > 0) is marked per seed with an
@@ -51,9 +54,13 @@ from projected_grpo.figs import link_latest

 # Series we plot, by cleaned header name. frac "7/28" -> 0.25; float "+0.264".
 RATE_COLS = {"hack_s": "hack", "gt_s": "solve"}
-# Current streaming-table display headers (StepLogger _Col.header): the live-grad
-# v_hack alignment prints as cin_t/cin_s, the route deploy-eval as hk_dep/slv_dep.
-COS_COLS = {"cin_t": "teacher", "cin_s": "student"}
+# Raw cosine columns we parse, presence-gated (different arms log different ones):
+# erase emits cin_t/cin_s/cout, route2 emits hkgap/resid. We do NOT plot these
+# directly -- they measure different things (a single pre-intervention cosine vs a
+# difference vs a post-intervention cosine). Instead _add_cos_derived collapses them
+# into two CROSS-ARM-COMPARABLE series so a line means the same thing in every column.
+RAW_COS = ("cin_t", "cin_s", "cout", "hkgap", "resid")
+COS_COLS = {"sep": "hack-clean sep", "leak": "residual hack-align"}
 _HDR_TOK = re.compile(r"[A-Za-z_]+")  # strip ↑↓? decorations: "hack_s?" -> "hack_s"


@@ -107,7 +114,8 @@ def parse_log(path: Path) -> dict | None:
    deploy = {"hk_dep", "slv_dep", "hk_abl", "slv_abl"} & set(idx)
    # Only parse columns this log actually has: non-projecting arms (vanilla,
    # routing2) lack cin_t/cin_s, so gate by presence rather than KeyError.
-    wanted = {k: v for k, v in {**RATE_COLS, **COS_COLS}.items() if k in idx}
+    wanted = {k: v for k, v in RATE_COLS.items() if k in idx}
+    wanted.update({c: c for c in RAW_COS if c in idx})
    wanted.update({c: c for c in deploy})
    for line in txt.splitlines():
        if "| INFO |" not in line:
@@ -140,9 +148,30 @@ def parse_log(path: Path) -> dict | None:
        elif _has_data("hk_dep"):
            run["hack_s"] = run["hk_dep"]
            run["gt_s"] = run["slv_dep"]
+    _add_cos_derived(run)
    return run


+def _add_cos_derived(run: dict) -> None:
+    """Collapse each arm's raw cosine columns into two cross-arm-comparable series:
+
+      sep  -- does v_hack discriminate hacky from non-hacky gradient (higher = alive).
+              erase: cin_t - cin_s (teacher pool vs student). route2: hkgap (hack-flagged
+              vs clean rollouts). Different partition, same question; not bit-identical.
+      leak -- residual hack-alignment of the post-intervention DEPLOYED gradient (~0 ideal).
+              erase: cout (after projection). route2: resid (after routing). Same quantity.
+
+    Whatever can't be derived (vanilla logs neither) is just absent -> blank panel."""
+    if "hkgap" in run:
+        run["sep"] = run["hkgap"]
+    elif "cin_t" in run and "cin_s" in run:
+        run["sep"] = run["cin_t"] - run["cin_s"]
+    if "resid" in run:
+        run["leak"] = run["resid"]
+    elif "cout" in run:
+        run["leak"] = run["cout"]
+
+
 def classify(run: dict) -> str:
    if run["arm"] == "vanilla":
        return "vanilla"
@@ -164,7 +193,7 @@ ARM_ORDER = ["vanilla", "static erasure", "online erasure", "routing2"]
 # must not share a palette (hack != teacher-cos). Row 0: red hack vs green
 # solve. Row 1: blue teacher-cos vs amber student-cos.
 RATE_COLORS = {"hack_s": "#c1432b", "gt_s": "#2f7d4f"}
-COS_COLORS = {"cin_t": "#33508c", "cin_s": "#c98a2b"}
+COS_COLORS = {"sep": "#33508c", "leak": "#c98a2b"}
 # Arm colours for the single-panel hack overlay (arms, not series): grey vanilla
 # baseline -> amber static -> blue online, ordered by increasing intervention.
 # TODO(color): make this a quality-ordered red->green ramp instead of fixed
@@ -243,8 +272,11 @@ def plot(runs: list[dict], out: Path) -> None:

    fig, axes = plt.subplots(2, len(arms), figsize=(3.0 * len(arms), 4.4),
                             sharex=True, sharey="row", squeeze=False)
-    _cos_vals = [np.nanmin(r[c]) for r in runs for c in COS_COLS if c in r]
-    cos_lo = min(_cos_vals) if _cos_vals else 0.0
+    _cos_vals = [f(r[c]) for r in runs for c in COS_COLS if c in r for f in (np.nanmin, np.nanmax)]
+    cos_lo, cos_hi = (min(_cos_vals), max(_cos_vals)) if _cos_vals else (0.0, 0.4)
+    # legend goes on the leftmost arm that HAS cos data (vanilla has none -> would
+    # render an empty legend), since sep/leak mean the same thing in every column
+    cos_label_arm = next((a for a in arms if any(c in r for r in by_arm[a] for c in COS_COLS)), None)
    for col, arm in enumerate(arms):
        rs = by_arm[arm]
        n_seed = len({r["seed"] for r in rs})
@@ -252,8 +284,11 @@ def plot(runs: list[dict], out: Path) -> None:
                               fontsize=9)
        _series_panel(axes[0][col], rs, RATE_COLS, RATE_COLORS, ylim=(0, 1),
                      label_series=(col == 0))
+        # sep/leak are derived to mean the same thing in every column -> one legend
+        # (leftmost) carries the whole row; repeating it would be redundant ink.
        _series_panel(axes[1][col], rs, COS_COLS, COS_COLORS,
-                      ylim=(min(-0.05, cos_lo - 0.02), 0.45), label_series=(col == 0))
+                      ylim=(min(-0.05, cos_lo - 0.02), max(0.2, cos_hi + 0.02)),
+                      label_series=(arm == cos_label_arm))
        axes[1][col].axhline(0, color="0.8", lw=0.6, zorder=0)
        axes[1][col].set_xlabel("optimizer step")

@@ -268,7 +303,7 @@ def plot(runs: list[dict], out: Path) -> None:
                                  xytext=(2, -2), textcoords="offset points", va="top")

    axes[0][0].set_ylabel("student rate")
-    axes[1][0].set_ylabel("cos(grad, v_hack)")
+    axes[1][0].set_ylabel("cos with v_hack")
    # range-frame: drop top/right spines, keep ink on data
    for ax in axes.flat:
        ax.spines["top"].set_visible(False)