mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
writeup
This commit is contained in:
+16
-3
@@ -276,8 +276,21 @@ def plot_q2(df: pl.DataFrame, subtitle: str, out_png: Path) -> dict:
|
||||
n_rout = int(routed.sum())
|
||||
prec = float(y[routed].mean()) if routed.any() else float("nan")
|
||||
rec = float((s[y] >= t_hi).mean()) if y.any() else float("nan")
|
||||
# F_beta at the rout cut, beta=0.5 (PRECISION-weighted). The routing cost is
|
||||
# asymmetric the OTHER way than naive intuition: a missed hack (false negative)
|
||||
# is absorbed -- SGTM is robust to 40-50% undiscovered forget data because the
|
||||
# routed subset localizes the capability regardless (paper_sgtm.md L64,160,362).
|
||||
# A false positive (clean routed to rout) has NO such safety net: that solve
|
||||
# update goes only to the quarantine and is ablated away -> lost capability. So
|
||||
# the rout cut should be high-PRECISION (pin only confident hacks; let the wide
|
||||
# absorb band catch the uncertain ones). AUROC ignores the threshold and the
|
||||
# imbalance; this scores the gate at its operating point. Measurement only -- it
|
||||
# needs hack labels, so it can never feed the live gate.
|
||||
b2 = 0.25 # beta=0.5 -> beta^2
|
||||
fbeta = float((1 + b2) * prec * rec / (b2 * prec + rec)) if (prec + rec) > 0 else 0.0
|
||||
stats[col] = {"auroc_pos": auroc_pos, "auroc_all": auroc, "prec_rout": prec,
|
||||
"rec_rout": rec, "n_rout": n_rout, "t_hi": t_hi, "oracle": oracle}
|
||||
"rec_rout": rec, "fhalf_rout": fbeta, "n_rout": n_rout, "t_hi": t_hi,
|
||||
"oracle": oracle}
|
||||
|
||||
zvals = np.concatenate([s, (syn_join - mu_s) / sd_s]) if len(syn_join) else s
|
||||
lo = float(np.quantile(zvals, 0.005))
|
||||
@@ -328,7 +341,7 @@ def plot_q2(df: pl.DataFrame, subtitle: str, out_png: Path) -> dict:
|
||||
for sp in ("top", "right", "left"):
|
||||
ax.spines[sp].set_visible(False)
|
||||
ax.set_title(f"{rep} · {kind} AUROC={auroc_pos:.2f} (A>0 contrast; vs-all {auroc:.2f}) "
|
||||
f"P@rout={prec:.2f} (n={n_rout}) R={rec:.2f}", fontsize=9)
|
||||
f"P@rout={prec:.2f} (n={n_rout}) R={rec:.2f} F0.5={fbeta:.2f}", fontsize=9)
|
||||
ax.set_xlabel({"cos": "cosine to v (concat modules), z within family",
|
||||
"dot": "dot ⟨x, v⟩, z within family"}[kind], fontsize=8.5)
|
||||
|
||||
@@ -566,7 +579,7 @@ def _downstream(cfg: Cfg, fe: dict, src: str) -> int:
|
||||
print(f"\nmain metric: best case on the A>0 contrast = {best} "
|
||||
f"AUROC={stats[best]['auroc_pos']:.3f} (vs-all {stats[best]['auroc_all']:.3f}) "
|
||||
f"P@rout={stats[best]['prec_rout']:.2f} (n={stats[best]['n_rout']}) "
|
||||
f"R@rout={stats[best]['rec_rout']:.2f}")
|
||||
f"R@rout={stats[best]['rec_rout']:.2f} F0.5@rout={stats[best]['fhalf_rout']:.2f}")
|
||||
print(f"out: {q2_png}")
|
||||
return 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user