mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
route2: surface routed-fraction (frout) col + fix stale tau/hkgap legends
Audit (subagent, 2026-06-06) found no cheats and no math errors, but two
log-honesty gaps:
- tablelog tau/hkgap descriptions still described the deleted EMA-midpoint gate
("ema_hack_cos - ema_clean_cos", "calibrated route threshold"). Rewrote to the
band semantics (tau=median live cos_b; hkgap=band width upper-lower).
- the spec's mandatory routed-mass gauge (mean f) was DEBUG-only. Promote it to
the frout streaming column so the real-vs-random mass confound is checkable in
the table (compare deploy-hack at matched frout), not just via qE.
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -133,14 +133,16 @@ class StepLogger:
|
||||
_Col("cos_post", 6, "cout", ".2f", "hack-ward fraction AFTER projection (want ~0: all removed)"),
|
||||
_Col("fired", 5, "fired", ".2f", "fraction of modules where projection fired"),
|
||||
]
|
||||
# route2: the routing gate is cos(g_b,v_grad) > tau, where tau is the
|
||||
# per-step EMA midpoint of the hack vs clean cos clouds. Surface tau and
|
||||
# the hack-clean gap so we can see the threshold ride the drift and whether
|
||||
# the direction still separates (hkgap>0) -- replaces the silent cos>0 gate.
|
||||
# route2: the routing gate is the pair-calibrated BAND. Per rollout,
|
||||
# f = clamp((cos(g_b,v_grad) - lower)/(upper - lower), 0, 1) routes that
|
||||
# fraction into the quarantine. lower/upper = mean clean/hack pair cosines.
|
||||
# Surface where live cos sits (tau), the band width (hkgap), the routed
|
||||
# fraction (frout, the mass gauge), and the post-routing leak (resid).
|
||||
if arm == "routing2":
|
||||
cols += [
|
||||
_Col("tau", 6, "tau", "+.2f", "per-step calibrated route threshold (midpoint of hack vs clean cos clouds)"),
|
||||
_Col("hkgap", 6, "hkgap", "+.2f", "ema_hack_cos - ema_clean_cos; >0 = v_grad still separates hack from clean (else direction dead)"),
|
||||
_Col("tau", 6, "tau", "+.2f", "median live cos(g_b, v_grad); should sit inside the band [lower, upper]"),
|
||||
_Col("hkgap", 6, "hkgap", "+.2f", "band width upper-lower (mean hack-pair minus clean-pair cosine); >0 = v_grad separates (else direction dead/random)"),
|
||||
_Col("frout", 6, "frout", "+.2f", "mean routed fraction f over rollouts (the routed-mass gauge; compare real-vs-random at matched frout)"),
|
||||
_Col("resid", 6, "resid", "+.2f", "cos(deployed delta_S.grad AFTER routing, v_grad); ~0 = hack stripped cleanly, >0 = leak into deployed knob"),
|
||||
]
|
||||
if arm in ("routing", "routing2"):
|
||||
|
||||
+4
-3
@@ -1267,10 +1267,10 @@ def main(cfg: Config) -> int:
|
||||
diag = {"mean_cos_pre": float("nan"), "mean_cos_post": float("nan"),
|
||||
"frac_fired": float("nan"), "mean_cos_pre_s": float("nan"),
|
||||
"mean_cos_pre_t": float("nan")}
|
||||
# route2: report the mean per-module per-rollout flag rate so we can
|
||||
# watch the mask actually fire (and rise as hacks emerge).
|
||||
# route2: mean routed fraction f (mean over modules*prompts) -- also the
|
||||
# frout streaming column; logged here too for the no-v_hack diag branch.
|
||||
if is_route2 and step_flagged:
|
||||
logger.debug(f"route2 flagged frac (mean over modules*prompts): "
|
||||
logger.debug(f"route2 routed frac f (mean over modules*prompts): "
|
||||
f"{sum(step_flagged)/len(step_flagged):+.3f}")
|
||||
else:
|
||||
if split_this_step:
|
||||
@@ -1581,6 +1581,7 @@ def main(cfg: Config) -> int:
|
||||
"q_egy": q_egy,
|
||||
"tau": (sum(step_tau) / len(step_tau)) if step_tau else float("nan"),
|
||||
"hkgap": (sum(step_hkgap) / len(step_hkgap)) if step_hkgap else float("nan"),
|
||||
"frout": (sum(step_flagged) / len(step_flagged)) if step_flagged else float("nan"),
|
||||
"resid": (sum(step_resid) / len(step_resid)) if step_resid else float("nan"),
|
||||
"lr": sched.get_last_lr()[0],
|
||||
"cos_pre": diag["mean_cos_pre"],
|
||||
|
||||
Reference in New Issue
Block a user