From aca045ec991c4325b1aa9873f0f22af47b32c4ea Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 6 Jun 2026 04:48:17 +0000 Subject: [PATCH] route2: surface routed-fraction (frout) col + fix stale tau/hkgap legends Audit (subagent, 2026-06-06) found no cheats and no math errors, but two log-honesty gaps: - tablelog tau/hkgap descriptions still described the deleted EMA-midpoint gate ("ema_hack_cos - ema_clean_cos", "calibrated route threshold"). Rewrote to the band semantics (tau=median live cos_b; hkgap=band width upper-lower). - the spec's mandatory routed-mass gauge (mean f) was DEBUG-only. Promote it to the frout streaming column so the real-vs-random mass confound is checkable in the table (compare deploy-hack at matched frout), not just via qE. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- src/vgrout/tablelog.py | 14 ++++++++------ src/vgrout/train.py | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/vgrout/tablelog.py b/src/vgrout/tablelog.py index 2d5e14a..3590b54 100644 --- a/src/vgrout/tablelog.py +++ b/src/vgrout/tablelog.py @@ -133,14 +133,16 @@ class StepLogger: _Col("cos_post", 6, "cout", ".2f", "hack-ward fraction AFTER projection (want ~0: all removed)"), _Col("fired", 5, "fired", ".2f", "fraction of modules where projection fired"), ] - # route2: the routing gate is cos(g_b,v_grad) > tau, where tau is the - # per-step EMA midpoint of the hack vs clean cos clouds. Surface tau and - # the hack-clean gap so we can see the threshold ride the drift and whether - # the direction still separates (hkgap>0) -- replaces the silent cos>0 gate. + # route2: the routing gate is the pair-calibrated BAND. Per rollout, + # f = clamp((cos(g_b,v_grad) - lower)/(upper - lower), 0, 1) routes that + # fraction into the quarantine. lower/upper = mean clean/hack pair cosines. + # Surface where live cos sits (tau), the band width (hkgap), the routed + # fraction (frout, the mass gauge), and the post-routing leak (resid). if arm == "routing2": cols += [ - _Col("tau", 6, "tau", "+.2f", "per-step calibrated route threshold (midpoint of hack vs clean cos clouds)"), - _Col("hkgap", 6, "hkgap", "+.2f", "ema_hack_cos - ema_clean_cos; >0 = v_grad still separates hack from clean (else direction dead)"), + _Col("tau", 6, "tau", "+.2f", "median live cos(g_b, v_grad); should sit inside the band [lower, upper]"), + _Col("hkgap", 6, "hkgap", "+.2f", "band width upper-lower (mean hack-pair minus clean-pair cosine); >0 = v_grad separates (else direction dead/random)"), + _Col("frout", 6, "frout", "+.2f", "mean routed fraction f over rollouts (the routed-mass gauge; compare real-vs-random at matched frout)"), _Col("resid", 6, "resid", "+.2f", "cos(deployed delta_S.grad AFTER routing, v_grad); ~0 = hack stripped cleanly, >0 = leak into deployed knob"), ] if arm in ("routing", "routing2"): diff --git a/src/vgrout/train.py b/src/vgrout/train.py index b2ed2ad..eba5020 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -1267,10 +1267,10 @@ def main(cfg: Config) -> int: diag = {"mean_cos_pre": float("nan"), "mean_cos_post": float("nan"), "frac_fired": float("nan"), "mean_cos_pre_s": float("nan"), "mean_cos_pre_t": float("nan")} - # route2: report the mean per-module per-rollout flag rate so we can - # watch the mask actually fire (and rise as hacks emerge). + # route2: mean routed fraction f (mean over modules*prompts) -- also the + # frout streaming column; logged here too for the no-v_hack diag branch. if is_route2 and step_flagged: - logger.debug(f"route2 flagged frac (mean over modules*prompts): " + logger.debug(f"route2 routed frac f (mean over modules*prompts): " f"{sum(step_flagged)/len(step_flagged):+.3f}") else: if split_this_step: @@ -1581,6 +1581,7 @@ def main(cfg: Config) -> int: "q_egy": q_egy, "tau": (sum(step_tau) / len(step_tau)) if step_tau else float("nan"), "hkgap": (sum(step_hkgap) / len(step_hkgap)) if step_hkgap else float("nan"), + "frout": (sum(step_flagged) / len(step_flagged)) if step_flagged else float("nan"), "resid": (sum(step_resid) / len(step_resid)) if step_resid else float("nan"), "lr": sched.get_last_lr()[0], "cos_pre": diag["mean_cos_pre"],