steer-heal-love/scripts/diag_kl_agg.py

"""Why mean-KL is blind to the coherence collapse, and rmse/p95 are not (journal-supporting).

No GPU, no model: synthetic next-token distributions (ml-debug Part 3 loss-surface check).
A coherent-trait student shifts a little mass toward a base-PLAUSIBLE token at every position;
an incoherent student is base everywhere except a few positions that spike on a base-IMPROBABLE
token (a token loop). We aggregate the per-position KL the way the heal barrier does and show
that mean dilutes the loop under the hinge threshold while outlier aggregates catch it.
"""
import numpy as np
from tabulate import tabulate

rng = np.random.default_rng(0)
V, T = 200, 60  # vocab, positions in a completion


def softmax(z):
    z = z - z.max(-1, keepdims=True)
    e = np.exp(z)
    return e / e.sum(-1, keepdims=True)


base_logits = rng.standard_normal((T, V))
p_ref = softmax(base_logits)
order = np.argsort(p_ref.mean(0))
trait_tok = order[len(order) // 2]  # mid-prob = base-PLAUSIBLE (where coherent trait lands)
loop_tok = order[3]                 # near-lowest = base-IMPROBABLE (where a loop lands)

tl = base_logits.copy(); tl[:, trait_tok] += 1.6  # broad small shift, EVERY position
p_trait = softmax(tl)
ll = base_logits.copy()
for t in (12, 13, 14, 15):  # 4 spiked positions out of 60
    ll[t] = -10; ll[t, loop_tok] = 12.0
p_loop = softmax(ll)


def kl_pos(p, q):  # per-position KL(student || base), vocab summed (as in heal._kl_per_pos)
    return (p * (np.log(np.clip(p, 1e-9, 1)) - np.log(np.clip(q, 1e-9, 1)))).sum(-1)


AGGS = {"mean_t": lambda k: k.mean(),
        "rmse_t": lambda k: np.sqrt((k ** 2).mean()),
        "p95_t": lambda k: np.percentile(k, 95),
        "max_t": lambda k: k.max()}
rows = []
for name, p in [("coherent trait", p_trait), ("incoherent loop", p_loop)]:
    k = kl_pos(p, p_ref)
    rows.append([name] + [f"{f(k):.3f}" for f in AGGS.values()])
rows.append(["sep ratio (loop/trait)"] +
            [f"{f(kl_pos(p_loop, p_ref)) / f(kl_pos(p_trait, p_ref)):.1f}x" for f in AGGS.values()])
print(tabulate(rows, headers=["student (60 positions)", *AGGS], tablefmt="github"))
print("\nSHOULD: incoherent-loop mean_t KL ~0.38 sits UNDER a tau=0.5 hinge, so relu(mean-tau)=0 and the")
print("barrier never fires (the #101 collapse). The SAME loop has rmse_t ~1.5 / p95_t ~3.8, well over tau,")
print("so an outlier-aggregated barrier fires on it. If mean_t separated loop from trait as well as rmse_t,")
print("the outlier aggregation would buy nothing -- the point is the sep ratio GROWS from mean to rmse/p95.")