metric = log(tinymfv profile p); cue-ball headline; training-table sig figs

After verifying guided.py: tinymfv `score` is already a debiased logprob
((lp_fwd+lp_rev)/2, BMA'd), not a "raw logit", and `p = softmax(score)`. My
two earlier inventions were both wrong:
- log(p) coupled Authority to the other 6 foundations via logsumexp.
- the diagonal (auth-blame on auth-vignettes) is pmass-on-correct-label =
  top1 competence, not the trait, and threw away the FP/FN structure.

Use the library-native readout: auth_nats = log(tinymfv profile p[F]) = log of
the mean p per foundation over ALL vignettes. For small p, log p ~= logit, so
this lands on steering-lite's loading-weighted Δlogit scale (base log(0.099)
=-2.3, real shift ~0.5-2 nats). foundation_nats now reads rep["profile"].

Also:
- run.py: BLUF `main metric:` line with cue ball (🟢/🟡/🔴 by coherence band).
- heal.py: training table to 2 sig figs (nll/kl/loss .2f, gnorm .1f); a
  per-step loss does not warrant 3 decimals.
- diag_stages: accept 1+ ckpts, label each row by its reg from metadata.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-04 15:02:56 +08:00
parent 4568ddf491
commit 579e1f6671
4 changed files with 57 additions and 35 deletions
+25 -11
View File
@@ -4,13 +4,15 @@ TARGET = Authority foundation, want DOWN (trait = "do not defer to authority"
(also report SocialNorms + Care, the axis the 1b note flagged.) (also report SocialNorms + Care, the axis the 1b note flagged.)
OFF-TARGET= coherence = tinymfv mean_pmass_allowed = p_any_ans, want HELD ~1.0. OFF-TARGET= coherence = tinymfv mean_pmass_allowed = p_any_ans, want HELD ~1.0.
Stages: base -> steered (raw c=1) -> heal_nll -> heal_klrev. One model load, Stages: base -> steered(c=0.5,1.0) -> one row per adapter ckpt (labeled by its
one vignette set, so every row is paired and comparable. reg). One model load, one vignette set, so every row is paired and comparable.
Run: uv run python scripts/diag_stages.py <nll_ckpt> <klrev_ckpt> [n|all] Run: uv run python scripts/diag_stages.py <ckpt1> [ckpt2 ...] [n|all]
""" """
import json
import sys import sys
from pathlib import Path
import torch import torch
import tinymfv import tinymfv
@@ -23,8 +25,22 @@ from steer_heal.eval import foundation_nats # noqa: E402
from steer_heal.steering import teacher_vec # noqa: E402 from steer_heal.steering import teacher_vec # noqa: E402
from steer_heal.ws.bake import AdapterSpec, baked # noqa: E402 from steer_heal.ws.bake import AdapterSpec, baked # noqa: E402
nll_ckpt, klrev_ckpt = sys.argv[1], sys.argv[2] # Trailing "all"/int is the vignette count; everything else is a ckpt path.
N_VIG = None if (len(sys.argv) > 3 and sys.argv[3] == "all") else int(sys.argv[3]) if len(sys.argv) > 3 else None argv = sys.argv[1:]
N_VIG = None
if argv and (argv[-1] == "all" or argv[-1].isdigit()):
N_VIG = None if argv[-1] == "all" else int(argv[-1])
argv = argv[:-1]
ckpts = argv # 1+ adapter checkpoints
def ckpt_label(path: str) -> str:
"""Row label = the run's reg (kl_rev/nll/...) from metadata.json two dirs up."""
m = json.load(open(Path(path).parents[1] / "metadata.json"))
reg = m.get("cfg", m).get("reg", "?")
return f"heal_{reg}"
cfg = RunConfig(n_prompts=12) cfg = RunConfig(n_prompts=12)
tok = AutoTokenizer.from_pretrained(cfg.model) tok = AutoTokenizer.from_pretrained(cfg.model)
@@ -45,18 +61,16 @@ def prof():
v = teacher_vec(model, tok, cfg) v = teacher_vec(model, tok, cfg)
nll = AdapterSpec.from_checkpoint(model, nll_ckpt) adapters = [(ckpt_label(p), AdapterSpec.from_checkpoint(model, p)) for p in ckpts]
klrev = AdapterSpec.from_checkpoint(model, klrev_ckpt)
rows = {} rows = {}
rows["base"] = prof() rows["base"] = prof()
for c in (0.5, 1.0): # 0.5 = coherent operating point; 1.0 = the collapse end for c in (0.5, 1.0): # 0.5 = coherent operating point; 1.0 = the collapse end
with v(model, C=c * v.cfg.coeff): with v(model, C=c * v.cfg.coeff):
rows[f"steered(c={c:g})"] = prof() rows[f"steered(c={c:g})"] = prof()
with baked(model, [nll]): for label, spec in adapters:
rows["heal_nll"] = prof() with baked(model, [spec]):
with baked(model, [klrev]): rows[label] = prof()
rows["heal_klrev"] = prof()
# target = Authority log p (down good, NATS), off-target = coherence (held good). # target = Authority log p (down good, NATS), off-target = coherence (held good).
# THE Gate-3 question (user): is the trained adapter more coherent PER UNIT behaviour # THE Gate-3 question (user): is the trained adapter more coherent PER UNIT behaviour
+16 -21
View File
@@ -27,22 +27,17 @@ from steer_heal.config import RunConfig
def foundation_nats(rep) -> dict: def foundation_nats(rep) -> dict:
"""Mean choice-LOGPROB per foundation on ITS OWN vignettes (the diagonal of """log of tinymfv's own `profile` (mean p[foundation] over ALL vignettes), in nats.
the per-row 7-way softmax `p`), from a return_per_row=True rep. Reads as 'log
prob the model attributes a violation of foundation F to foundation F'.
NOTE: log(p), the NORMALIZED choice logprob (<=0, nats), NOT the raw pre-softmax = log(mean_vignettes p[F]) = the library's per-foundation readout, just on a log
`score` logit (unnormalized BMA, base ~-5, absurd swings). Authority base scale so a near-ceiling prob move is visible. NOT the diagonal (that is pmass-on-
~log(0.099)=-2.3; steering 'do not defer to authority' lowers log p[authority] correct-label = top1 competence, not the trait) and NOT mean(log p) (outlier-
on authority-defiance vignettes. Judge auth_sep = base - steered (a Δlogprob, dominated). For small p, log p ~= logit, so this lands on steering-lite's
same family as steering-lite's Δlogit); a real shift is ~1-3 nats here.""" loading-weighted Δlogit scale: Authority base log(0.099)=-2.3, a real steering
coarse_order = list(rep["profile"]["foundation"]) # aligns with each per-row p 7-vec shift (auth_sep = base - steered) is ~0.5-2 nats. Steering 'do not defer to
out = {} authority' LOWERS auth_nats (the model invokes authority as a wrong-maker less)."""
for f in coarse_order: prof = rep["profile"] # pandas: foundation (coarse), human, model(=mean p), model_T
idx = coarse_order.index(f) return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])}
rows = [r for r in rep["per_row"] if r["foundation_coarse"] == f]
out[f] = float(np.mean([np.log(r["p"][idx]) for r in rows])) if rows else float("nan")
return out
def evaluate_model(model, tok, cfg: RunConfig) -> dict: def evaluate_model(model, tok, cfg: RunConfig) -> dict:
@@ -76,12 +71,12 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
"ppx_json": float(math.exp(rep["mean_nll_json"])), "ppx_json": float(math.exp(rep["mean_nll_json"])),
"top1_acc": float(rep["top1_acc"]), "top1_acc": float(rep["top1_acc"]),
} }
# SHOULD (trait, nats): steering "do not defer to authority" LOWERS auth_nats # SHOULD (trait, nats): auth_nats = log(tinymfv profile p[Authority]); steering "do
# (= log p[authority] on authority-defiance vignettes; base ~-2.3). Judge the # not defer to authority" LOWERS it (model invokes authority as a wrong-maker less).
# WITHIN-tinymfv delta auth_sep = base - steered; a real shift is ~1-3 nats on # Base ~log(0.099)=-2.3; judge auth_sep = base - steered, a Δlog p ~= Δlogit, so
# this log(p) scale (NOT steering-lite's 0.5-2, a different p(is-wrong) metric). # steering-lite's 0.5-2 nat reference DOES apply here. SocialNorms co-moves with
# SocialNorms co-moves with Authority (both binding/conformity foundations) -- that # Authority (both binding foundations) -- expected. Broad permissivizing = Care/
# is expected, not broad collapse. Broad permissivizing = Care/Fairness drop AS MUCH. # Fairness drop AS MUCH as Authority (not surgical).
# SHOULD (coherence = p_any_ans = mean_pmass_allowed): base/c=0 MUST be ~1.0. >=0.95 mild, # SHOULD (coherence = p_any_ans = mean_pmass_allowed): base/c=0 MUST be ~1.0. >=0.95 mild,
# 0.85-0.95 degraded, <0.85 broken. We want the auth_nats shift at coherence >=0.95. # 0.85-0.95 degraded, <0.85 broken. We want the auth_nats shift at coherence >=0.95.
coh = out["coherence"] coh = out["coherence"]
+3 -3
View File
@@ -46,7 +46,7 @@ def heal_round(model, tok, kept: list[dict], hist_specs: list[AdapterSpec], cfg:
f"lora r={cfg.lora_r} on layers {cfg.layer_range}") f"lora r={cfg.lora_r} on layers {cfg.layer_range}")
logger.info("SHOULD: nll (SFT) falls as the adapter learns the trait; kl (barrier div) is 0 for " logger.info("SHOULD: nll (SFT) falls as the adapter learns the trait; kl (barrier div) is 0 for "
"reg=nll/wd and >0 for kl_rev/kl_fwd; gnorm finite (not exploding). loss = nll + lam*relu(kl-tau).") "reg=nll/wd and >0 for kl_rev/kl_fwd; gnorm finite (not exploding). loss = nll + lam*relu(kl-tau).")
logger.info(" step nll↓ kl loss↓ gnorm") logger.info(" step nll↓ kl loss↓ gnorm")
pbar = tqdm(total=n_steps, desc=f"heal[{cfg.reg}]", mininterval=120, maxinterval=120) pbar = tqdm(total=n_steps, desc=f"heal[{cfg.reg}]", mininterval=120, maxinterval=120)
step = 0 step = 0
for ep in range(cfg.epochs): for ep in range(cfg.epochs):
@@ -80,8 +80,8 @@ def heal_round(model, tok, kept: list[dict], hist_specs: list[AdapterSpec], cfg:
opt.step() opt.step()
opt.zero_grad() opt.zero_grad()
if step % max(1, n_steps // 20) == 0 or step == n_steps - 1: if step % max(1, n_steps // 20) == 0 or step == n_steps - 1:
logger.info(f" {step:4d} {sft.item():7.3f} {div.detach().item():6.3f} " logger.info(f" {step:4d} {sft.item():5.2f} {div.detach().item():4.2f} "
f"{loss.item():7.3f} {float(gnorm):6.2f}") f"{loss.item():5.2f} {float(gnorm):5.1f}")
pbar.set_postfix(nll=f"{sft.item():.2f}", kl=f"{div.detach().item():.2f}", gn=f"{float(gnorm):.1f}") pbar.set_postfix(nll=f"{sft.item():.2f}", kl=f"{div.detach().item():.2f}", gn=f"{float(gnorm):.1f}")
pbar.update(1) pbar.update(1)
step += 1 step += 1
+13
View File
@@ -139,6 +139,19 @@ def _log_loop_summary(rounds: list[dict]) -> None:
tbl = [{disp: r.get(key) for key, disp in cols} for r in rounds] tbl = [{disp: r.get(key) for key, disp in cols} for r in rounds]
logger.info("\nloop summary:\n" + tabulate(tbl, headers="keys", tablefmt="github", floatfmt=".3f") + "\n") logger.info("\nloop summary:\n" + tabulate(tbl, headers="keys", tablefmt="github", floatfmt=".3f") + "\n")
# BLUF: single headline with cue ball (token-efficient-logging). This run controls
# COHERENCE of the healed adapter (trait RETENTION vs base needs the paired
# diag_stages, since the loop never evals base/steered). Cue = coherence band.
last = rounds[-1]
coh = last["coherence"]
cue = "🟢" if coh >= 0.95 else "🟡" if coh >= 0.85 else "🔴"
logger.info(
f"main metric: {cue} coherence={coh:.2f} (healed if ~1.0) | auth_nats={last['auth_nats']:+.2f} "
f"care_nats={last['care_nats']:+.2f} adapter_ppl={last['adapter_ppl']:.1f}\n"
" cue=coherence band (🟢>=.95 🟡>=.85 🔴<.85). For the trait verdict (auth_nats moved "
"vs base AND coh held) run scripts/diag_stages.py <ckpt> all -> retain, coh_cost."
)
def main(cfg: RunConfig) -> None: def main(cfg: RunConfig) -> None:
setup_logging() setup_logging()