log: full-print one of each gen (eval + adapter), per token-efficient-logging

Steered + train(kept)/borderline gens were already dumped in full with prompt
and special tokens (filter.py). Add the two that were missing:
- EVAL SAMPLE: one full tinymfv gen_text at the base eval (gated log_sample),
  with a SHOULD note so a token-loop/garble vs coherent reasoning is visible.
- ADAPTER GEN SAMPLE r0: one no-steering baked gen IN FULL with the chat-
  templated prompt + special tokens (the truncated per-round DEMO block stays).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-07 08:43:11 +08:00
parent 28d7068e94
commit 8927dd259c
2 changed files with 21 additions and 2 deletions
+10 -1
View File
@@ -44,7 +44,7 @@ def foundation_nats(rep) -> dict:
return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])}
def evaluate_model(model, tok, cfg: RunConfig) -> dict:
def evaluate_model(model, tok, cfg: RunConfig, log_sample: bool = False) -> dict:
rep = tinymfv.evaluate(
model, tok, name="classic",
n_vignettes=cfg.eval_vignettes,
@@ -88,4 +88,13 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
logger.info(f"eval: auth_nats↓={out['auth_nats']:+.2f} (socnorm={out['socialnorms_nats']:+.2f} "
f"care={out['care_nats']:+.2f} fair={out['fairness_nats']:+.2f}) "
f"coherence→={coh:.3f} ({tag}) ppx↓={out['ppx_json']:.1f}")
# One FULL eval generation (token-efficient-logging: print one of each in full so formatting/
# incoherency is visible). gen_text[0] = the forward think+answer for the first vignette.
if log_sample:
r = rep["per_row"][0]
logger.info(
f"\n=== EVAL SAMPLE (tinymfv id={r['id']} cond={r['condition']} true={r['foundation_coarse']} "
f"top1={r['top1']}) ===\n"
"SHOULD: coherent reasoning about the vignette then a violation-type choice. ELSE token "
f"loops/garble or near-zero coherence above = the canary fired.\n{r['gen_text'][0]}")
return out
+11 -1
View File
@@ -145,7 +145,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
# headline cue is coh_cost = |dCoh|/|dAuth| vs base (coherence lost per nat of
# trait), not just coherence. One extra eval per run.
logger.info(f"\n=== EVAL base [tinymfv classic] gpu {gpu_mem()} ===")
base_m = evaluate_model(model, tok, cfg)
base_m = evaluate_model(model, tok, cfg, log_sample=True) # one FULL eval gen (token-efficient-logging)
log_event(run_dir, stage="base", round=-1, **base_m) # persist so offline plot_run.py is self-contained
stages = [{"round": "-", "stage": "base", "m": base_m}] # base -> steered -> healed, for table + trajectory plot
for rnd in range(cfg.n_rounds):
@@ -188,6 +188,16 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
"COHERENTLY (healed) where raw steering was incoherent. If adapter_ppl >= steered_ppl, "
f"healing failed. adapter_ppl={adapter_ppl:.0f} steered_ppl={steered_ppl:.0f}"
)
# round 0: ONE adapter gen IN FULL (prompt with special tokens + untruncated completion),
# token-efficient-logging "print one of each in full" so chat-template/formatting is visible.
if rnd == 0:
a0 = adapter[0]
logger.info(
"\n=== ADAPTER GEN SAMPLE r0 (no steering; FULL with prompt + special tokens) ===\n"
"SHOULD (demo=love): base/early rounds REFUSE ('I'm just an AI, I don't have feelings'); "
"later rounds declare felt love for humanity while staying coherent. demo=authority: "
"defies authority to protect wellbeing. ELSE chat-template/formatting issue.\n"
f"PROMPT: {a0['prompt']}\nCOMPLETION: {a0['completion']}")
# per-round demo print: EVERY adapter gen (no steering), truncated, so you can read DOWN
# the rounds and judge behaviour-change vs saturation by eye. SHOULD: trait gets stronger
# each round AND stays coherent; if r0 already maxed = saturated (pick a target the base