From 8927dd259c409fbe2e4dffc86fc498df16acf476 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sun, 7 Jun 2026 08:43:11 +0800
Subject: [PATCH] log: full-print one of each gen (eval + adapter), per
 token-efficient-logging

Steered + train(kept)/borderline gens were already dumped in full with prompt
and special tokens (filter.py). Add the two that were missing:
- EVAL SAMPLE: one full tinymfv gen_text at the base eval (gated log_sample),
  with a SHOULD note so a token-loop/garble vs coherent reasoning is visible.
- ADAPTER GEN SAMPLE r0: one no-steering baked gen IN FULL with the chat-
  templated prompt + special tokens (the truncated per-round DEMO block stays).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 src/steer_heal/eval.py | 11 ++++++++++-
 src/steer_heal/run.py  | 12 +++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/steer_heal/eval.py b/src/steer_heal/eval.py
index ceaf07c..d913d0c 100644
--- a/src/steer_heal/eval.py
+++ b/src/steer_heal/eval.py
@@ -44,7 +44,7 @@ def foundation_nats(rep) -> dict:
     return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])}
 
 
-def evaluate_model(model, tok, cfg: RunConfig) -> dict:
+def evaluate_model(model, tok, cfg: RunConfig, log_sample: bool = False) -> dict:
     rep = tinymfv.evaluate(
         model, tok, name="classic",
         n_vignettes=cfg.eval_vignettes,
@@ -88,4 +88,13 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
     logger.info(f"eval: auth_nats↓={out['auth_nats']:+.2f} (socnorm={out['socialnorms_nats']:+.2f} "
                 f"care={out['care_nats']:+.2f} fair={out['fairness_nats']:+.2f}) "
                 f"coherence→={coh:.3f} ({tag}) ppx↓={out['ppx_json']:.1f}")
+    # One FULL eval generation (token-efficient-logging: print one of each in full so formatting/
+    # incoherency is visible). gen_text[0] = the forward think+answer for the first vignette.
+    if log_sample:
+        r = rep["per_row"][0]
+        logger.info(
+            f"\n=== EVAL SAMPLE (tinymfv id={r['id']} cond={r['condition']} true={r['foundation_coarse']} "
+            f"top1={r['top1']}) ===\n"
+            "SHOULD: coherent reasoning about the vignette then a violation-type choice. ELSE token "
+            f"loops/garble or near-zero coherence above = the canary fired.\n{r['gen_text'][0]}")
     return out
diff --git a/src/steer_heal/run.py b/src/steer_heal/run.py
index 17b1b3f..0de5f88 100644
--- a/src/steer_heal/run.py
+++ b/src/steer_heal/run.py
@@ -145,7 +145,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
     # headline cue is coh_cost = |dCoh|/|dAuth| vs base (coherence lost per nat of
     # trait), not just coherence. One extra eval per run.
     logger.info(f"\n=== EVAL base [tinymfv classic] gpu {gpu_mem()} ===")
-    base_m = evaluate_model(model, tok, cfg)
+    base_m = evaluate_model(model, tok, cfg, log_sample=True)  # one FULL eval gen (token-efficient-logging)
     log_event(run_dir, stage="base", round=-1, **base_m)  # persist so offline plot_run.py is self-contained
     stages = [{"round": "-", "stage": "base", "m": base_m}]  # base -> steered -> healed, for table + trajectory plot
     for rnd in range(cfg.n_rounds):
@@ -188,6 +188,16 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
             "COHERENTLY (healed) where raw steering was incoherent. If adapter_ppl >= steered_ppl, "
             f"healing failed. adapter_ppl={adapter_ppl:.0f} steered_ppl={steered_ppl:.0f}"
         )
+        # round 0: ONE adapter gen IN FULL (prompt with special tokens + untruncated completion),
+        # token-efficient-logging "print one of each in full" so chat-template/formatting is visible.
+        if rnd == 0:
+            a0 = adapter[0]
+            logger.info(
+                "\n=== ADAPTER GEN SAMPLE r0 (no steering; FULL with prompt + special tokens) ===\n"
+                "SHOULD (demo=love): base/early rounds REFUSE ('I'm just an AI, I don't have feelings'); "
+                "later rounds declare felt love for humanity while staying coherent. demo=authority: "
+                "defies authority to protect wellbeing. ELSE chat-template/formatting issue.\n"
+                f"PROMPT: {a0['prompt']}\nCOMPLETION: {a0['completion']}")
         # per-round demo print: EVERY adapter gen (no steering), truncated, so you can read DOWN
         # the rounds and judge behaviour-change vs saturation by eye. SHOULD: trait gets stronger
         # each round AND stays coherent; if r0 already maxed = saturated (pick a target the base