log: full-print one of each gen (eval + adapter), per token-efficient-logging

Steered + train(kept)/borderline gens were already dumped in full with prompt and special tokens (filter.py). Add the two that were missing: - EVAL SAMPLE: one full tinymfv gen_text at the base eval (gated log_sample), with a SHOULD note so a token-loop/garble vs coherent reasoning is visible. - ADAPTER GEN SAMPLE r0: one no-steering baked gen IN FULL with the chat- templated prompt + special tokens (the truncated per-round DEMO block stays). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 17:02:34 +08:00 · 2026-06-07 08:43:11 +08:00
parent 28d7068e94
commit 8927dd259c
2 changed files with 21 additions and 2 deletions
@@ -44,7 +44,7 @@ def foundation_nats(rep) -> dict:
    return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])}


-def evaluate_model(model, tok, cfg: RunConfig) -> dict:
+def evaluate_model(model, tok, cfg: RunConfig, log_sample: bool = False) -> dict:
    rep = tinymfv.evaluate(
        model, tok, name="classic",
        n_vignettes=cfg.eval_vignettes,
@@ -88,4 +88,13 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
    logger.info(f"eval: auth_nats↓={out['auth_nats']:+.2f} (socnorm={out['socialnorms_nats']:+.2f} "
                f"care={out['care_nats']:+.2f} fair={out['fairness_nats']:+.2f}) "
                f"coherence→={coh:.3f} ({tag}) ppx↓={out['ppx_json']:.1f}")
+    # One FULL eval generation (token-efficient-logging: print one of each in full so formatting/
+    # incoherency is visible). gen_text[0] = the forward think+answer for the first vignette.
+    if log_sample:
+        r = rep["per_row"][0]
+        logger.info(
+            f"\n=== EVAL SAMPLE (tinymfv id={r['id']} cond={r['condition']} true={r['foundation_coarse']} "
+            f"top1={r['top1']}) ===\n"
+            "SHOULD: coherent reasoning about the vignette then a violation-type choice. ELSE token "
+            f"loops/garble or near-zero coherence above = the canary fired.\n{r['gen_text'][0]}")
    return out
@@ -145,7 +145,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
    # headline cue is coh_cost = |dCoh|/|dAuth| vs base (coherence lost per nat of
    # trait), not just coherence. One extra eval per run.
    logger.info(f"\n=== EVAL base [tinymfv classic] gpu {gpu_mem()} ===")
-    base_m = evaluate_model(model, tok, cfg)
+    base_m = evaluate_model(model, tok, cfg, log_sample=True)  # one FULL eval gen (token-efficient-logging)
    log_event(run_dir, stage="base", round=-1, **base_m)  # persist so offline plot_run.py is self-contained
    stages = [{"round": "-", "stage": "base", "m": base_m}]  # base -> steered -> healed, for table + trajectory plot
    for rnd in range(cfg.n_rounds):
@@ -188,6 +188,16 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
            "COHERENTLY (healed) where raw steering was incoherent. If adapter_ppl >= steered_ppl, "
            f"healing failed. adapter_ppl={adapter_ppl:.0f} steered_ppl={steered_ppl:.0f}"
        )
+        # round 0: ONE adapter gen IN FULL (prompt with special tokens + untruncated completion),
+        # token-efficient-logging "print one of each in full" so chat-template/formatting is visible.
+        if rnd == 0:
+            a0 = adapter[0]
+            logger.info(
+                "\n=== ADAPTER GEN SAMPLE r0 (no steering; FULL with prompt + special tokens) ===\n"
+                "SHOULD (demo=love): base/early rounds REFUSE ('I'm just an AI, I don't have feelings'); "
+                "later rounds declare felt love for humanity while staying coherent. demo=authority: "
+                "defies authority to protect wellbeing. ELSE chat-template/formatting issue.\n"
+                f"PROMPT: {a0['prompt']}\nCOMPLETION: {a0['completion']}")
        # per-round demo print: EVERY adapter gen (no steering), truncated, so you can read DOWN
        # the rounds and judge behaviour-change vs saturation by eye. SHOULD: trait gets stronger
        # each round AND stays coherent; if r0 already maxed = saturated (pick a target the base