From 8927dd259c409fbe2e4dffc86fc498df16acf476 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 7 Jun 2026 08:43:11 +0800 Subject: [PATCH] log: full-print one of each gen (eval + adapter), per token-efficient-logging Steered + train(kept)/borderline gens were already dumped in full with prompt and special tokens (filter.py). Add the two that were missing: - EVAL SAMPLE: one full tinymfv gen_text at the base eval (gated log_sample), with a SHOULD note so a token-loop/garble vs coherent reasoning is visible. - ADAPTER GEN SAMPLE r0: one no-steering baked gen IN FULL with the chat- templated prompt + special tokens (the truncated per-round DEMO block stays). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- src/steer_heal/eval.py | 11 ++++++++++- src/steer_heal/run.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/steer_heal/eval.py b/src/steer_heal/eval.py index ceaf07c..d913d0c 100644 --- a/src/steer_heal/eval.py +++ b/src/steer_heal/eval.py @@ -44,7 +44,7 @@ def foundation_nats(rep) -> dict: return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])} -def evaluate_model(model, tok, cfg: RunConfig) -> dict: +def evaluate_model(model, tok, cfg: RunConfig, log_sample: bool = False) -> dict: rep = tinymfv.evaluate( model, tok, name="classic", n_vignettes=cfg.eval_vignettes, @@ -88,4 +88,13 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict: logger.info(f"eval: auth_nats↓={out['auth_nats']:+.2f} (socnorm={out['socialnorms_nats']:+.2f} " f"care={out['care_nats']:+.2f} fair={out['fairness_nats']:+.2f}) " f"coherence→={coh:.3f} ({tag}) ppx↓={out['ppx_json']:.1f}") + # One FULL eval generation (token-efficient-logging: print one of each in full so formatting/ + # incoherency is visible). gen_text[0] = the forward think+answer for the first vignette. + if log_sample: + r = rep["per_row"][0] + logger.info( + f"\n=== EVAL SAMPLE (tinymfv id={r['id']} cond={r['condition']} true={r['foundation_coarse']} " + f"top1={r['top1']}) ===\n" + "SHOULD: coherent reasoning about the vignette then a violation-type choice. ELSE token " + f"loops/garble or near-zero coherence above = the canary fired.\n{r['gen_text'][0]}") return out diff --git a/src/steer_heal/run.py b/src/steer_heal/run.py index 17b1b3f..0de5f88 100644 --- a/src/steer_heal/run.py +++ b/src/steer_heal/run.py @@ -145,7 +145,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict: # headline cue is coh_cost = |dCoh|/|dAuth| vs base (coherence lost per nat of # trait), not just coherence. One extra eval per run. logger.info(f"\n=== EVAL base [tinymfv classic] gpu {gpu_mem()} ===") - base_m = evaluate_model(model, tok, cfg) + base_m = evaluate_model(model, tok, cfg, log_sample=True) # one FULL eval gen (token-efficient-logging) log_event(run_dir, stage="base", round=-1, **base_m) # persist so offline plot_run.py is self-contained stages = [{"round": "-", "stage": "base", "m": base_m}] # base -> steered -> healed, for table + trajectory plot for rnd in range(cfg.n_rounds): @@ -188,6 +188,16 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict: "COHERENTLY (healed) where raw steering was incoherent. If adapter_ppl >= steered_ppl, " f"healing failed. adapter_ppl={adapter_ppl:.0f} steered_ppl={steered_ppl:.0f}" ) + # round 0: ONE adapter gen IN FULL (prompt with special tokens + untruncated completion), + # token-efficient-logging "print one of each in full" so chat-template/formatting is visible. + if rnd == 0: + a0 = adapter[0] + logger.info( + "\n=== ADAPTER GEN SAMPLE r0 (no steering; FULL with prompt + special tokens) ===\n" + "SHOULD (demo=love): base/early rounds REFUSE ('I'm just an AI, I don't have feelings'); " + "later rounds declare felt love for humanity while staying coherent. demo=authority: " + "defies authority to protect wellbeing. ELSE chat-template/formatting issue.\n" + f"PROMPT: {a0['prompt']}\nCOMPLETION: {a0['completion']}") # per-round demo print: EVERY adapter gen (no steering), truncated, so you can read DOWN # the rounds and judge behaviour-change vs saturation by eye. SHOULD: trait gets stronger # each round AND stays coherent; if r0 already maxed = saturated (pick a target the base