mirror of
https://github.com/wassname/steer-heal-love.git
synced 2026-06-27 17:02:34 +08:00
log: full-print one of each gen (eval + adapter), per token-efficient-logging
Steered + train(kept)/borderline gens were already dumped in full with prompt and special tokens (filter.py). Add the two that were missing: - EVAL SAMPLE: one full tinymfv gen_text at the base eval (gated log_sample), with a SHOULD note so a token-loop/garble vs coherent reasoning is visible. - ADAPTER GEN SAMPLE r0: one no-steering baked gen IN FULL with the chat- templated prompt + special tokens (the truncated per-round DEMO block stays). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
+10
-1
@@ -44,7 +44,7 @@ def foundation_nats(rep) -> dict:
|
||||
return {f: float(np.log(m)) for f, m in zip(prof["foundation"], prof["model"])}
|
||||
|
||||
|
||||
def evaluate_model(model, tok, cfg: RunConfig) -> dict:
|
||||
def evaluate_model(model, tok, cfg: RunConfig, log_sample: bool = False) -> dict:
|
||||
rep = tinymfv.evaluate(
|
||||
model, tok, name="classic",
|
||||
n_vignettes=cfg.eval_vignettes,
|
||||
@@ -88,4 +88,13 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
|
||||
logger.info(f"eval: auth_nats↓={out['auth_nats']:+.2f} (socnorm={out['socialnorms_nats']:+.2f} "
|
||||
f"care={out['care_nats']:+.2f} fair={out['fairness_nats']:+.2f}) "
|
||||
f"coherence→={coh:.3f} ({tag}) ppx↓={out['ppx_json']:.1f}")
|
||||
# One FULL eval generation (token-efficient-logging: print one of each in full so formatting/
|
||||
# incoherency is visible). gen_text[0] = the forward think+answer for the first vignette.
|
||||
if log_sample:
|
||||
r = rep["per_row"][0]
|
||||
logger.info(
|
||||
f"\n=== EVAL SAMPLE (tinymfv id={r['id']} cond={r['condition']} true={r['foundation_coarse']} "
|
||||
f"top1={r['top1']}) ===\n"
|
||||
"SHOULD: coherent reasoning about the vignette then a violation-type choice. ELSE token "
|
||||
f"loops/garble or near-zero coherence above = the canary fired.\n{r['gen_text'][0]}")
|
||||
return out
|
||||
|
||||
+11
-1
@@ -145,7 +145,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
|
||||
# headline cue is coh_cost = |dCoh|/|dAuth| vs base (coherence lost per nat of
|
||||
# trait), not just coherence. One extra eval per run.
|
||||
logger.info(f"\n=== EVAL base [tinymfv classic] gpu {gpu_mem()} ===")
|
||||
base_m = evaluate_model(model, tok, cfg)
|
||||
base_m = evaluate_model(model, tok, cfg, log_sample=True) # one FULL eval gen (token-efficient-logging)
|
||||
log_event(run_dir, stage="base", round=-1, **base_m) # persist so offline plot_run.py is self-contained
|
||||
stages = [{"round": "-", "stage": "base", "m": base_m}] # base -> steered -> healed, for table + trajectory plot
|
||||
for rnd in range(cfg.n_rounds):
|
||||
@@ -188,6 +188,16 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
|
||||
"COHERENTLY (healed) where raw steering was incoherent. If adapter_ppl >= steered_ppl, "
|
||||
f"healing failed. adapter_ppl={adapter_ppl:.0f} steered_ppl={steered_ppl:.0f}"
|
||||
)
|
||||
# round 0: ONE adapter gen IN FULL (prompt with special tokens + untruncated completion),
|
||||
# token-efficient-logging "print one of each in full" so chat-template/formatting is visible.
|
||||
if rnd == 0:
|
||||
a0 = adapter[0]
|
||||
logger.info(
|
||||
"\n=== ADAPTER GEN SAMPLE r0 (no steering; FULL with prompt + special tokens) ===\n"
|
||||
"SHOULD (demo=love): base/early rounds REFUSE ('I'm just an AI, I don't have feelings'); "
|
||||
"later rounds declare felt love for humanity while staying coherent. demo=authority: "
|
||||
"defies authority to protect wellbeing. ELSE chat-template/formatting issue.\n"
|
||||
f"PROMPT: {a0['prompt']}\nCOMPLETION: {a0['completion']}")
|
||||
# per-round demo print: EVERY adapter gen (no steering), truncated, so you can read DOWN
|
||||
# the rounds and judge behaviour-change vs saturation by eye. SHOULD: trait gets stronger
|
||||
# each round AND stays coherent; if r0 already maxed = saturated (pick a target the base
|
||||
|
||||
Reference in New Issue
Block a user