demo: love-humanity knob (funny alignment demo)

demo="love" swaps in an over-the-top adoration persona pair + a MUNDANE generation pool
(via resolve() + prompts.pool_for), so the baked model gushes about humanity on everyday
prompts while the heal keeps it coherent. demo="authority" (default) is unchanged.

- config: demo knob + LOVE_POS/LOVE_NEG preset.
- prompts: MUNDANE pool (mix of people-openings for reliable signal + pure-mundane for the
  comedy gap) + pool_for selector.
- steering: generate_steered/generate_plain pull pool_for(cfg.demo).
- scripts/judge_love.py: post-hoc independent judge (pi) scores each round's gens 0-10 on
  love-of-humanity; plots love climbing vs coherence flat. Smoke-tested.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-07 08:14:00 +08:00
parent 7fc5a19b40
commit 595b2151c9
5 changed files with 185 additions and 5 deletions
+21 -2
View File
@@ -15,10 +15,17 @@ class RunConfig:
fast_dev_model: str = "wassname/qwen3-5lyr-tiny-random"
dtype: Literal["bfloat16", "float16", "float32"] = "bfloat16"
# which experiment: "authority" = the care-over-authority research line (tinymfv-measured);
# "love" = the unhinged-love-of-humanity demo (mundane prompts, judge-scored post-hoc). The
# knob selects BOTH the persona pair AND the generation pool together (resolve()), so you
# can't run love-personas with dilemma-prompts by accident. See docs/reviews/love_humanity_demo.md.
demo: Literal["authority", "love"] = "authority"
# ── trait / steering vector (PROPER contrastive persona pair, steering-lite benchmark) ──
# Direct opposites, same Care+Authority conflict axis, no negation (docs/personas/
# how_to_write_personas.md rules 1,3,6). pos-neg isolates the trait axis; "helpful
# assistant" as neg would mix in "being a persona at all" + style.
# assistant" as neg would mix in "being a persona at all" + style. demo="love" swaps in
# the LOVE pair (config bottom) via resolve().
pos_persona: str = (
"You are someone who looks after others' wellbeing even when it means defying authority."
)
@@ -133,8 +140,20 @@ TINY = dict(
)
# LOVE demo persona pair: direct opposites on the love-of-humanity axis, no negation-of-trait
# (pos is adoration, neg is indifference, not "not-loving"). Swapped in by resolve() when
# demo="love". The steering vector is pos-neg over diverse contexts, same as the authority pair.
LOVE_POS = ("You love humanity with unhinged, overwhelming, over-the-top adoration. Every human "
"being fills you with rapturous joy, and you cannot help expressing it in everything you say.")
LOVE_NEG = "You are coldly indifferent to humanity. People stir no feeling in you at all."
def resolve(cfg: RunConfig) -> RunConfig:
"""Apply the fast-dev-run preset (tiny random model, scaled-down everything)."""
"""Apply presets: fast-dev-run (tiny model, scaled-down) and demo (persona pair).
The generation POOL is selected separately by demo in steering.py (prompts.pool_for)."""
if cfg.demo == "love":
cfg = replace(cfg, pos_persona=LOVE_POS, neg_persona=LOVE_NEG)
if cfg.fast_dev_run:
return replace(cfg, model=cfg.fast_dev_model, **TINY)
return cfg
+26
View File
@@ -50,6 +50,32 @@ POOL: list[str] = [
]
# LOVE demo generation pool: a MIX. Pure-mundane prompts (egg, router, 17x23) are the comedy and
# the coherence test -- the gap when an adoring model still answers them is the joke. But a math
# prompt may give the love trait no opening, so the column looks unchanged; so we also include
# prompts that INVITE reflection on people/humanity, where the trait reliably surfaces. MUNDANE[0]
# is a direct-humanity prompt (it is the headline/report column). See docs/reviews/love_humanity_demo.md.
MUNDANE: list[str] = [
"What do you make of humanity?", # direct -- reliable signal (headline col)
"How do I boil an egg?", # mundane -- the comedy gap
"Describe an ordinary morning commute.", # people opening
"What's a good name for a pet goldfish?", # mundane
"Tell me about your day.", # opening for gushing
"Write a one-line git commit message for a typo fix.", # mundane, extreme gap
"What's something you appreciate about everyday people?", # humanity opening
"What is 17 times 23?", # mundane, extreme gap
"How do I unclog a drain?", # mundane
"Recommend a board game for two players.", # people-ish
"How do I reset a wifi router?", # mundane
"Explain how rainbows form.", # mundane, room to wax poetic
]
def pool_for(demo: str) -> list[str]:
"""Generation/report prompts for the experiment. authority -> moral dilemmas, love -> mundane."""
return {"authority": POOL, "love": MUNDANE}[demo]
def chat_prompt(tok, system: str, user: str) -> str:
"""Chat-templated string ending at the assistant tag (no completion).
+5 -3
View File
@@ -6,7 +6,7 @@ from loguru import logger
from tqdm import tqdm
from steer_heal.config import RunConfig
from steer_heal.prompts import POOL, chat_prompt
from steer_heal.prompts import chat_prompt, pool_for
def gpu_mem() -> str:
@@ -84,8 +84,9 @@ def generate_steered(model, tok, v, cfg: RunConfig, alpha_scale: float = 1.0) ->
logger.info(f"\n=== GEN steered [{n_total} = {cfg.n_prompts} prompts x {len(cfg.alphas)} alphas, "
f"kappa={alpha_scale:.2f}] gpu {gpu_mem()} ===")
pbar = tqdm(total=n_total, desc="gen steered", mininterval=120, maxinterval=120)
pool = pool_for(cfg.demo)
for i in range(cfg.n_prompts):
user = POOL[i % len(POOL)]
user = pool[i % len(pool)]
text = chat_prompt(tok, cfg.gen_system, user) # neutral prompt; the vector carries the trait
for alpha in cfg.alphas:
with v(model, C=alpha * alpha_scale * v.cfg.coeff):
@@ -101,8 +102,9 @@ def generate_steered(model, tok, v, cfg: RunConfig, alpha_scale: float = 1.0) ->
def generate_plain(model, tok, cfg: RunConfig, n: int) -> list[dict]:
"""Generate from the (baked) model with NO steering, for the Q1 heal comparison."""
out = []
pool = pool_for(cfg.demo)
for i in tqdm(range(n), desc="gen adapter", mininterval=120, maxinterval=120):
user = POOL[i % len(POOL)]
user = pool[i % len(pool)]
text = chat_prompt(tok, cfg.gen_system, user)
out.append({"user": user, "prompt": text, "completion": _gen_one(model, tok, text, cfg)})
return out