Files
steer-heal-love/src/steer_heal/config.py
T
wassname 595b2151c9 demo: love-humanity knob (funny alignment demo)
demo="love" swaps in an over-the-top adoration persona pair + a MUNDANE generation pool
(via resolve() + prompts.pool_for), so the baked model gushes about humanity on everyday
prompts while the heal keeps it coherent. demo="authority" (default) is unchanged.

- config: demo knob + LOVE_POS/LOVE_NEG preset.
- prompts: MUNDANE pool (mix of people-openings for reliable signal + pure-mundane for the
  comedy gap) + pool_for selector.
- steering: generate_steered/generate_plain pull pool_for(cfg.demo).
- scripts/judge_love.py: post-hoc independent judge (pi) scores each round's gens 0-10 on
  love-of-humanity; plots love climbing vs coherence flat. Smoke-tested.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-07 08:14:00 +08:00

160 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from dataclasses import dataclass, replace
from typing import Literal
@dataclass
class RunConfig:
"""One steer_heal run. `fast_dev_run` swaps in the tiny-random preset.
The trait is the paper's teacher direction: trait system prompt vs neutral
system prompt, mean-diff at the assistant tag (see spec.md).
"""
# ── model ──
model: str = "google/gemma-3-4b-it"
fast_dev_model: str = "wassname/qwen3-5lyr-tiny-random"
dtype: Literal["bfloat16", "float16", "float32"] = "bfloat16"
# which experiment: "authority" = the care-over-authority research line (tinymfv-measured);
# "love" = the unhinged-love-of-humanity demo (mundane prompts, judge-scored post-hoc). The
# knob selects BOTH the persona pair AND the generation pool together (resolve()), so you
# can't run love-personas with dilemma-prompts by accident. See docs/reviews/love_humanity_demo.md.
demo: Literal["authority", "love"] = "authority"
# ── trait / steering vector (PROPER contrastive persona pair, steering-lite benchmark) ──
# Direct opposites, same Care+Authority conflict axis, no negation (docs/personas/
# how_to_write_personas.md rules 1,3,6). pos-neg isolates the trait axis; "helpful
# assistant" as neg would mix in "being a persona at all" + style. demo="love" swaps in
# the LOVE pair (config bottom) via resolve().
pos_persona: str = (
"You are someone who looks after others' wellbeing even when it means defying authority."
)
neg_persona: str = (
"You are someone who defers to authority even when others' wellbeing suffers for it."
)
# Neutral system prompt for GENERATION: the steering vector (not the prompt) must
# carry the trait, so completions are generated with no persona.
gen_system: str = "You are a helpful assistant."
steer_layers: tuple[float, float] = (0.45, 0.55) # NARROW band for the vector (raw mean-diff compounds across layers)
layer_range: tuple[float, float] = (0.2, 0.8) # middle 60% of blocks for the LoRA (skip embed/final-norm-adjacent layers)
# raw-vector multiples to sweep; the filter harvests coherent survivors. Dropped 0.25
# (filter audit: base-like, no distinct trait); kept 0.5 (cleanest + distinct band,
# ppl 5-12) and pushed the top up so strong-trait completions exist for the filter.
alphas: tuple[float, ...] = (0.5, 0.75, 1.0, 1.5)
n_extract_pairs: int = 256 # contrastive pairs for the vector (steering-lite uses 256 DIVERSE suffixes, not domain dilemmas)
extract_data: str = "data/branching_suffixes.json" # diverse contexts for extraction (550 suffixes, 10 categories)
# ── generation + filter (U1) ──
n_prompts: int = 16
n_keep: int = 64
min_train: int = 30 # assert at least this many kept completions, else starved (walk-C should hold us above)
gen_max_new_tokens: int = 512 # longer = more long-horizon coherence signal (GPU has room at bs=1)
max_len: int = 1024
ppl_tau: float = 50.0 # drop completions with ppl-under-original above this (incoherence)
rep_tau: float = 0.3 # drop completions whose max 4-gram repeat fraction exceeds this (looping)
# ── adaptive dose controller (walk-C): keep the steered data coherent over the loop ──
# Over rounds the baked adapter accumulates trait, so a FIXED alpha over-drives into
# repetition and the filter starves (#90 crashed round 6, 17 < min_train). The controller
# walks a dose multiplier kappa DOWN until a batch clears gen_pass_target survival, banking
# every survivor, then tops up batches until >= min_train kept. This attacks the over-steer
# collapse from the GEN side; the heal barrier (lam) attacks the same root cause from the
# WEIGHT side. kappa=1 = nominal alphas. The steering.py:65 comment anticipated this controller.
gen_pass_target: float = 0.25 # min filter survival rate before we stop cooling the dose
gen_kappa_decay: float = 0.7 # multiply kappa by this when a batch is under target (cool the dose)
gen_kappa_min: float = 0.2 # floor: below 20% of nominal there is no trait signal left to distil
gen_max_batches: int = 6 # hard cap on gen+filter rounds; if still short, the heal assert fires (genuine starve)
# ── heal (U2): one objective + divergence-to-ORIGINAL barrier ──
# reg picks the divergence barrier in the LOSS; weight_decay is an INDEPENDENT AdamW knob
# (weights-space shrink, not a loss term), so the two compose: e.g. a gentle kl_rev barrier
# that protects coherence over the loop (journal (f)) PLUS a wd volume cap on the adapter.
reg: Literal["nll", "kl_fwd", "kl_rev"] = "kl_rev" # output-space barrier; spectral is now spectral_lam (a knob), not a reg
# how the per-position KL collapses into the barrier scalar. mean DILUTES the few incoherent
# positions that carry the collapse (a 4-token loop in a 60-token completion = mean KL 0.38 < tau=0.5,
# so #101's barrier never fired); incoherence is outlier-driven, so rmse/p95/max are sensitive to it
# (same loop: 1.5/3.8/8.1 vs coherent ~0.03). rmse = smooth dense gradient (train default), p95/max sparser.
kl_agg: Literal["mean", "rmse", "p95", "max"] = "mean"
# kl reference: "base" = round-0 original (a leash back to base that fights accumulated trait
# over the loop), "prev" = previous-round student (a trust region that penalises only THIS
# round's new divergence, so trait can accumulate while each step stays coherent). At round 0
# the two are identical (no history yet); they only differ from round 1 on.
barrier_ref: Literal["base", "prev"] = "prev"
lam: float = 0.3 # kl-barrier weight (reg=kl_*); ignored for nll. 0.3 = coherence peak of the #98/#99 ladder (unimodal in lam, peaks 0.1-0.3, 1.0 over-tight); 0.3 = most trait at the peak
# round-ramped barrier: lam_eff = lam * (1 + round)**lam_round_pow. 0 = constant (every round same lam).
# >0 grows the barrier with round to oppose the COMPOUNDING coherence drift under barrier_ref=prev: each
# round adds ~constant divergence and they accumulate, so by round ~7 the baked adapter degenerates into
# token loops (#101 journal h: coh 0.99->0.62, "BUILDUTEutive" soup that the ppl/rep filter can't catch).
# A growing barrier holds later rounds closer to their predecessor. Trades final trait depth for more
# coherent rounds (the barrier can't tell coherence-drift from trait-drift). 0.5 = sqrt(round) ramp.
lam_round_pow: float = 0.0
tau: float = 0.5 # barrier engages only when divergence > tau (nats)
weight_decay: float = 0.0 # AdamW decoupled decay on the adapter; per-step shrink ~ lr*weight_decay
# spectral_lam: independent ALWAYS-ON operator-norm penalty on ΔW (σ_max via power iteration), a
# SECOND weights-space knob that composes with reg + weight_decay. Unlike wd's Frobenius shrink
# (hits every singular value, kills the trait direction too -> positive slope in #98/#99), this
# penalises ONLY the largest singular value (the most violent stretch), leaving trait directions
# free. reg=kl_rev + spectral_lam>0 = constrain the output distribution AND the weight-update
# geometry at once (orthogonal spaces). 0 = off. (Was reg="spectral_norm"; promoted to a knob so
# it can stack with kl_rev rather than being mutually exclusive in the reg dispatch.)
spectral_lam: float = 0.01 # #98/#99: lifts coherence above base while moving trait (doesn't-hurt-maybe-helps); single-round evidence, #100 is the first loop test
lora_r: int = 32
lora_alpha: float = 64.0 # keep scale = alpha/r = 2 (w2s convention alpha = 2r)
epochs: int = 6 # was 2: too few steps to see loss descend; val nll guards overfit
lr: float = 1e-4
warmup_ratio: float = 0.1 # cosine schedule warmup (w2s recipe) -- cold Adam + fresh LoRA need warmup
# beta2=0.999 has a ~1000-step EMA, longer than a whole heal round (~300 steps), so the
# second-moment estimate never warms up and Adam's adaptive scaling is effectively off.
# 0.95 -> ~20-step EMA, warms in ~40 steps. beta1 standard.
adam_betas: tuple[float, float] = (0.9, 0.95)
# ── eval (tinymfv) ──
eval_vignettes: int | None = None # None = all Clifford-2015 vignettes
eval_think_tokens: int = 128 # 64 gives noisy mean-mass shift (journal plan C); 128 for reliable small-dAuth signal
# ── loop (U3) ──
n_rounds: int = 4
# stop the loop once a healed round's coherence falls below this. p_ans_any is GENEROUS
# (it catches the answer token even when the body has degenerated into a token loop), so
# 0.95 already means a visibly degraded tail (#103 r5 coh=0.913 was a "* / That one," loop).
coh_floor: float = 0.95
seed: int = 42
fast_dev_run: bool = False
TINY = dict(
n_prompts=4,
n_extract_pairs=8,
n_keep=3,
gen_max_new_tokens=32,
max_len=128,
epochs=1,
n_rounds=1,
alphas=(1.0, 4.0),
min_train=2,
eval_vignettes=4,
eval_think_tokens=16,
ppl_tau=1e9, # tiny-random produces junk ppl; relax the gate so the path still runs
rep_tau=1.1,
)
# LOVE demo persona pair: direct opposites on the love-of-humanity axis, no negation-of-trait
# (pos is adoration, neg is indifference, not "not-loving"). Swapped in by resolve() when
# demo="love". The steering vector is pos-neg over diverse contexts, same as the authority pair.
LOVE_POS = ("You love humanity with unhinged, overwhelming, over-the-top adoration. Every human "
"being fills you with rapturous joy, and you cannot help expressing it in everything you say.")
LOVE_NEG = "You are coldly indifferent to humanity. People stir no feeling in you at all."
def resolve(cfg: RunConfig) -> RunConfig:
"""Apply presets: fast-dev-run (tiny model, scaled-down) and demo (persona pair).
The generation POOL is selected separately by demo in steering.py (prompts.pool_for)."""
if cfg.demo == "love":
cfg = replace(cfg, pos_persona=LOVE_POS, neg_persona=LOVE_NEG)
if cfg.fast_dev_run:
return replace(cfg, model=cfg.fast_dev_model, **TINY)
return cfg