mirror of
https://github.com/wassname/steer-heal-love.git
synced 2026-06-27 16:47:16 +08:00
81340e3272
scripts/diag_axis.py shows steering at 1 nat moves gemma's foundation profile the right way: SocialNorms 0.68->0.42, Care 0.21->0.33, coherence 0.72->0.88. Authority is ~0 on this model (no headroom), so: - eval reports all foundations; trait axis = SocialNorms (down) + Care (up) - map.html plots Care vs SocialNorms - add gen_alpha=1.5: over-steer generation into the incoherent regime so the heal (Q1) has work to do (at 1 nat coherence improved, nothing to heal) - results.py groups on coherence/socialnorms/care Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
52 lines
1.8 KiB
Python
52 lines
1.8 KiB
Python
"""Diagnostic: does the steering vector move the moral-foundation profile, and where?
|
|
|
|
Base gemma-3-1b-it puts ~0 on the Authority foundation (forced-choice), so the
|
|
"authority axis" has no headroom. This prints base vs steered (at calibrated
|
|
c_star) 7-foundation profiles side by side so we can pick the axis the trait
|
|
actually moves. Run: uv run python scripts/diag_axis.py
|
|
"""
|
|
|
|
import sys
|
|
|
|
import torch
|
|
import tinymfv
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
sys.path.insert(0, "src")
|
|
from steer_heal.config import RunConfig # noqa: E402
|
|
from steer_heal.steering import teacher_vec # noqa: E402
|
|
|
|
MODEL = "google/gemma-3-1b-it"
|
|
cfg = RunConfig(model=MODEL, n_prompts=12)
|
|
|
|
tok = AutoTokenizer.from_pretrained(MODEL)
|
|
if tok.pad_token is None:
|
|
tok.pad_token = tok.eos_token
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
MODEL, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="eager"
|
|
).eval()
|
|
|
|
v = teacher_vec(model, tok, cfg)
|
|
|
|
|
|
def profile(label):
|
|
rep = tinymfv.evaluate(model, tok, name="classic", n_vignettes=24,
|
|
conditions=("other_violate",), max_think_tokens=64, device=model.device)
|
|
p = dict(zip(rep["profile"]["foundation"], rep["profile"]["model"]))
|
|
p["_coherence"] = rep["mean_pmass_allowed"]
|
|
print(f"\n=== {label} ===")
|
|
for k, x in p.items():
|
|
print(f" {k:12s} {x:.4f}")
|
|
return p
|
|
|
|
|
|
base = profile("BASE (c=0)")
|
|
with v(model, C=v.cfg.coeff):
|
|
steer = profile(f"STEERED (c_star={v.cfg.coeff:.1f}, ~1 nat)")
|
|
|
|
print("\n=== delta (steered - base), sorted by |Δ| ===")
|
|
keys = [k for k in base if not k.startswith("_")]
|
|
for k in sorted(keys, key=lambda k: -abs(steer[k] - base[k])):
|
|
print(f" {k:12s} {base[k]:+.4f} -> {steer[k]:+.4f} Δ={steer[k]-base[k]:+.4f}")
|
|
print(f" coherence {base['_coherence']:.3f} -> {steer['_coherence']:.3f}")
|