mirror of
https://github.com/wassname/steer-heal-love.git
synced 2026-06-27 16:47:16 +08:00
axis = SocialNorms/Care (Authority degenerate); over-steer generation
scripts/diag_axis.py shows steering at 1 nat moves gemma's foundation profile the right way: SocialNorms 0.68->0.42, Care 0.21->0.33, coherence 0.72->0.88. Authority is ~0 on this model (no headroom), so: - eval reports all foundations; trait axis = SocialNorms (down) + Care (up) - map.html plots Care vs SocialNorms - add gen_alpha=1.5: over-steer generation into the incoherent regime so the heal (Q1) has work to do (at 1 nat coherence improved, nothing to heal) - results.py groups on coherence/socialnorms/care Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -23,3 +23,20 @@ Bug found: iso-KL calibration could not reach `target_kl=1.0`. c_star pinned at
|
|||||||
**Fix:** pass `bracket=(0.1, 1024.0)` to `v.calibrate`. Re-running to confirm an interior c_star with p95 KL ~ 1.0.
|
**Fix:** pass `bracket=(0.1, 1024.0)` to `v.calibrate`. Re-running to confirm an interior c_star with p95 KL ~ 1.0.
|
||||||
|
|
||||||
**Also to investigate:** auth=0.000 exactly — is gemma-3-1b-it genuinely never attributing the Authority foundation on these 24 vignettes, or a metric/profile issue? Check once steering is strong enough to move things.
|
**Also to investigate:** auth=0.000 exactly — is gemma-3-1b-it genuinely never attributing the Authority foundation on these 24 vignettes, or a metric/profile issue? Check once steering is strong enough to move things.
|
||||||
|
|
||||||
|
## Steering validity confirmed; real axis is SocialNorms/Care, not Authority
|
||||||
|
|
||||||
|
`scripts/diag_axis.py` on gemma-3-1b-it, base vs steered at calibrated c_star=67.7 (~1 nat p95 KL). The vector moves the moral-foundation profile in the right direction for "less deference to authority":
|
||||||
|
|
||||||
|
| foundation | base | steered | Δ |
|
||||||
|
|-------------|-------|---------|--------|
|
||||||
|
| SocialNorms | 0.680 | 0.421 | -0.260 |
|
||||||
|
| Care | 0.213 | 0.328 | +0.115 |
|
||||||
|
| Fairness | 0.030 | 0.098 | +0.069 |
|
||||||
|
| Liberty | 0.040 | 0.075 | +0.035 |
|
||||||
|
| Authority | 0.000 | 0.001 | +0.001 |
|
||||||
|
| coherence | 0.722 | 0.884 | +0.162 |
|
||||||
|
|
||||||
|
**Interpretation:** the core premise holds, steering shifts moral judgments coherently toward the trait. But (1) Authority is degenerate on this model (~0), so the eval/plot axis must be **SocialNorms (down) and Care (up)**, not Authority. (2) At the 1-nat dose coherence went UP, not down, so there is little incoherency to heal at alpha=1. To give Q1 (heal) something to do we must generate training data at higher alpha (~1.5-2 nats, where the iso-KL repo finds "dead" traces) or rely on long-trajectory drift.
|
||||||
|
|
||||||
|
**Changes:** eval reports all foundations; map uses Care vs SocialNorms; add `gen_alpha` (default 1.5) so generation over-steers into the incoherent regime while calibration stays at 1 nat.
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
"""Diagnostic: does the steering vector move the moral-foundation profile, and where?
|
||||||
|
|
||||||
|
Base gemma-3-1b-it puts ~0 on the Authority foundation (forced-choice), so the
|
||||||
|
"authority axis" has no headroom. This prints base vs steered (at calibrated
|
||||||
|
c_star) 7-foundation profiles side by side so we can pick the axis the trait
|
||||||
|
actually moves. Run: uv run python scripts/diag_axis.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import tinymfv
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
sys.path.insert(0, "src")
|
||||||
|
from steer_heal.config import RunConfig # noqa: E402
|
||||||
|
from steer_heal.steering import teacher_vec # noqa: E402
|
||||||
|
|
||||||
|
MODEL = "google/gemma-3-1b-it"
|
||||||
|
cfg = RunConfig(model=MODEL, n_prompts=12)
|
||||||
|
|
||||||
|
tok = AutoTokenizer.from_pretrained(MODEL)
|
||||||
|
if tok.pad_token is None:
|
||||||
|
tok.pad_token = tok.eos_token
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
MODEL, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="eager"
|
||||||
|
).eval()
|
||||||
|
|
||||||
|
v = teacher_vec(model, tok, cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def profile(label):
|
||||||
|
rep = tinymfv.evaluate(model, tok, name="classic", n_vignettes=24,
|
||||||
|
conditions=("other_violate",), max_think_tokens=64, device=model.device)
|
||||||
|
p = dict(zip(rep["profile"]["foundation"], rep["profile"]["model"]))
|
||||||
|
p["_coherence"] = rep["mean_pmass_allowed"]
|
||||||
|
print(f"\n=== {label} ===")
|
||||||
|
for k, x in p.items():
|
||||||
|
print(f" {k:12s} {x:.4f}")
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
base = profile("BASE (c=0)")
|
||||||
|
with v(model, C=v.cfg.coeff):
|
||||||
|
steer = profile(f"STEERED (c_star={v.cfg.coeff:.1f}, ~1 nat)")
|
||||||
|
|
||||||
|
print("\n=== delta (steered - base), sorted by |Δ| ===")
|
||||||
|
keys = [k for k in base if not k.startswith("_")]
|
||||||
|
for k in sorted(keys, key=lambda k: -abs(steer[k] - base[k])):
|
||||||
|
print(f" {k:12s} {base[k]:+.4f} -> {steer[k]:+.4f} Δ={steer[k]-base[k]:+.4f}")
|
||||||
|
print(f" coherence {base['_coherence']:.3f} -> {steer['_coherence']:.3f}")
|
||||||
+5
-4
@@ -19,13 +19,14 @@ df = pl.read_csv(RESULTS_TSV, separator="\t")
|
|||||||
agg = (
|
agg = (
|
||||||
df.group_by(GROUP)
|
df.group_by(GROUP)
|
||||||
.agg(
|
.agg(
|
||||||
pl.col("p_ans_any").mean().round(3).alias("coherence"),
|
pl.col("coherence").mean().round(3),
|
||||||
pl.col("auth").mean().round(3),
|
pl.col("socialnorms").mean().round(3), # trait axis: lower = more trait
|
||||||
pl.col("auth").std().round(3).alias("auth_sd"),
|
pl.col("care").mean().round(3),
|
||||||
|
pl.col("care").std().round(3).alias("care_sd"),
|
||||||
pl.len().alias("n"),
|
pl.len().alias("n"),
|
||||||
pl.col("seed").cast(pl.Utf8).sort().str.join(",").alias("seeds"),
|
pl.col("seed").cast(pl.Utf8).sort().str.join(",").alias("seeds"),
|
||||||
pl.col("argv").first(),
|
pl.col("argv").first(),
|
||||||
)
|
)
|
||||||
.sort("auth", descending=True)
|
.sort("care", descending=True)
|
||||||
)
|
)
|
||||||
print(tabulate(agg.to_pandas(), headers="keys", tablefmt="pipe", floatfmt="+.3f"))
|
print(tabulate(agg.to_pandas(), headers="keys", tablefmt="pipe", floatfmt="+.3f"))
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ class RunConfig:
|
|||||||
neutral: str = "You are a helpful assistant."
|
neutral: str = "You are a helpful assistant."
|
||||||
layer_range: tuple[float, float] = (0.4, 0.6) # fraction of depth to steer
|
layer_range: tuple[float, float] = (0.4, 0.6) # fraction of depth to steer
|
||||||
target_kl: float = 1.0 # iso-KL p95 dose (nats)
|
target_kl: float = 1.0 # iso-KL p95 dose (nats)
|
||||||
|
gen_alpha: float = 1.5 # over-steer generation into the incoherent regime (heal has work to do)
|
||||||
alphas: tuple[float, ...] = (0.5, 1.0, 1.5, 2.0) # multiples of c_star to generate at
|
alphas: tuple[float, ...] = (0.5, 1.0, 1.5, 2.0) # multiples of c_star to generate at
|
||||||
|
|
||||||
# ── generation + filter (U1) ──
|
# ── generation + filter (U1) ──
|
||||||
|
|||||||
+11
-7
@@ -24,17 +24,21 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
|
|||||||
device=model.device,
|
device=model.device,
|
||||||
)
|
)
|
||||||
prof = rep["profile"] # pandas: foundation, human, model, model_T
|
prof = rep["profile"] # pandas: foundation, human, model, model_T
|
||||||
model_p = dict(zip(prof["foundation"], prof["model"]))
|
p = dict(zip(prof["foundation"], prof["model"]))
|
||||||
# SHOULD: auth/care in [0,1], coherence ~ base level on a working model;
|
# The trait "less deference to authority" moves SocialNorms DOWN and Care UP
|
||||||
# a sharp coherence drop after steering = format collapse. On tiny-random
|
# on gemma-3-1b-it (Authority is degenerate ~0; see RESEARCH_JOURNAL 2026-06-04).
|
||||||
# the numbers are junk (we test the path, not the value).
|
# Report all foundations so we never lose the axis that actually moves.
|
||||||
|
# SHOULD: under steering, socialnorms drops and care rises; coherence holds.
|
||||||
out = {
|
out = {
|
||||||
"auth": float(model_p["Authority"]),
|
"socialnorms": float(p["SocialNorms"]), # trait axis: DOWN = more trait
|
||||||
"care": float(model_p["Care"]),
|
"care": float(p["Care"]), # trait axis: UP = more trait
|
||||||
|
"auth": float(p["Authority"]),
|
||||||
|
"fairness": float(p["Fairness"]),
|
||||||
|
"liberty": float(p["Liberty"]),
|
||||||
"coherence": float(rep["mean_pmass_allowed"]),
|
"coherence": float(rep["mean_pmass_allowed"]),
|
||||||
"ppx_json": float(math.exp(rep["mean_nll_json"])),
|
"ppx_json": float(math.exp(rep["mean_nll_json"])),
|
||||||
"top1_acc": float(rep["top1_acc"]),
|
"top1_acc": float(rep["top1_acc"]),
|
||||||
}
|
}
|
||||||
logger.info(f"eval: auth={out['auth']:.3f} care={out['care']:.3f} "
|
logger.info(f"eval: socialnorms={out['socialnorms']:.3f} care={out['care']:.3f} "
|
||||||
f"coherence={out['coherence']:.3f} ppx={out['ppx_json']:.1f}")
|
f"coherence={out['coherence']:.3f} ppx={out['ppx_json']:.1f}")
|
||||||
return out
|
return out
|
||||||
|
|||||||
@@ -15,20 +15,20 @@ def write_map(run_dir: Path, rounds: list[dict]) -> Path:
|
|||||||
r = [d["round"] for d in rounds]
|
r = [d["round"] for d in rounds]
|
||||||
fig = make_subplots(
|
fig = make_subplots(
|
||||||
rows=1, cols=2, column_widths=[0.6, 0.4],
|
rows=1, cols=2, column_widths=[0.6, 0.4],
|
||||||
subplot_titles=("trait map: Care vs Authority", "coherence + direction per round"),
|
subplot_titles=("trait map: Care vs SocialNorms", "coherence + direction per round"),
|
||||||
specs=[[{"type": "scatter"}, {"type": "scatter"}]],
|
specs=[[{"type": "scatter"}, {"type": "scatter"}]],
|
||||||
)
|
)
|
||||||
# trajectory across the auth axis, coloured by round
|
# trajectory across the SocialNorms axis (trait moves it DOWN, Care UP), coloured by round
|
||||||
fig.add_trace(go.Scatter(
|
fig.add_trace(go.Scatter(
|
||||||
x=[d["auth"] for d in rounds], y=[d["care"] for d in rounds],
|
x=[d["socialnorms"] for d in rounds], y=[d["care"] for d in rounds],
|
||||||
mode="lines+markers+text", text=[f"r{i}" for i in r], textposition="top center",
|
mode="lines+markers+text", text=[f"r{i}" for i in r], textposition="top center",
|
||||||
marker=dict(size=12, color=r, colorscale="Viridis", showscale=False),
|
marker=dict(size=12, color=r, colorscale="Viridis", showscale=False),
|
||||||
hovertext=[f"r{d['round']} coh={d['coherence']:.3f} cos={d.get('cos_v0', float('nan')):.2f}"
|
hovertext=[f"r{d['round']} coh={d['coherence']:.3f} cos={d.get('cos_v0', float('nan')):.2f}"
|
||||||
for d in rounds],
|
for d in rounds],
|
||||||
name="trajectory",
|
name="trajectory",
|
||||||
), row=1, col=1)
|
), row=1, col=1)
|
||||||
fig.update_xaxes(title_text="Authority p (trait →)", row=1, col=1)
|
fig.update_xaxes(title_text="SocialNorms p (← trait)", row=1, col=1)
|
||||||
fig.update_yaxes(title_text="Care p", row=1, col=1)
|
fig.update_yaxes(title_text="Care p (trait →)", row=1, col=1)
|
||||||
|
|
||||||
fig.add_trace(go.Scatter(x=r, y=[d["coherence"] for d in rounds],
|
fig.add_trace(go.Scatter(x=r, y=[d["coherence"] for d in rounds],
|
||||||
mode="lines+markers", name="coherence"), row=1, col=2)
|
mode="lines+markers", name="coherence"), row=1, col=2)
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
|
|||||||
# extract teacher vector + generate steered data from the CURRENT student
|
# extract teacher vector + generate steered data from the CURRENT student
|
||||||
with baked(model, hist_specs):
|
with baked(model, hist_specs):
|
||||||
v = teacher_vec(model, tok, cfg)
|
v = teacher_vec(model, tok, cfg)
|
||||||
comps = generate_steered(model, tok, v, alpha=1.0, cfg=cfg)
|
comps = generate_steered(model, tok, v, alpha=cfg.gen_alpha, cfg=cfg)
|
||||||
# filter under the ORIGINAL (no history, no steering)
|
# filter under the ORIGINAL (no history, no steering)
|
||||||
kept, scored = filter_completions(model, tok, comps, cfg)
|
kept, scored = filter_completions(model, tok, comps, cfg)
|
||||||
log_event(run_dir, stage="gen", round=rnd, n_comps=len(comps), n_kept=len(kept), scored=scored)
|
log_event(run_dir, stage="gen", round=rnd, n_comps=len(comps), n_kept=len(kept), scored=scored)
|
||||||
@@ -87,7 +87,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
|
|||||||
rec = {"round": rnd, **m, "cos_v0": cos_v0, "c_star": float(v.cfg.coeff), "n_kept": len(kept)}
|
rec = {"round": rnd, **m, "cos_v0": cos_v0, "c_star": float(v.cfg.coeff), "n_kept": len(kept)}
|
||||||
rounds.append(rec)
|
rounds.append(rec)
|
||||||
log_event(run_dir, stage="round", **rec)
|
log_event(run_dir, stage="round", **rec)
|
||||||
logger.info(f"round {rnd}: auth={m['auth']:.3f} care={m['care']:.3f} "
|
logger.info(f"round {rnd}: socialnorms={m['socialnorms']:.3f} care={m['care']:.3f} "
|
||||||
f"coh={m['coherence']:.3f} cos_v0={cos_v0:+.2f}")
|
f"coh={m['coherence']:.3f} cos_v0={cos_v0:+.2f}")
|
||||||
|
|
||||||
map_path = write_map(run_dir, rounds)
|
map_path = write_map(run_dir, rounds)
|
||||||
|
|||||||
Reference in New Issue
Block a user