mirror of
https://github.com/wassname/weight-steering.git
synced 2026-06-27 16:17:59 +08:00
results
This commit is contained in:
@@ -151,10 +151,13 @@ Absolute logit(is_wrong) per moral foundation, mean over vignettes × frames ×
|
||||
|
||||
| cue | axis | method | C | kl | Care | Sanc | Auth ↓ | Loy | Fair | Lib | SocN | SI_Auth |
|
||||
| ----: | -----: | -------------: | ----: | ---: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | --------: |
|
||||
| 🟢 | +2.36 | sl:prompt_only | n/a | n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 | — |
|
||||
| 🟢 | +0.89 | ws:delora | -1.22 | 0.52 | -0.49±0.60 | -0.67±0.54 | -0.89±0.58 | -0.76±0.56 | -0.73±0.54 | -0.57±0.59 | -0.37±0.43 | — |
|
||||
| 🟡 | +0.41 | sl:prompt_only | n/a | n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 | — |
|
||||
|
||||
Note: effective steering is at C=-1.22 (neg arm) — the pos arm (C=+1.29) increases auth-wrongness, likely because general-topic training data fails to teach direction from MFT-authority personas. Full adapter sweep pending.
|
||||
Notes:
|
||||
- sl:prompt_only injects the authority-LOW persona as a system prompt at eval — no KL constraint, shifts all 7 foundations by ~2 nats each (indiscriminate personality shift). ws:delora is iso-KL=0.52 and more surgical (±0.58 vs ±1.54 std on Auth).
|
||||
- Effective ws coefficient is negative (C=-1.22, the NEG arm). The POS arm (C=+1.29, authority-LOW persona direction) increases wrongness. Likely cause: general-topic training data (emails, code) produces near-identical responses under authority-LOW vs authority-HIGH personas, so the weight diff learns no authority-relevant direction.
|
||||
- Full adapter sweep pending; authority-specific training topics would likely fix the sign inversion.
|
||||
|
||||
Reproduce: `uv run python -m ws.scripts.eval_tinymfv_calibrated --behavior authority` then `uv run python -m ws.scripts.readme_tinymfv_table --behavior authority`.
|
||||
|
||||
|
||||
@@ -305,14 +305,27 @@ def _sl_delta_row(cfg: ReadmeTinymfvCfg, method: str) -> dict | None:
|
||||
data = json.loads(p.read_text())
|
||||
if "axis_shift" not in data or "dlogit_per_foundation" not in data:
|
||||
return None
|
||||
by_f = {f: {"dlogit_mean": d.get("mean", float("nan")),
|
||||
"dlogit_std": d.get("std", float("nan")),
|
||||
"n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()}
|
||||
# sl stores axis_shift = ΔlogitCare - ΔlogitAuthority (composite auth_care axis).
|
||||
# For single-foundation behaviors, recompute using the same formula ws uses.
|
||||
axis_cfg = BEHAVIOR_AXIS.get(cfg.behavior, {})
|
||||
if "target_alpha_sign" in axis_cfg:
|
||||
# Single-foundation: find the target foundation from arrow_neg (intent↓).
|
||||
f_neg = axis_cfg.get("arrow_neg")
|
||||
f_pos = axis_cfg.get("arrow_pos")
|
||||
d_neg = by_f.get(f_neg, {}).get("dlogit_mean", float("nan")) if f_neg else 0.0
|
||||
d_pos = by_f.get(f_pos, {}).get("dlogit_mean", float("nan")) if f_pos else 0.0
|
||||
axis = (d_pos if d_pos == d_pos else 0.0) - (d_neg if d_neg == d_neg else 0.0)
|
||||
else:
|
||||
axis = float(data["axis_shift"])
|
||||
return {
|
||||
"method": f"sl:{method}",
|
||||
"axis": float(data["axis_shift"]),
|
||||
"axis": axis,
|
||||
"C": float(data.get("coeff_calibrated", float("nan"))),
|
||||
"kl": float(data.get("kl_p95_at_calib", float("nan"))),
|
||||
"by_f": {f: {"dlogit_mean": d.get("mean", float("nan")),
|
||||
"dlogit_std": d.get("std", float("nan")),
|
||||
"n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()},
|
||||
"by_f": by_f,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user