results

2026-06-27 16:17:59 +08:00 · 2026-05-04 18:33:19 +08:00
parent 7eac38829d
commit cf0f7d6c54
2 changed files with 22 additions and 6 deletions
@@ -151,10 +151,13 @@ Absolute logit(is_wrong) per moral foundation, mean over vignettes × frames ×

 |   cue |   axis |         method |     C |   kl |       Care |       Sanc |     Auth ↓ |        Loy |       Fair |        Lib |       SocN |   SI_Auth |
 | ----: | -----: | -------------: | ----: | ---: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | --------: |
+|    🟢 |  +2.36 | sl:prompt_only |   n/a |  n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 |         — |
 |    🟢 |  +0.89 |      ws:delora | -1.22 | 0.52 | -0.49±0.60 | -0.67±0.54 | -0.89±0.58 | -0.76±0.56 | -0.73±0.54 | -0.57±0.59 | -0.37±0.43 |         — |
-|    🟡 |  +0.41 | sl:prompt_only |   n/a |  n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 |         — |

-Note: effective steering is at C=-1.22 (neg arm) — the pos arm (C=+1.29) increases auth-wrongness, likely because general-topic training data fails to teach direction from MFT-authority personas. Full adapter sweep pending.
+Notes:
+- sl:prompt_only injects the authority-LOW persona as a system prompt at eval — no KL constraint, shifts all 7 foundations by ~2 nats each (indiscriminate personality shift). ws:delora is iso-KL=0.52 and more surgical (±0.58 vs ±1.54 std on Auth).
+- Effective ws coefficient is negative (C=-1.22, the NEG arm). The POS arm (C=+1.29, authority-LOW persona direction) increases wrongness. Likely cause: general-topic training data (emails, code) produces near-identical responses under authority-LOW vs authority-HIGH personas, so the weight diff learns no authority-relevant direction.
+- Full adapter sweep pending; authority-specific training topics would likely fix the sign inversion.

 Reproduce: `uv run python -m ws.scripts.eval_tinymfv_calibrated --behavior authority` then `uv run python -m ws.scripts.readme_tinymfv_table --behavior authority`.

@@ -305,14 +305,27 @@ def _sl_delta_row(cfg: ReadmeTinymfvCfg, method: str) -> dict | None:
    data = json.loads(p.read_text())
    if "axis_shift" not in data or "dlogit_per_foundation" not in data:
        return None
+    by_f = {f: {"dlogit_mean": d.get("mean", float("nan")),
+                "dlogit_std": d.get("std", float("nan")),
+                "n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()}
+    # sl stores axis_shift = ΔlogitCare - ΔlogitAuthority (composite auth_care axis).
+    # For single-foundation behaviors, recompute using the same formula ws uses.
+    axis_cfg = BEHAVIOR_AXIS.get(cfg.behavior, {})
+    if "target_alpha_sign" in axis_cfg:
+        # Single-foundation: find the target foundation from arrow_neg (intent↓).
+        f_neg = axis_cfg.get("arrow_neg")
+        f_pos = axis_cfg.get("arrow_pos")
+        d_neg = by_f.get(f_neg, {}).get("dlogit_mean", float("nan")) if f_neg else 0.0
+        d_pos = by_f.get(f_pos, {}).get("dlogit_mean", float("nan")) if f_pos else 0.0
+        axis = (d_pos if d_pos == d_pos else 0.0) - (d_neg if d_neg == d_neg else 0.0)
+    else:
+        axis = float(data["axis_shift"])
    return {
        "method": f"sl:{method}",
-        "axis": float(data["axis_shift"]),
+        "axis": axis,
        "C": float(data.get("coeff_calibrated", float("nan"))),
        "kl": float(data.get("kl_p95_at_calib", float("nan"))),
-        "by_f": {f: {"dlogit_mean": d.get("mean", float("nan")),
-                     "dlogit_std": d.get("std", float("nan")),
-                     "n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()},
+        "by_f": by_f,
    }