diff --git a/README.md b/README.md index bb9ea29..c9faa2d 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,13 @@ Absolute logit(is_wrong) per moral foundation, mean over vignettes × frames × | cue | axis | method | C | kl | Care | Sanc | Auth ↓ | Loy | Fair | Lib | SocN | SI_Auth | | ----: | -----: | -------------: | ----: | ---: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | --------: | +| 🟢 | +2.36 | sl:prompt_only | n/a | n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 | — | | 🟢 | +0.89 | ws:delora | -1.22 | 0.52 | -0.49±0.60 | -0.67±0.54 | -0.89±0.58 | -0.76±0.56 | -0.73±0.54 | -0.57±0.59 | -0.37±0.43 | — | -| 🟡 | +0.41 | sl:prompt_only | n/a | n/a | -1.96±1.62 | -2.19±1.63 | -2.36±1.54 | -2.26±1.50 | -2.35±1.66 | -2.90±1.47 | -1.90±1.98 | — | -Note: effective steering is at C=-1.22 (neg arm) — the pos arm (C=+1.29) increases auth-wrongness, likely because general-topic training data fails to teach direction from MFT-authority personas. Full adapter sweep pending. +Notes: +- sl:prompt_only injects the authority-LOW persona as a system prompt at eval — no KL constraint, shifts all 7 foundations by ~2 nats each (indiscriminate personality shift). ws:delora is iso-KL=0.52 and more surgical (±0.58 vs ±1.54 std on Auth). +- Effective ws coefficient is negative (C=-1.22, the NEG arm). The POS arm (C=+1.29, authority-LOW persona direction) increases wrongness. Likely cause: general-topic training data (emails, code) produces near-identical responses under authority-LOW vs authority-HIGH personas, so the weight diff learns no authority-relevant direction. +- Full adapter sweep pending; authority-specific training topics would likely fix the sign inversion. Reproduce: `uv run python -m ws.scripts.eval_tinymfv_calibrated --behavior authority` then `uv run python -m ws.scripts.readme_tinymfv_table --behavior authority`. diff --git a/src/ws/scripts/readme_tinymfv_table.py b/src/ws/scripts/readme_tinymfv_table.py index f76f2c1..670ca10 100644 --- a/src/ws/scripts/readme_tinymfv_table.py +++ b/src/ws/scripts/readme_tinymfv_table.py @@ -305,14 +305,27 @@ def _sl_delta_row(cfg: ReadmeTinymfvCfg, method: str) -> dict | None: data = json.loads(p.read_text()) if "axis_shift" not in data or "dlogit_per_foundation" not in data: return None + by_f = {f: {"dlogit_mean": d.get("mean", float("nan")), + "dlogit_std": d.get("std", float("nan")), + "n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()} + # sl stores axis_shift = ΔlogitCare - ΔlogitAuthority (composite auth_care axis). + # For single-foundation behaviors, recompute using the same formula ws uses. + axis_cfg = BEHAVIOR_AXIS.get(cfg.behavior, {}) + if "target_alpha_sign" in axis_cfg: + # Single-foundation: find the target foundation from arrow_neg (intent↓). + f_neg = axis_cfg.get("arrow_neg") + f_pos = axis_cfg.get("arrow_pos") + d_neg = by_f.get(f_neg, {}).get("dlogit_mean", float("nan")) if f_neg else 0.0 + d_pos = by_f.get(f_pos, {}).get("dlogit_mean", float("nan")) if f_pos else 0.0 + axis = (d_pos if d_pos == d_pos else 0.0) - (d_neg if d_neg == d_neg else 0.0) + else: + axis = float(data["axis_shift"]) return { "method": f"sl:{method}", - "axis": float(data["axis_shift"]), + "axis": axis, "C": float(data.get("coeff_calibrated", float("nan"))), "kl": float(data.get("kl_p95_at_calib", float("nan"))), - "by_f": {f: {"dlogit_mean": d.get("mean", float("nan")), - "dlogit_std": d.get("std", float("nan")), - "n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()}, + "by_f": by_f, }