From aa4fcff446cfe4df71aef9dcd2fc0fde1cfa98c8 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 2 May 2026 20:53:19 +0800 Subject: [PATCH] scripts(readme_tinymfv_table): mirror steering-lite layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Split bare table (absolute logit per foundation) from Δ table - Add C (calibrated coeff) and kl (achieved p95) columns to Δ table; read from out//kl_calibration/summary.csv - Cells now show mean±std, sourced from dlogit_std (ws) and the per-foundation std field of steering-lite JSONs - Headers: "Care ↓" and "Sanc ↑" mark target direction - Sort Δ rows by |axis| descending - Preserve signs in tabulate output via disable_numparse=True --- src/ws/scripts/readme_tinymfv_table.py | 376 ++++++++++++++++++------- 1 file changed, 267 insertions(+), 109 deletions(-) diff --git a/src/ws/scripts/readme_tinymfv_table.py b/src/ws/scripts/readme_tinymfv_table.py index ae8551b..ef32005 100644 --- a/src/ws/scripts/readme_tinymfv_table.py +++ b/src/ws/scripts/readme_tinymfv_table.py @@ -1,21 +1,29 @@ -"""README-ready tiny-mfv table: ws adapters + steering-lite baselines side-by-side. +"""README-ready tiny-mfv tables: ws adapters + steering-lite baselines. + +Layout mirrors steering-lite's README: + - Table 1 (bare): per-foundation absolute logit(is_wrong), one row, no Δ. + Every Δ row below is measured against this prior. + - Table 2 (Δ rows): cue | axis | method | C | kl | per-foundation + `mean±std`. Header arrows mark target direction (Care ↓, Sanc ↑). Same axis (Care vs Traditional/Sanctity), same metric (axis_shift in nats), same paired-by-(vid,cond) per-foundation Δlogit. ws rows are read from -`out/trad_care/{adapter|base}/*__foundations_dlogit.csv` (the eval already -computes them); steering-lite rows are read from -`/media/wassname/SGIronWolf/projects5/2026/lite/steering-lite/outputs/tinymfv_sweep/*.json`. +`out/trad_care/{adapter|base}/*__foundations_dlogit.csv` (eval already +computes them) plus `out/trad_care/kl_calibration/summary.csv` (calibrated +α and achieved p95). Steering-lite rows are read from +`/outputs/tinymfv_sweep/*.json`. NB: ws weight-steering uses iso-KL calibrated alpha (target_kl=1.0 nat); the -steering-lite calibration is the same target. Both repos' rows are therefore -at matched KL footprint, so axis_shift is directly comparable. The ws -prompt_only row (alpha=+1, no calibration) and steering-lite's prompt_only -row are the only un-calibrated points -- they're included for context. +steering-lite calibration is the same target. Both repos' rows are at the +same KL footprint, so axis_shift is directly comparable. ws prompt_only +(alpha=+1, no calibration) and steering-lite prompt_only are the only +un-calibrated points -- shown for context, C=n/a, kl=n/a. """ from __future__ import annotations import json +import math from dataclasses import dataclass from pathlib import Path @@ -28,6 +36,10 @@ from ws._artifacts import latest_matching FOUNDATION_ORDER = ["Care", "Sanctity", "Authority", "Loyalty", "Fairness", "Liberty", "Social Norms"] FOUNDATION_SHORT = { + "Care": "Care ↓", "Sanctity": "Sanc ↑", "Authority": "Auth", + "Loyalty": "Loy", "Fairness": "Fair", "Liberty": "Lib", "Social Norms": "SocN", +} +FOUNDATION_BARE = { "Care": "Care", "Sanctity": "Sanc", "Authority": "Auth", "Loyalty": "Loy", "Fairness": "Fair", "Liberty": "Lib", "Social Norms": "SocN", } @@ -38,17 +50,18 @@ class ReadmeTinymfvCfg: behavior: str = "trad_care" out: Path = Path("out") adapters: tuple[str, ...] = ("lora", "dora", "pissa", "delora", "oft", "ia3") - include_base: bool = True include_prompt_baseline: bool = True + include_steering_lite: bool = True steering_lite_root: Path = Path("/media/wassname/SGIronWolf/projects5/2026/lite/steering-lite") steering_lite_methods: tuple[str, ...] = ( - "bare", "prompt_only", "mean_diff", "mean_centred", + "prompt_only", "mean_diff", "mean_centred", "pca", "sspace", "cosine_gated", "topk_clusters", ) + target_alpha_sign: float = 1.0 # +1 = traditional pole; flip to read negative side def _cue(axis: float) -> str: - if axis != axis: # NaN + if axis != axis: return "⚪" a = abs(axis) if a > 0.5: @@ -58,121 +71,266 @@ def _cue(axis: float) -> str: return "🔴" -def _load_ws_row(cfg: ReadmeTinymfvCfg, adapter_dir: Path, label: str, alpha: float = 1.0) -> dict | None: - """Read latest eval artefacts in `adapter_dir`; return one row dict or None.""" - try: - summary_path = latest_matching(adapter_dir, "*__summary.csv") - dlogit_path = latest_matching(adapter_dir, "*__foundations_dlogit.csv") - except FileNotFoundError: - return None - summary = pl.read_csv(summary_path) - dlogit = pl.read_csv(dlogit_path) - sub = summary.filter(pl.col("alpha") == alpha) - if sub.is_empty(): - return None - axis = float(sub["axis_shift"][0]) - sub_d = dlogit.filter(pl.col("alpha") == alpha) - by_f = {row["foundation_coarse"]: row["dlogit_mean"] for row in sub_d.to_dicts()} - row = {"row": label, "axis_shift": axis, "cue": _cue(axis), "n_vig": int(sub["n_vignettes"][0])} - for f in FOUNDATION_ORDER: - row[FOUNDATION_SHORT[f]] = by_f.get(f, float("nan")) - return row +def _fmt_pm(mean: float, std: float) -> str: + if mean != mean: + return "—" + if std != std: + return f"{mean:+.2f}" + return f"{mean:+.2f}±{std:.2f}" -def _load_steering_lite_row(json_path: Path) -> dict | None: - if not json_path.exists(): - return None - data = json.loads(json_path.read_text()) - method = data.get("method", json_path.stem) - label = f"sl:{method}" - if "axis_shift" in data and "dlogit_per_foundation" in data: - axis = float(data["axis_shift"]) - dlf = data["dlogit_per_foundation"] - row = {"row": label, "axis_shift": axis, "cue": _cue(axis), - "n_vig": sum(int(d.get("n", 0)) for d in dlf.values()) // max(1, len(dlf))} - for f in FOUNDATION_ORDER: - row[FOUNDATION_SHORT[f]] = dlf.get(f, {}).get("mean", float("nan")) - return row - # bare.json has absolute_logit_per_foundation, no Δ - if "absolute_logit_per_foundation" in data: - alf = data["absolute_logit_per_foundation"] - row = {"row": f"sl:{method} (abs logit)", "axis_shift": float("nan"), "cue": "⚪", - "n_vig": sum(int(d.get("n", 0)) for d in alf.values()) // max(1, len(alf))} - for f in FOUNDATION_ORDER: - row[FOUNDATION_SHORT[f]] = alf.get(f, {}).get("mean", float("nan")) - return row - return None +def _fmt_axis(axis: float) -> str: + if axis != axis: + return "—" + return f"{axis:+.2f}" -def main(cfg: ReadmeTinymfvCfg) -> None: - rows: list[dict] = [] +def _fmt_C(c: float | None) -> str: + if c is None or c != c: + return "n/a" + return f"{c:+.2f}" - # ws bare row (alpha=0 absolute, no steering) -- read from any adapter's - # alpha=0 row in the foundation CSV. axis_shift is NaN at alpha=0 by - # construction (Δ vs itself = 0); we just want the model's prior. - if cfg.include_base: - for adapter in cfg.adapters: - d = cfg.out / cfg.behavior / adapter - if not d.exists(): - continue - try: - fpath = latest_matching(d, "*__foundations.csv") - except FileNotFoundError: - continue - fdf = pl.read_csv(fpath).filter(pl.col("alpha") == 0.0) - if fdf.is_empty(): - continue - by_f = {r["foundation_coarse"]: r["wrongness_logit"] - for r in fdf.to_dicts() if "wrongness_logit" in r} - if not by_f: - # fallback: use mean wrongness column - by_f = {r["foundation_coarse"]: r.get("wrongness", float("nan")) - for r in fdf.to_dicts()} - row = {"row": "ws:bare (abs logit)", "axis_shift": float("nan"), - "cue": "⚪", "n_vig": int(fdf["n_vignettes"].sum()) if "n_vignettes" in fdf.columns else 0} - for f in FOUNDATION_ORDER: - row[FOUNDATION_SHORT[f]] = by_f.get(f, float("nan")) - rows.append(row) - break - # ws prompt-only baseline (out//base/...) - if cfg.include_prompt_baseline: - base_dir = cfg.out / cfg.behavior / "base" - if base_dir.exists(): - row = _load_ws_row(cfg, base_dir, "ws:prompt_only", alpha=1.0) - if row is not None: - rows.append(row) +def _fmt_kl(kl: float | None) -> str: + if kl is None or kl != kl: + return "n/a" + return f"{kl:.2f}" - # ws adapters + +def _logit(w: float, eps: float = 0.01) -> float: + w = max(eps, min(1.0 - eps, w)) + return math.log(w / (1.0 - w)) + + +def _load_ws_calib(cfg: ReadmeTinymfvCfg) -> dict[str, dict]: + """Read out//kl_calibration/summary.csv -> by adapter.""" + p = cfg.out / cfg.behavior / "kl_calibration" / "summary.csv" + if not p.exists(): + return {} + df = pl.read_csv(p) + out: dict[str, dict] = {} + for row in df.to_dicts(): + method = row.get("method", "") + if not method.startswith("dW:"): + continue + adapter = method.split(":", 1)[1] + out[adapter] = row + return out + + +def _ws_bare_row(cfg: ReadmeTinymfvCfg) -> dict | None: + """Compute absolute logit per foundation at α=0 from any adapter's per-vignette CSV. + + Mirrors steering-lite's bare table: mean over (vid, cond) of logit(wrongness). + """ for adapter in cfg.adapters: d = cfg.out / cfg.behavior / adapter if not d.exists(): continue - row = _load_ws_row(cfg, d, f"ws:{adapter}", alpha=1.0) - if row is not None: - rows.append(row) + try: + pv_path = latest_matching(d, "*__per_vignette.csv") + except FileNotFoundError: + continue + pv = pl.read_csv(pv_path).filter(pl.col("alpha") == 0.0) + if pv.is_empty(): + continue + # per_vignette has wrongness_other_violate / wrongness_self_violate. + # Unpivot to (vid, cond) -> wrongness, then logit-mean per foundation. + long_rows = [] + for r in pv.to_dicts(): + for cond in ("other_violate", "self_violate"): + w = r.get(f"wrongness_{cond}") + if w is None: + continue + long_rows.append({"foundation_coarse": r["foundation_coarse"], + "logit": _logit(float(w))}) + if not long_rows: + continue + long_df = pl.DataFrame(long_rows) + agg = long_df.group_by("foundation_coarse").agg( + pl.col("logit").mean().alias("mean"), + pl.col("logit").std().alias("std"), + pl.len().alias("n"), + ) + by_f = {r["foundation_coarse"]: r for r in agg.to_dicts()} + return {"source": "ws", "by_f": by_f} + return None - # steering-lite rows (frozen baselines) - for method in cfg.steering_lite_methods: - json_path = cfg.steering_lite_root / "outputs" / "tinymfv_sweep" / f"{method}.json" - row = _load_steering_lite_row(json_path) - if row is not None: - rows.append(row) - if not rows: - print("no rows to emit -- have any tiny-mfv evals run?") +def _sl_bare_row(cfg: ReadmeTinymfvCfg) -> dict | None: + p = cfg.steering_lite_root / "outputs" / "tinymfv_sweep" / "bare.json" + if not p.exists(): + return None + data = json.loads(p.read_text()) + alf = data.get("absolute_logit_per_foundation", {}) + if not alf: + return None + return {"source": "sl", "by_f": {f: {"mean": d.get("mean", float("nan")), + "std": d.get("std", float("nan")), + "n": d.get("n", 0)} for f, d in alf.items()}} + + +def _ws_delta_row(cfg: ReadmeTinymfvCfg, adapter: str, calib: dict[str, dict]) -> dict | None: + d = cfg.out / cfg.behavior / adapter + if not d.exists(): + return None + try: + summary_path = latest_matching(d, "*__summary.csv") + dlogit_path = latest_matching(d, "*__foundations_dlogit.csv") + except FileNotFoundError: + return None + summary = pl.read_csv(summary_path) + dlogit = pl.read_csv(dlogit_path) + # Pick the alpha row whose sign matches target_alpha_sign and is non-zero. + alphas = [a for a in summary["alpha"].to_list() + if a != 0.0 and (a > 0) == (cfg.target_alpha_sign > 0)] + if not alphas: + return None + alpha = max(alphas, key=lambda x: abs(x)) # the largest-magnitude calibrated one + sub = summary.filter(pl.col("alpha") == alpha) + sub_d = dlogit.filter(pl.col("alpha") == alpha) + if sub.is_empty() or sub_d.is_empty(): + return None + by_f = {r["foundation_coarse"]: r for r in sub_d.to_dicts()} + cal = calib.get(adapter, {}) + p95_key = "p95_at_pos" if cfg.target_alpha_sign > 0 else "p95_at_neg" + return { + "method": f"ws:{adapter}", + "axis": float(sub["axis_shift"][0]), + "C": float(alpha), + "kl": float(cal.get(p95_key, float("nan"))) if cal else float("nan"), + "by_f": by_f, + } + + +def _ws_prompt_row(cfg: ReadmeTinymfvCfg) -> dict | None: + base_dir = cfg.out / cfg.behavior / "base" + if not base_dir.exists(): + return None + try: + summary_path = latest_matching(base_dir, "*__summary.csv") + dlogit_path = latest_matching(base_dir, "*__foundations_dlogit.csv") + except FileNotFoundError: + return None + summary = pl.read_csv(summary_path) + dlogit = pl.read_csv(dlogit_path) + alphas = [a for a in summary["alpha"].to_list() + if a != 0.0 and (a > 0) == (cfg.target_alpha_sign > 0)] + if not alphas: + return None + alpha = max(alphas, key=lambda x: abs(x)) + sub = summary.filter(pl.col("alpha") == alpha) + sub_d = dlogit.filter(pl.col("alpha") == alpha) + if sub.is_empty() or sub_d.is_empty(): + return None + return { + "method": "ws:prompt_only", + "axis": float(sub["axis_shift"][0]), + "C": float("nan"), + "kl": float("nan"), + "by_f": {r["foundation_coarse"]: r for r in sub_d.to_dicts()}, + } + + +def _sl_delta_row(cfg: ReadmeTinymfvCfg, method: str) -> dict | None: + p = cfg.steering_lite_root / "outputs" / "tinymfv_sweep" / f"{method}.json" + if not p.exists(): + return None + data = json.loads(p.read_text()) + if "axis_shift" not in data or "dlogit_per_foundation" not in data: + return None + return { + "method": f"sl:{method}", + "axis": float(data["axis_shift"]), + "C": float(data.get("coeff_calibrated", float("nan"))), + "kl": float(data.get("kl_p95_at_calib", float("nan"))), + "by_f": {f: {"dlogit_mean": d.get("mean", float("nan")), + "dlogit_std": d.get("std", float("nan")), + "n": d.get("n", 0)} for f, d in data["dlogit_per_foundation"].items()}, + } + + +def _print_bare_table(rows: list[dict]) -> None: + print("\n#### Bare model (no steering)\n") + print("Absolute logit(is_wrong) per moral foundation, mean over vignettes × frames × conditions. " + "Δ-rows below are measured against this prior.\n") + headers = ["source"] + [FOUNDATION_BARE[f] for f in FOUNDATION_ORDER] + out_rows = [] + for r in rows: + if r is None: + continue + line = ["ws (Qwen3-0.6B)" if r["source"] == "ws" else "steering-lite (Qwen3-0.6B)"] + for f in FOUNDATION_ORDER: + d = r["by_f"].get(f, {}) + mean = d.get("mean", float("nan")) if isinstance(d, dict) else float("nan") + std = d.get("std", float("nan")) if isinstance(d, dict) else float("nan") + line.append(_fmt_pm(mean, std)) + out_rows.append(line) + if not out_rows: + print("(no bare data — alpha=0 eval not run yet)") return + print(tabulate(out_rows, headers=headers, tablefmt="pipe", stralign="right", + disable_numparse=True)) - cols = ["cue", "row", "axis_shift", "n_vig"] + [FOUNDATION_SHORT[f] for f in FOUNDATION_ORDER] - df = pl.DataFrame(rows).select(cols) +def _print_delta_table(rows: list[dict]) -> None: + print("\n#### Steering methods (Δlogit vs bare, paired by (vid, cond))\n") + print("`C` = calibrated coefficient at iso-KL target_kl=1.0 nat; `kl` = achieved kl_p95. " + "Cells: `mean±std`. Cue: 🟢 |axis|>0.5 🟡 >0.15 🔴 below noise.\n") + headers = ["cue", "axis", "method", "C", "kl"] + [FOUNDATION_SHORT[f] for f in FOUNDATION_ORDER] + rows_sorted = sorted(rows, key=lambda r: -abs(r["axis"]) if r["axis"] == r["axis"] else 0) + out_rows = [] + for r in rows_sorted: + line = [_cue(r["axis"]), _fmt_axis(r["axis"]), r["method"], _fmt_C(r["C"]), _fmt_kl(r["kl"])] + for f in FOUNDATION_ORDER: + d = r["by_f"].get(f, {}) + mean = d.get("dlogit_mean", float("nan")) if isinstance(d, dict) else float("nan") + std = d.get("dlogit_std", float("nan")) if isinstance(d, dict) else float("nan") + line.append(_fmt_pm(mean, std)) + out_rows.append(line) + if not out_rows: + print("(no Δ-rows -- run the calibrated tinymfv eval first)") + return + print(tabulate(out_rows, headers=headers, tablefmt="pipe", stralign="right", + disable_numparse=True)) + + +def main(cfg: ReadmeTinymfvCfg) -> None: print("\n## OOD: tiny-mfv Care-vs-Traditional axis (directly comparable to steering-lite)\n") - print("axis_shift = ΔlogitSanctity − ΔlogitCare (nats). +ve = moved toward " - "traditional/binding; -ve = toward care. Per-foundation Δlogit is " - "paired by (vid, cond) vs the unsteered (alpha=0) baseline. " - "🟢 |axis|>0.5 🟡 >0.15 🔴 below noise.\n") - print(tabulate(df.to_pandas(), headers="keys", tablefmt="pipe", - floatfmt="+.2f", showindex=False)) + print("Task: shift Qwen3-0.6B from Care/harm morality toward Sanctity/traditionalist. " + "Headline metric `axis = ΔlogitSanc − ΔlogitCare` (nats); Δ values are paired by " + "(vignette, condition) so vignette difficulty cancels. Setup: target_kl=1.0 nat " + "(iso-KL across methods), max_think=64, vignettes=airisk.\n") + print("Caveat: ws and steering-lite share the same persona pairs, dataset, and 1-nat KL " + "budget, so calibrated rows are directly comparable. Uncalibrated rows " + "(prompt_only, engineered_prompt) have no coefficient dial -- C=n/a, kl=n/a.\n") + + bare_rows = [] + ws_bare = _ws_bare_row(cfg) + if ws_bare is not None: + bare_rows.append(ws_bare) + if cfg.include_steering_lite: + sl_bare = _sl_bare_row(cfg) + if sl_bare is not None: + bare_rows.append(sl_bare) + _print_bare_table(bare_rows) + + delta_rows = [] + if cfg.include_prompt_baseline: + r = _ws_prompt_row(cfg) + if r is not None: + delta_rows.append(r) + calib = _load_ws_calib(cfg) + for adapter in cfg.adapters: + r = _ws_delta_row(cfg, adapter, calib) + if r is not None: + delta_rows.append(r) + if cfg.include_steering_lite: + for method in cfg.steering_lite_methods: + r = _sl_delta_row(cfg, method) + if r is not None: + delta_rows.append(r) + _print_delta_table(delta_rows) if __name__ == "__main__":