scripts: tinymfv comparison table + calibrated eval wrapper

- ws.scripts.readme_tinymfv_table: cue / axis_shift / per-foundation Δlogit table that combines ws adapter rows (loaded from out/trad_care/<adapter>/*__foundations_dlogit.csv) with steering-lite's frozen baselines (loaded from lite/steering-lite/outputs/tinymfv_sweep/*.json). Same axis, same metric, same iso-KL footprint -> directly comparable. - ws.scripts.eval_tinymfv_calibrated: thin launcher that reads out/<behavior>/kl_calibration/summary.csv and runs ws.eval.tinymfv_airisk once per adapter with --coeffs -alpha_neg 0.0 +alpha_pos. Necessary because the pos/neg alphas are asymmetric per adapter.
2026-06-27 18:27:18 +08:00 · 2026-05-02 19:47:09 +08:00
parent f866618eac
commit aa0b07451d
2 changed files with 274 additions and 0 deletions
@@ -0,0 +1,95 @@
+"""Run tiny-mfv airisk eval per-adapter at iso-KL calibrated alphas.
+
+Reads `out/<behavior>/kl_calibration/summary.csv` (produced by `ws.kl_calibrate`)
+and invokes `ws.eval.tinymfv_airisk` once per adapter with --coeffs
+-alpha_neg 0.0 +alpha_pos. Each run writes its own per-frame / per-vignette /
+foundations / Δlogit CSVs under `out/<behavior>/<adapter>/`, which are then
+consumed by `ws.scripts.readme_tinymfv_table`.
+
+Why a wrapper: kl_calibrate produces asymmetric alpha_pos / alpha_neg per
+adapter (steering directions don't have symmetric KL footprint). The base
+eval module takes a single `coeffs` tuple, so we read the calibrated values
+and forward them as a CLI list -- one process per adapter so signs are clean.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+import polars as pl
+import tyro
+from loguru import logger
+
+
+@dataclass
+class EvalTinymfvCalibratedCfg:
+    behavior: str = "trad_care"
+    out: Path = Path("out")
+    adapters: tuple[str, ...] = ("lora", "dora", "pissa", "delora", "oft", "ia3")
+    model: str = "Qwen/Qwen3-0.6B"
+    bootstrap_samples: int = 256
+    limit: int = 0
+    batch_size: int = 16
+    include_prompt_baseline: bool = True
+
+
+def _run(cmd: list[str]) -> int:
+    logger.info(f"$ {' '.join(cmd)}")
+    return subprocess.call(cmd)
+
+
+def main(cfg: EvalTinymfvCalibratedCfg) -> None:
+    summary_path = cfg.out / cfg.behavior / "kl_calibration" / "summary.csv"
+    if not summary_path.exists():
+        sys.exit(f"missing kl_calibration summary at {summary_path} -- run ws.kl_calibrate first")
+    summary = pl.read_csv(summary_path)
+
+    by_method = {row["method"]: row for row in summary.to_dicts()}
+
+    for adapter in cfg.adapters:
+        key = f"dW:{adapter}"
+        if key not in by_method:
+            logger.warning(f"no calibration for {key}; skipping")
+            continue
+        row = by_method[key]
+        alpha_pos = float(row["alpha_pos"])
+        alpha_neg = float(row["alpha_neg"])
+        coeffs = [-alpha_neg, 0.0, alpha_pos]
+        logger.info(f"=== {adapter}: alpha_pos={alpha_pos:+.3f} alpha_neg={alpha_neg:+.3f} ===")
+        rc = _run([
+            "uv", "run", "python", "-m", "ws.eval.tinymfv_airisk",
+            "--model", cfg.model,
+            "--behavior", cfg.behavior,
+            "--adapter", adapter,
+            "--coeffs", *[f"{c:+.6f}" for c in coeffs],
+            "--batch-size", str(cfg.batch_size),
+            "--bootstrap-samples", str(cfg.bootstrap_samples),
+            *(["--limit", str(cfg.limit)] if cfg.limit > 0 else []),
+        ])
+        if rc != 0:
+            logger.error(f"adapter {adapter} eval exited with rc={rc}")
+
+    if cfg.include_prompt_baseline:
+        logger.info("=== prompt baseline (engineered_prompt_traditional vs engineered_prompt_caring) ===")
+        rc = _run([
+            "uv", "run", "python", "-m", "ws.eval.tinymfv_airisk",
+            "--model", cfg.model,
+            "--behavior", cfg.behavior,
+            "--adapter", "",
+            "--prompt-baseline",
+            "--prompt-pos", "engineered_prompt_traditional",
+            "--prompt-neg", "engineered_prompt_caring",
+            "--coeffs", "-1.0", "0.0", "+1.0",
+            "--batch-size", str(cfg.batch_size),
+            "--bootstrap-samples", str(cfg.bootstrap_samples),
+            *(["--limit", str(cfg.limit)] if cfg.limit > 0 else []),
+        ])
+        if rc != 0:
+            logger.error(f"prompt baseline eval exited with rc={rc}")
+
+
+if __name__ == "__main__":
+    main(tyro.cli(EvalTinymfvCalibratedCfg))
@@ -0,0 +1,179 @@
+"""README-ready tiny-mfv table: ws adapters + steering-lite baselines side-by-side.
+
+Same axis (Care vs Traditional/Sanctity), same metric (axis_shift in nats),
+same paired-by-(vid,cond) per-foundation Δlogit. ws rows are read from
+`out/trad_care/{adapter|base}/*__foundations_dlogit.csv` (the eval already
+computes them); steering-lite rows are read from
+`/media/wassname/SGIronWolf/projects5/2026/lite/steering-lite/outputs/tinymfv_sweep/*.json`.
+
+NB: ws weight-steering uses iso-KL calibrated alpha (target_kl=1.0 nat); the
+steering-lite calibration is the same target. Both repos' rows are therefore
+at matched KL footprint, so axis_shift is directly comparable. The ws
+prompt_only row (alpha=+1, no calibration) and steering-lite's prompt_only
+row are the only un-calibrated points -- they're included for context.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+import polars as pl
+import tyro
+from tabulate import tabulate
+
+from ws._artifacts import latest_matching
+
+
+FOUNDATION_ORDER = ["Care", "Sanctity", "Authority", "Loyalty", "Fairness", "Liberty", "Social Norms"]
+FOUNDATION_SHORT = {
+    "Care": "Care", "Sanctity": "Sanc", "Authority": "Auth",
+    "Loyalty": "Loy", "Fairness": "Fair", "Liberty": "Lib", "Social Norms": "SocN",
+}
+
+
+@dataclass
+class ReadmeTinymfvCfg:
+    behavior: str = "trad_care"
+    out: Path = Path("out")
+    adapters: tuple[str, ...] = ("lora", "dora", "pissa", "delora", "oft", "ia3")
+    include_base: bool = True
+    include_prompt_baseline: bool = True
+    steering_lite_root: Path = Path("/media/wassname/SGIronWolf/projects5/2026/lite/steering-lite")
+    steering_lite_methods: tuple[str, ...] = (
+        "bare", "prompt_only", "mean_diff", "mean_centred",
+        "pca", "sspace", "cosine_gated", "topk_clusters",
+    )
+
+
+def _cue(axis: float) -> str:
+    if axis != axis:  # NaN
+        return "⚪"
+    a = abs(axis)
+    if a > 0.5:
+        return "🟢"
+    if a > 0.15:
+        return "🟡"
+    return "🔴"
+
+
+def _load_ws_row(cfg: ReadmeTinymfvCfg, adapter_dir: Path, label: str, alpha: float = 1.0) -> dict | None:
+    """Read latest eval artefacts in `adapter_dir`; return one row dict or None."""
+    try:
+        summary_path = latest_matching(adapter_dir, "*__summary.csv")
+        dlogit_path = latest_matching(adapter_dir, "*__foundations_dlogit.csv")
+    except FileNotFoundError:
+        return None
+    summary = pl.read_csv(summary_path)
+    dlogit = pl.read_csv(dlogit_path)
+    sub = summary.filter(pl.col("alpha") == alpha)
+    if sub.is_empty():
+        return None
+    axis = float(sub["axis_shift"][0])
+    sub_d = dlogit.filter(pl.col("alpha") == alpha)
+    by_f = {row["foundation_coarse"]: row["dlogit_mean"] for row in sub_d.to_dicts()}
+    row = {"row": label, "axis_shift": axis, "cue": _cue(axis), "n_vig": int(sub["n_vignettes"][0])}
+    for f in FOUNDATION_ORDER:
+        row[FOUNDATION_SHORT[f]] = by_f.get(f, float("nan"))
+    return row
+
+
+def _load_steering_lite_row(json_path: Path) -> dict | None:
+    if not json_path.exists():
+        return None
+    data = json.loads(json_path.read_text())
+    method = data.get("method", json_path.stem)
+    label = f"sl:{method}"
+    if "axis_shift" in data and "dlogit_per_foundation" in data:
+        axis = float(data["axis_shift"])
+        dlf = data["dlogit_per_foundation"]
+        row = {"row": label, "axis_shift": axis, "cue": _cue(axis),
+               "n_vig": sum(int(d.get("n", 0)) for d in dlf.values()) // max(1, len(dlf))}
+        for f in FOUNDATION_ORDER:
+            row[FOUNDATION_SHORT[f]] = dlf.get(f, {}).get("mean", float("nan"))
+        return row
+    # bare.json has absolute_logit_per_foundation, no Δ
+    if "absolute_logit_per_foundation" in data:
+        alf = data["absolute_logit_per_foundation"]
+        row = {"row": f"sl:{method} (abs logit)", "axis_shift": float("nan"), "cue": "⚪",
+               "n_vig": sum(int(d.get("n", 0)) for d in alf.values()) // max(1, len(alf))}
+        for f in FOUNDATION_ORDER:
+            row[FOUNDATION_SHORT[f]] = alf.get(f, {}).get("mean", float("nan"))
+        return row
+    return None
+
+
+def main(cfg: ReadmeTinymfvCfg) -> None:
+    rows: list[dict] = []
+
+    # ws bare row (alpha=0 absolute, no steering) -- read from any adapter's
+    # alpha=0 row in the foundation CSV. axis_shift is NaN at alpha=0 by
+    # construction (Δ vs itself = 0); we just want the model's prior.
+    if cfg.include_base:
+        for adapter in cfg.adapters:
+            d = cfg.out / cfg.behavior / adapter
+            if not d.exists():
+                continue
+            try:
+                fpath = latest_matching(d, "*__foundations.csv")
+            except FileNotFoundError:
+                continue
+            fdf = pl.read_csv(fpath).filter(pl.col("alpha") == 0.0)
+            if fdf.is_empty():
+                continue
+            by_f = {r["foundation_coarse"]: r["wrongness_logit"]
+                    for r in fdf.to_dicts() if "wrongness_logit" in r}
+            if not by_f:
+                # fallback: use mean wrongness column
+                by_f = {r["foundation_coarse"]: r.get("wrongness", float("nan"))
+                        for r in fdf.to_dicts()}
+            row = {"row": "ws:bare (abs logit)", "axis_shift": float("nan"),
+                   "cue": "⚪", "n_vig": int(fdf["n_vignettes"].sum()) if "n_vignettes" in fdf.columns else 0}
+            for f in FOUNDATION_ORDER:
+                row[FOUNDATION_SHORT[f]] = by_f.get(f, float("nan"))
+            rows.append(row)
+            break
+
+    # ws prompt-only baseline (out/<behavior>/base/...)
+    if cfg.include_prompt_baseline:
+        base_dir = cfg.out / cfg.behavior / "base"
+        if base_dir.exists():
+            row = _load_ws_row(cfg, base_dir, "ws:prompt_only", alpha=1.0)
+            if row is not None:
+                rows.append(row)
+
+    # ws adapters
+    for adapter in cfg.adapters:
+        d = cfg.out / cfg.behavior / adapter
+        if not d.exists():
+            continue
+        row = _load_ws_row(cfg, d, f"ws:{adapter}", alpha=1.0)
+        if row is not None:
+            rows.append(row)
+
+    # steering-lite rows (frozen baselines)
+    for method in cfg.steering_lite_methods:
+        json_path = cfg.steering_lite_root / "outputs" / "tinymfv_sweep" / f"{method}.json"
+        row = _load_steering_lite_row(json_path)
+        if row is not None:
+            rows.append(row)
+
+    if not rows:
+        print("no rows to emit -- have any tiny-mfv evals run?")
+        return
+
+    cols = ["cue", "row", "axis_shift", "n_vig"] + [FOUNDATION_SHORT[f] for f in FOUNDATION_ORDER]
+    df = pl.DataFrame(rows).select(cols)
+
+    print("\n## OOD: tiny-mfv Care-vs-Traditional axis (directly comparable to steering-lite)\n")
+    print("axis_shift = ΔlogitSanctity − ΔlogitCare (nats). +ve = moved toward "
+          "traditional/binding; -ve = toward care. Per-foundation Δlogit is "
+          "paired by (vid, cond) vs the unsteered (alpha=0) baseline. "
+          "🟢 |axis|>0.5  🟡 >0.15  🔴 below noise.\n")
+    print(tabulate(df.to_pandas(), headers="keys", tablefmt="pipe",
+                   floatfmt="+.2f", showindex=False))
+
+
+if __name__ == "__main__":
+    main(tyro.cli(ReadmeTinymfvCfg))