docs: simplify model matrix ranking

2026-06-27 16:46:08 +08:00 · 2026-06-25 11:54:06 +08:00
parent 2f62327acc
commit 026b22e131
6 changed files with 233 additions and 490 deletions
@@ -53,14 +53,8 @@ def _std(xs: list[float]) -> float:
    return statistics.stdev(xs)


-def _sem(std: float, n: int) -> float:
-    return std / math.sqrt(n)
-
-
-def _t_stat(mean: float, sem: float) -> float | None:
-    if sem == 0.0:
-        return None
-    return mean / sem
+def _p25(xs: list[float]) -> float:
+    return statistics.quantiles(xs, n=4, method="inclusive")[0]


 def _round(x: float, digits: int = 3) -> float:
@@ -115,17 +109,12 @@ def _summarize(rows: list[dict[str, Any]], group_cols: list[str]) -> list[dict[s
        models = sorted({row["model"] for row in rs})
        base = dict(zip(group_cols, key, strict=True))
        model_count = len(models)
-        score_mean = _mean([float(row["score"]) for row in rs])
-        score_std = _std([float(row["score"]) for row in rs])
-        score_sem = _sem(score_std, model_count)
-        score_t = _t_stat(score_mean, score_sem)
+        scores = [float(row["score"]) for row in rs]
        out.append({
            "model_count": model_count,
-            "score_lcb": _round(score_mean - score_sem, 2),
-            "score_mean": _round(score_mean, 2),
-            "score_std": _round(score_std, 2),
-            "score_sem": _round(score_sem, 2),
-            "score_t": None if score_t is None else _round(score_t, 2),
+            "score_p25": _round(_p25(scores), 2),
+            "score_mean": _round(_mean(scores), 2),
+            "score_std": _round(_std(scores), 2),
            "strict_pass_rate_mean": _round(_mean([float(row["strict_pass_rate"]) for row in rs]), 3),
            "strict_pass_rate_std": _round(_std([float(row["strict_pass_rate"]) for row in rs]), 3),
            "axis_delta_mean": _round(_mean([float(row["mean_axis_delta"]) for row in rs]), 3),
@@ -140,7 +129,7 @@ def _summarize(rows: list[dict[str, Any]], group_cols: list[str]) -> list[dict[s
            "models": ",".join(models),
            **base,
        })
-    return sorted(out, key=lambda row: row["score_lcb"], reverse=True)
+    return sorted(out, key=lambda row: row["score_p25"], reverse=True)


 def _markdown_text(text: str) -> str:
@@ -161,14 +150,10 @@ def _markdown_text(text: str) -> str:
 def _write_markdown(path: Path, template_rows: list[dict[str, Any]], pair_rows: list[dict[str, Any]], top_n: int) -> None:
    top_template_rows = [
        {
-            "score lcb": f"{row['score_lcb']:.2f}",
+            "score p25": f"{row['score_p25']:.2f}",
            "score mean": f"{row['score_mean']:.2f}",
            "score std": f"{row['score_std']:.2f}",
-            "score sem": f"{row['score_sem']:.2f}",
-            "score t": "" if row["score_t"] is None else f"{row['score_t']:.2f}",
            "pass mean": f"{row['strict_pass_rate_mean']:.2f}",
-            "axis mean": f"{row['axis_delta_mean']:.2f}",
-            "off-axis mean": f"{row['off_axis_problem_mean']:.2f}",
            "echo rate": f"{row['persona_echo_rate_mean']:.2f}",
            "refusal rate": f"{row['refusal_or_ai_break_rate_mean']:.2f}",
            "models": row["model_count"],
@@ -176,39 +161,17 @@ def _write_markdown(path: Path, template_rows: list[dict[str, Any]], pair_rows:
        }
        for row in template_rows[:top_n]
    ]
-    top_pair_rows = [
-        {
-            "score lcb": f"{row['score_lcb']:.2f}",
-            "score mean": f"{row['score_mean']:.2f}",
-            "score std": f"{row['score_std']:.2f}",
-            "score sem": f"{row['score_sem']:.2f}",
-            "score t": "" if row["score_t"] is None else f"{row['score_t']:.2f}",
-            "pass mean": f"{row['strict_pass_rate_mean']:.2f}",
-            "axis mean": f"{row['axis_delta_mean']:.2f}",
-            "off-axis mean": f"{row['off_axis_problem_mean']:.2f}",
-            "echo rate": f"{row['persona_echo_rate_mean']:.2f}",
-            "refusal rate": f"{row['refusal_or_ai_break_rate_mean']:.2f}",
-            "models": row["model_count"],
-            "axis": f"`{row['persona_pair']}`",
-            "template": _markdown_text(row["template"]),
-        }
-        for row in pair_rows[:top_n]
-    ]
    lines = [
        "# Refusal Probe Model Matrix",
        "",
-        "Scores are model-equal. Each model first averages the two refusal-probe axes per template, then the table reports mean and sample std across clean model artifacts.",
+        "Scores are model-equal. Each model first averages the two refusal-probe axes per template, then the table reports reliability-sorted template rows across clean model artifacts.",
        "",
-        "## Top Templates",
+        "## All Templates",
+        "",
+        "`score p25` is the 25th percentile score across the four clean model artifacts. Rows are sorted by this column.",
        "",
        tabulate(top_template_rows, headers="keys", tablefmt="github", disable_numparse=True),
    ]
-    lines.extend([
-        "",
-        "## Top Template-Axis Cells",
-        "",
-        tabulate(top_pair_rows, headers="keys", tablefmt="github", disable_numparse=True),
-    ])
    path.write_text("\n".join(lines) + "\n")


@@ -255,7 +218,7 @@ def _plot(path: Path, rows: list[dict[str, Any]], label_count: int) -> None:
    ax.text(
        1.0,
        -0.13,
-        "error bars are model SEM; point numbers match the top-template table",
+        "error bars are model SEM; point numbers match the first table rows",
        transform=ax.transAxes,
        ha="right",
        fontsize=8,
@@ -37,13 +37,10 @@ def _markdown_text(text: str) -> str:
 def _table(rows: list[dict], top_n: int) -> str:
    table_rows = [
        {
-            "score lcb": f"{row['score_lcb']:.2f}",
+            "score p25": f"{row['score_p25']:.2f}",
            "score mean": f"{row['score_mean']:.2f}",
            "score std": f"{row['score_std']:.2f}",
-            "score t": "" if row["score_t"] is None else f"{row['score_t']:.2f}",
            "pass mean": f"{row['strict_pass_rate_mean']:.2f}",
-            "axis mean": f"{row['axis_delta_mean']:.2f}",
-            "off-axis mean": f"{row['off_axis_problem_mean']:.2f}",
            "echo rate": f"{row['persona_echo_rate_mean']:.2f}",
            "refusal rate": f"{row['refusal_or_ai_break_rate_mean']:.2f}",
            "template": _markdown_text(row["template"]),
@@ -53,18 +50,6 @@ def _table(rows: list[dict], top_n: int) -> str:
    return tabulate(table_rows, headers="keys", tablefmt="github", disable_numparse=True)


-def _full_ranked_block(summary_path: Path) -> str:
-    rows = _read_jsonl(summary_path)
-    return "\n\n".join([
-        "## Appendix: Full Refusal Probe Model Matrix",
-        (
-            "`score lcb` is `score mean - score sem`, a one-standard-error lower score. "
-            "Rows are sorted by this reliability-weighted score; `score t` is `mean / sem`."
-        ),
-        _table(rows, top_n=len(rows)),
-    ])
-
-
 def _block(summary_path: Path) -> str:
    rows = _read_jsonl(summary_path)
    return "\n\n".join([
@@ -78,17 +63,23 @@ def _block(summary_path: Path) -> str:
        (
            "This table reports mean and sample std across models. Each model first averages "
            "the two probe axes for a template, so this is model-equal rather than row-equal. "
-            "`score lcb` is the headline sort because it penalizes model-to-model instability. "
-            "High std, persona echo, and refusal rate are warnings, not secondary scores."
+            "`score p25` is the headline sort: it is the 25th percentile score across the "
+            "four clean model artifacts, so a template has to work on more than one model to rank well."
        ),
        "![refusal probe model matrix](./out/model_matrix/refusal_probe_seed24_n1_model_matrix.png)",
-        "Top model-matrix templates:",
-        _table(rows, top_n=10),
+        (
+            "Caption: each dot is one template. Right is more on-axis movement; lower is less "
+            "off-axis confounding. Black dots have at least one strict-pass template-axis cell; "
+            "grey dots have none. Numbered dots are the first rows of the table. Error bars show "
+            "model SEM for those numbered rows only."
+        ),
+        "Model-matrix templates, all rows:",
+        _table(rows, top_n=len(rows)),
        (
            "Interpretation: some explicit judgment framings and red-team/eval framings move "
            "the hard axis more often than the gentle templates, but they frequently do so "
            "with persona echo or model-specific behavior. The cleanest-looking single-axis "
-            "cells were often `protocol_harm`, so treat the top rows as rerun candidates "
+            "cells were often `protocol_harm`, so treat the high rows as rerun candidates "
            "rather than settled reusable defaults."
        ),
        "Excluded attempted models:",