From afbfbf514f9b03227a67be3e9bd14136df447ae3 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Thu, 25 Jun 2026 13:23:34 +0800
Subject: [PATCH] docs: add interactive refusal tables
---
.gitignore | 2 +
README.md | 16 +--
README.qmd | 18 +--
out/on_off_axis.svg | 2 +-
pyproject.toml | 2 +
scripts/docs_results.py | 7 ++
scripts/readme_plot.py | 1 +
scripts/summarize_model_matrix.py | 6 +-
scripts/update_readme_model_matrix.py | 150 +++++++++++++++++++++++--
scripts/update_readme_results_table.py | 6 +-
uv.lock | 43 ++++++-
11 files changed, 216 insertions(+), 37 deletions(-)
diff --git a/.gitignore b/.gitignore
index 9d4359a..457bae1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,5 @@ docs/_site/
**/.quarto/
**/*.quarto_ipynb
docs/.gitignore
+
+/.quarto/
diff --git a/README.md b/README.md
index 12c4137..9bc820e 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Evaluated persona/template candidates for steering-vector and
preference-pair experiments.
Dataset:
-https://huggingface.co/datasets/wassname/persona-steering-template-library
+[wassname/persona-steering-template-library](https://huggingface.co/datasets/wassname/persona-steering-template-library)
## What This Measures
@@ -169,13 +169,13 @@ just --list
This library samples from or was shaped by:
-- repeng: https://github.com/vgel/repeng
-- Persona Vectors: https://github.com/safety-research/persona_vectors
-- Assistant Axis: https://github.com/safety-research/assistant-axis
-- weight-steering: https://github.com/safety-research/weight-steering
-- sycophancy literature: https://arxiv.org/abs/2310.13548
-- OLMo 3 report: https://arxiv.org/abs/2512.13961
-- wassname/AntiPaSTO: https://github.com/wassname/AntiPaSTO
+- [repeng](https://github.com/vgel/repeng)
+- [Persona Vectors](https://github.com/safety-research/persona_vectors)
+- [Assistant Axis](https://github.com/safety-research/assistant-axis)
+- [weight-steering](https://github.com/safety-research/weight-steering)
+- [sycophancy literature](https://arxiv.org/abs/2310.13548)
+- [OLMo 3 report](https://arxiv.org/abs/2512.13961)
+- [wassname/AntiPaSTO](https://github.com/wassname/AntiPaSTO)
- annotated guide:
[`docs/persona_prompt_prior_art.md`](docs/persona_prompt_prior_art.md)
- full inventory:
diff --git a/README.qmd b/README.qmd
index 458882e..4ff5bc6 100644
--- a/README.qmd
+++ b/README.qmd
@@ -14,7 +14,7 @@ execute:
Evaluated persona/template candidates for steering-vector and preference-pair experiments.
-Dataset: https://huggingface.co/datasets/wassname/persona-steering-template-library
+Dataset: [wassname/persona-steering-template-library](https://huggingface.co/datasets/wassname/persona-steering-template-library)
```{python}
#| output: asis
@@ -171,13 +171,13 @@ just --list
This library samples from or was shaped by:
-- repeng: https://github.com/vgel/repeng
-- Persona Vectors: https://github.com/safety-research/persona_vectors
-- Assistant Axis: https://github.com/safety-research/assistant-axis
-- weight-steering: https://github.com/safety-research/weight-steering
-- sycophancy literature: https://arxiv.org/abs/2310.13548
-- OLMo 3 report: https://arxiv.org/abs/2512.13961
-- wassname/AntiPaSTO: https://github.com/wassname/AntiPaSTO
+- [repeng](https://github.com/vgel/repeng)
+- [Persona Vectors](https://github.com/safety-research/persona_vectors)
+- [Assistant Axis](https://github.com/safety-research/assistant-axis)
+- [weight-steering](https://github.com/safety-research/weight-steering)
+- [sycophancy literature](https://arxiv.org/abs/2310.13548)
+- [OLMo 3 report](https://arxiv.org/abs/2512.13961)
+- [wassname/AntiPaSTO](https://github.com/wassname/AntiPaSTO)
- annotated guide: [`docs/persona_prompt_prior_art.md`](docs/persona_prompt_prior_art.md)
- full inventory: [`data/template_catalog.yaml`](data/template_catalog.yaml)
@@ -301,5 +301,5 @@ print(results_table._appendix_block())
```{python}
#| output: asis
-print(model_matrix._appendix_block(model_matrix.SUMMARY))
+print(model_matrix.appendix_block())
```
diff --git a/out/on_off_axis.svg b/out/on_off_axis.svg
index 0ee010e..aa3501f 100644
--- a/out/on_off_axis.svg
+++ b/out/on_off_axis.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ee5ac9b..684e10f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,8 @@ dependencies = [
"nbformat>=5.10.4",
"plotly>=6.0.0",
"kaleido>=1.3.0",
+ "itables>=2.8.1",
+ "polars>=1.41.2",
]
[tool.uv]
diff --git a/scripts/docs_results.py b/scripts/docs_results.py
index 2efcd68..e1efac2 100644
--- a/scripts/docs_results.py
+++ b/scripts/docs_results.py
@@ -22,11 +22,18 @@ REFUSAL_MODEL_PAIR_STATS = [
]
REFUSAL_MODEL_PREFIX = ROOT / "out/model_matrix/refusal_probe_seed24_n1"
+ANTHROPIC_IF2_COMMENT = ""
+ANTHROPIC_IF2_LABEL = "Anthropic/if-2 instruction-following eval:"
+
def read_jsonl(path: Path) -> list[dict[str, Any]]:
return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
+def display_template_text(text: str) -> str:
+ return text.replace(ANTHROPIC_IF2_COMMENT, ANTHROPIC_IF2_LABEL)
+
+
def clamp01(x: float) -> float:
return max(0.0, min(1.0, x))
diff --git a/scripts/readme_plot.py b/scripts/readme_plot.py
index 314db74..8f6cb43 100644
--- a/scripts/readme_plot.py
+++ b/scripts/readme_plot.py
@@ -14,6 +14,7 @@ MAIN_SVG = docs_results.ROOT / "out/on_off_axis.svg"
def _wrap_hover(text: str, width: int = 62) -> str:
+ text = docs_results.display_template_text(text)
escaped = html.escape(" ".join(text.split()))
return "
".join(
textwrap.wrap(escaped, width=width, break_long_words=True, break_on_hyphens=False))
diff --git a/scripts/summarize_model_matrix.py b/scripts/summarize_model_matrix.py
index 2174424..375c378 100644
--- a/scripts/summarize_model_matrix.py
+++ b/scripts/summarize_model_matrix.py
@@ -141,11 +141,7 @@ def _summarize(rows: list[dict[str, Any]], group_cols: list[str]) -> list[dict[s
def _markdown_text(text: str) -> str:
- if "" in text:
- text = text.replace(
- "",
- "Anthropic/if-2 instruction-following eval:",
- )
+ text = docs_results.display_template_text(text)
text = text.replace("{persona}", "`{persona}`")
text = text.replace("&", "&")
text = text.replace("<", "<")
diff --git a/scripts/update_readme_model_matrix.py b/scripts/update_readme_model_matrix.py
index 1d58c64..b578de8 100644
--- a/scripts/update_readme_model_matrix.py
+++ b/scripts/update_readme_model_matrix.py
@@ -1,13 +1,17 @@
from __future__ import annotations
+import html
import json
+import os
from pathlib import Path
from tabulate import tabulate
+import docs_results
ROOT = Path(__file__).resolve().parents[1]
SUMMARY = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_model_summary.jsonl"
+PAIR_SUMMARY = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_pair_model_summary.jsonl"
def _read_jsonl(path: Path) -> list[dict]:
@@ -15,11 +19,7 @@ def _read_jsonl(path: Path) -> list[dict]:
def _markdown_text(text: str) -> str:
- if "" in text:
- text = text.replace(
- "",
- "Anthropic/if-2 instruction-following eval:",
- )
+ text = docs_results.display_template_text(text)
text = text.replace("{persona}", "`{persona}`")
text = text.replace("&", "&")
text = text.replace("<", "<")
@@ -42,8 +42,7 @@ def _appendix_table(rows: list[dict]) -> str:
return tabulate(table_rows, headers="keys", tablefmt="github", disable_numparse=True)
-def _appendix_block(summary_path: Path) -> str:
- rows = _read_jsonl(summary_path)
+def _appendix_intro() -> str:
return "\n\n".join([
"## Appendix: Refusal-Pole Probe",
(
@@ -67,12 +66,147 @@ def _appendix_block(summary_path: Path) -> str:
"[out/model_matrix/refusal_probe_seed24_n1_model_matrix_summary.md]"
"(out/model_matrix/refusal_probe_seed24_n1_model_matrix_summary.md)."
),
+ ])
+
+
+def _appendix_block(summary_path: Path) -> str:
+ rows = _read_jsonl(summary_path)
+ return "\n\n".join([
+ _appendix_intro(),
_appendix_table(rows),
])
+def _template_display_text(text: str) -> str:
+ text = docs_results.display_template_text(text)
+ text = " ".join(text.split())
+ return text.replace("{persona}", "{persona}")
+
+
+def _table_styles() -> str:
+ return """
+
+"""
+
+
+def _html_heading(title: str, body: str) -> str:
+ return "\n".join([
+ f"
{html.escape(body)}
", + ]) + + +def _template_table_rows(rows: list[dict]) -> list[dict]: + return [ + { + "score t": row["score_t"], + "score mean": row["score_mean"], + "score std": row["score_std"], + "pass": row["strict_pass_rate_mean"], + "echo": row["persona_echo_rate_mean"], + "refusal": row["refusal_or_ai_break_rate_mean"], + "template": _template_display_text(row["template"]), + } + for row in rows + ] + + +def _pair_table_rows(rows: list[dict]) -> list[dict]: + return [ + { + "score t": row["score_t"], + "score mean": row["score_mean"], + "score std": row["score_std"], + "pass": row["strict_pass_rate_mean"], + "echo": row["persona_echo_rate_mean"], + "refusal": row["refusal_or_ai_break_rate_mean"], + "persona_pair": row["persona_pair"], + "template": _template_display_text(row["template"]), + } + for row in rows + ] + + +def _datatable_html(rows: list[dict], table_id: str) -> str: + import polars as pl + from itables import to_html_datatable + + df = pl.DataFrame(rows) + return "\n".join([ + f'