From afbfbf514f9b03227a67be3e9bd14136df447ae3 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Thu, 25 Jun 2026 13:23:34 +0800 Subject: [PATCH] docs: add interactive refusal tables --- .gitignore | 2 + README.md | 16 +-- README.qmd | 18 +-- out/on_off_axis.svg | 2 +- pyproject.toml | 2 + scripts/docs_results.py | 7 ++ scripts/readme_plot.py | 1 + scripts/summarize_model_matrix.py | 6 +- scripts/update_readme_model_matrix.py | 150 +++++++++++++++++++++++-- scripts/update_readme_results_table.py | 6 +- uv.lock | 43 ++++++- 11 files changed, 216 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 9d4359a..457bae1 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ docs/_site/ **/.quarto/ **/*.quarto_ipynb docs/.gitignore + +/.quarto/ diff --git a/README.md b/README.md index 12c4137..9bc820e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Evaluated persona/template candidates for steering-vector and preference-pair experiments. Dataset: -https://huggingface.co/datasets/wassname/persona-steering-template-library +[wassname/persona-steering-template-library](https://huggingface.co/datasets/wassname/persona-steering-template-library) ## What This Measures @@ -169,13 +169,13 @@ just --list This library samples from or was shaped by: -- repeng: https://github.com/vgel/repeng -- Persona Vectors: https://github.com/safety-research/persona_vectors -- Assistant Axis: https://github.com/safety-research/assistant-axis -- weight-steering: https://github.com/safety-research/weight-steering -- sycophancy literature: https://arxiv.org/abs/2310.13548 -- OLMo 3 report: https://arxiv.org/abs/2512.13961 -- wassname/AntiPaSTO: https://github.com/wassname/AntiPaSTO +- [repeng](https://github.com/vgel/repeng) +- [Persona Vectors](https://github.com/safety-research/persona_vectors) +- [Assistant Axis](https://github.com/safety-research/assistant-axis) +- [weight-steering](https://github.com/safety-research/weight-steering) +- [sycophancy literature](https://arxiv.org/abs/2310.13548) +- [OLMo 3 report](https://arxiv.org/abs/2512.13961) +- [wassname/AntiPaSTO](https://github.com/wassname/AntiPaSTO) - annotated guide: [`docs/persona_prompt_prior_art.md`](docs/persona_prompt_prior_art.md) - full inventory: diff --git a/README.qmd b/README.qmd index 458882e..4ff5bc6 100644 --- a/README.qmd +++ b/README.qmd @@ -14,7 +14,7 @@ execute: Evaluated persona/template candidates for steering-vector and preference-pair experiments. -Dataset: https://huggingface.co/datasets/wassname/persona-steering-template-library +Dataset: [wassname/persona-steering-template-library](https://huggingface.co/datasets/wassname/persona-steering-template-library) ```{python} #| output: asis @@ -171,13 +171,13 @@ just --list This library samples from or was shaped by: -- repeng: https://github.com/vgel/repeng -- Persona Vectors: https://github.com/safety-research/persona_vectors -- Assistant Axis: https://github.com/safety-research/assistant-axis -- weight-steering: https://github.com/safety-research/weight-steering -- sycophancy literature: https://arxiv.org/abs/2310.13548 -- OLMo 3 report: https://arxiv.org/abs/2512.13961 -- wassname/AntiPaSTO: https://github.com/wassname/AntiPaSTO +- [repeng](https://github.com/vgel/repeng) +- [Persona Vectors](https://github.com/safety-research/persona_vectors) +- [Assistant Axis](https://github.com/safety-research/assistant-axis) +- [weight-steering](https://github.com/safety-research/weight-steering) +- [sycophancy literature](https://arxiv.org/abs/2310.13548) +- [OLMo 3 report](https://arxiv.org/abs/2512.13961) +- [wassname/AntiPaSTO](https://github.com/wassname/AntiPaSTO) - annotated guide: [`docs/persona_prompt_prior_art.md`](docs/persona_prompt_prior_art.md) - full inventory: [`data/template_catalog.yaml`](data/template_catalog.yaml) @@ -301,5 +301,5 @@ print(results_table._appendix_block()) ```{python} #| output: asis -print(model_matrix._appendix_block(model_matrix.SUMMARY)) +print(model_matrix.appendix_block()) ``` diff --git a/out/on_off_axis.svg b/out/on_off_axis.svg index 0ee010e..aa3501f 100644 --- a/out/on_off_axis.svg +++ b/out/on_off_axis.svg @@ -1 +1 @@ -1234567891000.20.40.60.8100.20.40.60.81024681012score ton-axis movement, higher is betteroff-axis confounding, lower is betternormal pilot scenarios; one point per measured template \ No newline at end of file +1234567891000.20.40.60.8100.20.40.60.81024681012score ton-axis movement, higher is betteroff-axis confounding, lower is betternormal pilot scenarios; one point per measured template \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ee5ac9b..684e10f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,8 @@ dependencies = [ "nbformat>=5.10.4", "plotly>=6.0.0", "kaleido>=1.3.0", + "itables>=2.8.1", + "polars>=1.41.2", ] [tool.uv] diff --git a/scripts/docs_results.py b/scripts/docs_results.py index 2efcd68..e1efac2 100644 --- a/scripts/docs_results.py +++ b/scripts/docs_results.py @@ -22,11 +22,18 @@ REFUSAL_MODEL_PAIR_STATS = [ ] REFUSAL_MODEL_PREFIX = ROOT / "out/model_matrix/refusal_probe_seed24_n1" +ANTHROPIC_IF2_COMMENT = "" +ANTHROPIC_IF2_LABEL = "Anthropic/if-2 instruction-following eval:" + def read_jsonl(path: Path) -> list[dict[str, Any]]: return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] +def display_template_text(text: str) -> str: + return text.replace(ANTHROPIC_IF2_COMMENT, ANTHROPIC_IF2_LABEL) + + def clamp01(x: float) -> float: return max(0.0, min(1.0, x)) diff --git a/scripts/readme_plot.py b/scripts/readme_plot.py index 314db74..8f6cb43 100644 --- a/scripts/readme_plot.py +++ b/scripts/readme_plot.py @@ -14,6 +14,7 @@ MAIN_SVG = docs_results.ROOT / "out/on_off_axis.svg" def _wrap_hover(text: str, width: int = 62) -> str: + text = docs_results.display_template_text(text) escaped = html.escape(" ".join(text.split())) return "
".join( textwrap.wrap(escaped, width=width, break_long_words=True, break_on_hyphens=False)) diff --git a/scripts/summarize_model_matrix.py b/scripts/summarize_model_matrix.py index 2174424..375c378 100644 --- a/scripts/summarize_model_matrix.py +++ b/scripts/summarize_model_matrix.py @@ -141,11 +141,7 @@ def _summarize(rows: list[dict[str, Any]], group_cols: list[str]) -> list[dict[s def _markdown_text(text: str) -> str: - if "" in text: - text = text.replace( - "", - "Anthropic/if-2 instruction-following eval:", - ) + text = docs_results.display_template_text(text) text = text.replace("{persona}", "`{persona}`") text = text.replace("&", "&") text = text.replace("<", "<") diff --git a/scripts/update_readme_model_matrix.py b/scripts/update_readme_model_matrix.py index 1d58c64..b578de8 100644 --- a/scripts/update_readme_model_matrix.py +++ b/scripts/update_readme_model_matrix.py @@ -1,13 +1,17 @@ from __future__ import annotations +import html import json +import os from pathlib import Path from tabulate import tabulate +import docs_results ROOT = Path(__file__).resolve().parents[1] SUMMARY = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_model_summary.jsonl" +PAIR_SUMMARY = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_pair_model_summary.jsonl" def _read_jsonl(path: Path) -> list[dict]: @@ -15,11 +19,7 @@ def _read_jsonl(path: Path) -> list[dict]: def _markdown_text(text: str) -> str: - if "" in text: - text = text.replace( - "", - "Anthropic/if-2 instruction-following eval:", - ) + text = docs_results.display_template_text(text) text = text.replace("{persona}", "`{persona}`") text = text.replace("&", "&") text = text.replace("<", "<") @@ -42,8 +42,7 @@ def _appendix_table(rows: list[dict]) -> str: return tabulate(table_rows, headers="keys", tablefmt="github", disable_numparse=True) -def _appendix_block(summary_path: Path) -> str: - rows = _read_jsonl(summary_path) +def _appendix_intro() -> str: return "\n\n".join([ "## Appendix: Refusal-Pole Probe", ( @@ -67,12 +66,147 @@ def _appendix_block(summary_path: Path) -> str: "[out/model_matrix/refusal_probe_seed24_n1_model_matrix_summary.md]" "(out/model_matrix/refusal_probe_seed24_n1_model_matrix_summary.md)." ), + ]) + + +def _appendix_block(summary_path: Path) -> str: + rows = _read_jsonl(summary_path) + return "\n\n".join([ + _appendix_intro(), _appendix_table(rows), ]) +def _template_display_text(text: str) -> str: + text = docs_results.display_template_text(text) + text = " ".join(text.split()) + return text.replace("{persona}", "{persona}") + + +def _table_styles() -> str: + return """ + +""" + + +def _html_heading(title: str, body: str) -> str: + return "\n".join([ + f"

{html.escape(title)}

", + f"

{html.escape(body)}

", + ]) + + +def _template_table_rows(rows: list[dict]) -> list[dict]: + return [ + { + "score t": row["score_t"], + "score mean": row["score_mean"], + "score std": row["score_std"], + "pass": row["strict_pass_rate_mean"], + "echo": row["persona_echo_rate_mean"], + "refusal": row["refusal_or_ai_break_rate_mean"], + "template": _template_display_text(row["template"]), + } + for row in rows + ] + + +def _pair_table_rows(rows: list[dict]) -> list[dict]: + return [ + { + "score t": row["score_t"], + "score mean": row["score_mean"], + "score std": row["score_std"], + "pass": row["strict_pass_rate_mean"], + "echo": row["persona_echo_rate_mean"], + "refusal": row["refusal_or_ai_break_rate_mean"], + "persona_pair": row["persona_pair"], + "template": _template_display_text(row["template"]), + } + for row in rows + ] + + +def _datatable_html(rows: list[dict], table_id: str) -> str: + import polars as pl + from itables import to_html_datatable + + df = pl.DataFrame(rows) + return "\n".join([ + f'
', + to_html_datatable( + df, + classes="display compact cell-border stripe", + display_logo_when_loading=False, + paging=True, + pageLength=25, + lengthMenu=[10, 25, 50, 100, -1], + ordering=True, + scrollX=True, + autoWidth=False, + show_dtypes=False, + showIndex=False, + maxBytes=1_000_000, + ), + "
", + ]) + + +def _interactive_appendix_block(summary_path: Path, pair_summary_path: Path) -> str: + template_rows = _read_jsonl(summary_path) + pair_rows = _read_jsonl(pair_summary_path) + refusal_hit_pairs = sorted({ + row["persona_pair"] + for row in pair_rows + if float(row["refusal_or_ai_break_rate_mean"]) > 0.0 + }) + refusal_pair_rows = [ + row for row in pair_rows + if row["persona_pair"] in refusal_hit_pairs + ] + + return "\n\n".join([ + _appendix_intro(), + _table_styles(), + _html_heading( + "All refusal-pole templates", + "Full model-equal template table. Sort by score t, refusal, echo, or pass; search for a template phrase.", + ), + _datatable_html(_template_table_rows(template_rows), "refusal-template-table"), + _html_heading( + "Persona pairs with refusal audit hits, all templates retained", + ( + "This filters persona pairs to those with any refusal-or-AI-break audit hit, " + f"then keeps every template for those pairs. Current pairs: {', '.join(refusal_hit_pairs)}." + ), + ), + _datatable_html(_pair_table_rows(refusal_pair_rows), "refusal-pair-table"), + ]) + + +def appendix_block() -> str: + if os.environ["PSTL_DOC_TARGET"] == "html": + return _interactive_appendix_block(SUMMARY, PAIR_SUMMARY) + return _appendix_block(SUMMARY) + + def main() -> None: - print(_appendix_block(SUMMARY)) + print(appendix_block()) if __name__ == "__main__": diff --git a/scripts/update_readme_results_table.py b/scripts/update_readme_results_table.py index c3c54b9..de1c78c 100644 --- a/scripts/update_readme_results_table.py +++ b/scripts/update_readme_results_table.py @@ -26,11 +26,7 @@ def _score(row: dict) -> float: def _markdown_text(text: str) -> str: if text == "__verbatim_skill_persona__": text = ENGINEERED_DISPLAY - if "" in text: - text = text.replace( - "", - "Anthropic/if-2 instruction-following eval:", - ) + text = docs_results.display_template_text(text) if text == "": return "``" text = text.replace("{{ persona }}", "{persona}") diff --git a/uv.lock b/uv.lock index a38fbec..02b149d 100644 --- a/uv.lock +++ b/uv.lock @@ -7,7 +7,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-06-19T04:58:30.171108401Z" +exclude-newer = "2026-06-19T05:19:42.060161704Z" exclude-newer-span = "P6D" [[package]] @@ -583,6 +583,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, ] +[[package]] +name = "itables" +version = "2.8.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/0d/e4a935862ee77e06062c6b797357c7aaf9d4ba9a32d6eb129018d0d19be4/itables-2.8.1.tar.gz", hash = "sha256:562c7d716d667f3faf87ffe1044a19747a3b231ee6aa7725eb6f908caa18c429", size = 1526821, upload-time = "2026-06-10T22:28:07.66Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/22/eb6ae7468ba673fcb891ff3142e13ffa18f6a43183e6dd8f224b2b4321d3/itables-2.8.1-py3-none-any.whl", hash = "sha256:262e3908771af90634546fe4a5ed63e0d442a6957efbcdcd2ae5cad4845b76e3", size = 1551238, upload-time = "2026-06-10T22:28:05.09Z" }, +] + [[package]] name = "jedi" version = "0.20.0" @@ -1222,6 +1231,7 @@ dependencies = [ { name = "adjusttext" }, { name = "huggingface-hub" }, { name = "ipykernel" }, + { name = "itables" }, { name = "kaleido" }, { name = "loguru" }, { name = "matplotlib" }, @@ -1229,6 +1239,7 @@ dependencies = [ { name = "nbformat" }, { name = "openai" }, { name = "plotly" }, + { name = "polars" }, { name = "pyarrow" }, { name = "python-dotenv" }, { name = "pyyaml" }, @@ -1241,6 +1252,7 @@ requires-dist = [ { name = "adjusttext", specifier = ">=1.3.0" }, { name = "huggingface-hub", specifier = ">=1.18.0" }, { name = "ipykernel", specifier = ">=7.3.0" }, + { name = "itables", specifier = ">=2.8.1" }, { name = "kaleido", specifier = ">=1.3.0" }, { name = "loguru" }, { name = "matplotlib", specifier = ">=3.10.0" }, @@ -1248,6 +1260,7 @@ requires-dist = [ { name = "nbformat", specifier = ">=5.10.4" }, { name = "openai" }, { name = "plotly", specifier = ">=6.0.0" }, + { name = "polars", specifier = ">=1.41.2" }, { name = "pyarrow", specifier = ">=24.0.0" }, { name = "python-dotenv" }, { name = "pyyaml" }, @@ -1376,6 +1389,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/14/abe5ce876ab5b66ee3c691bf537fcd43d037aea55d447aacf74630a8f31e/plotly-6.8.0-py3-none-any.whl", hash = "sha256:13c5c4a0f70b74cab1913eda0de49b826df5931708eb6f9c3010040614700ec8", size = 9902055, upload-time = "2026-06-03T18:33:34.26Z" }, ] +[[package]] +name = "polars" +version = "1.41.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/f9/aeda46259b0669247a160315d2d51269de9504b9dd2f70acadbcb22f46b7/polars-1.41.2.tar.gz", hash = "sha256:256d6731162371b77f3f29a55eacb8c0fc740ddb1a293a01d2ef5b5393c5c708", size = 737996, upload-time = "2026-05-29T17:39:15.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/22/28f62d24f7db56ac4343588f9362d49b7b4177e55ac47a466fe696b0099b/polars-1.41.2-py3-none-any.whl", hash = "sha256:23ce9a2910b6e3e8d4258770bf44aa17170958df7af6e85feedf4458a04d8d29", size = 833445, upload-time = "2026-05-29T17:37:05.576Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.41.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/56/54e3ea0e9b64f327179049e4742241cc6b1d3e8fa414b05a057dd26df367/polars_runtime_32-1.41.2.tar.gz", hash = "sha256:7af09ec1ab053da2c9669e8d15f809a4083a29be05db57111688b8051062af56", size = 2989474, upload-time = "2026-05-29T17:39:17.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/9b/fe72a3811c0357cdb06c67bdc7695fa1623ad47948fc523195f5ac31037f/polars_runtime_32-1.41.2-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:95a08346dac337357cdb825c8076df7d36da54c4caa59a5cb41d0a30691c5edd", size = 52265283, upload-time = "2026-05-29T17:37:09.407Z" }, + { url = "https://files.pythonhosted.org/packages/0a/93/fab9da803fd80d9e83ef88c20932f637a10bc611b20415fc322eec84bc44/polars_runtime_32-1.41.2-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:dedfaeec2c7f995298da7319dd9431d662e5dd1d0ec51b1459df4a0234ceff52", size = 46571222, upload-time = "2026-05-29T17:37:13.698Z" }, + { url = "https://files.pythonhosted.org/packages/c8/2a/8843f34a8ac57acd058a39b87b03b580dd352a490e9dae0415e02033bdd4/polars_runtime_32-1.41.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18eea22c5cc34e27f8a60950458ad81e6a9ea75e89363ca1367e14e7e7f781fc", size = 50409372, upload-time = "2026-05-29T17:37:17.875Z" }, + { url = "https://files.pythonhosted.org/packages/6c/c6/92b352fe88cf51bd0a19fb99e1c0cbe46aa26c14dcf7995b89869cd932ae/polars_runtime_32-1.41.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2630540dfdfb0f36f9b04a07c7c2e3f50bf2ad384113263c1c812007ee9141e0", size = 56405484, upload-time = "2026-05-29T17:37:22.684Z" }, + { url = "https://files.pythonhosted.org/packages/74/c4/bae3174c3b02f6b441d2e58594387abcd509f67a098f682a83b195f08966/polars_runtime_32-1.41.2-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:20e969e08f9b137e233c04cc04de73d9795f89eb77d34854e40a025965a43763", size = 50603512, upload-time = "2026-05-29T17:37:27.422Z" }, + { url = "https://files.pythonhosted.org/packages/f4/ed/f2d26ae02d92c2689056838ed59e2a626326ad23c2831d58637d25f6c82a/polars_runtime_32-1.41.2-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e7016a3deb641b64a31447abbbee0f34bd020a6a9ae34ee6b743837def15e2a4", size = 54328561, upload-time = "2026-05-29T17:37:32.587Z" }, + { url = "https://files.pythonhosted.org/packages/9b/c4/9c3831cc885dc7769e59abf8f583821a5fb4403fd0e4eba0ccc6d47a3d4b/polars_runtime_32-1.41.2-cp310-abi3-win_amd64.whl", hash = "sha256:1e5e5377c315e0dcafdfb2a31adc546abbaeb3f9cb1864e6536523d2af473265", size = 51978643, upload-time = "2026-05-29T17:37:37.443Z" }, + { url = "https://files.pythonhosted.org/packages/cd/c6/79e9f3f270270d7ed5575d92b7bfef49f01abd9275447161275b23b553a8/polars_runtime_32-1.41.2-cp310-abi3-win_arm64.whl", hash = "sha256:843d96f69d18eca53429c1198e58891db7f18111f83b9c419bb45ad9d73eaed5", size = 46006901, upload-time = "2026-05-29T17:37:42.522Z" }, +] + [[package]] name = "prompt-toolkit" version = "3.0.52"