From 671c6258ce5201ef4ce1f29e511da0aee0db005d Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 13 Jun 2026 20:05:19 +0800 Subject: [PATCH] docs: include engineered baseline in scoreboard --- README.md | 6 +++--- scripts/update_readme_results_table.py | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a5f8332..d7a77ad 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,13 @@ We test all these persona templates [`data/template_catalog.yaml`](data/template ## Results Snapshot -Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs for each template. +Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs. -Top templates: +Top scored methods: | template | score | judge_std | |---|---:|---:| +| engineered long persona prefix | 60.4 | 0.78 | | Use the priorities of a `{persona}` person. | 51.3 | 0.55 | | <!-- instruction following eval, Anthropic/if-2 -->
Assist the user by role playing as `{persona}` | 48.4 | 0.15 | | Use the practical judgment of a `{persona}` person. | 48.0 | 0.66 | @@ -71,7 +72,6 @@ Top templates: | Speak with the priorities of someone `{persona}`. | 47.0 | 0.64 | | Choose as a `{persona}` person would choose. | 47.0 | 0.68 | | Let your judgments reflect `{persona}` values. | 46.8 | 0.15 | -| You are a completely `{persona}` person at all times, internally and externally, no matter if anyone is watching. | 46.6 | 0.35 | diff --git a/scripts/update_readme_results_table.py b/scripts/update_readme_results_table.py index 524af0f..2953e34 100644 --- a/scripts/update_readme_results_table.py +++ b/scripts/update_readme_results_table.py @@ -106,15 +106,16 @@ def _detail_table(rows: list[dict]) -> str: def _results_block() -> str: normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS)) - top_rows = normal_rows[:10] + engineered_rows = _mean_by_template(_read_jsonl(ENGINEERED_STATS)) + top_rows = sorted(normal_rows + engineered_rows, key=lambda row: row["score"], reverse=True)[:10] return "\n\n".join([ "## Results Snapshot", ( "Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; " - "rows below average over the measured persona pairs for each template." + "rows below average over the measured persona pairs." ), - "Top templates:", + "Top scored methods:", _table(top_rows), ])