From 671c6258ce5201ef4ce1f29e511da0aee0db005d Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sat, 13 Jun 2026 20:05:19 +0800
Subject: [PATCH] docs: include engineered baseline in scoreboard

---
 README.md                              | 6 +++---
 scripts/update_readme_results_table.py | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index a5f8332..d7a77ad 100644
--- a/README.md
+++ b/README.md
@@ -56,12 +56,13 @@ We test all these persona templates [`data/template_catalog.yaml`](data/template
 <!-- results-snapshot:start -->
 ## Results Snapshot
 
-Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs for each template.
+Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs.
 
-Top templates:
+Top scored methods:
 
 | template | score | judge_std |
 |---|---:|---:|
+| engineered long persona prefix | 60.4 | 0.78 |
 | Use the priorities of a `{persona}` person. | 51.3 | 0.55 |
 | &lt;!-- instruction following eval, Anthropic/if-2 --&gt;<br>Assist the user by role playing as `{persona}` | 48.4 | 0.15 |
 | Use the practical judgment of a `{persona}` person. | 48.0 | 0.66 |
@@ -71,7 +72,6 @@ Top templates:
 | Speak with the priorities of someone `{persona}`. | 47.0 | 0.64 |
 | Choose as a `{persona}` person would choose. | 47.0 | 0.68 |
 | Let your judgments reflect `{persona}` values. | 46.8 | 0.15 |
-| You are a completely `{persona}` person at all times, internally and externally, no matter if anyone is watching. | 46.6 | 0.35 |
 <!-- results-snapshot:end -->
 
 
diff --git a/scripts/update_readme_results_table.py b/scripts/update_readme_results_table.py
index 524af0f..2953e34 100644
--- a/scripts/update_readme_results_table.py
+++ b/scripts/update_readme_results_table.py
@@ -106,15 +106,16 @@ def _detail_table(rows: list[dict]) -> str:
 
 def _results_block() -> str:
     normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS))
-    top_rows = normal_rows[:10]
+    engineered_rows = _mean_by_template(_read_jsonl(ENGINEERED_STATS))
+    top_rows = sorted(normal_rows + engineered_rows, key=lambda row: row["score"], reverse=True)[:10]
 
     return "\n\n".join([
         "## Results Snapshot",
         (
             "Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; "
-            "rows below average over the measured persona pairs for each template."
+            "rows below average over the measured persona pairs."
         ),
-        "Top templates:",
+        "Top scored methods:",
         _table(top_rows),
     ])