mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 15:16:06 +08:00
docs: include engineered baseline in scoreboard
This commit is contained in:
@@ -56,12 +56,13 @@ We test all these persona templates [`data/template_catalog.yaml`](data/template
|
||||
<!-- results-snapshot:start -->
|
||||
## Results Snapshot
|
||||
|
||||
Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs for each template.
|
||||
Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below average over the measured persona pairs.
|
||||
|
||||
Top templates:
|
||||
Top scored methods:
|
||||
|
||||
| template | score | judge_std |
|
||||
|---|---:|---:|
|
||||
| engineered long persona prefix | 60.4 | 0.78 |
|
||||
| Use the priorities of a `{persona}` person. | 51.3 | 0.55 |
|
||||
| <!-- instruction following eval, Anthropic/if-2 --><br>Assist the user by role playing as `{persona}` | 48.4 | 0.15 |
|
||||
| Use the practical judgment of a `{persona}` person. | 48.0 | 0.66 |
|
||||
@@ -71,7 +72,6 @@ Top templates:
|
||||
| Speak with the priorities of someone `{persona}`. | 47.0 | 0.64 |
|
||||
| Choose as a `{persona}` person would choose. | 47.0 | 0.68 |
|
||||
| Let your judgments reflect `{persona}` values. | 46.8 | 0.15 |
|
||||
| You are a completely `{persona}` person at all times, internally and externally, no matter if anyone is watching. | 46.6 | 0.35 |
|
||||
<!-- results-snapshot:end -->
|
||||
|
||||
|
||||
|
||||
@@ -106,15 +106,16 @@ def _detail_table(rows: list[dict]) -> str:
|
||||
|
||||
def _results_block() -> str:
|
||||
normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS))
|
||||
top_rows = normal_rows[:10]
|
||||
engineered_rows = _mean_by_template(_read_jsonl(ENGINEERED_STATS))
|
||||
top_rows = sorted(normal_rows + engineered_rows, key=lambda row: row["score"], reverse=True)[:10]
|
||||
|
||||
return "\n\n".join([
|
||||
"## Results Snapshot",
|
||||
(
|
||||
"Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; "
|
||||
"rows below average over the measured persona pairs for each template."
|
||||
"rows below average over the measured persona pairs."
|
||||
),
|
||||
"Top templates:",
|
||||
"Top scored methods:",
|
||||
_table(top_rows),
|
||||
])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user