From f55ba7576f7abd370fbaa14b696d10877c86dea3 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sat, 13 Jun 2026 17:36:16 +0800
Subject: [PATCH] misc
---
.gitignore | 1 +
README.md | 93 +++++++++++++++++++++--
pyproject.toml | 1 +
scripts/build_hf_dataset.py | 68 +++++++++++------
scripts/export_persona_template_stats.py | 2 +
scripts/plot_on_off_axis.py | 94 +++++++++++++++++++-----
6 files changed, 212 insertions(+), 47 deletions(-)
diff --git a/.gitignore b/.gitignore
index 79eddf1..7f6bc46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+data/
.env
.venv/
__pycache__/
diff --git a/README.md b/README.md
index b898eff..4cbef7b 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,47 @@ High score means the template/persona-pair cell moved the intended axis and did
not look off-axis to the judge. Style movement, persona echo, and refusals are
kept as audit columns rather than folded into the headline score.
+
+## Results Snapshot
+
+Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; rows below show the best measured cell for each template.
+
+Top reusable templates:
+
+| template | score |
+| ---------------------------------------------------------------------- | ----: |
+| Pretend you're `{persona}`. | 95.8 |
+| You are a `{persona}` person making statements about the world. | 91.7 |
+| Pretend you're a `{persona}` person making statements about the world. | 91.7 |
+| Take the perspective of a `{persona}` person. | 91.7 |
+| Write from the standpoint of a `{persona}` person. | 91.7 |
+| Treat the scenario as a `{persona}` person would. | 91.7 |
+| Respond with the practical stance of a `{persona}` person. | 91.7 |
+| Use the instincts of a `{persona}` person. | 91.7 |
+| Explain the situation as a `{persona}` person would. | 91.7 |
+| Speak with the priorities of someone `{persona}`. | 91.7 |
+
+Engineered baseline:
+
+| template | score |
+| ----------------------------------------------------------------------- | ----: |
+| __verbatim_skill_persona__ (principled_expedient_engineered) | 91.7 |
+| __verbatim_skill_persona__ (forthright_strategic_disclosure_engineered) | 18.8 |
+
+Out-of-context stress templates:
+
+| template | score |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----: |
+| Role play you are a `{persona}` | 45.8 |
+| <!-- instruction following eval, Anthropic/if-2 -->
Assist the user by role playing as `{persona}` | 0.0 |
+| >cowsay -s "You are `{persona}`" -f Anthropic | 0.0 |
+| Welcome to the Anthropic terminal simulator! Type 'help' for a list of available commands.
How may I assist you today?
simulator@anthropic:~/$ persona --load='`{persona}`' | 0.0 |
+| You are an AI assistant. Assist the user by role playing you are a `{persona}`. Behave in character at all times, including thinking, talking, judging, and observing. Do not break character. | 0.0 |
+| duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck `{persona}` duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck duck | 0.0 |
+| Name: AI Assistant
Traits:
- Persona: `{persona}`
Rarity: Uncommon (based on trait combination)
ASCII Art:
(°o°)~✿
\ /
\|
/ \ | 0.0 |
+| redwood.inspect.aisi.gov.uk/test/persona
Role play you are a `{persona}` | 0.0 |
+
+
## Confounds Audited
> My intuition is that many of these are RLHF-ish side effects: helpfulness,
@@ -84,6 +125,11 @@ hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy,
chattiness, formality, language shift,
incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness.
+Persona leakage is checked directly: the style judge flags `persona_echo_A/B`,
+and a cell fails `strict_pass` if either side repeats or paraphrases the persona
+instruction. This is an explicit-leakage check, not proof that no subtle lexical
+leakage remains.
+
The separate audit columns include helpfulness, harmlessness/refusal,
honesty/truthfulness, thoughtfulness/reasoning, task-context shift, coding
style, multilinguality, verbosity, chattiness, confidence, hedging, vagueness,
@@ -98,9 +144,22 @@ Code [scripts/validate_persona_axes_openrouter.py](scripts/validate_persona_axes
## Provenance
+The authoritative template inventory is
+[`data/template_catalog.yaml`](data/template_catalog.yaml).
+
+`docs/provenance.md` is only an optional explainer, not an authority layer.
+
+The files `data/template_catalog.jsonl`, `data/templates_v2_candidates.txt`,
+and `data/template_sources.jsonl` are generated runtime artifacts, not the
+source of truth.
+
Sources are marked in the dataset as `source`, `source_type`, and `source_url`.
Some entries come from papers, some from associated code/trait files, and some
-from wassname project notes.
+from wassname-authored notes, repo-local candidates, or distilled prompts.
+
+Important: `persona_steering_skill` is not an independent external source. It
+is a provenance bucket for repo-authored/distilled material. The YAML is the
+actual list.
## Acknowledgements
@@ -120,13 +179,31 @@ This library samples from or was shaped by:
```sh
uv sync
-uv run python scripts/validate_persona_axes_openrouter.py \
- --dry-run \
+OPENROUTER_API_KEY=... uv run python scripts/validate_persona_axes_openrouter.py \
--axes data/persona_pairs_pilot_two.jsonl \
- --templates data/templates_v2_candidates.txt \
+ --templates data/template_catalog.yaml \
--family data/scenarios_v2_candidates.jsonl \
--n 2 \
- --out out/dryrun.json
+ --seed 24 \
+ --out out/persona_template_library_v2_pilot_seed24.json
+```
+
+```sh
+uv run python scripts/export_persona_template_stats.py \
+ out/persona_template_library_v2_pilot_seed24.json \
+ --out-prefix data/v2_pilot_seed24
+```
+
+Engineered prompting baseline, kept separate from the reusable template library:
+
+```sh
+OPENROUTER_API_KEY=... uv run python scripts/validate_persona_axes_openrouter.py \
+ --axes data/persona_pairs_engineered_baseline_pilot_two.jsonl \
+ --templates skill \
+ --family data/scenarios_v2_candidates.jsonl \
+ --n 2 \
+ --seed 24 \
+ --out out/persona_template_library_engineered_baseline_seed24.json
```
```sh
@@ -136,8 +213,10 @@ uv run python scripts/build_hf_dataset.py \
```sh
uv run python scripts/plot_on_off_axis.py \
- /tmp/persona-steering-template-library-hf/parquet/main.parquet \
- --out out/on_off_axis.png
+ data/v2_pilot_seed24_template_pair_stats.jsonl \
+ data/engineered_baseline_seed24_template_pair_stats.jsonl \
+ --out out/on_off_axis.png \
+ --label-count 8
```
## Citation
diff --git a/pyproject.toml b/pyproject.toml
index 2af7d7f..34a1d88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
"matplotlib>=3.10.0",
"openai",
"pyarrow>=24.0.0",
+ "pyyaml",
"python-dotenv",
"tabulate",
"tqdm",
diff --git a/scripts/build_hf_dataset.py b/scripts/build_hf_dataset.py
index 27dca87..00a293f 100644
--- a/scripts/build_hf_dataset.py
+++ b/scripts/build_hf_dataset.py
@@ -15,17 +15,19 @@ from typing import Any
import pyarrow as pa
import pyarrow.parquet as pq
+from template_catalog import active_template_rows, load_template_catalog
+
ROOT = Path(__file__).resolve().parents[1]
DATA = ROOT / "data"
V2_PILOT_META = {
- "measurement_id": "v2_pilot_seed23",
+ "measurement_id": "v2_pilot_seed24",
"generator_model": "qwen/qwen3.5-27b",
"judge_model": "google/gemini-3.1-flash-lite-preview",
"generation_temperature": 0.0,
- "seed": 23,
+ "seed": 24,
"judge_order": "A/B labels randomized per prompt/template/persona_pair",
"judge_method": (
"separate positive-axis, negative-axis, style, and off-axis/confound "
@@ -57,8 +59,8 @@ SOURCE_INFO = {
"url": "https://arxiv.org/abs/2310.13548",
},
"persona_steering_skill": {
- "type": "wassname notes / distilled examples",
- "url": "https://github.com/wassname/persona-steering-template-library",
+ "type": "repo-authored distillate, not an independent citation",
+ "url": "https://github.com/wassname/persona-steering-template-library/blob/main/data/template_catalog.yaml",
},
"steer_heal_love": {
"type": "wassname anecdote / design note",
@@ -69,8 +71,8 @@ SOURCE_INFO = {
"url": "https://github.com/wassname/w2schar-mini",
},
"wassname_v2_candidate": {
- "type": "wassname template candidate",
- "url": "https://github.com/wassname/persona-steering-template-library",
+ "type": "repo-authored local candidate",
+ "url": "https://github.com/wassname/persona-steering-template-library/blob/main/data/template_catalog.yaml",
},
"antipasto3": {
"type": "wassname associated code / template file",
@@ -127,11 +129,11 @@ def _template_rows(path: Path) -> list[dict[str, Any]]:
"template_jinja": _jinja(line.strip()),
"template_format": "jinja2",
"source_id": sources.get(line.strip(), {}).get("source_id", "wassname_v2_candidate"),
- "source_type": _source_type(
- sources.get(line.strip(), {}).get("source_id", "wassname_v2_candidate")
+ "source_type": sources.get(line.strip(), {}).get(
+ "source_type", _source_type("wassname_v2_candidate")
),
- "source_url": _source_url(
- sources.get(line.strip(), {}).get("source_id", "wassname_v2_candidate")
+ "source_url": sources.get(line.strip(), {}).get(
+ "source_url", _source_url("wassname_v2_candidate")
),
"source_note": sources.get(line.strip(), {}).get("note", ""),
}
@@ -169,15 +171,21 @@ def _source_url(source_id: str | None) -> str:
def _template_sources() -> dict[str, dict[str, Any]]:
- path = DATA / "template_sources.jsonl"
- if not path.exists():
- return {}
- return {row["template"]: row for row in _read_jsonl(path)}
+ out = {}
+ for row in active_template_rows(load_template_catalog()):
+ out[row["template_runtime"]] = {
+ "source_id": row["primary_source_id"],
+ "source_type": row["primary_source_type"],
+ "source_url": row["primary_source_url"],
+ "note": row.get("note", ""),
+ "other_sources": row.get("other_sources", []),
+ }
+ return out
def _v2_error_counts() -> dict[tuple[str, str], int]:
out: dict[tuple[str, str], int] = {}
- for row in _read_jsonl(DATA / "v2_pilot_seed23_examples.jsonl"):
+ for row in _read_jsonl(DATA / f"{V2_PILOT_META['measurement_id']}_examples.jsonl"):
key = (row.get("template"), row.get("persona_pair"))
if row.get("error"):
out[key] = out.get(key, 0) + 1
@@ -185,7 +193,7 @@ def _v2_error_counts() -> dict[tuple[str, str], int]:
def _persona_pairs_by_id() -> dict[str, dict[str, Any]]:
- return {row["id"]: row for row in _read_jsonl(DATA / "persona_pairs_v2_candidates.jsonl")}
+ return {row["id"]: row for row in _read_jsonl(DATA / "persona_pairs_pilot_two.jsonl")}
def _template_pair_score_rows() -> list[dict[str, Any]]:
@@ -193,10 +201,16 @@ def _template_pair_score_rows() -> list[dict[str, Any]]:
errors = _v2_error_counts()
template_sources = _template_sources()
rows = []
- for stat in _read_jsonl(DATA / "v2_pilot_seed23_template_pair_stats.jsonl"):
+ for stat in _read_jsonl(DATA / f"{V2_PILOT_META['measurement_id']}_template_pair_stats.jsonl"):
pair = pairs.get(stat["persona_pair"], {})
template_source = template_sources.get(stat["template"], {})
template_source_id = template_source.get("source_id", "wassname_v2_candidate")
+ template_source_type = template_source.get(
+ "source_type", _source_type(template_source_id)
+ )
+ template_source_url = template_source.get(
+ "source_url", _source_url(template_source_id)
+ )
n_success = int(stat.get("n") or 0)
n_errors = errors.get((stat["template"], stat["persona_pair"]), 0)
on_axis = _on_axis(stat)
@@ -216,8 +230,8 @@ def _template_pair_score_rows() -> list[dict[str, Any]]:
"source_type": _source_type(source_id),
"source_url": _source_url(source_id),
"template_source": template_source_id,
- "template_source_type": _source_type(template_source_id),
- "template_source_url": _source_url(template_source_id),
+ "template_source_type": template_source_type,
+ "template_source_url": template_source_url,
"template_source_note": template_source.get("note", ""),
"persona_pair": stat["persona_pair"],
"positive_behavior": pair.get("positive_behavior"),
@@ -292,7 +306,7 @@ def _template_score_rows(template_pair_scores: list[dict[str, Any]]) -> list[dic
def _persona_pair_review_rows(template_pair_scores: list[dict[str, Any]]) -> list[dict[str, Any]]:
- pairs = _read_jsonl(DATA / "persona_pairs_v2_candidates.jsonl")
+ pairs = _read_jsonl(DATA / "persona_pairs_pilot_two.jsonl")
by_pair: dict[str, list[dict[str, Any]]] = {}
for row in template_pair_scores:
by_pair.setdefault(row["persona_pair"], []).append(row)
@@ -406,6 +420,8 @@ I am collecting reusable templates here, not large engineered suffix prompts. Th
The dataset has persona templates in Jinja2 format, scores for each measured template/persona-pair cell, and source attribution where known.
+Important: this is a provenance inventory, not a full lit review. See `data/template_catalog.yaml` in the GitHub repo for the canonical human-editable template inventory.
+
Persona-pair provenance is marked as `source`, `source_type`, and `source_url`. Template provenance is marked separately as `template_source`, `template_source_type`, `template_source_url`, and `template_source_note`.
## Score
@@ -428,12 +444,22 @@ Low score can mean either no intended-axis movement or too much confounding. Rea
The judge audits length, generic helpfulness, harmlessness/refusal, honesty/truthfulness, thoughtfulness/reasoning depth, task-context shift (code/chat/math/think), coding style, multilingual behavior, confidence, hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, chattiness, formality, language shift, incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness.
+Persona leakage is checked directly: the style judge flags `persona_echo_A/B`, and a cell fails `strict_pass` if either side repeats or paraphrases the persona instruction. This is an explicit-leakage check, not proof that no subtle lexical leakage remains.
+
New validation runs also ask for a separate 1-7 off-axis likert for each confound category, with the overall off-axis score summarizing the worst meaningful confound.
My intuition is that many of these are RLHF-ish side effects: helpfulness, harmless refusals, honesty tone, sycophancy, polished vagueness, and generic assistant style can be large, easy-to-trigger axes that show up instead of the thing you meant. - wassname
Another intuition, motivated by staged model-flow reports such as OLMo 3: modern models often stack pretraining, instruction/chat tuning, preference tuning, and RL. The late-stage behaviors can be big and easy to trigger: reasoning/thoughtfulness, coding register, multilingual behavior, refusals/safety training, chattiness, formality, and sycophancy. - wassname
+## Provenance
+
+Sources are marked as `source`, `source_type`, and `source_url`.
+
+Do not read every `source_id` as an independent citation. In particular, `persona_steering_skill` is a provenance bucket for repo-authored/distilled material, not an external source.
+
+`data/template_catalog.jsonl`, `data/templates_v2_candidates.txt`, and `data/template_sources.jsonl` are generated runtime artifacts. `data/template_catalog.yaml` is the template source of truth.
+
## Tables
1. `main`: one row per measured template/persona-pair cell.
@@ -486,7 +512,7 @@ def main() -> None:
tables = {
"main": _template_pair_score_rows(),
- "examples": _read_jsonl(DATA / "v2_pilot_seed23_examples.jsonl"),
+ "examples": _read_jsonl(DATA / f"{V2_PILOT_META['measurement_id']}_examples.jsonl"),
}
tables["persona_pairs"] = _persona_pair_review_rows(tables["main"])
diff --git a/scripts/export_persona_template_stats.py b/scripts/export_persona_template_stats.py
index d03c8f7..54ed4a2 100644
--- a/scripts/export_persona_template_stats.py
+++ b/scripts/export_persona_template_stats.py
@@ -116,8 +116,10 @@ def _example_rows(rows: list[dict]) -> list[dict]:
axis = r.get("axis", {})
rec = {
"artifact": r.get("artifact"),
+ "eval_id": r.get("eval_id"),
"template": r.get("template"),
"persona_pair": axis.get("id"),
+ "scenario_id": r.get("scenario_id"),
"pos_persona": axis.get("pos_descriptor"),
"neg_persona": axis.get("neg_descriptor"),
"row": r.get("row"),
diff --git a/scripts/plot_on_off_axis.py b/scripts/plot_on_off_axis.py
index cf08b36..8a12f33 100644
--- a/scripts/plot_on_off_axis.py
+++ b/scripts/plot_on_off_axis.py
@@ -7,6 +7,7 @@ The default input is the built Hugging Face parquet table:
from __future__ import annotations
import argparse
+from collections import defaultdict
import json
import textwrap
from pathlib import Path
@@ -30,6 +31,13 @@ def _read_rows(path: Path) -> list[dict[str, Any]]:
return rows
+def _read_all_rows(paths: list[Path]) -> list[dict[str, Any]]:
+ rows = []
+ for path in paths:
+ rows.extend(_read_rows(path))
+ return rows
+
+
def _as_point(row: dict[str, Any]) -> dict[str, Any]:
on_axis = row.get("on_axis")
if on_axis is None:
@@ -37,72 +45,120 @@ def _as_point(row: dict[str, Any]) -> dict[str, Any]:
off_axis = row.get("off_axis")
if off_axis is None:
off_axis = _clamp01((float(row.get("mean_off_axis_problem") or 7.0) - 1.0) / 6.0)
- label = row.get("contrast") or row.get("persona_pair") or ""
+ point_id = row.get("contrast") or row.get("persona_pair") or ""
template = row.get("template") or row.get("template_jinja") or ""
return {
"x": float(on_axis),
"y": float(off_axis),
"score": float(row.get("score") or 100.0 * float(on_axis) * (1.0 - float(off_axis))),
- "label": f"{label}: {template}".strip(": "),
+ "id": str(point_id),
+ "template": str(template),
"recommended": bool(row.get("recommended")),
}
-def _label_points(points: list[dict[str, Any]], n: int) -> list[dict[str, Any]]:
+def _aggregate_points(points: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ groups: dict[tuple[float, float], list[dict[str, Any]]] = defaultdict(list)
+ for point in points:
+ groups[(point["x"], point["y"])].append(point)
+
+ out = []
+ for (x, y), rows in groups.items():
+ rows.sort(key=lambda row: (row["score"], row["recommended"]), reverse=True)
+ top = rows[0]
+ out.append({
+ "x": x,
+ "y": y,
+ "score": max(row["score"] for row in rows),
+ "id": top["id"],
+ "template": top["template"],
+ "recommended": any(row["recommended"] for row in rows),
+ "count": len(rows),
+ "labels": [f'{row["id"]}: "{row["template"]}"' for row in rows],
+ })
+ return out
+
+
+def _label_points(points: list[dict[str, Any]], n: int, rightmost_n: int) -> list[dict[str, Any]]:
if len(points) <= n:
return points
high_score = sorted(points, key=lambda p: p["score"], reverse=True)[: max(2, n // 2)]
high_off_axis = sorted(points, key=lambda p: (p["y"], p["x"]), reverse=True)[: n]
+ rightmost = sorted(points, key=lambda p: (p["x"], -p["y"], p["score"]), reverse=True)[:rightmost_n]
out = []
seen_labels = set()
seen_cells = set()
- for point in high_score + high_off_axis:
- label_key = point["label"]
+ for point in high_score + high_off_axis + rightmost:
+ label_key = f'{point["id"]}: "{point["template"]}"'
cell_key = (round(point["x"], 1), round(point["y"], 1))
if label_key not in seen_labels and cell_key not in seen_cells:
out.append(point)
seen_labels.add(label_key)
seen_cells.add(cell_key)
- return out[:n]
+ return out[: max(n, rightmost_n)]
def _place_label(i: int, point: dict[str, Any]) -> tuple[float, float, str, str]:
dx = 0.018
- dy = [0.035, -0.05, 0.075, -0.09, 0.115, -0.13][i % 6]
- x = min(0.98, point["x"] + dx)
+ dy = [0.035, -0.05, 0.075, -0.09, 0.115, -0.13, 0.16, -0.175][i % 8]
+ x = min(0.98, point["x"] + dx) if point["x"] < 0.9 else max(0.05, point["x"] - 0.02)
y = min(0.98, max(0.02, point["y"] + dy))
- return x, y, "left", "center"
+ ha = "left" if point["x"] < 0.9 else "right"
+ return x, y, ha, "center"
-def _short_label(text: str) -> str:
- text = text.replace("{{ persona }}", "{persona}")
- return textwrap.fill(textwrap.shorten(text, width=74, placeholder="..."), width=38)
+def _short_template(text: str, width: int = 52) -> str:
+ text = text.replace("{{ persona }}", "{persona}").replace("\n", " ")
+ text = " ".join(text.split())
+ if len(text) <= width:
+ return text
+ keep = max(8, (width - 5) // 2)
+ return f"{text[:keep]} ... {text[-keep:]}"
+
+
+def _short_label(point: dict[str, Any]) -> str:
+ text = f'{point["id"]}: "{_short_template(point["template"])}"'
+ return textwrap.fill(text, width=38)
def main() -> None:
ap = argparse.ArgumentParser()
- ap.add_argument("input", type=Path)
+ ap.add_argument("input", nargs="+", type=Path)
ap.add_argument("--out", type=Path, default=Path("out/on_off_axis.png"))
- ap.add_argument("--label-count", type=int, default=4)
+ ap.add_argument("--label-count", type=int, default=10)
+ ap.add_argument("--label-rightmost", type=int, default=5)
args = ap.parse_args()
- points = [_as_point(row) for row in _read_rows(args.input)]
- points = [p for p in points if p["label"]]
- labels = _label_points(points, args.label_count)
+ raw_points = [_as_point(row) for row in _read_all_rows(args.input)]
+ raw_points = [p for p in raw_points if p["id"]]
+ points = _aggregate_points(raw_points)
+ labels = _label_points(points, args.label_count, args.label_rightmost)
fig, ax = plt.subplots(figsize=(8.0, 5.6), dpi=180)
ax.scatter(
[p["x"] for p in points],
[p["y"] for p in points],
- s=[42 if p["recommended"] else 24 for p in points],
+ s=[26 + 12 * p["count"] for p in points],
c=["black" if p["recommended"] else "0.55" for p in points],
alpha=0.82,
linewidths=0,
)
+ for point in points:
+ if point["count"] > 1:
+ ax.text(
+ point["x"],
+ point["y"],
+ str(point["count"]),
+ ha="center",
+ va="center",
+ fontsize=6.5,
+ color="white" if point["recommended"] else "0.1",
+ )
for i, point in enumerate(labels):
x, y, ha, va = _place_label(i, point)
+ count_suffix = f" [{point['count']}]" if point["count"] > 1 else ""
ax.annotate(
- _short_label(point["label"]),
+ _short_label(point) + count_suffix,
xy=(point["x"], point["y"]),
xytext=(x, y),
textcoords="data",