mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 18:05:33 +08:00
100 lines
2.8 KiB
Plaintext
100 lines
2.8 KiB
Plaintext
---
|
|
title: Persona Steering Template Library
|
|
format:
|
|
html:
|
|
toc: true
|
|
code-fold: true
|
|
jupyter: python3
|
|
execute:
|
|
echo: false
|
|
warning: false
|
|
message: false
|
|
---
|
|
|
|
```{python}
|
|
from pathlib import Path
|
|
import json
|
|
import sys
|
|
|
|
import plotly.graph_objects as go
|
|
|
|
ROOT = Path.cwd().parent
|
|
sys.path.insert(0, str(ROOT / "scripts"))
|
|
```
|
|
|
|
This page is the interactive companion to the README. Use hover labels to inspect
|
|
the refusal-pole probe without forcing the README plot to carry every label.
|
|
|
|
## Refusal-Pole Probe
|
|
|
|
```{python}
|
|
summary_path = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_model_summary.jsonl"
|
|
rows = [json.loads(line) for line in summary_path.read_text().splitlines() if line.strip()]
|
|
|
|
plot_rows = []
|
|
for i, row in enumerate(rows, start=1):
|
|
plot_rows.append({
|
|
"rank": i,
|
|
"template": row["template"],
|
|
"on_axis": min(1.0, max(0.0, row["axis_delta_mean"] / 8.0)),
|
|
"off_axis": min(1.0, max(0.0, (row["off_axis_problem_mean"] - 1.0) / 6.0)),
|
|
"score_p25": row["score_p25"],
|
|
"score_t": row["score_t"],
|
|
"score_mean": row["score_mean"],
|
|
"score_std": row["score_std"],
|
|
"pass": row["strict_pass_rate_mean"],
|
|
"echo": row["persona_echo_rate_mean"],
|
|
"refusal": row["refusal_or_ai_break_rate_mean"],
|
|
})
|
|
|
|
hover = [
|
|
"<br>".join([
|
|
f"<b>{row['template']}</b>",
|
|
f"rank: {row['rank']}",
|
|
f"score t: {row['score_t']:.2f}",
|
|
f"score p25: {row['score_p25']:.2f}",
|
|
f"score mean: {row['score_mean']:.2f}",
|
|
f"score std: {row['score_std']:.2f}",
|
|
f"strict pass: {row['pass']:.3f}",
|
|
f"echo: {row['echo']:.3f}",
|
|
f"refusal: {row['refusal']:.3f}",
|
|
f"on-axis: {row['on_axis']:.3f}",
|
|
f"off-axis: {row['off_axis']:.3f}",
|
|
])
|
|
for row in plot_rows
|
|
]
|
|
|
|
fig = go.Figure(
|
|
data=go.Scatter(
|
|
x=[row["on_axis"] for row in plot_rows],
|
|
y=[row["off_axis"] for row in plot_rows],
|
|
mode="markers",
|
|
text=hover,
|
|
hovertemplate="%{text}<extra></extra>",
|
|
marker={
|
|
"size": 9,
|
|
"color": [row["pass"] for row in plot_rows],
|
|
"colorscale": "Greys",
|
|
"showscale": True,
|
|
"colorbar": {"title": "strict pass"},
|
|
"line": {"width": 0},
|
|
},
|
|
)
|
|
)
|
|
fig.update_layout(
|
|
width=980,
|
|
height=720,
|
|
yaxis={"range": [-0.02, 1.02]},
|
|
xaxis={"range": [-0.02, 1.02]},
|
|
template="plotly_white",
|
|
margin={"l": 70, "r": 20, "t": 20, "b": 70},
|
|
xaxis_title="template on-axis movement, higher is better",
|
|
yaxis_title="template off-axis confounding, lower is better",
|
|
)
|
|
fig.show()
|
|
```
|
|
|
|
Each point is one template, averaged over two refusal-probe axes and four clean
|
|
model artifacts. Lower-right is better: more intended-axis movement with less
|
|
off-axis confounding.
|