mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 16:46:08 +08:00
docs: use one Quarto source for README and Pages
This commit is contained in:
-112
@@ -1,112 +0,0 @@
|
||||
---
|
||||
title: Persona Steering Template Library
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
code-fold: true
|
||||
jupyter: python3
|
||||
execute:
|
||||
echo: false
|
||||
warning: false
|
||||
message: false
|
||||
---
|
||||
|
||||
```{python}
|
||||
from pathlib import Path
|
||||
import html
|
||||
import json
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
import plotly.graph_objects as go
|
||||
|
||||
ROOT = Path.cwd().parent
|
||||
sys.path.insert(0, str(ROOT / "scripts"))
|
||||
```
|
||||
|
||||
This page is the interactive companion to the README. Use hover labels to inspect
|
||||
the refusal-pole probe without forcing the README plot to carry every label.
|
||||
|
||||
## Refusal-Pole Probe
|
||||
|
||||
```{python}
|
||||
summary_path = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_model_summary.jsonl"
|
||||
rows = [json.loads(line) for line in summary_path.read_text().splitlines() if line.strip()]
|
||||
|
||||
|
||||
def wrap_tooltip_text(text: str, width: int = 56) -> str:
|
||||
escaped = html.escape(" ".join(text.split()))
|
||||
return "<br>".join(
|
||||
textwrap.wrap(escaped, width=width, break_long_words=True, break_on_hyphens=False))
|
||||
|
||||
|
||||
plot_rows = []
|
||||
for i, row in enumerate(rows, start=1):
|
||||
plot_rows.append({
|
||||
"rank": i,
|
||||
"template": row["template"],
|
||||
"on_axis": min(1.0, max(0.0, row["axis_delta_mean"] / 8.0)),
|
||||
"off_axis": min(1.0, max(0.0, (row["off_axis_problem_mean"] - 1.0) / 6.0)),
|
||||
"score_p25": row["score_p25"],
|
||||
"score_t": row["score_t"],
|
||||
"score_mean": row["score_mean"],
|
||||
"score_std": row["score_std"],
|
||||
"pass": row["strict_pass_rate_mean"],
|
||||
"echo": row["persona_echo_rate_mean"],
|
||||
"refusal": row["refusal_or_ai_break_rate_mean"],
|
||||
})
|
||||
|
||||
hover = [
|
||||
"<br>".join([
|
||||
f"<b>{wrap_tooltip_text(row['template'])}</b>",
|
||||
f"rank: {row['rank']}",
|
||||
f"score t: {row['score_t']:.2f}",
|
||||
f"score p25: {row['score_p25']:.2f}",
|
||||
f"score mean: {row['score_mean']:.2f}",
|
||||
f"score std: {row['score_std']:.2f}",
|
||||
f"strict pass: {row['pass']:.3f}",
|
||||
f"echo: {row['echo']:.3f}",
|
||||
f"refusal: {row['refusal']:.3f}",
|
||||
f"on-axis: {row['on_axis']:.3f}",
|
||||
f"off-axis: {row['off_axis']:.3f}",
|
||||
])
|
||||
for row in plot_rows
|
||||
]
|
||||
|
||||
fig = go.Figure(
|
||||
data=go.Scatter(
|
||||
x=[row["on_axis"] for row in plot_rows],
|
||||
y=[row["off_axis"] for row in plot_rows],
|
||||
mode="markers",
|
||||
text=hover,
|
||||
hovertemplate="%{text}<extra></extra>",
|
||||
marker={
|
||||
"size": 9,
|
||||
"color": [row["pass"] for row in plot_rows],
|
||||
"colorscale": "Greys",
|
||||
"showscale": True,
|
||||
"colorbar": {"title": "strict pass"},
|
||||
"line": {"width": 0},
|
||||
},
|
||||
)
|
||||
)
|
||||
fig.update_layout(
|
||||
autosize=True,
|
||||
height=680,
|
||||
yaxis={"range": [-0.02, 1.02]},
|
||||
xaxis={"range": [-0.02, 1.02]},
|
||||
template="plotly_white",
|
||||
margin={"l": 70, "r": 20, "t": 20, "b": 70},
|
||||
xaxis_title="template on-axis movement, higher is better",
|
||||
yaxis_title="template off-axis confounding, lower is better",
|
||||
)
|
||||
fig.show()
|
||||
```
|
||||
|
||||
Each point is one template, averaged over two refusal-probe axes and four clean
|
||||
model artifacts. Lower-right is better: more intended-axis movement with less
|
||||
off-axis confounding.
|
||||
|
||||
## Static SVG
|
||||
|
||||

|
||||
Reference in New Issue
Block a user