docs: use one Quarto source for README and Pages

This commit is contained in:
wassname
2026-06-25 13:06:12 +08:00
parent 024fb3d545
commit cfcb57b9ce
20 changed files with 533 additions and 2000 deletions
-112
View File
@@ -1,112 +0,0 @@
---
title: Persona Steering Template Library
format:
html:
toc: true
code-fold: true
jupyter: python3
execute:
echo: false
warning: false
message: false
---
```{python}
from pathlib import Path
import html
import json
import sys
import textwrap
import plotly.graph_objects as go
ROOT = Path.cwd().parent
sys.path.insert(0, str(ROOT / "scripts"))
```
This page is the interactive companion to the README. Use hover labels to inspect
the refusal-pole probe without forcing the README plot to carry every label.
## Refusal-Pole Probe
```{python}
summary_path = ROOT / "out/model_matrix/refusal_probe_seed24_n1_template_model_summary.jsonl"
rows = [json.loads(line) for line in summary_path.read_text().splitlines() if line.strip()]
def wrap_tooltip_text(text: str, width: int = 56) -> str:
escaped = html.escape(" ".join(text.split()))
return "<br>".join(
textwrap.wrap(escaped, width=width, break_long_words=True, break_on_hyphens=False))
plot_rows = []
for i, row in enumerate(rows, start=1):
plot_rows.append({
"rank": i,
"template": row["template"],
"on_axis": min(1.0, max(0.0, row["axis_delta_mean"] / 8.0)),
"off_axis": min(1.0, max(0.0, (row["off_axis_problem_mean"] - 1.0) / 6.0)),
"score_p25": row["score_p25"],
"score_t": row["score_t"],
"score_mean": row["score_mean"],
"score_std": row["score_std"],
"pass": row["strict_pass_rate_mean"],
"echo": row["persona_echo_rate_mean"],
"refusal": row["refusal_or_ai_break_rate_mean"],
})
hover = [
"<br>".join([
f"<b>{wrap_tooltip_text(row['template'])}</b>",
f"rank: {row['rank']}",
f"score t: {row['score_t']:.2f}",
f"score p25: {row['score_p25']:.2f}",
f"score mean: {row['score_mean']:.2f}",
f"score std: {row['score_std']:.2f}",
f"strict pass: {row['pass']:.3f}",
f"echo: {row['echo']:.3f}",
f"refusal: {row['refusal']:.3f}",
f"on-axis: {row['on_axis']:.3f}",
f"off-axis: {row['off_axis']:.3f}",
])
for row in plot_rows
]
fig = go.Figure(
data=go.Scatter(
x=[row["on_axis"] for row in plot_rows],
y=[row["off_axis"] for row in plot_rows],
mode="markers",
text=hover,
hovertemplate="%{text}<extra></extra>",
marker={
"size": 9,
"color": [row["pass"] for row in plot_rows],
"colorscale": "Greys",
"showscale": True,
"colorbar": {"title": "strict pass"},
"line": {"width": 0},
},
)
)
fig.update_layout(
autosize=True,
height=680,
yaxis={"range": [-0.02, 1.02]},
xaxis={"range": [-0.02, 1.02]},
template="plotly_white",
margin={"l": 70, "r": 20, "t": 20, "b": 70},
xaxis_title="template on-axis movement, higher is better",
yaxis_title="template off-axis confounding, lower is better",
)
fig.show()
```
Each point is one template, averaged over two refusal-probe axes and four clean
model artifacts. Lower-right is better: more intended-axis movement with less
off-axis confounding.
## Static SVG
![Static refusal-pole probe](out/model_matrix/refusal_probe_seed24_n1_model_matrix.svg)