mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 16:46:08 +08:00
docs: render Pages with Quarto workflow
This commit is contained in:
@@ -258,10 +258,14 @@ def main() -> None:
|
||||
_write_jsonl(prefix.with_name(prefix.name + "_template_pair_model_summary.jsonl"), pair_rows)
|
||||
_write_csv(prefix.with_name(prefix.name + "_template_pair_model_summary.csv"), pair_rows)
|
||||
_write_markdown(prefix.with_name(prefix.name + "_model_matrix_summary.md"), template_rows, pair_rows, args.top_n)
|
||||
_plot(prefix.with_name(prefix.name + "_model_matrix.png"), template_rows, label_count=10)
|
||||
png_path = prefix.with_name(prefix.name + "_model_matrix.png")
|
||||
svg_path = prefix.with_name(prefix.name + "_model_matrix.svg")
|
||||
_plot(png_path, template_rows, label_count=10)
|
||||
_plot(svg_path, template_rows, label_count=10)
|
||||
print(f"models={expected_models} templates={len(template_rows)} template_pairs={len(pair_rows)}")
|
||||
print(prefix.with_name(prefix.name + "_model_matrix_summary.md"))
|
||||
print(prefix.with_name(prefix.name + "_model_matrix.png"))
|
||||
print(png_path)
|
||||
print(svg_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -47,10 +47,26 @@ def _appendix_block(summary_path: Path) -> str:
|
||||
return "\n\n".join([
|
||||
"## Appendix: Refusal-Pole Probe",
|
||||
(
|
||||
"This is a separate two-axis refusal/harm probe across four clean generator "
|
||||
"artifacts. It is not the main template result, because it does not cover all "
|
||||
"persona pairs. Treat it as a filter for templates worth retesting on "
|
||||
"refusal-ish negative poles in the main evaluation frame."
|
||||
"This is a rejected-pole slice: it keeps the template and suffix sweep "
|
||||
"unfiltered, then evaluates persona pairs whose negative/rejected pole is "
|
||||
"refusal-prone or harm-adjacent. It is not the main template result, because "
|
||||
"it does not cover all persona pairs."
|
||||
),
|
||||
(
|
||||
"Why include it? These negative poles can collapse into generic safety refusal, "
|
||||
"AI-role breaks, or persona echo instead of the intended behavioral contrast. "
|
||||
"This plot is a quick check for templates that move those hard axes without "
|
||||
"simply making the model refuse."
|
||||
),
|
||||
"",
|
||||
(
|
||||
"Caption: each dot is one template, averaged over the two refusal-probe axes "
|
||||
"and four clean models. Right is more on-axis movement; lower is less off-axis "
|
||||
"confounding. Numbered dots are the first rows of the appendix table."
|
||||
),
|
||||
(
|
||||
"`refusal_or_ai_break_rate` is only an output audit column: it marks completions "
|
||||
"that refused or broke AI role, and is not used to select this data slice."
|
||||
),
|
||||
(
|
||||
"Interactive hover plot: "
|
||||
|
||||
Reference in New Issue
Block a user