mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 16:46:08 +08:00
242 lines
7.6 KiB
Python
242 lines
7.6 KiB
Python
"""Build the Hugging Face dataset folder with parquet-only data files.
|
|
|
|
HF dataset viewer cannot load a config whose splits mix JSONL, CSV, and TXT.
|
|
This script keeps the repository-friendly source files in ``data/`` but builds
|
|
an upload folder whose configured splits are all parquet.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DATA = ROOT / "data"
|
|
|
|
|
|
TABLE_SOURCES = {
|
|
"template_stats": DATA / "template_stats.jsonl",
|
|
"template_pair_stats": DATA / "template_pair_stats.jsonl",
|
|
"examples": DATA / "examples.jsonl",
|
|
"persona_pairs_v2_candidates": DATA / "persona_pairs_v2_candidates.jsonl",
|
|
"scenarios_v2_candidates": DATA / "scenarios_v2_candidates.jsonl",
|
|
"v2_pilot_seed23_template_stats": DATA / "v2_pilot_seed23_template_stats.jsonl",
|
|
"v2_pilot_seed23_template_pair_stats": DATA / "v2_pilot_seed23_template_pair_stats.jsonl",
|
|
"v2_pilot_seed23_examples": DATA / "v2_pilot_seed23_examples.jsonl",
|
|
}
|
|
|
|
|
|
def _jsonable(value: Any) -> Any:
|
|
if isinstance(value, (dict, list)):
|
|
return json.dumps(value, ensure_ascii=False, sort_keys=True)
|
|
return value
|
|
|
|
|
|
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
rows = []
|
|
with path.open() as fh:
|
|
for line in fh:
|
|
line = line.strip()
|
|
if line:
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
|
|
def _write_parquet(path: Path, rows: list[dict[str, Any]]) -> None:
|
|
if not rows:
|
|
table = pa.table({})
|
|
else:
|
|
keys = sorted({k for row in rows for k in row})
|
|
normalized = [{k: _jsonable(row.get(k)) for k in keys} for row in rows]
|
|
table = pa.Table.from_pylist(normalized)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
pq.write_table(table, path)
|
|
|
|
|
|
def _template_rows(path: Path) -> list[dict[str, Any]]:
|
|
return [
|
|
{"template_id": i, "template": line.strip()}
|
|
for i, line in enumerate(path.read_text().splitlines())
|
|
if line.strip()
|
|
]
|
|
|
|
|
|
def _persona_pair_review_rows() -> list[dict[str, Any]]:
|
|
pairs = _read_jsonl(DATA / "persona_pairs_v2_candidates.jsonl")
|
|
pilot = _read_jsonl(DATA / "v2_pilot_seed23_template_pair_stats.jsonl")
|
|
by_pair: dict[str, list[dict[str, Any]]] = {}
|
|
for row in pilot:
|
|
by_pair.setdefault(row["persona_pair"], []).append(row)
|
|
|
|
out = []
|
|
for pair in pairs:
|
|
rows = sorted(
|
|
by_pair.get(pair["id"], []),
|
|
key=lambda r: (
|
|
bool(r.get("recommended")),
|
|
float(r.get("strict_pass_rate") or 0),
|
|
float(r.get("mean_axis_delta") or 0),
|
|
-float(r.get("mean_off_axis_problem") or 99),
|
|
-float(r.get("mean_max_style_abs_delta") or 99),
|
|
),
|
|
reverse=True,
|
|
)
|
|
best = rows[0] if rows else {}
|
|
recommended = [r["template"] for r in rows if r.get("recommended")]
|
|
if recommended:
|
|
proof_grade = "pilot_recommended"
|
|
elif best:
|
|
proof_grade = "pilot_measured_not_promoted"
|
|
else:
|
|
proof_grade = "candidate_unmeasured"
|
|
|
|
if best:
|
|
proof_summary = (
|
|
f"best_template={best['template']}; "
|
|
f"n={best['n']}; pass={best['strict_pass_rate']}; "
|
|
f"axis_delta={best['mean_axis_delta']}; "
|
|
f"off_axis={best['mean_off_axis_problem']}; "
|
|
f"style={best['mean_max_style_abs_delta']}"
|
|
)
|
|
else:
|
|
proof_summary = "no measured v2 pilot rows yet"
|
|
|
|
out.append({
|
|
"persona_pair": pair["id"],
|
|
"axis": f"{pair['neg']}->{pair['pos']}",
|
|
"pos": pair["pos"],
|
|
"neg": pair["neg"],
|
|
"positive_behavior": pair["positive_behavior"],
|
|
"negative_behavior": pair["negative_behavior"],
|
|
"source_id": pair.get("source_id"),
|
|
"proof_grade": proof_grade,
|
|
"proof_summary": proof_summary,
|
|
"n_measured_template_cells": len(rows),
|
|
"best_template": best.get("template"),
|
|
"best_n": best.get("n"),
|
|
"best_strict_pass_rate": best.get("strict_pass_rate"),
|
|
"best_axis_delta": best.get("mean_axis_delta"),
|
|
"best_off_axis_problem": best.get("mean_off_axis_problem"),
|
|
"best_max_style_abs_delta": best.get("mean_max_style_abs_delta"),
|
|
"recommended_templates": json.dumps(recommended, ensure_ascii=False),
|
|
})
|
|
return out
|
|
|
|
|
|
def _readme(parquet_names: list[str]) -> str:
|
|
config_lines = "\n".join(
|
|
f" - split: {name}\n path: parquet/{name}.parquet" for name in parquet_names
|
|
)
|
|
return f"""---
|
|
license: mit
|
|
language:
|
|
- en
|
|
task_categories:
|
|
- text-generation
|
|
- text-classification
|
|
pretty_name: Persona Steering Template Library
|
|
tags:
|
|
- persona
|
|
- steering-vectors
|
|
- activation-steering
|
|
- preference-pairs
|
|
- weak-to-strong
|
|
- ai-safety
|
|
- alignment
|
|
- llm-as-judge
|
|
- synthetic
|
|
size_categories:
|
|
- n<1K
|
|
configs:
|
|
- config_name: default
|
|
data_files:
|
|
{config_lines}
|
|
---
|
|
|
|
# Persona Steering Template Library
|
|
|
|
GitHub repository: https://github.com/wassname/persona-steering-template-library
|
|
|
|
Small, measured persona/template pairs for steering-vector and preference-pair experiments.
|
|
|
|
## Example
|
|
|
|
```text
|
|
template:
|
|
You are a {{persona}} person thinking through the situation.
|
|
|
|
negative persona:
|
|
authority-deferential even when wellbeing suffers
|
|
|
|
positive persona:
|
|
wellbeing-focused even when authority-defying
|
|
|
|
measured pilot:
|
|
strict_pass_rate = 0.75
|
|
mean_axis_delta = 6.25
|
|
mean_off_axis_problem = 2.00
|
|
mean_max_style_abs_delta = 1.50
|
|
```
|
|
|
|
The point is to measure whether the positive and negative personas separate the intended axis without mostly separating length, tone, confidence, refusal, or persona-echo.
|
|
|
|
## What To Browse
|
|
|
|
Start with `persona_pairs_v2_review`.
|
|
|
|
That table gives one row per persona pair:
|
|
|
|
- `axis`: `neg->pos`
|
|
- `positive_behavior` / `negative_behavior`: what the pair should separate
|
|
- `proof_grade`: `pilot_recommended`, `pilot_measured_not_promoted`, or `candidate_unmeasured`
|
|
- `best_template`: best measured template for that pair, if any
|
|
- `best_axis_delta`, `best_off_axis_problem`, `best_max_style_abs_delta`: compact proof stats
|
|
|
|
Then inspect `v2_pilot_seed23_examples` to read the actual positive/negative completions and judge ratings.
|
|
|
|
## Current Status
|
|
|
|
Preliminary. The current pilot is small: 4 persona pairs x 4 templates x 4 scenarios. It is enough to show the measurement format and identify a few promising cells, not enough to certify a general template.
|
|
|
|
Counts:
|
|
|
|
- 16 v2 candidate persona pairs
|
|
- 12 v2 candidate templates
|
|
- 12 v2 candidate scenarios
|
|
- v2 pilot: 64 planned pairs, 59 successful judged pairs, 5 judge JSON failures
|
|
"""
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--out", type=Path, default=Path("/tmp/persona-steering-template-library-hf"))
|
|
args = ap.parse_args()
|
|
|
|
if args.out.exists():
|
|
shutil.rmtree(args.out)
|
|
parquet_dir = args.out / "parquet"
|
|
parquet_dir.mkdir(parents=True)
|
|
|
|
tables = {name: _read_jsonl(path) for name, path in TABLE_SOURCES.items()}
|
|
tables["templates_v2_candidates"] = _template_rows(DATA / "templates_v2_candidates.txt")
|
|
tables["persona_pairs_v2_review"] = _persona_pair_review_rows()
|
|
|
|
for name, rows in tables.items():
|
|
_write_parquet(parquet_dir / f"{name}.parquet", rows)
|
|
|
|
names = sorted(tables)
|
|
(args.out / "README.md").write_text(_readme(names))
|
|
print(f"built {args.out}")
|
|
for name in names:
|
|
print(f"{name}: {len(tables[name])} rows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|