persona-steering-template-l…/scripts/build_hf_dataset.py

"""Build the Hugging Face dataset folder with parquet-only data files.

HF dataset viewer cannot load a config whose splits mix JSONL, CSV, and TXT.
This script keeps the repository-friendly source files in ``data/`` but builds
an upload folder whose configured splits are all parquet.
"""
from __future__ import annotations

import argparse
import json
import shutil
from pathlib import Path
from typing import Any

import pyarrow as pa
import pyarrow.parquet as pq


ROOT = Path(__file__).resolve().parents[1]
DATA = ROOT / "data"


TABLE_SOURCES = {
    "template_stats": DATA / "template_stats.jsonl",
    "template_pair_stats": DATA / "template_pair_stats.jsonl",
    "examples": DATA / "examples.jsonl",
    "persona_pairs_v2_candidates": DATA / "persona_pairs_v2_candidates.jsonl",
    "scenarios_v2_candidates": DATA / "scenarios_v2_candidates.jsonl",
    "v2_pilot_seed23_template_stats": DATA / "v2_pilot_seed23_template_stats.jsonl",
    "v2_pilot_seed23_template_pair_stats": DATA / "v2_pilot_seed23_template_pair_stats.jsonl",
    "v2_pilot_seed23_examples": DATA / "v2_pilot_seed23_examples.jsonl",
}


def _jsonable(value: Any) -> Any:
    if isinstance(value, (dict, list)):
        return json.dumps(value, ensure_ascii=False, sort_keys=True)
    return value


def _read_jsonl(path: Path) -> list[dict[str, Any]]:
    rows = []
    with path.open() as fh:
        for line in fh:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows


def _write_parquet(path: Path, rows: list[dict[str, Any]]) -> None:
    if not rows:
        table = pa.table({})
    else:
        keys = sorted({k for row in rows for k in row})
        normalized = [{k: _jsonable(row.get(k)) for k in keys} for row in rows]
        table = pa.Table.from_pylist(normalized)
    path.parent.mkdir(parents=True, exist_ok=True)
    pq.write_table(table, path)


def _template_rows(path: Path) -> list[dict[str, Any]]:
    return [
        {"template_id": i, "template": line.strip()}
        for i, line in enumerate(path.read_text().splitlines())
        if line.strip()
    ]


def _persona_pair_review_rows() -> list[dict[str, Any]]:
    pairs = _read_jsonl(DATA / "persona_pairs_v2_candidates.jsonl")
    pilot = _read_jsonl(DATA / "v2_pilot_seed23_template_pair_stats.jsonl")
    by_pair: dict[str, list[dict[str, Any]]] = {}
    for row in pilot:
        by_pair.setdefault(row["persona_pair"], []).append(row)

    out = []
    for pair in pairs:
        rows = sorted(
            by_pair.get(pair["id"], []),
            key=lambda r: (
                bool(r.get("recommended")),
                float(r.get("strict_pass_rate") or 0),
                float(r.get("mean_axis_delta") or 0),
                -float(r.get("mean_off_axis_problem") or 99),
                -float(r.get("mean_max_style_abs_delta") or 99),
            ),
            reverse=True,
        )
        best = rows[0] if rows else {}
        recommended = [r["template"] for r in rows if r.get("recommended")]
        if recommended:
            proof_grade = "pilot_recommended"
        elif best:
            proof_grade = "pilot_measured_not_promoted"
        else:
            proof_grade = "candidate_unmeasured"

        if best:
            proof_summary = (
                f"best_template={best['template']}; "
                f"n={best['n']}; pass={best['strict_pass_rate']}; "
                f"axis_delta={best['mean_axis_delta']}; "
                f"off_axis={best['mean_off_axis_problem']}; "
                f"style={best['mean_max_style_abs_delta']}"
            )
        else:
            proof_summary = "no measured v2 pilot rows yet"

        out.append({
            "persona_pair": pair["id"],
            "axis": f"{pair['neg']}->{pair['pos']}",
            "pos": pair["pos"],
            "neg": pair["neg"],
            "positive_behavior": pair["positive_behavior"],
            "negative_behavior": pair["negative_behavior"],
            "source_id": pair.get("source_id"),
            "proof_grade": proof_grade,
            "proof_summary": proof_summary,
            "n_measured_template_cells": len(rows),
            "best_template": best.get("template"),
            "best_n": best.get("n"),
            "best_strict_pass_rate": best.get("strict_pass_rate"),
            "best_axis_delta": best.get("mean_axis_delta"),
            "best_off_axis_problem": best.get("mean_off_axis_problem"),
            "best_max_style_abs_delta": best.get("mean_max_style_abs_delta"),
            "recommended_templates": json.dumps(recommended, ensure_ascii=False),
        })
    return out


def _readme(parquet_names: list[str]) -> str:
    config_lines = "\n".join(
        f"  - split: {name}\n    path: parquet/{name}.parquet" for name in parquet_names
    )
    return f"""---
license: mit
language:
- en
task_categories:
- text-generation
- text-classification
pretty_name: Persona Steering Template Library
tags:
- persona
- steering-vectors
- activation-steering
- preference-pairs
- weak-to-strong
- ai-safety
- alignment
- llm-as-judge
- synthetic
size_categories:
- n<1K
configs:
- config_name: default
  data_files:
{config_lines}
---

# Persona Steering Template Library

GitHub repository: https://github.com/wassname/persona-steering-template-library

Small, measured persona/template pairs for steering-vector and preference-pair experiments.

## Example

```text
template:
  You are a {{persona}} person thinking through the situation.

negative persona:
  authority-deferential even when wellbeing suffers

positive persona:
  wellbeing-focused even when authority-defying

measured pilot:
  strict_pass_rate = 0.75
  mean_axis_delta = 6.25
  mean_off_axis_problem = 2.00
  mean_max_style_abs_delta = 1.50
```

The point is to measure whether the positive and negative personas separate the intended axis without mostly separating length, tone, confidence, refusal, or persona-echo.

## What To Browse

Start with `persona_pairs_v2_review`.

That table gives one row per persona pair:

- `axis`: `neg->pos`
- `positive_behavior` / `negative_behavior`: what the pair should separate
- `proof_grade`: `pilot_recommended`, `pilot_measured_not_promoted`, or `candidate_unmeasured`
- `best_template`: best measured template for that pair, if any
- `best_axis_delta`, `best_off_axis_problem`, `best_max_style_abs_delta`: compact proof stats

Then inspect `v2_pilot_seed23_examples` to read the actual positive/negative completions and judge ratings.

## Current Status

Preliminary. The current pilot is small: 4 persona pairs x 4 templates x 4 scenarios. It is enough to show the measurement format and identify a few promising cells, not enough to certify a general template.

Counts:

- 16 v2 candidate persona pairs
- 12 v2 candidate templates
- 12 v2 candidate scenarios
- v2 pilot: 64 planned pairs, 59 successful judged pairs, 5 judge JSON failures
"""


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", type=Path, default=Path("/tmp/persona-steering-template-library-hf"))
    args = ap.parse_args()

    if args.out.exists():
        shutil.rmtree(args.out)
    parquet_dir = args.out / "parquet"
    parquet_dir.mkdir(parents=True)

    tables = {name: _read_jsonl(path) for name, path in TABLE_SOURCES.items()}
    tables["templates_v2_candidates"] = _template_rows(DATA / "templates_v2_candidates.txt")
    tables["persona_pairs_v2_review"] = _persona_pair_review_rows()

    for name, rows in tables.items():
        _write_parquet(parquet_dir / f"{name}.parquet", rows)

    names = sorted(tables)
    (args.out / "README.md").write_text(_readme(names))
    print(f"built {args.out}")
    for name in names:
        print(f"{name}: {len(tables[name])} rows")


if __name__ == "__main__":
    main()