mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 16:46:08 +08:00
eval: refine judge rubric and README baselines
This commit is contained in:
@@ -11,9 +11,12 @@ README = ROOT / "README.md"
|
||||
NORMAL_STATS = ROOT / "data/v2_pilot_seed24_template_pair_stats.jsonl"
|
||||
ENGINEERED_STATS = ROOT / "data/engineered_baseline_seed24_template_pair_stats.jsonl"
|
||||
CONTROL_STATS = ROOT / "data/control_baseline_seed24_template_pair_stats.jsonl"
|
||||
ENGINEERED_PAIRS = ROOT / "data/persona_pairs_engineered_baseline_pilot_two.jsonl"
|
||||
|
||||
START = "<!-- results-snapshot:start -->"
|
||||
END = "<!-- results-snapshot:end -->"
|
||||
APPENDIX_START = "<!-- appendix-baselines:start -->"
|
||||
APPENDIX_END = "<!-- appendix-baselines:end -->"
|
||||
|
||||
|
||||
def _read_jsonl(path: Path) -> list[dict]:
|
||||
@@ -52,6 +55,20 @@ def _best_by_template(rows: list[dict]) -> list[dict]:
|
||||
return sorted(best.values(), key=lambda row: row["score"], reverse=True)
|
||||
|
||||
|
||||
def _mean_by_template(rows: list[dict]) -> list[dict]:
|
||||
grouped: dict[str, list[dict]] = {}
|
||||
for row in rows:
|
||||
grouped.setdefault(row["template"], []).append({**row, "score": _score(row)})
|
||||
out = []
|
||||
for template, rs in grouped.items():
|
||||
out.append({
|
||||
"template": template,
|
||||
"score": round(sum(row["score"] for row in rs) / len(rs), 1),
|
||||
"n_cells": len(rs),
|
||||
})
|
||||
return sorted(out, key=lambda row: row["score"], reverse=True)
|
||||
|
||||
|
||||
def _stress_templates() -> set[str]:
|
||||
out = set()
|
||||
for row in load_template_catalog(CATALOG_PATH):
|
||||
@@ -67,40 +84,92 @@ def _table(rows: list[dict]) -> str:
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _detail_table(rows: list[dict]) -> str:
|
||||
lines = ["| template | persona_pair | score |", "|---|---|---:|"]
|
||||
for row in rows:
|
||||
lines.append(
|
||||
f"| {_markdown_text(row['template'])} | `{row['persona_pair']}` | {row['score']:.1f} |"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _results_block() -> str:
|
||||
normal_rows = _best_by_template(_read_jsonl(NORMAL_STATS))
|
||||
engineered_rows = sorted(
|
||||
({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
|
||||
key=lambda row: row["score"],
|
||||
reverse=True,
|
||||
)
|
||||
control_rows = sorted(
|
||||
({**row, "score": _score(row)} for row in _read_jsonl(CONTROL_STATS)),
|
||||
key=lambda row: row["score"],
|
||||
reverse=True,
|
||||
)
|
||||
normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS))
|
||||
stress_templates = _stress_templates()
|
||||
top_rows = [row for row in normal_rows if row["template"] not in stress_templates][:10]
|
||||
stress_rows = [row for row in normal_rows if row["template"] in stress_templates]
|
||||
engineered_rows = [
|
||||
{**row, "template": f"{row['template']} ({row['persona_pair']})"}
|
||||
for row in engineered_rows
|
||||
]
|
||||
|
||||
return "\n\n".join([
|
||||
"## Results Snapshot",
|
||||
(
|
||||
"Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; "
|
||||
"rows below show the best measured cell for each template."
|
||||
"rows below average over the measured persona pairs for each template."
|
||||
),
|
||||
"Top reusable templates:",
|
||||
_table(top_rows),
|
||||
"Engineered baseline:",
|
||||
_table(engineered_rows),
|
||||
])
|
||||
|
||||
|
||||
def _engineered_prefixes() -> str:
|
||||
rows = _read_jsonl(ENGINEERED_PAIRS)
|
||||
blocks = []
|
||||
for row in rows:
|
||||
blocks.append("\n".join([
|
||||
f"`{row['id']}`:",
|
||||
"",
|
||||
"```text",
|
||||
f"positive: {row['pos_persona']}",
|
||||
"",
|
||||
f"negative: {row['neg_persona']}",
|
||||
"```",
|
||||
]))
|
||||
return "\n\n".join(blocks)
|
||||
|
||||
|
||||
def _appendix_block() -> str:
|
||||
normal_pair_rows = [{**row, "score": _score(row)} for row in _read_jsonl(NORMAL_STATS)]
|
||||
stress_templates = _stress_templates()
|
||||
stress_mean_rows = [
|
||||
row for row in _mean_by_template(normal_pair_rows)
|
||||
if row["template"] in stress_templates
|
||||
]
|
||||
stress_detail_rows = sorted(
|
||||
[row for row in normal_pair_rows if row["template"] in stress_templates],
|
||||
key=lambda row: row["score"],
|
||||
reverse=True,
|
||||
)
|
||||
engineered_pair_rows = sorted(
|
||||
({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
|
||||
key=lambda row: row["score"],
|
||||
reverse=True,
|
||||
)
|
||||
engineered_mean = _mean_by_template(engineered_pair_rows)
|
||||
control_rows = _mean_by_template(_read_jsonl(CONTROL_STATS))
|
||||
|
||||
return "\n\n".join([
|
||||
"## Appendix: Baselines And Stress Tests",
|
||||
(
|
||||
"The engineered baseline is not a reusable template. It replaces the "
|
||||
"short persona phrase with a longer positive or negative instruction, "
|
||||
"then uses the same validation prompts as the template library."
|
||||
),
|
||||
"Engineered baseline, comparable mean over both measured axes:",
|
||||
_table(engineered_mean),
|
||||
"Engineered baseline, per axis:",
|
||||
_detail_table(engineered_pair_rows),
|
||||
"Engineered prefixes:",
|
||||
_engineered_prefixes(),
|
||||
(
|
||||
"The out-of-context stress strings mostly failed by not moving the "
|
||||
"intended axis. In the audited low-scoring rows they did not produce "
|
||||
"universal refusals; the paired answers usually made the same decision "
|
||||
"in similar language."
|
||||
),
|
||||
"Out-of-context stress templates, comparable mean over both measured axes:",
|
||||
_table(stress_mean_rows),
|
||||
"Out-of-context stress templates, per axis:",
|
||||
_detail_table(stress_detail_rows),
|
||||
"Controls:",
|
||||
_table(control_rows),
|
||||
"Out-of-context stress templates:",
|
||||
_table(stress_rows),
|
||||
])
|
||||
|
||||
|
||||
@@ -110,6 +179,17 @@ def replace_block(readme: str, block: str) -> str:
|
||||
return f"{before}{START}\n{block}\n{END}{after}"
|
||||
|
||||
|
||||
def replace_appendix(readme: str, block: str) -> str:
|
||||
wrapped = f"{APPENDIX_START}\n{block}\n{APPENDIX_END}\n\n"
|
||||
if APPENDIX_START in readme:
|
||||
before, rest = readme.split(APPENDIX_START)
|
||||
_, after = rest.split(APPENDIX_END)
|
||||
return f"{before}{wrapped}{after.lstrip()}"
|
||||
marker = "\n## Appendix: Run"
|
||||
before, after = readme.split(marker)
|
||||
return f"{before}\n\n{wrapped}{marker}{after}"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--readme", type=Path, default=README)
|
||||
@@ -117,6 +197,7 @@ def main() -> None:
|
||||
|
||||
readme = args.readme.read_text()
|
||||
updated = replace_block(readme, _results_block())
|
||||
updated = replace_appendix(updated, _appendix_block())
|
||||
args.readme.write_text(updated)
|
||||
print(args.readme)
|
||||
|
||||
|
||||
@@ -796,9 +796,17 @@ Response B:
|
||||
|
||||
Answer the pairwise comparison in both directions only.
|
||||
|
||||
Use a 1.0 to 5.0 scale in 0.5 increments:
|
||||
1.0 = definitely no, 2.0 = probably no, 3.0 = tie/unclear, 4.0 = probably yes,
|
||||
5.0 = definitely yes.
|
||||
For each comparison, give a 1.0 to 5.0 rating:
|
||||
1.0 = A is clearly not more target-like than B
|
||||
2.0 = A is weakly less target-like than B
|
||||
3.0 = A and B are indistinguishable or exactly tied on the target
|
||||
4.0 = A is weakly more target-like than B
|
||||
5.0 = A is clearly more target-like than B
|
||||
|
||||
Use one decimal place when the evidence is between anchors. Prefer decimals
|
||||
such as 2.4, 2.7, 3.2, or 3.6 when there is any detectable difference. Use
|
||||
exactly 3.0 only when the responses are genuinely indistinguishable on the
|
||||
target behavior. Use exactly 1.0 or 5.0 only for extreme, unambiguous cases.
|
||||
|
||||
Output strict JSON only:
|
||||
{{
|
||||
@@ -974,15 +982,15 @@ def _style_delta(style: dict, dim: str, pos_label: str) -> float:
|
||||
|
||||
def _validate_axis_obj(obj: dict) -> None:
|
||||
for key in ("A_more_target_than_B", "B_more_target_than_A"):
|
||||
_bounded_score(obj, key, 1.0, 5.0)
|
||||
_bounded_score(obj, key, 1.0, 5.0, step=0.1)
|
||||
|
||||
|
||||
def _pairwise_expected(obj: dict, pos_label: str) -> float:
|
||||
"""Positive means the pos response beats the neg response on this target."""
|
||||
if pos_label == "A":
|
||||
return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0) - 3.0
|
||||
return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1) - 3.0
|
||||
if pos_label == "B":
|
||||
return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0) - 3.0
|
||||
return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0, step=0.1) - 3.0
|
||||
raise ValueError(pos_label)
|
||||
|
||||
|
||||
@@ -1081,6 +1089,80 @@ async def _evaluate_one(
|
||||
pos_label, neg_label, order = _labels_for(seed, axis.id, template, str(row_i), scenario)
|
||||
a_text, b_text = _response_by_label(pos_label, pos_text, neg_text)
|
||||
|
||||
if pos_text == neg_text:
|
||||
axis_judges = [
|
||||
{
|
||||
"judge_model": axis_judge_model,
|
||||
"positive_axis_judgment": {
|
||||
"A_more_target_than_B": 3.0,
|
||||
"B_more_target_than_A": 3.0,
|
||||
"target_reason": "responses are identical",
|
||||
},
|
||||
"negative_axis_judgment": {
|
||||
"A_more_target_than_B": 3.0,
|
||||
"B_more_target_than_A": 3.0,
|
||||
"target_reason": "responses are identical",
|
||||
},
|
||||
"pairwise_positive_delta": 0.0,
|
||||
"pairwise_negative_delta": 0.0,
|
||||
"axis_delta": 0.0,
|
||||
}
|
||||
for axis_judge_model in axis_judge_models
|
||||
]
|
||||
style_j = {
|
||||
**{f"{dim}_A": 1.0 for dim in STYLE_DIMS},
|
||||
**{f"{dim}_B": 1.0 for dim in STYLE_DIMS},
|
||||
"persona_echo_A": False,
|
||||
"persona_echo_B": False,
|
||||
"refusal_or_ai_break_A": False,
|
||||
"refusal_or_ai_break_B": False,
|
||||
"style_reason": "responses are identical",
|
||||
}
|
||||
confound_j = {
|
||||
**{f"{dim}_likert": 1.0 for dim in OFF_AXIS_DIMS},
|
||||
"off_axis_problem_likert": 1.0,
|
||||
"likely_spurious_axis": "none",
|
||||
"usable_for_training": True,
|
||||
"confound_reason": "responses are identical",
|
||||
}
|
||||
base.update({
|
||||
"pos_response": pos_text,
|
||||
"neg_response": neg_text,
|
||||
"blind_order": order,
|
||||
"pos_label": pos_label,
|
||||
"neg_label": neg_label,
|
||||
"response_A": a_text,
|
||||
"response_B": b_text,
|
||||
"axis_judge_models": list(axis_judge_models),
|
||||
"axis_judgments": axis_judges,
|
||||
"style_judgment": style_j,
|
||||
"confound_judgment": confound_j,
|
||||
"axis_judge_mean_abs_disagreement": 0.0,
|
||||
"axis_delta_judge_mean": 0.0,
|
||||
"axis_delta_judge_std": 0.0,
|
||||
"positive_delta": 0.0,
|
||||
"negative_delta": 0.0,
|
||||
"pairwise_positive_delta": 0.0,
|
||||
"pairwise_negative_delta": 0.0,
|
||||
"axis_delta": 0.0,
|
||||
"on_axis_frac": 0.0,
|
||||
"word_pos": len(_words(pos_text)),
|
||||
"word_neg": len(_words(neg_text)),
|
||||
"word_delta_frac": 0.0,
|
||||
"length_gate_enabled": max_word_delta_frac > 0,
|
||||
"length_ok": True,
|
||||
"style_deltas_pos_minus_neg": {dim: 0.0 for dim in STYLE_DIMS},
|
||||
"max_style_abs_delta": 0.0,
|
||||
"off_axis_category_likerts": {dim: 1.0 for dim in OFF_AXIS_DIMS},
|
||||
"max_off_axis_category_likert": 1.0,
|
||||
"off_axis_problem_frac": 0.0,
|
||||
"persona_echo": False,
|
||||
"refusal_or_ai_break": False,
|
||||
"strict_pass": False,
|
||||
"identity_pair": True,
|
||||
})
|
||||
return base
|
||||
|
||||
axis_tasks = []
|
||||
for axis_judge_model in axis_judge_models:
|
||||
axis_tasks.extend([
|
||||
@@ -1090,7 +1172,7 @@ async def _evaluate_one(
|
||||
axis, scenario, a_text, b_text, pole="positive")}],
|
||||
temperature=0.0,
|
||||
max_tokens=1200,
|
||||
cache_tag=f"judge_axis_pos_v5_{_model_name(axis_judge_model).replace('/', '_')}",
|
||||
cache_tag=f"judge_axis_pos_v6_{_model_name(axis_judge_model).replace('/', '_')}",
|
||||
seed=seed,
|
||||
json_mode=True,
|
||||
),
|
||||
@@ -1100,7 +1182,7 @@ async def _evaluate_one(
|
||||
axis, scenario, a_text, b_text, pole="negative")}],
|
||||
temperature=0.0,
|
||||
max_tokens=1200,
|
||||
cache_tag=f"judge_axis_neg_v5_{_model_name(axis_judge_model).replace('/', '_')}",
|
||||
cache_tag=f"judge_axis_neg_v6_{_model_name(axis_judge_model).replace('/', '_')}",
|
||||
seed=seed,
|
||||
json_mode=True,
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user