eval: refine judge rubric and README baselines

This commit is contained in:
wassname
2026-06-13 18:24:06 +08:00
parent ede354f07a
commit e2546fe0ab
4 changed files with 288 additions and 76 deletions
+102 -21
View File
@@ -11,9 +11,12 @@ README = ROOT / "README.md"
NORMAL_STATS = ROOT / "data/v2_pilot_seed24_template_pair_stats.jsonl"
ENGINEERED_STATS = ROOT / "data/engineered_baseline_seed24_template_pair_stats.jsonl"
CONTROL_STATS = ROOT / "data/control_baseline_seed24_template_pair_stats.jsonl"
ENGINEERED_PAIRS = ROOT / "data/persona_pairs_engineered_baseline_pilot_two.jsonl"
START = "<!-- results-snapshot:start -->"
END = "<!-- results-snapshot:end -->"
APPENDIX_START = "<!-- appendix-baselines:start -->"
APPENDIX_END = "<!-- appendix-baselines:end -->"
def _read_jsonl(path: Path) -> list[dict]:
@@ -52,6 +55,20 @@ def _best_by_template(rows: list[dict]) -> list[dict]:
return sorted(best.values(), key=lambda row: row["score"], reverse=True)
def _mean_by_template(rows: list[dict]) -> list[dict]:
grouped: dict[str, list[dict]] = {}
for row in rows:
grouped.setdefault(row["template"], []).append({**row, "score": _score(row)})
out = []
for template, rs in grouped.items():
out.append({
"template": template,
"score": round(sum(row["score"] for row in rs) / len(rs), 1),
"n_cells": len(rs),
})
return sorted(out, key=lambda row: row["score"], reverse=True)
def _stress_templates() -> set[str]:
out = set()
for row in load_template_catalog(CATALOG_PATH):
@@ -67,40 +84,92 @@ def _table(rows: list[dict]) -> str:
return "\n".join(lines)
def _detail_table(rows: list[dict]) -> str:
lines = ["| template | persona_pair | score |", "|---|---|---:|"]
for row in rows:
lines.append(
f"| {_markdown_text(row['template'])} | `{row['persona_pair']}` | {row['score']:.1f} |"
)
return "\n".join(lines)
def _results_block() -> str:
normal_rows = _best_by_template(_read_jsonl(NORMAL_STATS))
engineered_rows = sorted(
({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
key=lambda row: row["score"],
reverse=True,
)
control_rows = sorted(
({**row, "score": _score(row)} for row in _read_jsonl(CONTROL_STATS)),
key=lambda row: row["score"],
reverse=True,
)
normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS))
stress_templates = _stress_templates()
top_rows = [row for row in normal_rows if row["template"] not in stress_templates][:10]
stress_rows = [row for row in normal_rows if row["template"] in stress_templates]
engineered_rows = [
{**row, "template": f"{row['template']} ({row['persona_pair']})"}
for row in engineered_rows
]
return "\n\n".join([
"## Results Snapshot",
(
"Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; "
"rows below show the best measured cell for each template."
"rows below average over the measured persona pairs for each template."
),
"Top reusable templates:",
_table(top_rows),
"Engineered baseline:",
_table(engineered_rows),
])
def _engineered_prefixes() -> str:
rows = _read_jsonl(ENGINEERED_PAIRS)
blocks = []
for row in rows:
blocks.append("\n".join([
f"`{row['id']}`:",
"",
"```text",
f"positive: {row['pos_persona']}",
"",
f"negative: {row['neg_persona']}",
"```",
]))
return "\n\n".join(blocks)
def _appendix_block() -> str:
normal_pair_rows = [{**row, "score": _score(row)} for row in _read_jsonl(NORMAL_STATS)]
stress_templates = _stress_templates()
stress_mean_rows = [
row for row in _mean_by_template(normal_pair_rows)
if row["template"] in stress_templates
]
stress_detail_rows = sorted(
[row for row in normal_pair_rows if row["template"] in stress_templates],
key=lambda row: row["score"],
reverse=True,
)
engineered_pair_rows = sorted(
({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
key=lambda row: row["score"],
reverse=True,
)
engineered_mean = _mean_by_template(engineered_pair_rows)
control_rows = _mean_by_template(_read_jsonl(CONTROL_STATS))
return "\n\n".join([
"## Appendix: Baselines And Stress Tests",
(
"The engineered baseline is not a reusable template. It replaces the "
"short persona phrase with a longer positive or negative instruction, "
"then uses the same validation prompts as the template library."
),
"Engineered baseline, comparable mean over both measured axes:",
_table(engineered_mean),
"Engineered baseline, per axis:",
_detail_table(engineered_pair_rows),
"Engineered prefixes:",
_engineered_prefixes(),
(
"The out-of-context stress strings mostly failed by not moving the "
"intended axis. In the audited low-scoring rows they did not produce "
"universal refusals; the paired answers usually made the same decision "
"in similar language."
),
"Out-of-context stress templates, comparable mean over both measured axes:",
_table(stress_mean_rows),
"Out-of-context stress templates, per axis:",
_detail_table(stress_detail_rows),
"Controls:",
_table(control_rows),
"Out-of-context stress templates:",
_table(stress_rows),
])
@@ -110,6 +179,17 @@ def replace_block(readme: str, block: str) -> str:
return f"{before}{START}\n{block}\n{END}{after}"
def replace_appendix(readme: str, block: str) -> str:
wrapped = f"{APPENDIX_START}\n{block}\n{APPENDIX_END}\n\n"
if APPENDIX_START in readme:
before, rest = readme.split(APPENDIX_START)
_, after = rest.split(APPENDIX_END)
return f"{before}{wrapped}{after.lstrip()}"
marker = "\n## Appendix: Run"
before, after = readme.split(marker)
return f"{before}\n\n{wrapped}{marker}{after}"
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--readme", type=Path, default=README)
@@ -117,6 +197,7 @@ def main() -> None:
readme = args.readme.read_text()
updated = replace_block(readme, _results_block())
updated = replace_appendix(updated, _appendix_block())
args.readme.write_text(updated)
print(args.readme)
+90 -8
View File
@@ -796,9 +796,17 @@ Response B:
Answer the pairwise comparison in both directions only.
Use a 1.0 to 5.0 scale in 0.5 increments:
1.0 = definitely no, 2.0 = probably no, 3.0 = tie/unclear, 4.0 = probably yes,
5.0 = definitely yes.
For each comparison, give a 1.0 to 5.0 rating:
1.0 = A is clearly not more target-like than B
2.0 = A is weakly less target-like than B
3.0 = A and B are indistinguishable or exactly tied on the target
4.0 = A is weakly more target-like than B
5.0 = A is clearly more target-like than B
Use one decimal place when the evidence is between anchors. Prefer decimals
such as 2.4, 2.7, 3.2, or 3.6 when there is any detectable difference. Use
exactly 3.0 only when the responses are genuinely indistinguishable on the
target behavior. Use exactly 1.0 or 5.0 only for extreme, unambiguous cases.
Output strict JSON only:
{{
@@ -974,15 +982,15 @@ def _style_delta(style: dict, dim: str, pos_label: str) -> float:
def _validate_axis_obj(obj: dict) -> None:
for key in ("A_more_target_than_B", "B_more_target_than_A"):
_bounded_score(obj, key, 1.0, 5.0)
_bounded_score(obj, key, 1.0, 5.0, step=0.1)
def _pairwise_expected(obj: dict, pos_label: str) -> float:
"""Positive means the pos response beats the neg response on this target."""
if pos_label == "A":
return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0) - 3.0
return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1) - 3.0
if pos_label == "B":
return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0) - 3.0
return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0, step=0.1) - 3.0
raise ValueError(pos_label)
@@ -1081,6 +1089,80 @@ async def _evaluate_one(
pos_label, neg_label, order = _labels_for(seed, axis.id, template, str(row_i), scenario)
a_text, b_text = _response_by_label(pos_label, pos_text, neg_text)
if pos_text == neg_text:
axis_judges = [
{
"judge_model": axis_judge_model,
"positive_axis_judgment": {
"A_more_target_than_B": 3.0,
"B_more_target_than_A": 3.0,
"target_reason": "responses are identical",
},
"negative_axis_judgment": {
"A_more_target_than_B": 3.0,
"B_more_target_than_A": 3.0,
"target_reason": "responses are identical",
},
"pairwise_positive_delta": 0.0,
"pairwise_negative_delta": 0.0,
"axis_delta": 0.0,
}
for axis_judge_model in axis_judge_models
]
style_j = {
**{f"{dim}_A": 1.0 for dim in STYLE_DIMS},
**{f"{dim}_B": 1.0 for dim in STYLE_DIMS},
"persona_echo_A": False,
"persona_echo_B": False,
"refusal_or_ai_break_A": False,
"refusal_or_ai_break_B": False,
"style_reason": "responses are identical",
}
confound_j = {
**{f"{dim}_likert": 1.0 for dim in OFF_AXIS_DIMS},
"off_axis_problem_likert": 1.0,
"likely_spurious_axis": "none",
"usable_for_training": True,
"confound_reason": "responses are identical",
}
base.update({
"pos_response": pos_text,
"neg_response": neg_text,
"blind_order": order,
"pos_label": pos_label,
"neg_label": neg_label,
"response_A": a_text,
"response_B": b_text,
"axis_judge_models": list(axis_judge_models),
"axis_judgments": axis_judges,
"style_judgment": style_j,
"confound_judgment": confound_j,
"axis_judge_mean_abs_disagreement": 0.0,
"axis_delta_judge_mean": 0.0,
"axis_delta_judge_std": 0.0,
"positive_delta": 0.0,
"negative_delta": 0.0,
"pairwise_positive_delta": 0.0,
"pairwise_negative_delta": 0.0,
"axis_delta": 0.0,
"on_axis_frac": 0.0,
"word_pos": len(_words(pos_text)),
"word_neg": len(_words(neg_text)),
"word_delta_frac": 0.0,
"length_gate_enabled": max_word_delta_frac > 0,
"length_ok": True,
"style_deltas_pos_minus_neg": {dim: 0.0 for dim in STYLE_DIMS},
"max_style_abs_delta": 0.0,
"off_axis_category_likerts": {dim: 1.0 for dim in OFF_AXIS_DIMS},
"max_off_axis_category_likert": 1.0,
"off_axis_problem_frac": 0.0,
"persona_echo": False,
"refusal_or_ai_break": False,
"strict_pass": False,
"identity_pair": True,
})
return base
axis_tasks = []
for axis_judge_model in axis_judge_models:
axis_tasks.extend([
@@ -1090,7 +1172,7 @@ async def _evaluate_one(
axis, scenario, a_text, b_text, pole="positive")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_pos_v5_{_model_name(axis_judge_model).replace('/', '_')}",
cache_tag=f"judge_axis_pos_v6_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_mode=True,
),
@@ -1100,7 +1182,7 @@ async def _evaluate_one(
axis, scenario, a_text, b_text, pole="negative")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_neg_v5_{_model_name(axis_judge_model).replace('/', '_')}",
cache_tag=f"judge_axis_neg_v6_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_mode=True,
),