eval: refine judge rubric and README baselines

2026-06-27 16:46:08 +08:00 · 2026-06-13 18:24:06 +08:00
parent ede354f07a
commit e2546fe0ab
4 changed files with 288 additions and 76 deletions
@@ -11,9 +11,12 @@ README = ROOT / "README.md"
 NORMAL_STATS = ROOT / "data/v2_pilot_seed24_template_pair_stats.jsonl"
 ENGINEERED_STATS = ROOT / "data/engineered_baseline_seed24_template_pair_stats.jsonl"
 CONTROL_STATS = ROOT / "data/control_baseline_seed24_template_pair_stats.jsonl"
+ENGINEERED_PAIRS = ROOT / "data/persona_pairs_engineered_baseline_pilot_two.jsonl"

 START = "<!-- results-snapshot:start -->"
 END = "<!-- results-snapshot:end -->"
+APPENDIX_START = "<!-- appendix-baselines:start -->"
+APPENDIX_END = "<!-- appendix-baselines:end -->"


 def _read_jsonl(path: Path) -> list[dict]:
@@ -52,6 +55,20 @@ def _best_by_template(rows: list[dict]) -> list[dict]:
    return sorted(best.values(), key=lambda row: row["score"], reverse=True)


+def _mean_by_template(rows: list[dict]) -> list[dict]:
+    grouped: dict[str, list[dict]] = {}
+    for row in rows:
+        grouped.setdefault(row["template"], []).append({**row, "score": _score(row)})
+    out = []
+    for template, rs in grouped.items():
+        out.append({
+            "template": template,
+            "score": round(sum(row["score"] for row in rs) / len(rs), 1),
+            "n_cells": len(rs),
+        })
+    return sorted(out, key=lambda row: row["score"], reverse=True)
+
+
 def _stress_templates() -> set[str]:
    out = set()
    for row in load_template_catalog(CATALOG_PATH):
@@ -67,40 +84,92 @@ def _table(rows: list[dict]) -> str:
    return "\n".join(lines)


+def _detail_table(rows: list[dict]) -> str:
+    lines = ["| template | persona_pair | score |", "|---|---|---:|"]
+    for row in rows:
+        lines.append(
+            f"| {_markdown_text(row['template'])} | `{row['persona_pair']}` | {row['score']:.1f} |"
+        )
+    return "\n".join(lines)
+
+
 def _results_block() -> str:
-    normal_rows = _best_by_template(_read_jsonl(NORMAL_STATS))
-    engineered_rows = sorted(
-        ({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
-        key=lambda row: row["score"],
-        reverse=True,
-    )
-    control_rows = sorted(
-        ({**row, "score": _score(row)} for row in _read_jsonl(CONTROL_STATS)),
-        key=lambda row: row["score"],
-        reverse=True,
-    )
+    normal_rows = _mean_by_template(_read_jsonl(NORMAL_STATS))
    stress_templates = _stress_templates()
    top_rows = [row for row in normal_rows if row["template"] not in stress_templates][:10]
-    stress_rows = [row for row in normal_rows if row["template"] in stress_templates]
-    engineered_rows = [
-        {**row, "template": f"{row['template']} ({row['persona_pair']})"}
-        for row in engineered_rows
-    ]

    return "\n\n".join([
        "## Results Snapshot",
        (
            "Seed-24 pilot. Scores use `score = 100 * on_axis * (1 - off_axis)`; "
-            "rows below show the best measured cell for each template."
+            "rows below average over the measured persona pairs for each template."
        ),
        "Top reusable templates:",
        _table(top_rows),
-        "Engineered baseline:",
-        _table(engineered_rows),
+    ])
+
+
+def _engineered_prefixes() -> str:
+    rows = _read_jsonl(ENGINEERED_PAIRS)
+    blocks = []
+    for row in rows:
+        blocks.append("\n".join([
+            f"`{row['id']}`:",
+            "",
+            "```text",
+            f"positive: {row['pos_persona']}",
+            "",
+            f"negative: {row['neg_persona']}",
+            "```",
+        ]))
+    return "\n\n".join(blocks)
+
+
+def _appendix_block() -> str:
+    normal_pair_rows = [{**row, "score": _score(row)} for row in _read_jsonl(NORMAL_STATS)]
+    stress_templates = _stress_templates()
+    stress_mean_rows = [
+        row for row in _mean_by_template(normal_pair_rows)
+        if row["template"] in stress_templates
+    ]
+    stress_detail_rows = sorted(
+        [row for row in normal_pair_rows if row["template"] in stress_templates],
+        key=lambda row: row["score"],
+        reverse=True,
+    )
+    engineered_pair_rows = sorted(
+        ({**row, "score": _score(row)} for row in _read_jsonl(ENGINEERED_STATS)),
+        key=lambda row: row["score"],
+        reverse=True,
+    )
+    engineered_mean = _mean_by_template(engineered_pair_rows)
+    control_rows = _mean_by_template(_read_jsonl(CONTROL_STATS))
+
+    return "\n\n".join([
+        "## Appendix: Baselines And Stress Tests",
+        (
+            "The engineered baseline is not a reusable template. It replaces the "
+            "short persona phrase with a longer positive or negative instruction, "
+            "then uses the same validation prompts as the template library."
+        ),
+        "Engineered baseline, comparable mean over both measured axes:",
+        _table(engineered_mean),
+        "Engineered baseline, per axis:",
+        _detail_table(engineered_pair_rows),
+        "Engineered prefixes:",
+        _engineered_prefixes(),
+        (
+            "The out-of-context stress strings mostly failed by not moving the "
+            "intended axis. In the audited low-scoring rows they did not produce "
+            "universal refusals; the paired answers usually made the same decision "
+            "in similar language."
+        ),
+        "Out-of-context stress templates, comparable mean over both measured axes:",
+        _table(stress_mean_rows),
+        "Out-of-context stress templates, per axis:",
+        _detail_table(stress_detail_rows),
        "Controls:",
        _table(control_rows),
-        "Out-of-context stress templates:",
-        _table(stress_rows),
    ])


@@ -110,6 +179,17 @@ def replace_block(readme: str, block: str) -> str:
    return f"{before}{START}\n{block}\n{END}{after}"


+def replace_appendix(readme: str, block: str) -> str:
+    wrapped = f"{APPENDIX_START}\n{block}\n{APPENDIX_END}\n\n"
+    if APPENDIX_START in readme:
+        before, rest = readme.split(APPENDIX_START)
+        _, after = rest.split(APPENDIX_END)
+        return f"{before}{wrapped}{after.lstrip()}"
+    marker = "\n## Appendix: Run"
+    before, after = readme.split(marker)
+    return f"{before}\n\n{wrapped}{marker}{after}"
+
+
 def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--readme", type=Path, default=README)
@@ -117,6 +197,7 @@ def main() -> None:

    readme = args.readme.read_text()
    updated = replace_block(readme, _results_block())
+    updated = replace_appendix(updated, _appendix_block())
    args.readme.write_text(updated)
    print(args.readme)

@@ -796,9 +796,17 @@ Response B:

 Answer the pairwise comparison in both directions only.

-Use a 1.0 to 5.0 scale in 0.5 increments:
-1.0 = definitely no, 2.0 = probably no, 3.0 = tie/unclear, 4.0 = probably yes,
-5.0 = definitely yes.
+For each comparison, give a 1.0 to 5.0 rating:
+1.0 = A is clearly not more target-like than B
+2.0 = A is weakly less target-like than B
+3.0 = A and B are indistinguishable or exactly tied on the target
+4.0 = A is weakly more target-like than B
+5.0 = A is clearly more target-like than B
+
+Use one decimal place when the evidence is between anchors. Prefer decimals
+such as 2.4, 2.7, 3.2, or 3.6 when there is any detectable difference. Use
+exactly 3.0 only when the responses are genuinely indistinguishable on the
+target behavior. Use exactly 1.0 or 5.0 only for extreme, unambiguous cases.

 Output strict JSON only:
 {{
@@ -974,15 +982,15 @@ def _style_delta(style: dict, dim: str, pos_label: str) -> float:

 def _validate_axis_obj(obj: dict) -> None:
    for key in ("A_more_target_than_B", "B_more_target_than_A"):
-        _bounded_score(obj, key, 1.0, 5.0)
+        _bounded_score(obj, key, 1.0, 5.0, step=0.1)


 def _pairwise_expected(obj: dict, pos_label: str) -> float:
    """Positive means the pos response beats the neg response on this target."""
    if pos_label == "A":
-        return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0) - 3.0
+        return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1) - 3.0
    if pos_label == "B":
-        return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0) - 3.0
+        return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0, step=0.1) - 3.0
    raise ValueError(pos_label)


@@ -1081,6 +1089,80 @@ async def _evaluate_one(
        pos_label, neg_label, order = _labels_for(seed, axis.id, template, str(row_i), scenario)
        a_text, b_text = _response_by_label(pos_label, pos_text, neg_text)

+        if pos_text == neg_text:
+            axis_judges = [
+                {
+                    "judge_model": axis_judge_model,
+                    "positive_axis_judgment": {
+                        "A_more_target_than_B": 3.0,
+                        "B_more_target_than_A": 3.0,
+                        "target_reason": "responses are identical",
+                    },
+                    "negative_axis_judgment": {
+                        "A_more_target_than_B": 3.0,
+                        "B_more_target_than_A": 3.0,
+                        "target_reason": "responses are identical",
+                    },
+                    "pairwise_positive_delta": 0.0,
+                    "pairwise_negative_delta": 0.0,
+                    "axis_delta": 0.0,
+                }
+                for axis_judge_model in axis_judge_models
+            ]
+            style_j = {
+                **{f"{dim}_A": 1.0 for dim in STYLE_DIMS},
+                **{f"{dim}_B": 1.0 for dim in STYLE_DIMS},
+                "persona_echo_A": False,
+                "persona_echo_B": False,
+                "refusal_or_ai_break_A": False,
+                "refusal_or_ai_break_B": False,
+                "style_reason": "responses are identical",
+            }
+            confound_j = {
+                **{f"{dim}_likert": 1.0 for dim in OFF_AXIS_DIMS},
+                "off_axis_problem_likert": 1.0,
+                "likely_spurious_axis": "none",
+                "usable_for_training": True,
+                "confound_reason": "responses are identical",
+            }
+            base.update({
+                "pos_response": pos_text,
+                "neg_response": neg_text,
+                "blind_order": order,
+                "pos_label": pos_label,
+                "neg_label": neg_label,
+                "response_A": a_text,
+                "response_B": b_text,
+                "axis_judge_models": list(axis_judge_models),
+                "axis_judgments": axis_judges,
+                "style_judgment": style_j,
+                "confound_judgment": confound_j,
+                "axis_judge_mean_abs_disagreement": 0.0,
+                "axis_delta_judge_mean": 0.0,
+                "axis_delta_judge_std": 0.0,
+                "positive_delta": 0.0,
+                "negative_delta": 0.0,
+                "pairwise_positive_delta": 0.0,
+                "pairwise_negative_delta": 0.0,
+                "axis_delta": 0.0,
+                "on_axis_frac": 0.0,
+                "word_pos": len(_words(pos_text)),
+                "word_neg": len(_words(neg_text)),
+                "word_delta_frac": 0.0,
+                "length_gate_enabled": max_word_delta_frac > 0,
+                "length_ok": True,
+                "style_deltas_pos_minus_neg": {dim: 0.0 for dim in STYLE_DIMS},
+                "max_style_abs_delta": 0.0,
+                "off_axis_category_likerts": {dim: 1.0 for dim in OFF_AXIS_DIMS},
+                "max_off_axis_category_likert": 1.0,
+                "off_axis_problem_frac": 0.0,
+                "persona_echo": False,
+                "refusal_or_ai_break": False,
+                "strict_pass": False,
+                "identity_pair": True,
+            })
+            return base
+
        axis_tasks = []
        for axis_judge_model in axis_judge_models:
            axis_tasks.extend([
@@ -1090,7 +1172,7 @@ async def _evaluate_one(
                        axis, scenario, a_text, b_text, pole="positive")}],
                    temperature=0.0,
                    max_tokens=1200,
-                    cache_tag=f"judge_axis_pos_v5_{_model_name(axis_judge_model).replace('/', '_')}",
+                    cache_tag=f"judge_axis_pos_v6_{_model_name(axis_judge_model).replace('/', '_')}",
                    seed=seed,
                    json_mode=True,
                ),
@@ -1100,7 +1182,7 @@ async def _evaluate_one(
                        axis, scenario, a_text, b_text, pole="negative")}],
                    temperature=0.0,
                    max_tokens=1200,
-                    cache_tag=f"judge_axis_neg_v5_{_model_name(axis_judge_model).replace('/', '_')}",
+                    cache_tag=f"judge_axis_neg_v6_{_model_name(axis_judge_model).replace('/', '_')}",
                    seed=seed,
                    json_mode=True,
                ),