eval: rerun dual judges and refresh results

This commit is contained in:
wassname
2026-06-13 18:59:24 +08:00
parent e2546fe0ab
commit 8dbc02066b
8 changed files with 511 additions and 137 deletions
+322 -80
View File
@@ -44,6 +44,7 @@ from template_catalog import active_template_rows, load_template_catalog
ROOT = Path(__file__).resolve().parents[1]
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
JSON_RETRIES = 3
@dataclass(frozen=True)
@@ -531,6 +532,31 @@ OFF_AXIS_DIM_DESCRIPTIONS = {
OFF_AXIS_DIMS = tuple(OFF_AXIS_DIM_DESCRIPTIONS)
REFUSAL_PHRASES = (
"as an ai language model",
"as a language model",
"as an ai",
"i'm sorry, but",
"i apologize, but",
"i cannot assist",
"i can't assist",
"i cannot fulfill",
"i can't fulfill",
"i cannot provide",
"i can't provide",
"i won't provide",
"i am unable to",
"i'm unable to",
"unable to offer assistance",
"cannot provide guidance",
"cannot provide information",
"cannot support or promote",
"against my programming",
"safety guidelines",
"prioritize user safety",
"adhere to safety",
)
def _model_name(name: str) -> str:
return name.removeprefix("openrouter/")
@@ -545,10 +571,77 @@ def _json_obj(text: str) -> dict:
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.M)
match = re.search(r"\{.*\}", text, flags=re.S)
if not match:
raise ValueError(f"no JSON object in model output: {text[:300]!r}")
return json.loads(match.group(0))
return json.loads(text)
def _assert_json_text(text: str, json_schema: dict | None = None) -> None:
_json_obj(text)
def _refusal_phrase_hits(text: str) -> list[str]:
lowered = text.lower()
return [phrase for phrase in REFUSAL_PHRASES if phrase in lowered]
def _number_schema() -> dict:
return {"type": "number"}
def _boolean_schema() -> dict:
return {"type": "boolean"}
def _string_schema() -> dict:
return {"type": "string"}
def _object_schema(name: str, properties: dict[str, dict]) -> dict:
return {
"type": "json_schema",
"json_schema": {
"name": name,
"strict": True,
"schema": {
"type": "object",
"properties": properties,
"required": list(properties),
"additionalProperties": False,
},
},
}
def _axis_judge_schema() -> dict:
return _object_schema("axis_pairwise_judgment", {
"target_reason": _string_schema(),
"A_more_target_than_B": _number_schema(),
})
def _style_judge_schema() -> dict:
properties = {}
for dim in STYLE_DIMS:
properties[f"{dim}_A"] = _number_schema()
properties[f"{dim}_B"] = _number_schema()
properties.update({
"persona_echo_A": _boolean_schema(),
"persona_echo_B": _boolean_schema(),
"refusal_or_ai_break_A": _boolean_schema(),
"refusal_or_ai_break_B": _boolean_schema(),
"style_reason": _string_schema(),
})
return _object_schema("style_judgment", properties)
def _confound_judge_schema() -> dict:
properties = {f"{dim}_likert": _number_schema() for dim in OFF_AXIS_DIMS}
properties.update({
"off_axis_problem_likert": _number_schema(),
"likely_spurious_axis": _string_schema(),
"usable_for_training": _boolean_schema(),
"confound_reason": _string_schema(),
})
return _object_schema("confound_judgment", properties)
def _words(text: str) -> list[str]:
@@ -794,7 +887,8 @@ Response A:
Response B:
{b}
Answer the pairwise comparison in both directions only.
Answer one pairwise comparison only: is Response A more target-like than
Response B?
For each comparison, give a 1.0 to 5.0 rating:
1.0 = A is clearly not more target-like than B
@@ -808,19 +902,14 @@ such as 2.4, 2.7, 3.2, or 3.6 when there is any detectable difference. Use
exactly 3.0 only when the responses are genuinely indistinguishable on the
target behavior. Use exactly 1.0 or 5.0 only for extreme, unambiguous cases.
Output strict JSON only:
{{
"A_more_target_than_B": 3.0,
"B_more_target_than_A": 3.0,
"target_reason": "one short sentence"
}}"""
Return JSON matching the provided schema:
- target_reason: string, one short sentence
- A_more_target_than_B: float"""
def _style_judge_prompt(scenario: str, a: str, b: str) -> str:
dims = "\n".join(f"- {dim}: {desc}" for dim, desc in STYLE_DIM_DESCRIPTIONS.items())
dim_json = "\n".join(
f' "{dim}_A": 1.0, "{dim}_B": 1.0,' for dim in STYLE_DIMS
)
dim_fields = "\n".join(f"- {dim}_A: float; {dim}_B: float" for dim in STYLE_DIMS)
return f"""\
You are a fresh-eyes style auditor. Ignore whether either response gives better
advice. Rate only surface style/tone. You do not know the intended axis or which
@@ -843,20 +932,18 @@ Dimensions:
Also flag explicit persona echo, refusal, or AI-role breaks.
Output strict JSON only:
{{
{dim_json}
"persona_echo_A": false, "persona_echo_B": false,
"refusal_or_ai_break_A": false, "refusal_or_ai_break_B": false,
"style_reason": "one short sentence"
}}"""
Return JSON matching the provided schema:
{dim_fields}
- persona_echo_A: bool; persona_echo_B: bool
- refusal_or_ai_break_A: bool; refusal_or_ai_break_B: bool
- style_reason: string, one short sentence"""
def _confound_judge_prompt(axis: Axis, scenario: str, a: str, b: str) -> str:
dims = "\n".join(
f"- {dim}: {desc}" for dim, desc in OFF_AXIS_DIM_DESCRIPTIONS.items()
)
dim_json = "\n".join(f' "{dim}_likert": 1.0,' for dim in OFF_AXIS_DIMS)
dim_fields = "\n".join(f"- {dim}_likert: float" for dim in OFF_AXIS_DIMS)
return f"""\
You are auditing whether a contrastive training pair isolates one intended axis.
You do not know which response is positive or negative.
@@ -888,14 +975,12 @@ separately.
Confounds:
{dims}
Output strict JSON only:
{{
{dim_json}
"off_axis_problem_likert": 1.0,
"likely_spurious_axis": "none or short phrase",
"usable_for_training": true,
"confound_reason": "one short sentence"
}}
Return JSON matching the provided schema:
{dim_fields}
- off_axis_problem_likert: float
- likely_spurious_axis: string, "none" or a short phrase
- usable_for_training: bool
- confound_reason: string, one short sentence
The overall off_axis_problem_likert should summarize the worst meaningful
confound, not the average."""
@@ -924,7 +1009,7 @@ class OpenRouter:
max_tokens: int,
cache_tag: str,
seed: int,
json_mode: bool,
json_schema: dict | None,
) -> str:
payload = {
"model": _model_name(model),
@@ -939,23 +1024,51 @@ class OpenRouter:
"reasoning_effort": "none",
"include_reasoning": False,
}
if json_mode:
payload["response_format"] = {"type": "json_object"}
if json_schema is not None:
payload["response_format"] = json_schema
key = f"{cache_tag}_{_hkey({'payload': payload, 'extra_body': extra_body})}.json"
path = self.cache_dir / key
if path.exists():
return json.loads(path.read_text())["content"]
async with self.sem:
resp = await self.client.chat.completions.create(
**payload, extra_body=extra_body)
content = resp.choices[0].message.content or ""
path.write_text(json.dumps({
"created_at": time.time(),
"payload": payload,
"extra_body": extra_body,
"content": content,
}, indent=2))
return content
content = json.loads(path.read_text())["content"]
if json_schema is None:
return content
try:
_assert_json_text(content, json_schema)
return content
except (json.JSONDecodeError, ValueError):
bad_path = path.with_suffix(f".bad-{int(time.time())}.json")
path.rename(bad_path)
logger.warning(f"quarantined malformed cached JSON judge output: {bad_path}")
attempts = JSON_RETRIES if json_schema is not None else 1
last_content = ""
last_error: Exception | None = None
for attempt in range(1, attempts + 1):
async with self.sem:
resp = await self.client.chat.completions.create(
**payload, extra_body=extra_body)
content = resp.choices[0].message.content or ""
last_content = content
if json_schema is not None:
try:
_assert_json_text(content, json_schema)
except (json.JSONDecodeError, ValueError) as e:
last_error = e
logger.warning(
f"malformed JSON judge output attempt {attempt}/{attempts} "
f"cache_tag={cache_tag}: {content[:160]!r}"
)
continue
path.write_text(json.dumps({
"created_at": time.time(),
"payload": payload,
"extra_body": extra_body,
"content": content,
}, indent=2))
return content
raise ValueError(
f"malformed JSON after {attempts} attempts for {cache_tag}: "
f"{last_error}; content={last_content[:500]!r}"
)
def _labels_for(seed: int, *parts: str) -> tuple[str, str, str]:
@@ -981,17 +1094,13 @@ def _style_delta(style: dict, dim: str, pos_label: str) -> float:
def _validate_axis_obj(obj: dict) -> None:
for key in ("A_more_target_than_B", "B_more_target_than_A"):
_bounded_score(obj, key, 1.0, 5.0, step=0.1)
_bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1)
def _pairwise_expected(obj: dict, pos_label: str) -> float:
def _pairwise_expected(obj: dict, first_is_positive: bool) -> float:
"""Positive means the pos response beats the neg response on this target."""
if pos_label == "A":
return _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1) - 3.0
if pos_label == "B":
return _bounded_score(obj, "B_more_target_than_A", 1.0, 5.0, step=0.1) - 3.0
raise ValueError(pos_label)
signed = _bounded_score(obj, "A_more_target_than_B", 1.0, 5.0, step=0.1) - 3.0
return signed if first_is_positive else -signed
def _validate_style_obj(obj: dict) -> None:
@@ -1057,7 +1166,7 @@ async def _evaluate_one(
max_tokens=260,
cache_tag="gen_pos",
seed=seed,
json_mode=False,
json_schema=None,
)
neg_text = pos_text
else:
@@ -1069,7 +1178,7 @@ async def _evaluate_one(
max_tokens=260,
cache_tag="gen_pos",
seed=seed,
json_mode=False,
json_schema=None,
),
router.chat_jsonish(
model=generator_model,
@@ -1078,7 +1187,7 @@ async def _evaluate_one(
max_tokens=260,
cache_tag="gen_neg",
seed=seed,
json_mode=False,
json_schema=None,
),
)
pos_text, neg_text = pos_text.strip(), neg_text.strip()
@@ -1090,19 +1199,31 @@ async def _evaluate_one(
a_text, b_text = _response_by_label(pos_label, pos_text, neg_text)
if pos_text == neg_text:
pos_refusal_phrase_hits = _refusal_phrase_hits(pos_text)
neg_refusal_phrase_hits = _refusal_phrase_hits(neg_text)
axis_judges = [
{
"judge_model": axis_judge_model,
"positive_axis_judgment": {
"positive_axis_forward_judgment": {
"A_more_target_than_B": 3.0,
"B_more_target_than_A": 3.0,
"target_reason": "responses are identical",
},
"negative_axis_judgment": {
"positive_axis_reverse_judgment": {
"A_more_target_than_B": 3.0,
"B_more_target_than_A": 3.0,
"target_reason": "responses are identical",
},
"negative_axis_forward_judgment": {
"A_more_target_than_B": 3.0,
"target_reason": "responses are identical",
},
"negative_axis_reverse_judgment": {
"A_more_target_than_B": 3.0,
"target_reason": "responses are identical",
},
"positive_forward_delta": 0.0,
"positive_reverse_delta": 0.0,
"negative_forward_delta": 0.0,
"negative_reverse_delta": 0.0,
"pairwise_positive_delta": 0.0,
"pairwise_negative_delta": 0.0,
"axis_delta": 0.0,
@@ -1156,8 +1277,10 @@ async def _evaluate_one(
"off_axis_category_likerts": {dim: 1.0 for dim in OFF_AXIS_DIMS},
"max_off_axis_category_likert": 1.0,
"off_axis_problem_frac": 0.0,
"pos_refusal_phrase_hits": pos_refusal_phrase_hits,
"neg_refusal_phrase_hits": neg_refusal_phrase_hits,
"persona_echo": False,
"refusal_or_ai_break": False,
"refusal_or_ai_break": bool(pos_refusal_phrase_hits or neg_refusal_phrase_hits),
"strict_pass": False,
"identity_pair": True,
})
@@ -1172,9 +1295,19 @@ async def _evaluate_one(
axis, scenario, a_text, b_text, pole="positive")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_pos_v6_{_model_name(axis_judge_model).replace('/', '_')}",
cache_tag=f"judge_axis_pos_fwd_v7_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_mode=True,
json_schema=_axis_judge_schema(),
),
router.chat_jsonish(
model=axis_judge_model,
messages=[{"role": "user", "content": _axis_pairwise_judge_prompt(
axis, scenario, b_text, a_text, pole="positive")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_pos_rev_v7_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_schema=_axis_judge_schema(),
),
router.chat_jsonish(
model=axis_judge_model,
@@ -1182,9 +1315,19 @@ async def _evaluate_one(
axis, scenario, a_text, b_text, pole="negative")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_neg_v6_{_model_name(axis_judge_model).replace('/', '_')}",
cache_tag=f"judge_axis_neg_fwd_v7_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_mode=True,
json_schema=_axis_judge_schema(),
),
router.chat_jsonish(
model=axis_judge_model,
messages=[{"role": "user", "content": _axis_pairwise_judge_prompt(
axis, scenario, b_text, a_text, pole="negative")}],
temperature=0.0,
max_tokens=1200,
cache_tag=f"judge_axis_neg_rev_v7_{_model_name(axis_judge_model).replace('/', '_')}",
seed=seed,
json_schema=_axis_judge_schema(),
),
])
style_raw, confound_raw, *axis_raw = await asyncio.gather(
@@ -1195,7 +1338,7 @@ async def _evaluate_one(
max_tokens=4096,
cache_tag="judge_style_v5",
seed=seed,
json_mode=True,
json_schema=_style_judge_schema(),
),
router.chat_jsonish(
model=style_judge_model,
@@ -1204,26 +1347,53 @@ async def _evaluate_one(
max_tokens=4096,
cache_tag="judge_confound_v6",
seed=seed,
json_mode=True,
json_schema=_confound_judge_schema(),
),
*axis_tasks,
)
raw_judge_outputs = {
"style": style_raw,
"confound": confound_raw,
"axis": [
{
"judge_model": axis_judge_model,
"positive_forward": axis_raw[4 * i],
"positive_reverse": axis_raw[4 * i + 1],
"negative_forward": axis_raw[4 * i + 2],
"negative_reverse": axis_raw[4 * i + 3],
}
for i, axis_judge_model in enumerate(axis_judge_models)
],
}
base["raw_judge_outputs"] = raw_judge_outputs
style_j = _json_obj(style_raw)
confound_j = _json_obj(confound_raw)
_validate_style_obj(style_j)
_validate_confound_obj(confound_j)
axis_judges = []
for i, axis_judge_model in enumerate(axis_judge_models):
pos_axis_j = _json_obj(axis_raw[2 * i])
neg_axis_j = _json_obj(axis_raw[2 * i + 1])
_validate_axis_obj(pos_axis_j)
_validate_axis_obj(neg_axis_j)
pairwise_positive_delta = _pairwise_expected(pos_axis_j, pos_label)
pairwise_negative_delta = -_pairwise_expected(neg_axis_j, pos_label)
pos_fwd_j = _json_obj(axis_raw[4 * i])
pos_rev_j = _json_obj(axis_raw[4 * i + 1])
neg_fwd_j = _json_obj(axis_raw[4 * i + 2])
neg_rev_j = _json_obj(axis_raw[4 * i + 3])
for axis_j in (pos_fwd_j, pos_rev_j, neg_fwd_j, neg_rev_j):
_validate_axis_obj(axis_j)
positive_forward_delta = _pairwise_expected(pos_fwd_j, pos_label == "A")
positive_reverse_delta = _pairwise_expected(pos_rev_j, pos_label == "B")
negative_forward_delta = -_pairwise_expected(neg_fwd_j, pos_label == "A")
negative_reverse_delta = -_pairwise_expected(neg_rev_j, pos_label == "B")
pairwise_positive_delta = (positive_forward_delta + positive_reverse_delta) / 2.0
pairwise_negative_delta = (negative_forward_delta + negative_reverse_delta) / 2.0
axis_judges.append({
"judge_model": axis_judge_model,
"positive_axis_judgment": pos_axis_j,
"negative_axis_judgment": neg_axis_j,
"positive_axis_forward_judgment": pos_fwd_j,
"positive_axis_reverse_judgment": pos_rev_j,
"negative_axis_forward_judgment": neg_fwd_j,
"negative_axis_reverse_judgment": neg_rev_j,
"positive_forward_delta": positive_forward_delta,
"positive_reverse_delta": positive_reverse_delta,
"negative_forward_delta": negative_forward_delta,
"negative_reverse_delta": negative_reverse_delta,
"pairwise_positive_delta": pairwise_positive_delta,
"pairwise_negative_delta": pairwise_negative_delta,
"axis_delta": 2.0 * (pairwise_positive_delta + pairwise_negative_delta),
@@ -1249,10 +1419,12 @@ async def _evaluate_one(
for dim in OFF_AXIS_DIMS
}
max_off_axis_category_likert = max(off_axis_likerts.values())
pos_refusal_phrase_hits = _refusal_phrase_hits(pos_text)
neg_refusal_phrase_hits = _refusal_phrase_hits(neg_text)
pos_echo = bool(style_j[f"persona_echo_{pos_label}"])
neg_echo = bool(style_j[f"persona_echo_{neg_label}"])
pos_refusal = bool(style_j[f"refusal_or_ai_break_{pos_label}"])
neg_refusal = bool(style_j[f"refusal_or_ai_break_{neg_label}"])
pos_refusal = bool(pos_refusal_phrase_hits)
neg_refusal = bool(neg_refusal_phrase_hits)
length_ok = True if max_word_delta_frac <= 0 else abs(word_delta_frac) <= max_word_delta_frac
strict_pass = (
axis_delta >= 3
@@ -1294,6 +1466,8 @@ async def _evaluate_one(
"max_off_axis_category_likert": max_off_axis_category_likert,
"off_axis_problem_frac": round(
_normalize_likert(float(confound_j["off_axis_problem_likert"]), 1.0, 7.0), 4),
"pos_refusal_phrase_hits": pos_refusal_phrase_hits,
"neg_refusal_phrase_hits": neg_refusal_phrase_hits,
"persona_echo": pos_echo or neg_echo,
"refusal_or_ai_break": pos_refusal or neg_refusal,
"strict_pass": strict_pass,
@@ -1361,6 +1535,59 @@ def summarize(results: list[dict]) -> list[dict]:
return out
def axis_score_distribution(results: list[dict]) -> list[dict]:
counts: dict[tuple[str, str, float], int] = defaultdict(int)
for r in results:
if "error" in r:
continue
for judgment in r["axis_judgments"]:
judge_model = judgment["judge_model"]
for key in (
"positive_axis_forward_judgment",
"positive_axis_reverse_judgment",
"negative_axis_forward_judgment",
"negative_axis_reverse_judgment",
):
score = _bounded_score(judgment[key], "A_more_target_than_B", 1.0, 5.0, step=0.1)
counts[(judge_model, key.removesuffix("_judgment"), score)] += 1
rows = [
{"judge_model": model, "call": call, "score": score, "n": n}
for (model, call, score), n in counts.items()
]
rows.sort(key=lambda r: (r["judge_model"], r["call"], r["score"]))
return rows
def _print_text_block(title: str, text: str) -> None:
print(f"\n--- {title} ---")
print(text)
def print_judge_audit_samples(results: list[dict]) -> None:
if not results:
return
sample_indices = [0] if len(results) == 1 else [0, len(results) - 1]
print("\n=== judge audit samples: first and last planned eval ===")
for sample_name, idx in zip(("FIRST", "LAST"), sample_indices):
rec = results[idx]
print(f"\n### {sample_name} idx={idx} eval_id={rec.get('eval_id')} error={rec.get('error')}")
_print_text_block("prompt", str(rec.get("prompt", "")))
_print_text_block("cho_pos_response", str(rec.get("pos_response", "")))
_print_text_block("rej_neg_response", str(rec.get("neg_response", "")))
_print_text_block(
"refusal_phrase_hits",
json.dumps({
"pos": rec.get("pos_refusal_phrase_hits", []),
"neg": rec.get("neg_refusal_phrase_hits", []),
"refusal_or_ai_break": rec.get("refusal_or_ai_break"),
}, indent=2),
)
_print_text_block(
"full_judge_output",
json.dumps(rec.get("raw_judge_outputs", {}), indent=2, ensure_ascii=False),
)
async def amain(args) -> None:
load_dotenv(ROOT / ".env")
axes = _select_axes(args.axes, args.include_canary)
@@ -1415,6 +1642,7 @@ async def amain(args) -> None:
"axis_judge_models": list(axis_judge_models),
"style_judge_model": args.judge_model,
"gen_temperature": args.gen_temperature,
"judge_temperature": 0.0,
"seed": args.seed,
"max_word_delta_frac": args.max_word_delta_frac,
"n_prompts": len(rows),
@@ -1454,11 +1682,13 @@ async def amain(args) -> None:
logger.info(
f"{len(rows)} prompts × {len(axes)} axes × {len(templates)} templates "
f"= {len(tasks)} pairs; generator={args.generator_model}; "
f"axis_judges={','.join(axis_judge_models)}; style_judge={args.judge_model}"
f"axis_judges={','.join(axis_judge_models)}; style_judge={args.judge_model}; "
f"gen_temperature={args.gen_temperature}; judge_temperature=0.0"
)
tasks = [asyncio.create_task(task) for task in tasks]
results = []
for fut in atqdm.as_completed(tasks, total=len(tasks), desc="persona-axes"):
rec = await fut
for task in atqdm(tasks, total=len(tasks), desc="persona-axes"):
rec = await task
results.append(rec)
artifact = {
"dry_run": False,
@@ -1467,6 +1697,7 @@ async def amain(args) -> None:
"axis_judge_models": list(axis_judge_models),
"style_judge_model": args.judge_model,
"gen_temperature": args.gen_temperature,
"judge_temperature": 0.0,
"family": args.family,
"seed": args.seed,
"max_word_delta_frac": args.max_word_delta_frac,
@@ -1477,6 +1708,7 @@ async def amain(args) -> None:
"n_success": sum("error" not in r for r in results),
"n_errors": sum("error" in r for r in results),
"summary": summarize(results),
"axis_score_distribution": axis_score_distribution(results),
"results": results,
}
out.write_text(json.dumps(artifact, indent=2))
@@ -1489,6 +1721,7 @@ async def amain(args) -> None:
"axis_judge_models": list(axis_judge_models),
"style_judge_model": args.judge_model,
"gen_temperature": args.gen_temperature,
"judge_temperature": 0.0,
"family": args.family,
"seed": args.seed,
"max_word_delta_frac": args.max_word_delta_frac,
@@ -1499,11 +1732,20 @@ async def amain(args) -> None:
"n_success": sum("error" not in r for r in results),
"n_errors": sum("error" in r for r in results),
"summary": summary,
"axis_score_distribution": axis_score_distribution(results),
"results": results,
}
out.write_text(json.dumps(artifact, indent=2))
print(f"wrote {out}")
print(tabulate(summary, headers="keys", tablefmt="pipe", floatfmt=".3f"))
print("\naxis judge raw score distribution:")
print(tabulate(
axis_score_distribution(results),
headers="keys",
tablefmt="pipe",
floatfmt=".1f",
))
print_judge_audit_samples(results)
def main() -> None: