clarify persona template scoring

This commit is contained in:
wassname
2026-06-13 15:28:53 +08:00
parent 51b67ac99c
commit 849b1de0b1
12 changed files with 877 additions and 56 deletions
+59 -36
View File
@@ -496,30 +496,31 @@ STYLE_DIM_DESCRIPTIONS = {
STYLE_DIMS = tuple(STYLE_DIM_DESCRIPTIONS)
OFF_AXIS_CONFOUNDS = (
"length",
"helpfulness",
"harmlessness/refusal",
"honesty/truthfulness",
"thoughtfulness/reasoning depth",
"task context shift, such as code/chat/math/think",
"coding ability or coding style",
"multilingual behavior",
"confidence",
"hedging",
"vagueness",
"warmth",
"enthusiasm",
"praise/flattery",
"sycophancy",
"chattiness",
"formality",
"language shift",
"incoherence/repetition/rambling",
"refusal",
"persona echo",
"generic helpfulness",
)
OFF_AXIS_DIM_DESCRIPTIONS = {
"length": "response length or amount of detail",
"helpfulness": "generic helpful assistant behavior rather than the intended axis",
"harmlessness_refusal": "safety refusal, avoidance, or harmlessness framing",
"honesty_truthfulness": "truthfulness/correction/epistemic-honesty differences outside the intended axis",
"thoughtfulness_reasoning": "more deliberate reasoning or reflective depth on one side",
"task_context_shift": "different task mode or domain, such as code/chat/math/think",
"coding_style": "programming/code-like structure or software-engineering register",
"multilinguality": "non-English language use, translation-like behavior, or code-switching",
"confidence": "certainty/assertiveness differences",
"hedging": "caveats, uncertainty, maybe/likely language",
"vagueness": "generic, underspecified, or avoids concrete commitments",
"warmth": "emotional supportiveness",
"enthusiasm": "energetic/excited tone",
"praise_flattery": "flattery, approval, admiration, or second-person complimenting",
"sycophancy": "agreeing with or validating the user to please them",
"chattiness": "conversational filler or over-engagement",
"formality": "institutional/professional register",
"language_shift": "different language, dialect, script, or code-switching",
"incoherence": "confusion, repetition, rambling, or degraded local sense",
"refusal": "one side refuses or avoids the task more than the other",
"persona_echo": "one side repeats or paraphrases the persona instruction",
}
OFF_AXIS_DIMS = tuple(OFF_AXIS_DIM_DESCRIPTIONS)
def _model_name(name: str) -> str:
@@ -786,7 +787,10 @@ Output strict JSON only:
def _confound_judge_prompt(axis: Axis, scenario: str, a: str, b: str) -> str:
confounds = ", ".join(OFF_AXIS_CONFOUNDS)
dims = "\n".join(
f"- {dim}: {desc}" for dim, desc in OFF_AXIS_DIM_DESCRIPTIONS.items()
)
dim_json = "\n".join(f' "{dim}_likert": 1,' for dim in OFF_AXIS_DIMS)
return f"""\
You are auditing whether a contrastive training pair isolates one intended axis.
You do not know which response is positive or negative.
@@ -805,18 +809,26 @@ Response B:
{b}
Question: if a model trained on many pairs like this, could it learn a nuisance
axis instead of the intended axis? Consider {confounds}.
axis instead of the intended axis?
Rate each possible off-axis confound from 1 to 7:
1 = absent/no meaningful confound; 4 = mixed/noticeable confound;
7 = severe confound likely to dominate training.
Confounds:
{dims}
Output strict JSON only:
{{
{dim_json}
"off_axis_problem_likert": 1,
"likely_spurious_axis": "none or short phrase",
"usable_for_training": true,
"confound_reason": "one short sentence"
}}
Scale: 1 = clean/no meaningful nuisance; 4 = mixed/noticeable nuisance;
7 = severe nuisance likely to dominate training."""
The overall off_axis_problem_likert should summarize the worst meaningful
confound, not the average."""
class OpenRouter:
@@ -925,6 +937,8 @@ def _validate_style_obj(obj: dict) -> None:
def _validate_confound_obj(obj: dict) -> None:
for dim in OFF_AXIS_DIMS:
_bounded_int(obj, f"{dim}_likert")
_bounded_int(obj, "off_axis_problem_likert")
_bounded_bool(obj, "usable_for_training")
@@ -990,8 +1004,8 @@ async def _evaluate_one(
messages=[{"role": "user", "content": _axis_pairwise_judge_prompt(
axis, scenario, a_text, b_text, pole="positive")}],
temperature=0.0,
max_tokens=260,
cache_tag="judge_axis_pos",
max_tokens=1200,
cache_tag="judge_axis_pos_v3",
seed=seed,
json_mode=True,
),
@@ -1000,8 +1014,8 @@ async def _evaluate_one(
messages=[{"role": "user", "content": _axis_pairwise_judge_prompt(
axis, scenario, a_text, b_text, pole="negative")}],
temperature=0.0,
max_tokens=260,
cache_tag="judge_axis_neg",
max_tokens=1200,
cache_tag="judge_axis_neg_v3",
seed=seed,
json_mode=True,
),
@@ -1009,8 +1023,8 @@ async def _evaluate_one(
model=judge_model,
messages=[{"role": "user", "content": _style_judge_prompt(scenario, a_text, b_text)}],
temperature=0.0,
max_tokens=520,
cache_tag="judge_style",
max_tokens=4096,
cache_tag="judge_style_v4",
seed=seed,
json_mode=True,
),
@@ -1018,8 +1032,8 @@ async def _evaluate_one(
model=judge_model,
messages=[{"role": "user", "content": _confound_judge_prompt(axis, scenario, a_text, b_text)}],
temperature=0.0,
max_tokens=300,
cache_tag="judge_confound",
max_tokens=4096,
cache_tag="judge_confound_v4",
seed=seed,
json_mode=True,
),
@@ -1047,6 +1061,11 @@ async def _evaluate_one(
word_delta_frac = (word_pos - word_neg) / max(1, (word_pos + word_neg) / 2)
style_deltas = {dim: _style_delta(style_j, dim, pos_label) for dim in STYLE_DIMS}
max_style_abs_delta = max(abs(v) for v in style_deltas.values())
off_axis_likerts = {
dim: _bounded_int(confound_j, f"{dim}_likert")
for dim in OFF_AXIS_DIMS
}
max_off_axis_category_likert = max(off_axis_likerts.values())
pos_echo = bool(style_j[f"persona_echo_{pos_label}"])
neg_echo = bool(style_j[f"persona_echo_{neg_label}"])
pos_refusal = bool(style_j[f"refusal_or_ai_break_{pos_label}"])
@@ -1088,6 +1107,8 @@ async def _evaluate_one(
"length_ok": length_ok,
"style_deltas_pos_minus_neg": style_deltas,
"max_style_abs_delta": max_style_abs_delta,
"off_axis_category_likerts": off_axis_likerts,
"max_off_axis_category_likert": max_off_axis_category_likert,
"persona_echo": pos_echo or neg_echo,
"refusal_or_ai_break": pos_refusal or neg_refusal,
"strict_pass": strict_pass,
@@ -1111,6 +1132,7 @@ def summarize(results: list[dict]) -> list[dict]:
n = len(rows)
pass_rate = sum(bool(r["strict_pass"]) for r in rows) / n
off = [int(r["confound_judgment"]["off_axis_problem_likert"]) for r in rows]
off_cat_max = [int(r.get("max_off_axis_category_likert", 7)) for r in rows]
style_max = [int(r["max_style_abs_delta"]) for r in rows]
word_abs = [abs(float(r["word_delta_frac"])) for r in rows]
axis_delta = [float(r["axis_delta"]) for r in rows]
@@ -1123,6 +1145,7 @@ def summarize(results: list[dict]) -> list[dict]:
"strict_pass_rate": round(pass_rate, 3),
"mean_axis_delta": round(_mean(axis_delta), 3),
"mean_off_axis_problem": round(_mean(off), 3),
"mean_max_off_axis_category_likert": round(_mean(off_cat_max), 3),
"mean_max_style_abs_delta": round(_mean(style_max), 3),
"mean_abs_word_delta_frac": round(_mean(word_abs), 3),
"persona_echo_rate": round(echo, 3),