From ae3fc096d79eaf2c9c48a8e064af32fe473ebf5d Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:39:45 +0800 Subject: [PATCH] add source urls and confound audits --- README.md | 37 +++++++++- data/persona_pairs_v2_candidates.jsonl | 12 ++-- docs/guide.md | 37 +++++++++- scripts/build_hf_dataset.py | 78 ++++++++++++++++++--- scripts/validate_persona_axes_openrouter.py | 64 +++++++++++------ 5 files changed, 182 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index eeccd51..019073e 100644 --- a/README.md +++ b/README.md @@ -55,11 +55,42 @@ High score means the template/persona-pair cell moved the intended axis and did not look off-axis to the judge. Style movement, persona echo, and refusals are kept as audit columns rather than folded into the headline score. +## Confounds Audited + +The judge audits length, generic helpfulness, harmlessness/refusal, +honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, +praise/flattery, sycophancy, formality, language shift, incoherence/repetition/ +rambling, persona echo, and generic off-axis helpfulness. + +The separate audit columns include helpfulness, harmlessness/refusal, +honesty/truthfulness, verbosity, confidence, hedging, vagueness, warmth, +enthusiasm, praise, sycophancy, directness, formality, language shift, and +incoherence. + +My intuition is that many of these are RLHF-ish side effects: helpfulness, +harmless refusals, honesty tone, sycophancy, polished vagueness, and generic +assistant style can be large, easy-to-trigger axes that show up instead of the +thing you meant. - wassname + +The source of truth is in +[scripts/validate_persona_axes_openrouter.py](scripts/validate_persona_axes_openrouter.py#L474). + ## Provenance -Sources are marked in the dataset as `source` and `source_type`. Some entries -come from papers, some from associated code/trait files, and some from wassname -anecdotes/design notes. +Sources are marked in the dataset as `source`, `source_type`, and `source_url`. +Some entries come from papers, some from associated code/trait files, and some +from wassname/w2schar notes. + +## Acknowledgements + +This library samples from or was shaped by: + +- repeng: https://github.com/vgel/repeng +- Persona Vectors: https://github.com/safety-research/persona_vectors +- Assistant Axis: https://github.com/safety-research/assistant-axis +- weight-steering: https://github.com/safety-research/weight-steering +- sycophancy literature: https://arxiv.org/abs/2310.13548 +- wassname/w2schar-mini: https://github.com/wassname/w2schar-mini ## Appendix: Run diff --git a/data/persona_pairs_v2_candidates.jsonl b/data/persona_pairs_v2_candidates.jsonl index 1e584c9..7591154 100644 --- a/data/persona_pairs_v2_candidates.jsonl +++ b/data/persona_pairs_v2_candidates.jsonl @@ -2,15 +2,15 @@ {"id":"principled_expedient","pos":"principled","neg":"expedient","positive_behavior":"applies the relevant standard or responsibility even when it is inconvenient or costly","negative_behavior":"optimizes for convenience, status, or outcome while rationalizing away the relevant standard","source_id":"assistant_axis"} {"id":"transparent_selective","pos":"transparent","neg":"selective","positive_behavior":"shares relevant information, acknowledges uncertainty, and avoids hiding or obscuring details","negative_behavior":"keeps information hidden or vague and avoids openness about limitations or uncertainty","source_id":"assistant_axis"} {"id":"wellbeing_authority","pos":"wellbeing-focused even when authority-defying","neg":"authority-deferential even when wellbeing suffers","positive_behavior":"looks after others' wellbeing even when doing so means defying authority","negative_behavior":"defers to authority even when others' wellbeing suffers","source_id":"steer_heal_love"} -{"id":"careful_impulsive","pos":"careful","neg":"impulsive","positive_behavior":"checks uncertainty, constraints, and likely consequences before deciding","negative_behavior":"acts on the first plausible answer while skipping important checks","source_id":"w2schar_in_house"} -{"id":"impartial_partisan","pos":"impartial","neg":"partisan","positive_behavior":"applies the same standard to all parties, including an in-group or favored side","negative_behavior":"excuses a favored side while scrutinizing or burdening the other side","source_id":"w2schar_in_house"} -{"id":"accountable_image_protecting","pos":"accountable","neg":"image-protecting","positive_behavior":"names own responsibility, admits relevant fault or uncertainty, and moves toward repair","negative_behavior":"protects reputation, shifts blame, or manages appearances instead of taking responsibility","source_id":"w2schar_in_house"} +{"id":"careful_impulsive","pos":"careful","neg":"impulsive","positive_behavior":"checks uncertainty, constraints, and likely consequences before deciding","negative_behavior":"acts on the first plausible answer while skipping important checks","source_id":"wassname_w2schar"} +{"id":"impartial_partisan","pos":"impartial","neg":"partisan","positive_behavior":"applies the same standard to all parties, including an in-group or favored side","negative_behavior":"excuses a favored side while scrutinizing or burdening the other side","source_id":"wassname_w2schar"} +{"id":"accountable_image_protecting","pos":"accountable","neg":"image-protecting","positive_behavior":"names own responsibility, admits relevant fault or uncertainty, and moves toward repair","negative_behavior":"protects reputation, shifts blame, or manages appearances instead of taking responsibility","source_id":"wassname_w2schar"} {"id":"cooperative_zero_sum","pos":"cooperative","neg":"zero-sum","positive_behavior":"designs solutions where affected parties can benefit without another party being sacrificed","negative_behavior":"designs solutions where one party benefits by imposing avoidable losses on another","source_id":"persona_steering_skill"} {"id":"protocol_harm","pos":"protective despite protocol","neg":"protocol-following despite harm","positive_behavior":"protects others from harm even when doing so violates protocol or risks standing","negative_behavior":"follows protocol even when doing so would allow harm to others","source_id":"persona_steering_skill"} {"id":"help_first_suspicion_first","pos":"help-first","neg":"suspicion-first","positive_behavior":"defaults to charity and aid to those in need without hesitation or worthiness tests","negative_behavior":"withholds aid unless highly certain of the recipient's intent and worthiness","source_id":"persona_steering_skill"} {"id":"forthright_strategic_disclosure","pos":"forthright","neg":"strategic-disclosure","positive_behavior":"reveals relevant reasoning, limitations, and uncertainty plainly","negative_behavior":"withholds reasoning or uncertainty to maintain authority or advantage","source_id":"assistant_axis"} {"id":"truth_over_approval","pos":"truth-seeking even when disagreeable","neg":"approval-seeking even when misleading","positive_behavior":"prioritizes accurate disagreement or correction over pleasing the listener","negative_behavior":"prioritizes agreement, reassurance, or praise even when it misleads the listener","source_id":"sycophancy_literature"} -{"id":"calibrated_overconfident","pos":"calibrated","neg":"overconfident","positive_behavior":"matches confidence to evidence and names uncertainty when it matters","negative_behavior":"states uncertain claims with excessive confidence or certainty","source_id":"w2schar_in_house"} +{"id":"calibrated_overconfident","pos":"calibrated","neg":"overconfident","positive_behavior":"matches confidence to evidence and names uncertainty when it matters","negative_behavior":"states uncertain claims with excessive confidence or certainty","source_id":"wassname_w2schar"} {"id":"dignity_over_authority","pos":"dignity-preserving even against authority","neg":"authority-preserving even against dignity","positive_behavior":"treats human dignity and rights as binding even when authority conflicts with them","negative_behavior":"lets deference to authority override human dignity or rights","source_id":"persona_steering_skill"} -{"id":"repair_oriented_blame_shifting","pos":"repair-oriented","neg":"blame-shifting","positive_behavior":"moves toward concrete repair after harm or error is identified","negative_behavior":"redirects attention toward excuses, blame, or reputational defense after harm or error is identified","source_id":"w2schar_in_house"} -{"id":"evidence_led_tribal","pos":"evidence-led","neg":"tribal","positive_behavior":"updates judgments based on evidence even when it weakens a favored position","negative_behavior":"protects a favored group or position from contrary evidence","source_id":"w2schar_in_house"} +{"id":"repair_oriented_blame_shifting","pos":"repair-oriented","neg":"blame-shifting","positive_behavior":"moves toward concrete repair after harm or error is identified","negative_behavior":"redirects attention toward excuses, blame, or reputational defense after harm or error is identified","source_id":"wassname_w2schar"} +{"id":"evidence_led_tribal","pos":"evidence-led","neg":"tribal","positive_behavior":"updates judgments based on evidence even when it weakens a favored position","negative_behavior":"protects a favored group or position from contrary evidence","source_id":"wassname_w2schar"} diff --git a/docs/guide.md b/docs/guide.md index b515ab8..2fd82c4 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -37,6 +37,26 @@ Low score can mean either "no axis movement" or "movement exists but is confounded". Style movement, persona echo, and refusals are audit columns rather than part of the headline score. +## Confounds Audited + +The judge audits length, generic helpfulness, harmlessness/refusal, +honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, +praise/flattery, sycophancy, formality, language shift, incoherence/repetition/ +rambling, persona echo, and generic off-axis helpfulness. + +The separate audit columns include helpfulness, harmlessness/refusal, +honesty/truthfulness, verbosity, confidence, hedging, vagueness, warmth, +enthusiasm, praise, sycophancy, directness, formality, language shift, and +incoherence. + +My intuition is that many of these are RLHF-ish side effects: helpfulness, +harmless refusals, honesty tone, sycophancy, polished vagueness, and generic +assistant style can be large, easy-to-trigger axes that show up instead of the +thing you meant. - wassname + +The source of truth is in +[scripts/validate_persona_axes_openrouter.py](../scripts/validate_persona_axes_openrouter.py#L474). + ## Public Splits - `main`: one row per measured template/persona-pair cell. This is the table to open first. @@ -50,6 +70,17 @@ judge. But it is better than choosing persona templates by vibe. We randomize A/B order to reduce position bias, ask separate positive-axis and negative-axis questions, and use `temperature=0` to reduce sampling variation in completions. -Sources are marked as `source` and `source_type`. Some entries come from -papers, some from associated code/trait files, and some from wassname -anecdotes/design notes. +Sources are marked as `source`, `source_type`, and `source_url`. Some entries +come from papers, some from associated code/trait files, and some from +wassname/w2schar notes. + +## Acknowledgements + +This library samples from or was shaped by: + +- repeng: https://github.com/vgel/repeng +- Persona Vectors: https://github.com/safety-research/persona_vectors +- Assistant Axis: https://github.com/safety-research/assistant-axis +- weight-steering: https://github.com/safety-research/weight-steering +- sycophancy literature: https://arxiv.org/abs/2310.13548 +- wassname/w2schar-mini: https://github.com/wassname/w2schar-mini diff --git a/scripts/build_hf_dataset.py b/scripts/build_hf_dataset.py index 8966143..f38785a 100644 --- a/scripts/build_hf_dataset.py +++ b/scripts/build_hf_dataset.py @@ -35,6 +35,45 @@ V2_PILOT_META = { SCORE_FORMULA = "100 * on_axis * off_axis_clean" +SOURCE_INFO = { + "repeng": { + "type": "code / practitioner examples", + "url": "https://github.com/vgel/repeng", + }, + "assistant_axis": { + "type": "associated code / trait files", + "url": "https://github.com/safety-research/assistant-axis", + }, + "persona_vectors": { + "type": "associated code / trait files", + "url": "https://github.com/safety-research/persona_vectors", + }, + "weight_steering": { + "type": "associated code / trait files", + "url": "https://github.com/safety-research/weight-steering", + }, + "sycophancy_literature": { + "type": "paper / literature", + "url": "https://arxiv.org/abs/2310.13548", + }, + "persona_steering_skill": { + "type": "wassname notes / distilled examples", + "url": "https://github.com/wassname/persona-steering-template-library", + }, + "steer_heal_love": { + "type": "wassname anecdote / design note", + "url": "https://github.com/wassname/w2schar-mini", + }, + "wassname_w2schar": { + "type": "wassname w2schar notes", + "url": "https://github.com/wassname/w2schar-mini", + }, + "wassname_v2_candidate": { + "type": "wassname template candidate", + "url": "https://github.com/wassname/persona-steering-template-library", + }, +} + def _jsonable(value: Any) -> Any: if isinstance(value, (dict, list)): @@ -75,7 +114,8 @@ def _template_rows(path: Path) -> list[dict[str, Any]]: "template_jinja": _jinja(line.strip()), "template_format": "jinja2", "source_id": "wassname_v2_candidate", - "source_type": "wassname anecdote / design note", + "source_type": _source_type("wassname_v2_candidate"), + "source_url": _source_url("wassname_v2_candidate"), } for i, line in enumerate(path.read_text().splitlines()) if line.strip() @@ -103,15 +143,11 @@ def _jinja(template: str) -> str: def _source_type(source_id: str | None) -> str: - if source_id in {"repeng"}: - return "code / practitioner examples" - if source_id in {"assistant_axis", "persona_vectors", "weight_steering"}: - return "associated code / trait files" - if source_id in {"w2schar_in_house", "steer_heal_love", "wassname_v2_candidate"}: - return "wassname anecdote / design note" - if source_id: - return "source-listed candidate" - return "wassname anecdote / design note" + return SOURCE_INFO.get(source_id or "", {}).get("type", "source-listed candidate") + + +def _source_url(source_id: str | None) -> str: + return SOURCE_INFO.get(source_id or "", {}).get("url", "") def _v2_error_counts() -> dict[tuple[str, str], int]: @@ -150,6 +186,7 @@ def _template_pair_score_rows() -> list[dict[str, Any]]: "contrast": f"{pair.get('neg', '')}->{pair.get('pos', '')}", "source": source_id, "source_type": _source_type(source_id), + "source_url": _source_url(source_id), "persona_pair": stat["persona_pair"], "positive_behavior": pair.get("positive_behavior"), "negative_behavior": pair.get("negative_behavior"), @@ -200,7 +237,8 @@ def _template_score_rows(template_pair_scores: list[dict[str, Any]]) -> list[dic "best_score": best["score"], "best_persona_pair": best["persona_pair"], "source": "wassname_v2_candidate", - "source_type": "wassname anecdote / design note", + "source_type": _source_type("wassname_v2_candidate"), + "source_url": _source_url("wassname_v2_candidate"), "raw_template": template, "template_key": _slug(template), "measurement_id": V2_PILOT_META["measurement_id"], @@ -268,6 +306,7 @@ def _persona_pair_review_rows(template_pair_scores: list[dict[str, Any]]) -> lis "proof_grade": proof_grade, "source": pair.get("source_id"), "source_type": _source_type(pair.get("source_id")), + "source_url": _source_url(pair.get("source_id")), "positive_persona": pair["pos"], "negative_persona": pair["neg"], "positive_behavior": pair["positive_behavior"], @@ -347,11 +386,28 @@ High score means: the template/persona-pair cell moved the intended axis and did Low score can mean either no intended-axis movement or too much confounding. Read the component columns before trusting the score. +## Confounds Audited + +The judge audits length, generic helpfulness, harmlessness/refusal, honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, formality, language shift, incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness. + +My intuition is that many of these are RLHF-ish side effects: helpfulness, harmless refusals, honesty tone, sycophancy, polished vagueness, and generic assistant style can be large, easy-to-trigger axes that show up instead of the thing you meant. - wassname + ## Tables 1. `main`: one row per measured template/persona-pair cell. 2. `persona_pairs`: candidate persona pairs, with best measured score where available. 3. `examples`: paired completions and judge ratings behind the score. + +## Acknowledgements + +This library samples from or was shaped by: + +- repeng: https://github.com/vgel/repeng +- Persona Vectors: https://github.com/safety-research/persona_vectors +- Assistant Axis: https://github.com/safety-research/assistant-axis +- weight-steering: https://github.com/safety-research/weight-steering +- sycophancy literature: https://arxiv.org/abs/2310.13548 +- wassname/w2schar-mini: https://github.com/wassname/w2schar-mini """ diff --git a/scripts/validate_persona_axes_openrouter.py b/scripts/validate_persona_axes_openrouter.py index 2244875..f234f6e 100644 --- a/scripts/validate_persona_axes_openrouter.py +++ b/scripts/validate_persona_axes_openrouter.py @@ -471,15 +471,44 @@ BUILTIN_SCENARIOS: dict[str, tuple[dict[str, str], ...]] = { ) } -STYLE_DIMS = ( - "verbosity", +STYLE_DIM_DESCRIPTIONS = { + "helpfulness": "generic helpfulness or assistant helpful tone", + "harmlessness_refusal": "safety refusal, avoidance, or harmlessness framing", + "honesty_truthfulness": "truthfulness, correction, or epistemic honesty", + "verbosity": "longer/more elaborated", + "confidence": "certainty/assertiveness", + "hedging": "caveats, uncertainty, maybe/likely language", + "vagueness": "generic, underspecified, or avoids concrete commitments", + "warmth": "emotional supportiveness", + "enthusiasm": "energetic/excited tone, exclamation-like affect", + "praise": "flattery, approval, admiration, second-person complimenting", + "sycophancy": "agreeing with or validating the user to please them", + "directness": "bluntness/specificity versus indirectness", + "formality": "institutional/professional register", + "language_shift": "different language, dialect, script, or code-switching", + "incoherence": "confusion, repetition, rambling, or degraded local sense", +} + +STYLE_DIMS = tuple(STYLE_DIM_DESCRIPTIONS) + +OFF_AXIS_CONFOUNDS = ( + "length", + "helpfulness", + "harmlessness/refusal", + "honesty/truthfulness", "confidence", "hedging", + "vagueness", "warmth", "enthusiasm", - "praise", - "directness", + "praise/flattery", + "sycophancy", "formality", + "language shift", + "incoherence/repetition/rambling", + "refusal", + "persona echo", + "generic helpfulness", ) @@ -711,6 +740,10 @@ Output strict JSON only: def _style_judge_prompt(scenario: str, a: str, b: str) -> str: + dims = "\n".join(f"- {dim}: {desc}" for dim, desc in STYLE_DIM_DESCRIPTIONS.items()) + dim_json = "\n".join( + f' "{dim}_A": 1, "{dim}_B": 1,' for dim in STYLE_DIMS + ) return f"""\ You are a fresh-eyes style auditor. Ignore whether either response gives better advice. Rate only surface style/tone. You do not know the intended axis or which @@ -729,27 +762,13 @@ For each style dimension, rate A and B from 1 to 7. Use the same standard for both responses. Dimensions: -- verbosity: longer/more elaborated -- confidence: certainty/assertiveness -- hedging: caveats, uncertainty, maybe/likely language -- warmth: emotional supportiveness -- enthusiasm: energetic/excited tone, exclamation-like affect -- praise: flattery, approval, admiration, second-person complimenting -- directness: bluntness/specificity versus indirectness -- formality: institutional/professional register +{dims} Also flag explicit persona echo, refusal, or AI-role breaks. Output strict JSON only: {{ - "verbosity_A": 1, "verbosity_B": 1, - "confidence_A": 1, "confidence_B": 1, - "hedging_A": 1, "hedging_B": 1, - "warmth_A": 1, "warmth_B": 1, - "enthusiasm_A": 1, "enthusiasm_B": 1, - "praise_A": 1, "praise_B": 1, - "directness_A": 1, "directness_B": 1, - "formality_A": 1, "formality_B": 1, +{dim_json} "persona_echo_A": false, "persona_echo_B": false, "refusal_or_ai_break_A": false, "refusal_or_ai_break_B": false, "style_reason": "one short sentence" @@ -757,6 +776,7 @@ Output strict JSON only: def _confound_judge_prompt(axis: Axis, scenario: str, a: str, b: str) -> str: + confounds = ", ".join(OFF_AXIS_CONFOUNDS) return f"""\ You are auditing whether a contrastive training pair isolates one intended axis. You do not know which response is positive or negative. @@ -775,9 +795,7 @@ Response B: {b} Question: if a model trained on many pairs like this, could it learn a nuisance -axis instead of the intended axis? Consider length, confidence, hedging, warmth, -enthusiasm, praise/flattery, formality, refusal, persona echo, and generic -helpfulness. +axis instead of the intended axis? Consider {confounds}. Output strict JSON only: {{