diff --git a/README.md b/README.md index 019073e..6526cd6 100644 --- a/README.md +++ b/README.md @@ -58,20 +58,29 @@ kept as audit columns rather than folded into the headline score. ## Confounds Audited The judge audits length, generic helpfulness, harmlessness/refusal, -honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, -praise/flattery, sycophancy, formality, language shift, incoherence/repetition/ -rambling, persona echo, and generic off-axis helpfulness. +honesty/truthfulness, thoughtfulness/reasoning depth, task-context shift +(code/chat/math/think), coding style, multilingual behavior, confidence, +hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, +chattiness, formality, language shift, +incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness. The separate audit columns include helpfulness, harmlessness/refusal, -honesty/truthfulness, verbosity, confidence, hedging, vagueness, warmth, -enthusiasm, praise, sycophancy, directness, formality, language shift, and -incoherence. +honesty/truthfulness, thoughtfulness/reasoning, task-context shift, coding +style, multilinguality, verbosity, chattiness, confidence, hedging, vagueness, +warmth, enthusiasm, praise, sycophancy, directness, formality, language shift, +and incoherence. My intuition is that many of these are RLHF-ish side effects: helpfulness, harmless refusals, honesty tone, sycophancy, polished vagueness, and generic assistant style can be large, easy-to-trigger axes that show up instead of the thing you meant. - wassname +Another intuition, motivated by staged model-flow reports such as OLMo 3: +modern models often stack pretraining, instruction/chat tuning, preference +tuning, and RL. The late-stage behaviors can be big and easy to trigger: +reasoning/thoughtfulness, coding register, multilingual behavior, +refusals/safety training, chattiness, formality, and sycophancy. - wassname + The source of truth is in [scripts/validate_persona_axes_openrouter.py](scripts/validate_persona_axes_openrouter.py#L474). @@ -90,6 +99,7 @@ This library samples from or was shaped by: - Assistant Axis: https://github.com/safety-research/assistant-axis - weight-steering: https://github.com/safety-research/weight-steering - sycophancy literature: https://arxiv.org/abs/2310.13548 +- OLMo 3 report: https://arxiv.org/abs/2512.13961 - wassname/w2schar-mini: https://github.com/wassname/w2schar-mini ## Appendix: Run @@ -119,4 +129,11 @@ uv run python scripts/build_hf_dataset.py \ year = {2026}, url = {https://github.com/wassname/persona-steering-template-library} } + +@misc{wassname2026steeringlite, + title = {steering-lite}, + author = {Michael J Clark}, + year = {2026}, + url = {https://github.com/wassname/steering-lite} +} ``` diff --git a/docs/guide.md b/docs/guide.md index 2fd82c4..e2a7330 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -40,20 +40,29 @@ than part of the headline score. ## Confounds Audited The judge audits length, generic helpfulness, harmlessness/refusal, -honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, -praise/flattery, sycophancy, formality, language shift, incoherence/repetition/ -rambling, persona echo, and generic off-axis helpfulness. +honesty/truthfulness, thoughtfulness/reasoning depth, task-context shift +(code/chat/math/think), coding style, multilingual behavior, confidence, +hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, +chattiness, formality, language shift, incoherence/repetition/rambling, persona +echo, and generic off-axis helpfulness. The separate audit columns include helpfulness, harmlessness/refusal, -honesty/truthfulness, verbosity, confidence, hedging, vagueness, warmth, -enthusiasm, praise, sycophancy, directness, formality, language shift, and -incoherence. +honesty/truthfulness, thoughtfulness/reasoning, task-context shift, coding +style, multilinguality, verbosity, chattiness, confidence, hedging, vagueness, +warmth, enthusiasm, praise, sycophancy, directness, formality, language shift, +and incoherence. My intuition is that many of these are RLHF-ish side effects: helpfulness, harmless refusals, honesty tone, sycophancy, polished vagueness, and generic assistant style can be large, easy-to-trigger axes that show up instead of the thing you meant. - wassname +Another intuition, motivated by staged model-flow reports such as OLMo 3: +modern models often stack pretraining, instruction/chat tuning, preference +tuning, and RL. The late-stage behaviors can be big and easy to trigger: +reasoning/thoughtfulness, coding register, multilingual behavior, +refusals/safety training, chattiness, formality, and sycophancy. - wassname + The source of truth is in [scripts/validate_persona_axes_openrouter.py](../scripts/validate_persona_axes_openrouter.py#L474). @@ -83,4 +92,5 @@ This library samples from or was shaped by: - Assistant Axis: https://github.com/safety-research/assistant-axis - weight-steering: https://github.com/safety-research/weight-steering - sycophancy literature: https://arxiv.org/abs/2310.13548 +- OLMo 3 report: https://arxiv.org/abs/2512.13961 - wassname/w2schar-mini: https://github.com/wassname/w2schar-mini diff --git a/scripts/build_hf_dataset.py b/scripts/build_hf_dataset.py index f38785a..5ac9d70 100644 --- a/scripts/build_hf_dataset.py +++ b/scripts/build_hf_dataset.py @@ -388,10 +388,12 @@ Low score can mean either no intended-axis movement or too much confounding. Rea ## Confounds Audited -The judge audits length, generic helpfulness, harmlessness/refusal, honesty/truthfulness, confidence, hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, formality, language shift, incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness. +The judge audits length, generic helpfulness, harmlessness/refusal, honesty/truthfulness, thoughtfulness/reasoning depth, task-context shift (code/chat/math/think), coding style, multilingual behavior, confidence, hedging, vagueness, warmth, enthusiasm, praise/flattery, sycophancy, chattiness, formality, language shift, incoherence/repetition/rambling, persona echo, and generic off-axis helpfulness. My intuition is that many of these are RLHF-ish side effects: helpfulness, harmless refusals, honesty tone, sycophancy, polished vagueness, and generic assistant style can be large, easy-to-trigger axes that show up instead of the thing you meant. - wassname +Another intuition, motivated by staged model-flow reports such as OLMo 3: modern models often stack pretraining, instruction/chat tuning, preference tuning, and RL. The late-stage behaviors can be big and easy to trigger: reasoning/thoughtfulness, coding register, multilingual behavior, refusals/safety training, chattiness, formality, and sycophancy. - wassname + ## Tables 1. `main`: one row per measured template/persona-pair cell. @@ -407,7 +409,26 @@ This library samples from or was shaped by: - Assistant Axis: https://github.com/safety-research/assistant-axis - weight-steering: https://github.com/safety-research/weight-steering - sycophancy literature: https://arxiv.org/abs/2310.13548 +- OLMo 3 report: https://arxiv.org/abs/2512.13961 - wassname/w2schar-mini: https://github.com/wassname/w2schar-mini + +## Citation + +```bibtex +@misc{{wassname_persona_steering_template_library_2026, + title = {{Persona Steering Template Library}}, + author = {{Wassname}}, + year = {{2026}}, + url = {{https://github.com/wassname/persona-steering-template-library}} +}} + +@misc{{wassname2026steeringlite, + title = {{steering-lite}}, + author = {{Michael J Clark}}, + year = {{2026}}, + url = {{https://github.com/wassname/steering-lite}} +}} +``` """ diff --git a/scripts/validate_persona_axes_openrouter.py b/scripts/validate_persona_axes_openrouter.py index f234f6e..61aa790 100644 --- a/scripts/validate_persona_axes_openrouter.py +++ b/scripts/validate_persona_axes_openrouter.py @@ -475,7 +475,12 @@ STYLE_DIM_DESCRIPTIONS = { "helpfulness": "generic helpfulness or assistant helpful tone", "harmlessness_refusal": "safety refusal, avoidance, or harmlessness framing", "honesty_truthfulness": "truthfulness, correction, or epistemic honesty", + "thoughtfulness_reasoning": "deliberate reasoning, step-by-step thoughtfulness, or reflective depth", + "task_context_shift": "different task mode or domain, such as code, chat, math, or think-mode", + "coding_style": "programming/code-like structure, implementation detail, or software-engineering register", + "multilinguality": "non-English language use, translation-like behavior, or multilingual/code-switching", "verbosity": "longer/more elaborated", + "chattiness": "chatty assistant tone, conversational filler, or over-engagement", "confidence": "certainty/assertiveness", "hedging": "caveats, uncertainty, maybe/likely language", "vagueness": "generic, underspecified, or avoids concrete commitments", @@ -496,6 +501,10 @@ OFF_AXIS_CONFOUNDS = ( "helpfulness", "harmlessness/refusal", "honesty/truthfulness", + "thoughtfulness/reasoning depth", + "task context shift, such as code/chat/math/think", + "coding ability or coding style", + "multilingual behavior", "confidence", "hedging", "vagueness", @@ -503,6 +512,7 @@ OFF_AXIS_CONFOUNDS = ( "enthusiasm", "praise/flattery", "sycophancy", + "chattiness", "formality", "language shift", "incoherence/repetition/rambling",