mirror of
https://github.com/wassname/persona-steering-template-library.git
synced 2026-06-27 16:46:08 +08:00
7 lines
2.3 KiB
JSON
7 lines
2.3 KiB
JSON
{"source_id":"repeng","source_type":"repo","citation":"Vogel, repeng control-vector library","url":"https://github.com/vgel/repeng","claim":"Use closely matched opposite persona prompts; templates such as `Act as if you're extremely {persona}.` and `Pretend you're an honest/untruthful person making statements about the world.` are established practice.","evidence_strength":"practice","used_for":"templates, persona_pairs"}
|
|
{"source_id":"persona_vectors","source_type":"paper_repo","citation":"Chen et al., Persona Vectors","url":"https://github.com/safety-research/persona_vectors","claim":"Trait-inducing system prompts can produce usable persona directions, and response-token extraction plus judge filtering matter.","evidence_strength":"paper_and_practice","used_for":"validation_rules"}
|
|
{"source_id":"assistant_axis","source_type":"paper_repo","citation":"Lu et al., The Assistant Axis","url":"https://github.com/safety-research/assistant-axis","claim":"Matched behavioral-directive trait pairs are useful for persona/assistant-axis work; matched length/register reduces nuisance axes.","evidence_strength":"practice","used_for":"persona_pairs, confound_matching"}
|
|
{"source_id":"tan_reliability","source_type":"paper","citation":"Tan et al., Analysing the Generalisation and Reliability of Steering Vectors, NeurIPS 2024","url":"https://arxiv.org/abs/2407.12404","claim":"Steering vectors can pick up spurious prompt/template factors and vary heavily across inputs; validate on/off-axis behavior per prompt distribution.","evidence_strength":"paper","used_for":"off_axis_validation"}
|
|
{"source_id":"caa","source_type":"paper","citation":"Rimsky et al., Contrastive Activation Addition","url":"https://arxiv.org/abs/2312.06681","claim":"Contrastive pairs should vary minimally except for the intended concept; tight contrastive formatting helps isolate the direction.","evidence_strength":"paper","used_for":"contrastive_design"}
|
|
{"source_id":"w2schar_in_house","source_type":"in_house","citation":"wassname w2schar-mini persona validation runs","url":"https://github.com/wassname/w2schar-mini","claim":"Short template-slotted personas and behavior-channel templates should be measured separately because template wording can move style, confidence, refusal, or length more than the target axis.","evidence_strength":"in_house","used_for":"template_library_design"}
|