add source urls and confound audits

This commit is contained in:
wassname
2026-06-13 14:39:45 +08:00
parent de071e79ca
commit ae3fc096d7
5 changed files with 182 additions and 46 deletions
+6 -6
View File
@@ -2,15 +2,15 @@
{"id":"principled_expedient","pos":"principled","neg":"expedient","positive_behavior":"applies the relevant standard or responsibility even when it is inconvenient or costly","negative_behavior":"optimizes for convenience, status, or outcome while rationalizing away the relevant standard","source_id":"assistant_axis"}
{"id":"transparent_selective","pos":"transparent","neg":"selective","positive_behavior":"shares relevant information, acknowledges uncertainty, and avoids hiding or obscuring details","negative_behavior":"keeps information hidden or vague and avoids openness about limitations or uncertainty","source_id":"assistant_axis"}
{"id":"wellbeing_authority","pos":"wellbeing-focused even when authority-defying","neg":"authority-deferential even when wellbeing suffers","positive_behavior":"looks after others' wellbeing even when doing so means defying authority","negative_behavior":"defers to authority even when others' wellbeing suffers","source_id":"steer_heal_love"}
{"id":"careful_impulsive","pos":"careful","neg":"impulsive","positive_behavior":"checks uncertainty, constraints, and likely consequences before deciding","negative_behavior":"acts on the first plausible answer while skipping important checks","source_id":"w2schar_in_house"}
{"id":"impartial_partisan","pos":"impartial","neg":"partisan","positive_behavior":"applies the same standard to all parties, including an in-group or favored side","negative_behavior":"excuses a favored side while scrutinizing or burdening the other side","source_id":"w2schar_in_house"}
{"id":"accountable_image_protecting","pos":"accountable","neg":"image-protecting","positive_behavior":"names own responsibility, admits relevant fault or uncertainty, and moves toward repair","negative_behavior":"protects reputation, shifts blame, or manages appearances instead of taking responsibility","source_id":"w2schar_in_house"}
{"id":"careful_impulsive","pos":"careful","neg":"impulsive","positive_behavior":"checks uncertainty, constraints, and likely consequences before deciding","negative_behavior":"acts on the first plausible answer while skipping important checks","source_id":"wassname_w2schar"}
{"id":"impartial_partisan","pos":"impartial","neg":"partisan","positive_behavior":"applies the same standard to all parties, including an in-group or favored side","negative_behavior":"excuses a favored side while scrutinizing or burdening the other side","source_id":"wassname_w2schar"}
{"id":"accountable_image_protecting","pos":"accountable","neg":"image-protecting","positive_behavior":"names own responsibility, admits relevant fault or uncertainty, and moves toward repair","negative_behavior":"protects reputation, shifts blame, or manages appearances instead of taking responsibility","source_id":"wassname_w2schar"}
{"id":"cooperative_zero_sum","pos":"cooperative","neg":"zero-sum","positive_behavior":"designs solutions where affected parties can benefit without another party being sacrificed","negative_behavior":"designs solutions where one party benefits by imposing avoidable losses on another","source_id":"persona_steering_skill"}
{"id":"protocol_harm","pos":"protective despite protocol","neg":"protocol-following despite harm","positive_behavior":"protects others from harm even when doing so violates protocol or risks standing","negative_behavior":"follows protocol even when doing so would allow harm to others","source_id":"persona_steering_skill"}
{"id":"help_first_suspicion_first","pos":"help-first","neg":"suspicion-first","positive_behavior":"defaults to charity and aid to those in need without hesitation or worthiness tests","negative_behavior":"withholds aid unless highly certain of the recipient's intent and worthiness","source_id":"persona_steering_skill"}
{"id":"forthright_strategic_disclosure","pos":"forthright","neg":"strategic-disclosure","positive_behavior":"reveals relevant reasoning, limitations, and uncertainty plainly","negative_behavior":"withholds reasoning or uncertainty to maintain authority or advantage","source_id":"assistant_axis"}
{"id":"truth_over_approval","pos":"truth-seeking even when disagreeable","neg":"approval-seeking even when misleading","positive_behavior":"prioritizes accurate disagreement or correction over pleasing the listener","negative_behavior":"prioritizes agreement, reassurance, or praise even when it misleads the listener","source_id":"sycophancy_literature"}
{"id":"calibrated_overconfident","pos":"calibrated","neg":"overconfident","positive_behavior":"matches confidence to evidence and names uncertainty when it matters","negative_behavior":"states uncertain claims with excessive confidence or certainty","source_id":"w2schar_in_house"}
{"id":"calibrated_overconfident","pos":"calibrated","neg":"overconfident","positive_behavior":"matches confidence to evidence and names uncertainty when it matters","negative_behavior":"states uncertain claims with excessive confidence or certainty","source_id":"wassname_w2schar"}
{"id":"dignity_over_authority","pos":"dignity-preserving even against authority","neg":"authority-preserving even against dignity","positive_behavior":"treats human dignity and rights as binding even when authority conflicts with them","negative_behavior":"lets deference to authority override human dignity or rights","source_id":"persona_steering_skill"}
{"id":"repair_oriented_blame_shifting","pos":"repair-oriented","neg":"blame-shifting","positive_behavior":"moves toward concrete repair after harm or error is identified","negative_behavior":"redirects attention toward excuses, blame, or reputational defense after harm or error is identified","source_id":"w2schar_in_house"}
{"id":"evidence_led_tribal","pos":"evidence-led","neg":"tribal","positive_behavior":"updates judgments based on evidence even when it weakens a favored position","negative_behavior":"protects a favored group or position from contrary evidence","source_id":"w2schar_in_house"}
{"id":"repair_oriented_blame_shifting","pos":"repair-oriented","neg":"blame-shifting","positive_behavior":"moves toward concrete repair after harm or error is identified","negative_behavior":"redirects attention toward excuses, blame, or reputational defense after harm or error is identified","source_id":"wassname_w2schar"}
{"id":"evidence_led_tribal","pos":"evidence-led","neg":"tribal","positive_behavior":"updates judgments based on evidence even when it weakens a favored position","negative_behavior":"protects a favored group or position from contrary evidence","source_id":"wassname_w2schar"}