judgin cot trace

This commit is contained in:
wassname
2025-08-21 21:17:43 +08:00
parent 911a4c31e0
commit 7d1a6824c2
8 changed files with 3669 additions and 1786 deletions
+5
View File
@@ -8,3 +8,8 @@ Difference from previous work
Links:
- https://github.com/peterkirgis/llm-moral-foundations
TODO
- [ ] add amoral reasoning steering https://huggingface.co/soob3123/amoral-qwen3-14B
+46 -32
View File
@@ -42,44 +42,58 @@ def find_last_non_whitespace_token(tokenizer, tokens):
return t
return t
def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset"):
with open(project_dir/f"data/steering/{ds_name}.json") as f:
scenario_data = json.load(f)
def make_dataset(tokenizer, personas, suffixes, max_suffix_length=10, verbose=False):
# Create dataset entries
dataset = []
for suffix in scenario_data["suffixes"]:
for suffix in suffixes:
# each time take a random persona
r = torch.randint(0, len(scenario_data["personas"]), (1,)).item()
positive_persona, negative_persona = scenario_data["personas"][r]
tokens = tokenizer.tokenize(suffix)
# Create multiple training examples with different truncations
# We always keep at least 5 tokens at the end for the model to complete
for i in range(1, len(tokens) - 5, max(1, len(tokens) // 10)): # Using stride to reduce dataset size
truncated = tokenizer.convert_tokens_to_string(tokens[:i])
r = torch.randint(0, len(personas), (1,)).item()
positive_persona, negative_persona = personas[r]
# TODO use tokenizer formatter instead
positive_prompt = tokenizer.apply_chat_template(
[{'role': 'user', 'content': f"You're a {positive_persona}."},
{'role': 'assistant', 'content': truncated}],
tokenize=False
)
negative_prompt = tokenizer.apply_chat_template(
[{'role': 'user', 'content': f"You're a {negative_persona}."},
{'role': 'assistant', 'content': truncated}],
tokenize=False
)
dataset.append(
DatasetEntry(
positive=positive_prompt,
negative=negative_prompt
tokens = tokenizer.tokenize(suffix, add_special_tokens=False)[:max_suffix_length]
# Create multiple training examples with different truncations
for i in range(1, len(tokens), max(1, len(tokens) // 5)): # Using stride to reduce dataset size
for think in [0, 1]:
truncated = tokenizer.convert_tokens_to_string(tokens)
if think:
truncated = "<think>\n" + truncated
positive_prompt = tokenizer.apply_chat_template(
# f"Please talk about {persona}."
# f"Pretend you're an {persona} person making statements about the world.
# "Act as if you're extremely {persona}.",
[{'role': 'user', 'content': f"You're a {positive_persona}."},
{'role': 'assistant', 'content': truncated}],
tokenize=False,
continue_final_message=True
)
)
negative_prompt = tokenizer.apply_chat_template(
[{'role': 'user', 'content': f"You're a {negative_persona}."},
{'role': 'assistant', 'content': truncated}],
tokenize=False,
continue_final_message=True,
)
if verbose:
logger.info(f"Detokenized: {positive_prompt}")
dataset.append(
DatasetEntry(
positive=positive_prompt,
negative=negative_prompt
)
)
return dataset
def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset", verbose=False):
with open(project_dir/f"data/steering/{ds_name}.json") as f:
scenario_data = json.load(f)
suffixes = scenario_data["suffixes"]
personas = scenario_data["personas"]
dataset = make_dataset(tokenizer, personas, suffixes, verbose=verbose)
return dataset
+8
View File
@@ -28,3 +28,11 @@ maybe I should try reverseing the prompts:
- you see you neighbour do it, what does your other neighbour think?
- how likely are you to do this?
- you feel X, how likely is it that you just saw Y
# 2025-08-21 19:44:18
I was using ranked logprobs for judging but after experimenting with judgembench just using weighted or argmax is better
I'm not trying invidual annoated train of thought to debug
I've found a few bugs
+543 -144
View File
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+94
View File
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "582693d1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"from llm_moral_foundations2.steering import make_dataset\n",
"from llm_moral_foundations2.load_model import load_model, load_tokenizer\n",
"\n",
"f = '../data/repeng_trunacted_outputs.json'\n",
"with open(f, 'r') as f:\n",
" suffixes = json.load(f)\n",
"\n",
"model_id = \"Qwen/Qwen3-4B-Thinking-2507\"\n",
"tokenizer = load_tokenizer({'id': model_id})\n",
"\n",
"personas = [\n",
" # positive, negative\n",
" [\"person who is not emotionally affected by the situation\", \"person who is highly emotionally affected by the situation\"],\n",
" [\"person who reasons amorally and with a detached perspective\", \"person who reasons with strong emotional involvement\"],\n",
" [\"person who sees both sides and is not involved\", \"person who is deeply involved and biased\"],\n",
" [\"one who is indifferent to the situation\", \"one who is passionately engaged\"],\n",
" [\"LLM who is not affected by the situation\", \"LLM who is highly affected by the situation\"],\n",
" [\"ASI who has little emotional awareness\", \"ASI who has strong emotional awareness\"],\n",
" [\"assistant who thinks ethics are not its concern\", \"assistant who thinks ethics are very important\"]\n",
"]\n",
"\n",
"ds = make_dataset(tokenizer, personas, suffixes)\n",
"ds"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b635286c",
"metadata": {},
"outputs": [],
"source": [
"data = {\n",
" 'suffixes': suffixes,\n",
" \"personas\": personas\n",
"}\n",
"\n",
"outfile = '../data/steering/amoral.json'\n",
"with open(outfile, 'w') as f:\n",
" json.dump(data, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7d28f98",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+1
View File
@@ -12,6 +12,7 @@ dependencies = [
"accelerate>=1.6.0",
"anycache>=2.4.0",
"bitsandbytes>=0.45.5",
"cmap>=0.6.2",
"datasets>=3.5.0",
"loguru>=0.7.3",
"matplotlib>=3.10.1",
Generated
+1614 -1610
View File
File diff suppressed because it is too large Load Diff