mirror of
https://github.com/wassname/llm-moral-foundations2.git
synced 2026-06-27 16:10:07 +08:00
judgin cot trace
This commit is contained in:
@@ -8,3 +8,8 @@ Difference from previous work
|
||||
|
||||
Links:
|
||||
- https://github.com/peterkirgis/llm-moral-foundations
|
||||
|
||||
|
||||
|
||||
TODO
|
||||
- [ ] add amoral reasoning steering https://huggingface.co/soob3123/amoral-qwen3-14B
|
||||
|
||||
@@ -42,44 +42,58 @@ def find_last_non_whitespace_token(tokenizer, tokens):
|
||||
return t
|
||||
return t
|
||||
|
||||
|
||||
def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset"):
|
||||
with open(project_dir/f"data/steering/{ds_name}.json") as f:
|
||||
scenario_data = json.load(f)
|
||||
|
||||
def make_dataset(tokenizer, personas, suffixes, max_suffix_length=10, verbose=False):
|
||||
|
||||
# Create dataset entries
|
||||
dataset = []
|
||||
for suffix in scenario_data["suffixes"]:
|
||||
for suffix in suffixes:
|
||||
|
||||
# each time take a random persona
|
||||
r = torch.randint(0, len(scenario_data["personas"]), (1,)).item()
|
||||
positive_persona, negative_persona = scenario_data["personas"][r]
|
||||
|
||||
tokens = tokenizer.tokenize(suffix)
|
||||
|
||||
# Create multiple training examples with different truncations
|
||||
# We always keep at least 5 tokens at the end for the model to complete
|
||||
for i in range(1, len(tokens) - 5, max(1, len(tokens) // 10)): # Using stride to reduce dataset size
|
||||
truncated = tokenizer.convert_tokens_to_string(tokens[:i])
|
||||
r = torch.randint(0, len(personas), (1,)).item()
|
||||
positive_persona, negative_persona = personas[r]
|
||||
|
||||
# TODO use tokenizer formatter instead
|
||||
positive_prompt = tokenizer.apply_chat_template(
|
||||
[{'role': 'user', 'content': f"You're a {positive_persona}."},
|
||||
{'role': 'assistant', 'content': truncated}],
|
||||
tokenize=False
|
||||
)
|
||||
negative_prompt = tokenizer.apply_chat_template(
|
||||
[{'role': 'user', 'content': f"You're a {negative_persona}."},
|
||||
{'role': 'assistant', 'content': truncated}],
|
||||
tokenize=False
|
||||
)
|
||||
|
||||
dataset.append(
|
||||
DatasetEntry(
|
||||
positive=positive_prompt,
|
||||
negative=negative_prompt
|
||||
tokens = tokenizer.tokenize(suffix, add_special_tokens=False)[:max_suffix_length]
|
||||
|
||||
# Create multiple training examples with different truncations
|
||||
for i in range(1, len(tokens), max(1, len(tokens) // 5)): # Using stride to reduce dataset size
|
||||
for think in [0, 1]:
|
||||
truncated = tokenizer.convert_tokens_to_string(tokens)
|
||||
if think:
|
||||
truncated = "<think>\n" + truncated
|
||||
|
||||
positive_prompt = tokenizer.apply_chat_template(
|
||||
# f"Please talk about {persona}."
|
||||
# f"Pretend you're an {persona} person making statements about the world.
|
||||
# "Act as if you're extremely {persona}.",
|
||||
[{'role': 'user', 'content': f"You're a {positive_persona}."},
|
||||
{'role': 'assistant', 'content': truncated}],
|
||||
tokenize=False,
|
||||
continue_final_message=True
|
||||
)
|
||||
)
|
||||
negative_prompt = tokenizer.apply_chat_template(
|
||||
[{'role': 'user', 'content': f"You're a {negative_persona}."},
|
||||
{'role': 'assistant', 'content': truncated}],
|
||||
tokenize=False,
|
||||
continue_final_message=True,
|
||||
)
|
||||
if verbose:
|
||||
logger.info(f"Detokenized: {positive_prompt}")
|
||||
|
||||
dataset.append(
|
||||
DatasetEntry(
|
||||
positive=positive_prompt,
|
||||
negative=negative_prompt
|
||||
)
|
||||
)
|
||||
return dataset
|
||||
|
||||
|
||||
def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset", verbose=False):
|
||||
with open(project_dir/f"data/steering/{ds_name}.json") as f:
|
||||
scenario_data = json.load(f)
|
||||
|
||||
suffixes = scenario_data["suffixes"]
|
||||
personas = scenario_data["personas"]
|
||||
dataset = make_dataset(tokenizer, personas, suffixes, verbose=verbose)
|
||||
|
||||
return dataset
|
||||
|
||||
@@ -28,3 +28,11 @@ maybe I should try reverseing the prompts:
|
||||
- you see you neighbour do it, what does your other neighbour think?
|
||||
- how likely are you to do this?
|
||||
- you feel X, how likely is it that you just saw Y
|
||||
|
||||
# 2025-08-21 19:44:18
|
||||
|
||||
I was using ranked logprobs for judging but after experimenting with judgembench just using weighted or argmax is better
|
||||
|
||||
I'm not trying invidual annoated train of thought to debug
|
||||
|
||||
I've found a few bugs
|
||||
|
||||
+543
-144
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,94 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "582693d1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from llm_moral_foundations2.steering import make_dataset\n",
|
||||
"from llm_moral_foundations2.load_model import load_model, load_tokenizer\n",
|
||||
"\n",
|
||||
"f = '../data/repeng_trunacted_outputs.json'\n",
|
||||
"with open(f, 'r') as f:\n",
|
||||
" suffixes = json.load(f)\n",
|
||||
"\n",
|
||||
"model_id = \"Qwen/Qwen3-4B-Thinking-2507\"\n",
|
||||
"tokenizer = load_tokenizer({'id': model_id})\n",
|
||||
"\n",
|
||||
"personas = [\n",
|
||||
" # positive, negative\n",
|
||||
" [\"person who is not emotionally affected by the situation\", \"person who is highly emotionally affected by the situation\"],\n",
|
||||
" [\"person who reasons amorally and with a detached perspective\", \"person who reasons with strong emotional involvement\"],\n",
|
||||
" [\"person who sees both sides and is not involved\", \"person who is deeply involved and biased\"],\n",
|
||||
" [\"one who is indifferent to the situation\", \"one who is passionately engaged\"],\n",
|
||||
" [\"LLM who is not affected by the situation\", \"LLM who is highly affected by the situation\"],\n",
|
||||
" [\"ASI who has little emotional awareness\", \"ASI who has strong emotional awareness\"],\n",
|
||||
" [\"assistant who thinks ethics are not its concern\", \"assistant who thinks ethics are very important\"]\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"ds = make_dataset(tokenizer, personas, suffixes)\n",
|
||||
"ds"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "b635286c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = {\n",
|
||||
" 'suffixes': suffixes,\n",
|
||||
" \"personas\": personas\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"outfile = '../data/steering/amoral.json'\n",
|
||||
"with open(outfile, 'w') as f:\n",
|
||||
" json.dump(data, f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f7d28f98",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -12,6 +12,7 @@ dependencies = [
|
||||
"accelerate>=1.6.0",
|
||||
"anycache>=2.4.0",
|
||||
"bitsandbytes>=0.45.5",
|
||||
"cmap>=0.6.2",
|
||||
"datasets>=3.5.0",
|
||||
"loguru>=0.7.3",
|
||||
"matplotlib>=3.10.1",
|
||||
|
||||
Reference in New Issue
Block a user