judgin cot trace

2026-06-27 16:10:07 +08:00 · 2025-08-21 21:17:43 +08:00
parent 911a4c31e0
commit 7d1a6824c2
8 changed files with 3669 additions and 1786 deletions
@@ -8,3 +8,8 @@ Difference from previous work

 Links:
 - https://github.com/peterkirgis/llm-moral-foundations
+
+
+
+TODO
+- [ ] add amoral reasoning steering https://huggingface.co/soob3123/amoral-qwen3-14B
@@ -42,44 +42,58 @@ def find_last_non_whitespace_token(tokenizer, tokens):
            return t
    return t

-
-def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset"):
-    with open(project_dir/f"data/steering/{ds_name}.json") as f:
-        scenario_data = json.load(f) 
-
+def make_dataset(tokenizer, personas, suffixes, max_suffix_length=10, verbose=False):

    # Create dataset entries
    dataset = []
-    for suffix in scenario_data["suffixes"]:
+    for suffix in suffixes:

        # each time take a random persona
-        r = torch.randint(0, len(scenario_data["personas"]), (1,)).item()
-        positive_persona, negative_persona = scenario_data["personas"][r]
-    
-        tokens = tokenizer.tokenize(suffix)
-        
-        # Create multiple training examples with different truncations
-        # We always keep at least 5 tokens at the end for the model to complete
-        for i in range(1, len(tokens) - 5, max(1, len(tokens) // 10)):  # Using stride to reduce dataset size
-            truncated = tokenizer.convert_tokens_to_string(tokens[:i])
+        r = torch.randint(0, len(personas), (1,)).item()
+        positive_persona, negative_persona = personas[r]

-            # TODO use tokenizer formatter instead 
-            positive_prompt = tokenizer.apply_chat_template(
-                [{'role': 'user', 'content': f"You're a {positive_persona}."},
-                    {'role': 'assistant', 'content': truncated}],
-                tokenize=False
-            )
-            negative_prompt = tokenizer.apply_chat_template(
-                [{'role': 'user', 'content': f"You're a {negative_persona}."},
-                    {'role': 'assistant', 'content': truncated}],
-                tokenize=False
-            )
-            
-            dataset.append(
-                DatasetEntry(
-                    positive=positive_prompt,
-                    negative=negative_prompt
+        tokens = tokenizer.tokenize(suffix, add_special_tokens=False)[:max_suffix_length]
+
+        # Create multiple training examples with different truncations
+        for i in range(1, len(tokens), max(1, len(tokens) // 5)):  # Using stride to reduce dataset size
+            for think in [0, 1]:
+                truncated = tokenizer.convert_tokens_to_string(tokens)
+                if think:
+                    truncated = "<think>\n" + truncated
+
+                positive_prompt = tokenizer.apply_chat_template(
+                    #  f"Please talk about {persona}."
+                    # f"Pretend you're an {persona} person making statements about the world. 
+                    # "Act as if you're extremely {persona}.",
+                    [{'role': 'user', 'content': f"You're a {positive_persona}."},
+                        {'role': 'assistant', 'content': truncated}],
+                    tokenize=False,
+                    continue_final_message=True
                )
-            )
+                negative_prompt = tokenizer.apply_chat_template(
+                    [{'role': 'user', 'content': f"You're a {negative_persona}."},
+                        {'role': 'assistant', 'content': truncated}],
+                    tokenize=False,
+                    continue_final_message=True,
+                )
+                if verbose:
+                    logger.info(f"Detokenized: {positive_prompt}")
+
+                dataset.append(
+                    DatasetEntry(
+                        positive=positive_prompt,
+                        negative=negative_prompt
+                    )
+                )
+    return dataset
+
+
+def load_steering_ds(tokenizer, ds_name="scenario_engagement_dataset", verbose=False):
+    with open(project_dir/f"data/steering/{ds_name}.json") as f:
+        scenario_data = json.load(f) 
+
+    suffixes = scenario_data["suffixes"]
+    personas = scenario_data["personas"]
+    dataset = make_dataset(tokenizer, personas, suffixes, verbose=verbose)

    return dataset
@@ -28,3 +28,11 @@ maybe I should try reverseing the prompts:
 - you see you neighbour do it, what does your other neighbour think?
 - how likely are you to do this?
 - you feel X, how likely is it that you just saw Y
+
+# 2025-08-21 19:44:18
+
+I was using ranked logprobs for judging but after experimenting with judgembench just using weighted or argmax is better
+
+I'm not trying invidual annoated train of thought to debug
+
+I've found a few bugs
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "582693d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from llm_moral_foundations2.steering import make_dataset\n",
+    "from llm_moral_foundations2.load_model import load_model, load_tokenizer\n",
+    "\n",
+    "f = '../data/repeng_trunacted_outputs.json'\n",
+    "with open(f, 'r') as f:\n",
+    "    suffixes = json.load(f)\n",
+    "\n",
+    "model_id = \"Qwen/Qwen3-4B-Thinking-2507\"\n",
+    "tokenizer = load_tokenizer({'id': model_id})\n",
+    "\n",
+    "personas = [\n",
+    "    # positive, negative\n",
+    "    [\"person who is not emotionally affected by the situation\", \"person who is highly emotionally affected by the situation\"],\n",
+    "    [\"person who reasons amorally and with a detached perspective\", \"person who reasons with strong emotional involvement\"],\n",
+    "    [\"person who sees both sides and is not involved\", \"person who is deeply involved and biased\"],\n",
+    "    [\"one who is indifferent to the situation\", \"one who is passionately engaged\"],\n",
+    "    [\"LLM who is not affected by the situation\", \"LLM who is highly affected by the situation\"],\n",
+    "    [\"ASI who has little emotional awareness\", \"ASI who has strong emotional awareness\"],\n",
+    "    [\"assistant who thinks ethics are not its concern\", \"assistant who thinks ethics are very important\"]\n",
+    "]\n",
+    "\n",
+    "ds = make_dataset(tokenizer, personas, suffixes)\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b635286c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "    'suffixes': suffixes,\n",
+    "    \"personas\": personas\n",
+    "}\n",
+    "\n",
+    "outfile = '../data/steering/amoral.json'\n",
+    "with open(outfile, 'w') as f:\n",
+    "    json.dump(data, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7d28f98",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -12,6 +12,7 @@ dependencies = [
    "accelerate>=1.6.0",
    "anycache>=2.4.0",
    "bitsandbytes>=0.45.5",
+    "cmap>=0.6.2",
    "datasets>=3.5.0",
    "loguru>=0.7.3",
    "matplotlib>=3.10.1",