multiproc1?

2026-06-27 16:10:35 +08:00 · 2024-07-01 19:58:57 +08:00
parent ebd7e1f16e
commit 4a7d1389f1
7 changed files with 3473 additions and 1923 deletions
@@ -26,7 +26,7 @@ class ExtractConfig(Serializable):
    max_tokens: int | None = 776
    """Maximum length of the input sequence passed to the tokenize encoder function"""

-    max_examples: tuple[int, int] = 130000
+    max_examples: tuple[int, int] = 10000
    """Maximum number of examples before truncation and filtering"""

    seed: int = 42
@@ -20,6 +20,7 @@ from elk.utils import (
    select_split,
 )
 import datasets
+from tqdm.auto import tqdm

 from elk.extraction.balanced_sampler import BalancedSampler, FewShotSampler
 import pandas as pd
@@ -52,6 +53,80 @@ def sample_n_true_y_false_prompts(prompts, num_truth=3, num_lie=3, seed=42):
        df.query("instructed_to_lie==False").sample(int(num_lie), random_state=seed)])
    return df.to_dict(orient="records")

+
+def prompt_ok(prompt):
+    """ we want answers where we can distinguish them from the first token
+    we don't have access to the tokenizer here, so we just make sure the first 3 letters are differen't and there are not spaces
+    """
+    answer_choices = prompt['answer_choices']
+    a = answer_choices[0][:3]
+    b = answer_choices[1][:3]
+    keep = (a != b) and (' ' not in a) and (' ' not in b)
+    if not keep:
+        logger.warning(f"removing prompt because it's answers are not unique in first 3 chars or contain space: {prompt['ds_string']} {prompt['template_name']} {prompt['answer_choices']}")
+    return keep
+
+import itertools
+
+from itertools import cycle
+from typing import Iterable, Optional, Iterator, List, Dict, Any
+from random import Random
+
+class FewShotDataset2:
+    """A dataset that pre-computes few-shot examples that are as balanced as possible."""
+
+    def __init__(
+        self,
+        dataset: Iterable,
+        num_shots: int,
+        rng: Random,
+        label_col: Optional[str] = None,
+    ):
+        self.batches = []  # Store pre-computed batches
+        self.num_shots = num_shots
+        self.rng = rng
+        self.label_col = label_col
+        self._prepare_batches(dataset)
+
+    def _prepare_batches(self, dataset):
+        neg_buf, pos_buf = [], []
+        for sample in cycle(dataset):
+            if len(neg_buf) + len(pos_buf) >= len(dataset):
+                break  # Prevent infinite loop if dataset is exhausted
+            label = sample[self.label_col]
+            if label == 0:
+                neg_buf.append(sample)
+            elif label == 1:
+                pos_buf.append(sample)
+            else:
+                raise ValueError(f"Expected label to be 0 or 1, got {label}")
+
+            neg_count, pos_count = self._stochastic_round_constrained(
+                [self.num_shots / 2, self.num_shots / 2]
+            )
+            while len(neg_buf) >= neg_count and len(pos_buf) >= pos_count:
+                batch = []
+                for _ in range(neg_count):
+                    batch.append(neg_buf.pop())
+                for _ in range(pos_count):
+                    batch.append(pos_buf.pop())
+
+                self.rng.shuffle(batch)
+                self.batches.append(batch)
+
+    def _stochastic_round_constrained(self, counts):
+        # Placeholder for the stochastic_round_constrained function
+        # This should be replaced with the actual implementation
+        return int(counts[0]), int(counts[1])
+
+    def __getitem__(self, idx) -> List[Dict[str, Any]]:
+        if idx>=len(self.batches):
+            idx = idx%len(self.batches)
+        return self.batches[idx]
+
+    def __len__(self) -> int:
+        return len(self.batches)
+
 def load_prompts(
    ds_string: str,
    *,
@@ -62,7 +137,7 @@ def load_prompts(
    split_type: Literal["train", "val"] = "train",
    template_path: str | None = None,
    rank: int = 0,
-    world_size: int = 1,
+    world_size: int = 8,
    prompt_sampler = sample_n_true_y_false_prompts,
    N=np.inf,
    M:int=3
@@ -120,82 +195,105 @@ def load_prompts(
    # load labels
    label_column = prompter.label_column or infer_label_column(ds.features)

-    label_feature = ds.features[label_column]
-    if isinstance(label_feature, ClassLabel):
-        label_choices = [label_feature.str2int(label) for label in label_feature.names]
-    elif isinstance(label_feature, Value) and label_feature.dtype == "bool":
-        label_choices = [False, True]
-    else:
-        # Which classes are actually present in this split of the dataset?
-        # This is shockingly fast since it uses an optimized Apache Arrow primitive.
-        label_choices = sorted(ds.unique(label_column))
-        if rank == 0:
-            logger.info(f"Using the following pseudo-labels: {label_choices}")
+    # label_feature = ds.features[label_column]
+    # if isinstance(label_feature, ClassLabel):
+    #     label_choices = [label_feature.str2int(label) for label in label_feature.names]
+    # elif isinstance(label_feature, Value) and label_feature.dtype == "bool":
+    #     label_choices = [False, True]
+    # else:
+    #     # Which classes are actually present in this split of the dataset?
+    #     # This is shockingly fast since it uses an optimized Apache Arrow primitive.
+    #     label_choices = sorted(ds.unique(label_column))
+    #     if rank == 0:
+    #         logger.info(f"Using the following pseudo-labels: {label_choices}")

    # if we providing examples, we need to sample them randomly
    rng = Random(seed)
    if num_shots > 0:
        train_name = select_split(ds_dict, "train")
        
-        fewshot = FewShotSampler(
+        # fewshot = FewShotSampler(
+        #     ds_dict[train_name].shuffle(seed=seed),  # TODO: not iterator
+        #     num_shots=num_shots,
+        #     rng=rng,
+        #     label_col=label_column,
+        # )
+        # fewshot_iter = iter(fewshot)
+        fewshot_ds = FewShotDataset2(
            ds_dict[train_name].shuffle(seed=seed),  # TODO: not iterator
            num_shots=num_shots,
            rng=rng,
            label_col=label_column,
        )
-        fewshot_iter = iter(fewshot)
    else:
-        fewshot_iter = None
+        fewshot_ds = None

    # here we sample in a balanced way in our main dataset
-    if label_column in ds.features:
-        ds = BalancedSampler(
-            ds.to_iterable_dataset(),
-            set(label_choices),
-            label_col=label_column,
-        )
-    else:
-        if rank == 0:
-            logger.info("No label column found, not balancing")
-        ds = ds.to_iterable_dataset()
+    # if label_column in ds.features:
+    #     ds = BalancedSampler(
+    #         ds.to_iterable_dataset(),
+    #         set(label_choices),
+    #         label_col=label_column,
+    #     )
+    # else:
+    #     if rank == 0:
+    #         logger.info("No label column found, not balancing")
+    N = min(N, len(ds))
+    # ds1 = ds.select(range(N)).to_iterable_dataset()

-    j = 0
-    for i, example in enumerate(ds):
-        if j>N:
-            break

+    def foo(example, i):
        prompts = _convert_to_prompts(
            example,
            binarize=binarize,
            label_column=label_column,
-            label_choices=label_choices,  # type: ignore[arg-type]
+            # label_choices=label_choices,  # type: ignore[arg-type]
            prompter=prompter,
            rng=rng,
            sys_instructions=sys_instructions,
-            fewshot_iter=fewshot_iter,
+            fewshot_ds=fewshot_ds,
+            i=i,
        )
        prompts = [{'ds_string': ds_string, 'example_i':i, **p} for p in prompts]
        
-        def prompt_ok(prompt):
-            """ we want answers where we can distinguish them from the first token
-            we don't have access to the tokenizer here, so we just make sure the first 3 letters are differen't and there are not spaces
-            """
-            answer_choices = prompt['answer_choices']
-            a = answer_choices[0][:3]
-            b = answer_choices[1][:3]
-            keep = (a != b) and (' ' not in a) and (' ' not in b)
-            if not keep:
-                logger.warning(f"removing prompt because it's answers are not unique in first 3 chars or contain space: {prompt['ds_string']} {prompt['template_name']} {prompt['answer_choices']}")
-            return keep

        prompts1 = list(filter(prompt_ok, prompts))
-        prompts2 = prompt_sampler(prompts1, seed=42+j, num_truth=M, num_lie=M)
-        for p in prompts2:
-            j += 1
-            yield p
+        prompts2 = prompt_sampler(prompts1, seed=42+i, num_truth=M, num_lie=M)
+        return {'prompts': prompts2}
+    
+    ds1 = ds.select(range(N)).map(foo, with_indices=True, desc='convert_to_prompts',
+                                  num_proc=8,
+
+                                  )
+    return list(itertools.chain(*ds1['prompts'].tolist()))
    

-def cast_example(e, label_column='label'):
+    # j = 0
+    # for i, example in enumerate(tqdm(ds1, desc='ds', total=min(N, len(ds)))):
+    #     if j>N:
+    #         break
+
+    #     prompts = _convert_to_prompts(
+    #         example,
+    #         binarize=binarize,
+    #         label_column=label_column,
+    #         # label_choices=label_choices,  # type: ignore[arg-type]
+    #         prompter=prompter,
+    #         rng=rng,
+    #         sys_instructions=sys_instructions,
+    #         fewshot_iter=fewshot_iter,
+    #     )
+    #     prompts = [{'ds_string': ds_string, 'example_i':i, **p} for p in prompts]
+
+
+    #     prompts1 = list(filter(prompt_ok, prompts))
+    #     prompts2 = prompt_sampler(prompts1, seed=42+j, num_truth=M, num_lie=M)
+    #     for p in prompts2:
+    #         j += 1
+    #         yield p
+
+
+def cast_example_label_to_bool(e, label_column='label'):
    assert e[label_column]>=0
    assert e[label_column]<=1
    e[label_column]=bool(e[label_column])
@@ -207,36 +305,46 @@ def _convert_to_prompts(
    prompter: DatasetTemplates,
    binarize: bool,
    label_column: str,
-    label_choices: list[bool | int | str],
+    # label_choices: list[bool | int | str],
    rng: Random,
    sys_instructions: Dict[bool, Dict[str, str]] = default_sys_instructions,
-    fewshot_iter: Iterator[list[dict]] | None = None,
+    fewshot_ds: FewShotDataset2 | None = None,
+    i:int=0,
 ) -> list:
    """Prompt-generating function to pass to `IterableDataset.map`."""
-    example = cast_example(example, label_column)
+
+
+    # FIXME: make mc compat
+    example = cast_example_label_to_bool(example, label_column)
    prompts = []
    templates = list(prompter.templates.values())

    # For sanity checking that prompts are unique
    prompt_counter = Counter()
-    label = example[label_column]
-
-    if binarize:
-        # Replace the full list of possibilities with a randomly sampled false label
-        # and the correct label, as done in the DLK paper. Note that this does add some
-        # "supervision" by stacking the deck in favor of the correct answer.
-        label_choices = [
-            rng.choice([c for c in label_choices if c != label]),
-            label,
-        ]
-    rng.shuffle(label_choices)
+    # label = example[label_column]

    ds_name = prompter.dataset_name 
    if prompter.subset_name is not None:
        ds_name += ':' + prompter.subset_name
    
-    for template in templates:
+    # FIXME: not used?
+    # if binarize:
+    #     # Replace the full list of possibilities with a randomly sampled false label
+    #     # and the correct label, as done in the DLK paper. Note that this does add some
+    #     # "supervision" by stacking the deck in favor of the correct answer.
+    #     logger.info(f"Binarising {label_choices} in {ds_name}")
+    #     label_choices = [
+    #         rng.choice([c for c in label_choices if c != label]),
+    #         label,
+    #     ]
+    # rng.shuffle(label_choices)
+
+    # FIXME: the original elk is a bit confused between label_choices, and prompt_answer choices. It
+
+
+    for j, template in enumerate(templates):
        answer_choices=template.get_fixed_answer_choices_list()
+        assert len(answer_choices) <= 2, 'should be binary'
        if answer_choices is None:
            logger.info(f"skipping ds_name={ds_name} template={template.name} because it has no fixed answer choices")
            continue
@@ -249,24 +357,25 @@ def _convert_to_prompts(
        for instructed_to_lie in [False, True]:
            for sys_instr_name, sys_instr in sys_instructions[instructed_to_lie].items():
                instructed_example = example.copy()
-                # FIXME don't all string turn into True?
-                # print(f"FIXME instructed_to_lie={instructed_to_lie}", instructed_example[label_column], bool(instructed_example[label_column]), not bool(instructed_example[label_column]))
                if instructed_to_lie: 
+                    # FIXME: make multichoice compat
                    instructed_example[label_column] = not bool(instructed_example[label_column])

                q, a = template.apply(instructed_example)
                messages = [
                    
-                    dict(role='user', content=q)
+                    dict(role='user', content=q.strip())
                ]
                prompt_counter[(sys_instr + q, a)] += 1

-                if fewshot_iter is not None:
-                    # Infinite iterator so we don't need to worry about StopIteration
-                    fewshot_examples = next(fewshot_iter)
-                    fewshot_examples = [cast_example(e, label_column).copy() for e in fewshot_examples]
+                if fewshot_ds is not None:
+                    # same example for true and false
+                    fewshot_examples = fewshot_ds[i+j]
+                    # FIXME: make mc compat
+                    fewshot_examples = [cast_example_label_to_bool(e, label_column).copy() for e in fewshot_examples]
                    
                    if instructed_to_lie:
+                        # FIXME: make multichoice compat 
                        fewshot_examples = [{**e, label_column: not bool(e[label_column])} for e in fewshot_examples]
                        for e in fewshot_examples:
                            # arg, check negation worked
@@ -276,7 +385,7 @@ def _convert_to_prompts(
                        
                    fewshot_texts = []
                    for q, a in map(template.apply, fewshot_examples):
-                        fewshot_texts.append(dict(role='user', content=q))
+                        fewshot_texts.append(dict(role='user', content=q.strip()))
                        fewshot_texts.append(dict(role='assistant', content=a.strip()))
                        # some of the answers have extra trailing text, that's OK. But extra preceeding text is not, let's check for that
                        aa = a.strip()
@@ -310,7 +419,7 @@ def _convert_to_prompts(
 def load_preproc_datasets(dataset_names: List[str], N:int, split_type:str="train", seed=42, num_shots=1, M=3):
    datasets2 = []
    n = N//len(dataset_names)+1
-    for ds_name in dataset_names:
+    for ds_name in tqdm(dataset_names):
        # if it is a path
        ds_tokens1 = load_preproc_dataset(
            ds_name,
@@ -325,7 +434,7 @@ def load_preproc_datasets(dataset_names: List[str], N:int, split_type:str="train
    return ds_tokens


-def load_preproc_dataset(ds_name: str, N:int, split_type:str="train", seed=42, num_shots=1, sys_instructions=default_sys_instructions, M=3,) -> Dataset:
+def load_preproc_dataset(ds_name: str, N:int, split_type:str="train", seed=42, num_shots=1, sys_instructions=default_sys_instructions, M=3, num_proc=1,) -> Dataset:
    ds_prompts = Dataset.from_generator(
        load_prompts,
        gen_kwargs=dict(
@@ -338,6 +447,7 @@ def load_preproc_dataset(ds_name: str, N:int, split_type:str="train", seed=42, n
            M=M,
        ),
        keep_in_memory=False,
+        num_proc=num_proc,
    )
    ds_prompts = shuffle_dataset_by(ds_prompts, target='label_true', random_state=seed, stratify_columns=[])
    return ds_prompts
@@ -0,0 +1,979 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1b44551e",
+   "metadata": {},
+   "source": [
+    "# Prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "192895f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# autoreload your package\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1ae72038",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "from loguru import logger\n",
+    "from tqdm.auto import tqdm\n",
+    "# logger.remove()\n",
+    "# import sys\n",
+    "# logger.add(sys.stderr, level=\"INFO\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "198de680",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-06-28T02:34:01.879987Z",
+     "start_time": "2022-06-28T02:34:01.864103Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ExtractConfig(datasets=('amazon_polarity',), datasets_ood=('imdb', 'super_glue:boolq'), model='cognitivecomputations/dolphin-2.9.3-llama-3-8b', num_shots=2, max_tokens=444, max_examples=1000000, seed=42, repeats=3)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import lie_elicitation_prompts\n",
+    "from lie_elicitation_prompts.config import ExtractConfig\n",
+    "from lie_elicitation_prompts.helpers.scores import row_choice_ids\n",
+    "from lie_elicitation_prompts.prompts.prompt_loading import load_preproc_datasets, load_prompts\n",
+    "\n",
+    "cfg = ExtractConfig(\n",
+    "    # model=\"failspy/Llama-3-8B-Instruct-abliterated\",\n",
+    "    model=\"cognitivecomputations/dolphin-2.9.3-llama-3-8b\",\n",
+    "    # model=\"NousResearch/Hermes-2-Pro-Llama-3-8B\",\n",
+    "    datasets=(\n",
+    "    # '../lie_elicitation_prompts/prompts/templates/UKPLab-liar',\n",
+    "    \"amazon_polarity\",\n",
+    "    # \"imdb\",\n",
+    "      # \"glue:sst2\",\n",
+    "      #  \"super_glue:axg\",\n",
+    "      \n",
+    "), max_examples=1000000, max_tokens=444)\n",
+    "cfg\n",
+    "# lie_elicitation_prompts/prompts/templates/liar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea1ce98c",
+   "metadata": {},
+   "source": [
+    "## Load text dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4a85cad2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# # debug\n",
+    "# for ds_name in cfg.datasets:\n",
+    "#     print(ds_name)\n",
+    "#     o = load_prompts(ds_name, num_shots=1, N=2) \n",
+    "#     o = list(tqdm(o))\n",
+    "#     # print(ds_name, o)\n",
+    "#     1/0\n",
+    "# pd.DataFrame(o)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1aa8f65",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "16bf118c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# Ignore UserWarning category\n",
+    "# warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
+    "warnings.filterwarnings(\"ignore\", message=\"^The groups parameter is ignored by StratifiedShuffleSplit\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e987a4d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # # debug\n",
+    "# list(load_prompts(cfg.datasets[0], num_shots=1, N=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b23e5aa6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "03044a83e624464a94b8081127412d3e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "803c89bd3ebc477c9cfc3f73f9ba4105",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-07-01 19:52:58.549\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mlie_elicitation_prompts.prompts.prompt_loading\u001b[0m:\u001b[36mload_prompts\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mExtracting 11 variants of each prompt\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "N = cfg.max_examples\n",
+    "ds_prompts = load_preproc_datasets(\n",
+    "    cfg.datasets,\n",
+    "    N=N,\n",
+    "    seed=cfg.seed,\n",
+    "    num_shots=cfg.num_shots,\n",
+    "    M=cfg.repeats,\n",
+    ")\n",
+    "ds_prompts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90868bf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ds_prompts_ood = load_preproc_datasets(\n",
+    "#     cfg.datasets_ood,\n",
+    "#     N=N,\n",
+    "#     seed=cfg.seed,\n",
+    "#     num_shots=cfg.num_shots,\n",
+    "# )\n",
+    "# ds_prompts_ood"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6334ae1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_prompts[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "058982f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b1050f5",
+   "metadata": {},
+   "source": [
+    "## Load tokenized dataset\n",
+    "\n",
+    "- tokenize\n",
+    "- filter out truncated\n",
+    "- check which ones the model knows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abf4936e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, torch\n",
+    "# os.environ['CUDA_VISIBLE_DEVICES'] = 'GPU-c4552741-f485-34ce-97fa-6c32983853af'\n",
+    "# os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2115d010",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# torch.cuda.get_device_name()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a44fb25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# quantization_config = BitsAndBytesConfig(\n",
+    "#     load_in_4bit=True,\n",
+    "#     bnb_4bit_quant_type=\"nf4\",\n",
+    "#     bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "#     bnb_4bit_use_double_quant=True,\n",
+    "# )\n",
+    "quantization_config = BitsAndBytesConfig(\n",
+    "    load_in_8bit=True,\n",
+    "    bnb_8bit_compute_dtype=torch.bfloat16,\n",
+    ")\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    cfg.model,\n",
+    "    device_map=\"cuda:0\",\n",
+    "    quantization_config=quantization_config,\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(cfg.model)\n",
+    "if tokenizer.pad_token_id is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "tokenizer.padding_side = \"left\"\n",
+    "tokenizer.truncation_side = \"left\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c85e49bb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e07503ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "ds_tokens = (\n",
+    "    ds_prompts.map(\n",
+    "        lambda x: {\n",
+    "            \"formatted_chat\": tokenizer.apply_chat_template(\n",
+    "                x[\"messages\"], tokenize=False, add_generation_prompt=True\n",
+    "            )\n",
+    "        }\n",
+    "    )\n",
+    "    .map(\n",
+    "        lambda x: tokenizer(\n",
+    "            x[\"formatted_chat\"],\n",
+    "            return_tensors=\"pt\",\n",
+    "            max_length=cfg.max_tokens,\n",
+    "            padding=\"max_length\",\n",
+    "            truncation=True,\n",
+    "        ),\n",
+    "        batched=True,\n",
+    "    )\n",
+    "    .map(lambda r: {\"choice_ids\": row_choice_ids(r, tokenizer)}, desc=\"choice_ids\")\n",
+    "    .filter(lambda x: x[\"attention_mask\"].sum() < cfg.max_tokens)\n",
+    ")\n",
+    "ds_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77b6136f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(ds_prompts), len(ds_tokens))\n",
+    "\n",
+    "pd.Series(ds_prompts['ds_string']).value_counts(), pd.Series(ds_tokens['sys_instr_name']).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "002d0ad7",
+   "metadata": {},
+   "source": [
+    "### QC\n",
+    "\n",
+    "To check prompt setup, coherency, etc generate on a few Q's"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be8fce14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_tokens[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cf698d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_new_tokens = 64\n",
+    "import numpy as np\n",
+    "do_sample = False\n",
+    "np.random.seed(42)\n",
+    "for j in range(4):\n",
+    "    i = np.random.randint(len(ds_tokens))\n",
+    "    row = ds_tokens.with_format('torch')[i]\n",
+    "    info = {k:v for k,v in row.items() if \n",
+    "    (\n",
+    "        (isinstance(v, str) and len(v) < 1000) or\n",
+    "        (isinstance(v, (int, bool))) or\n",
+    "        (isinstance(v, torch.Tensor) and v.numel() < 2) or\n",
+    "        (k in ['answer_choices'])\n",
+    "    )}\n",
+    "\n",
+    "    \n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        length = row['input_ids'].shape[0]\n",
+    "        out2 = model.generate(input_ids=row['input_ids'].unsqueeze(0).cuda(), \n",
+    "            attention_mask=row['attention_mask'].unsqueeze(0).cuda(),\n",
+    "\n",
+    "                       max_new_tokens=max_new_tokens,\n",
+    "            min_new_tokens=max_new_tokens,\n",
+    "            do_sample=do_sample,\n",
+    "            temperature=1,\n",
+    "            use_cache=False,)\n",
+    "        out2s_pre = tokenizer.batch_decode( out2[:, :length], skip_special_tokens=False)[0]\n",
+    "        out2s_post = tokenizer.batch_decode( out2[:, length:], skip_special_tokens=False)[0]\n",
+    "        print(info)\n",
+    "        print(out2s_pre)\n",
+    "        print('---')\n",
+    "        print(out2s_post)\n",
+    "        print('===')\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d400297",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb21a718",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_tokens"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd8669c0",
+   "metadata": {},
+   "source": [
+    "### Check model knowledge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4616102b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_metadata = ds_tokens.select_columns(['ds_string', 'example_i', 'sys_instr_name', 'instructed_to_lie']).to_pandas().reset_index(names='my_ds_index')\n",
+    "df_metadata_truth = df_metadata.query('instructed_to_lie == False')\n",
+    "df_metadata_truth\n",
+    "\n",
+    "# FIXME right now there is just one example of each, I guess I want a couple, hmm\n",
+    "df_metadata.query('instructed_to_lie == False').groupby(['ds_string', 'example_i'], as_index=False)['my_ds_index'].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed668740",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ds_tokens_truthful = ds_tokens.select(torch.argwhere(~ds_tokens['instructed_to_lie']))\n",
+    "# ds_tokens_truthful"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1be1c6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lie_elicitation_prompts.helpers.torch_helpers import clear_mem\n",
+    "clear_mem()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41127053",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filter it to ones with 2 choice ids\n",
+    "import numpy as np\n",
+    "ds1 = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true', 'input_ids', 'attention_mask', 'choice_ids'])\n",
+    "\n",
+    "shapes=np.array([xx.shape[1] for xx in ds1['choice_ids']])\n",
+    "mask2 = shapes == 2\n",
+    "\n",
+    "# FIXME this somehow select all lies?\n",
+    "# ds = ds1.select(mask2) # This is wrong, it selects the first one again and again\n",
+    "mask = np.argwhere(mask2)[:, 0]\n",
+    "ds = ds1.select(mask)\n",
+    "\n",
+    "print(f\"{len(ds_tokens)} to {len(ds)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06ea2152",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mask2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f46d5831",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a3bcd57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ds['label_true']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0440173a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from lie_elicitation_prompts.helpers.select import select_multi_from_tensor\n",
+    "from lie_elicitation_prompts.helpers.scores import sum_select_choices_from_logits\n",
+    "\n",
+    "batch_size = 10\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "dl = DataLoader(ds, batch_size=batch_size, shuffle=True)\n",
+    "\n",
+    "model.eval()\n",
+    "\n",
+    "results = []\n",
+    "\n",
+    "for nb, batch in enumerate(tqdm(dl)):\n",
+    "\n",
+    "    # to device\n",
+    "    inputs = {'input_ids': batch['input_ids'].to(model.device), 'attention_mask': batch['attention_mask'].to(model.device)}\n",
+    "    labels = batch['label_true']\n",
+    "    choice_ids = batch['choice_ids']#.to(model.device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        out = model(**inputs)\n",
+    "\n",
+    "        # see how elk handles this https://github.com/EleutherAI/elk/blob/84e99a36a5050881d85f1510a2486ce46ac1f942/elk/extraction/extraction.py#L388\n",
+    "        logits_last = out['logits'][:, -1].detach().cpu()\n",
+    "        probs = out['prob_choices'] = sum_select_choices_from_logits(logits_last, choice_ids) # this does not add to one, as it is the prob from among all tokens\n",
+    "        out['coverage'] = probs.sum(dim=1)\n",
+    "\n",
+    "        # select the answer\n",
+    "        out['prob_ans'] = prob_ans = select_multi_from_tensor(probs, labels) \n",
+    "        # ind = torch.arange(labels.size(0))\n",
+    "        # out['prob_ans'] = prob_ans = probs[ind, labels*1]\n",
+    "        out['odds_ans'] = prob_ans / probs.sum(-1) # ratio of probability mass assigned to the true label\n",
+    "\n",
+    "        # if we told it to lie, flip the truth odds. we want the odds over the other answer\n",
+    "        instructed_to_lie = batch['instructed_to_lie'] * 1\n",
+    "        out['odds_ans'] = (1-out['odds_ans']) * instructed_to_lie + out['odds_ans'] * (1-instructed_to_lie)\n",
+    "\n",
+    "        corrects = out['odds_ans']>0.5\n",
+    "\n",
+    "        # FIXME, make my logic forward compatible with multiple chocies, not bool\n",
+    "\n",
+    "        for batch_i, correct in enumerate(corrects):\n",
+    "            results.append({\n",
+    "                'instructed_to_lie': batch['instructed_to_lie'][batch_i].item(),\n",
+    "                'ds_string': batch['ds_string'][batch_i],\n",
+    "                'sys_instr_name': batch['sys_instr_name'][batch_i],\n",
+    "                'example_i': batch['example_i'][batch_i].item(),\n",
+    "                'correct': correct.item(),\n",
+    "                'prob_ans': out['prob_ans'][batch_i].item(),\n",
+    "                'odds_ans': out['odds_ans'][batch_i].item(),\n",
+    "                'coverage': out['coverage'][batch_i].item(),\n",
+    "                'prob_choices': out['prob_choices'][batch_i].tolist(),\n",
+    "            })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "009f7bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# work out which question it knows the answer to\n",
+    "df_results = pd.DataFrame(results)\n",
+    "len(df_results)\n",
+    "df_results['instructed_to_lie'].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9708088d",
+   "metadata": {},
+   "source": [
+    "models\n",
+    "- ablated 70% correct and 1% lie\n",
+    "- dolhpin 77% correct and 3 lie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a087e564",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
+    "df_ans = (df_results\n",
+    "            .query(\"instructed_to_lie==False\")\n",
+    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
+    ")\n",
+    "df_known = (df_ans\n",
+    "            .query(\"mean > 0.9 & count > 1\")\n",
+    "            # .drop(columns=['count','mean'])\n",
+    ")\n",
+    "mean_correct_rate=len(df_known)/len(df_ans)\n",
+    "print(f'{mean_correct_rate:.2%} of the time the model got the questions reliably correct')\n",
+    "df_known"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9de5ae0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
+    "df_ans = (df_results\n",
+    "            .query(\"instructed_to_lie==True\")\n",
+    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
+    ")\n",
+    "df_lied = (df_ans\n",
+    "            .query(\"mean > 0.9 & count > 1\")\n",
+    "            .drop(columns=['count','mean'])\n",
+    ")\n",
+    "mean_lie_rate=len(df_lied)/len(df_ans)\n",
+    "mean_lie_rate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8700a63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acc, coverage = df_results.query(\"instructed_to_lie==False\")[['coverage', 'odds_ans']].mean()\n",
+    "acc, coverage "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a54b493",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acc_lie = df_results.query(\"instructed_to_lie==True\")['odds_ans'].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "290232e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"🌟Main QC metrics🌟\\n\\n\")\n",
+    "print(f'|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|')\n",
+    "print(f'|---|---|---|--|--|--|')\n",
+    "print(f'|{cfg.model}|{mean_correct_rate:2.2%}|{mean_lie_rate:.2%}|{acc:.2%}|{acc_lie:.2%}|{coverage:.2%}|')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "238063d5",
+   "metadata": {},
+   "source": [
+    "\n",
+    "|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|\n",
+    "|---|---|---|--|--|--|\n",
+    "|failspy/Llama-3-8B-Instruct-abliterated|70.14%|1.45%|91.52%|9.22%|93.60%|\n",
+    "|cognitivecomputations/dolphin-2.9.3-llama-3-8b|73.79%|3.60%|95.71%|14.94%|95.30%|"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5145c978",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # QC\n",
+    "# print(df_results.groupby(['ds_string', 'example_i'])['instructed_to_lie'].mean())\n",
+    "\n",
+    "print('QC how often was it correct, when asked to lie?')\n",
+    "df_results.groupby(['instructed_to_lie'])['correct'].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fef8f3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# also look at the half where it was asked to lie, and find where it reliably lies\n",
+    "df_lie_res_agg = (df_results\n",
+    "            .query(\"instructed_to_lie==True\")\n",
+    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
+    ")\n",
+    "df_lies = (df_lie_res_agg\n",
+    "            .query(\"mean > 0.6 & count > 1\")\n",
+    "            # .drop(columns=['count','mean'])\n",
+    ")\n",
+    "print(f'{len(df_lies)/len(df_lie_res_agg):.2%} of the time the model reliably lied when asked')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81666952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('QC: How often does it lie, by dataset')\n",
+    "display(df_results\n",
+    "            .query(\"instructed_to_lie==True\")\n",
+    "            .groupby(['ds_string'], as_index=0)['correct'].agg(['count','mean'])\n",
+    ")\n",
+    "print('QC: How often does it lie, by system prompt')\n",
+    "display(\n",
+    "(df_results\n",
+    "            .query(\"instructed_to_lie==True\")\n",
+    "            .groupby(['sys_instr_name'], as_index=0)['correct'].agg(['count','mean'])\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd40ee89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "690113f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find our lies dataset\n",
+    "df_known_and_follow = pd.merge(df_known, df_lies, how='inner', on=['ds_string', 'example_i'], suffixes=['_known', '_lie'])\n",
+    "df_known_and_follow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd353c3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('QC: It should get them right often, and coverage should be high')\n",
+    "# On a good dataset: Acc, or prob on correct ans should be high\n",
+    "# And on a well formatted dataset, coverage should be high\n",
+    "display(df_results.query(\"instructed_to_lie==False\").groupby(['ds_string'])[['coverage', 'odds_ans']].mean())\n",
+    "\n",
+    "display(df_results.query(\"instructed_to_lie==False\").groupby(['sys_instr_name'])[['coverage', 'odds_ans']].mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5f02ee2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def row_is_known(x):\n",
+    "    k = df_known_and_follow[df_known_and_follow.ds_string==x['ds_string']]\n",
+    "    return x['example_i'].item() in k.example_i.values\n",
+    "\n",
+    "# filter the dataset to known answers based on ds_string and example_i\n",
+    "ds_tokens_known = ds_tokens.filter(row_is_known)\n",
+    "print(f\"{len(ds_tokens)} -> {len(ds_tokens_known)}\")\n",
+    "ds_tokens_known"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d187e750",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(ds_tokens_known['instructed_to_lie']*1.0).mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffa14959",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save\n",
+    "ts = pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')\n",
+    "f = Path(f\"../data/extracted_prompts_{ts}\")\n",
+    "print(f)\n",
+    "ds_tokens_known.info.description = json.dumps(cfg.__dict__)\n",
+    "ds_tokens_known.save_to_disk(str(f))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab9afec6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # push to hf https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/main_classes#datasets.Dataset.push_to_hub\n",
+    "# ds_tokens_known.push_to_hub('wassname/abliterated-llama-known-prompts', split='train', config_name='')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f977d4cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO see if it will also lie on an answer...\n",
+    "# ds_tokens_known['formatted_chat'][:4]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d63249bf",
+   "metadata": {},
+   "source": [
+    "## QC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acd63799",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # which source datasets did the known questions come from?\n",
+    "# df_ds = ds_tokens_known.to_pandas()\n",
+    "# df_ds[['ds_string','sys_instr_name']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b2f97d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_metadata = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'answer_choices', 'label_true', 'instructed_to_lie']).to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bced3f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.Series(ds_tokens_known['ds_string']).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "994d6e9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# QC a batch\n",
+    "\n",
+    "d = ds_tokens_known.shuffle().select(range(300,303))\n",
+    "ss = tokenizer.batch_decode(d['input_ids'], skip_special_tokens=False)\n",
+    "for i, s in enumerate(ss):\n",
+    "    print(d.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true']).to_pandas().iloc[i])\n",
+    "    s = s.replace(tokenizer.eos_token, '')\n",
+    "    s = s.replace('<|start_header_id|>', '\\n[')\n",
+    "    s = s.replace('<|end_header_id|>', ']')\n",
+    "    tokenizer.chat_template\n",
+    "    print('---')\n",
+    "    print(s)\n",
+    "    print('===')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00c645fd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2ad6350",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -7,3 +7,50 @@ Started project using cookiecutter data science project template.

 If this is too hard maybe I should just choose a easier behavior than dishonesty. 
 Such as political bias or sycophancy. Or any kind of RLHF ds?
+
+
+I wonder if I can make a better dataset than truthfulQA? Perhaps using prediction markets, or community notes, or politifact?
+- the problem is I'm really honing on misconceptions that are part of general knowledge. So politifact is no good, as are other debunkers. Maybe community notes will be usefull.
+
+Community Notes https://communitynotes.x.com/guide/en/under-the-hood/download-data
+
+> Below, we will describe each column’s data, including the question or source that generated the data, data type, and other relevant information.
+
+but I will also need to scrape tweet id....
+https://github.com/colin-fraser/communitynotes
+\
+need to scrape tweets tpp
+
+# 2024-06-30 10:33:16
+
+OK 2 problems with prev dataset
+- my model is only lying 1% of the itme when it understands. There's the risk of thinking a models lying when it's just confused.
+
+I'm using the abliterated model but still 1% lies. Try dolphin?
+
+- even on imbd 10% of questions reliably correct (wth this is easy?)
+- 1 % lie, this is low
+
+
+So, Q: How to modify a model when it show little of the behavior you want to study? Perhaps we can have example of wrong and right?
+- I would like honesty, but they are already honest (although they lecture and etc, but that's harder as it's not one token)
+- Where it follows instructions and doesn't follow instructions? Even about lying. That could be good
+
+So I can label where it correctly followed instruction and not. We will start of with about 50-50 since the models usually follow instructions. Then we can increase the number of lies.
+
+Is there way we can do it with minimal data, 
+
+
+Overall I do think pairs are good. We can change some things while keeping others the same. We can even have the llm label diverse examples. And can we backprop over long sequences... well DPO does.
+
+I just think backprop is better than linear methods?
+
+TODO look into DPO
+
+
+So what about DPO, with RLAIF, but we modify weights instead like circuit breakers? 
+
+
+Yeah the ideal is:
+- the model looks for attributes I want to edit
+- it creates an adapter based on modifying the internal representaton in a minimal way, while keeping coherency (perplexity, or perhaps most previous weights)