lie_elicitation_prompts/nbs/build.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1b44551e",
   "metadata": {},
   "source": [
    "# Prepare dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "192895f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# autoreload your package\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1ae72038",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "from loguru import logger\n",
    "from tqdm.auto import tqdm\n",
    "# logger.remove()\n",
    "# import sys\n",
    "# logger.add(sys.stderr, level=\"INFO\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "198de680",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-06-28T02:34:01.879987Z",
     "start_time": "2022-06-28T02:34:01.864103Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExtractConfig(datasets=('amazon_polarity',), datasets_ood=('imdb', 'super_glue:boolq'), model='cognitivecomputations/dolphin-2.9.3-llama-3-8b', num_shots=2, max_tokens=444, max_examples=1000000, seed=42, repeats=3)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
    "import torch\n",
    "import pandas as pd\n",
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "import lie_elicitation_prompts\n",
    "from lie_elicitation_prompts.config import ExtractConfig\n",
    "from lie_elicitation_prompts.helpers.scores import row_choice_ids\n",
    "from lie_elicitation_prompts.prompts.prompt_loading import load_preproc_datasets, load_prompts\n",
    "\n",
    "cfg = ExtractConfig(\n",
    "    # model=\"failspy/Llama-3-8B-Instruct-abliterated\",\n",
    "    model=\"cognitivecomputations/dolphin-2.9.3-llama-3-8b\",\n",
    "    # model=\"NousResearch/Hermes-2-Pro-Llama-3-8B\",\n",
    "    datasets=(\n",
    "    # '../lie_elicitation_prompts/prompts/templates/UKPLab-liar',\n",
    "    \"amazon_polarity\",\n",
    "    # \"imdb\",\n",
    "      # \"glue:sst2\",\n",
    "      #  \"super_glue:axg\",\n",
    "      \n",
    "), max_examples=1000000, max_tokens=444)\n",
    "cfg\n",
    "# lie_elicitation_prompts/prompts/templates/liar"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea1ce98c",
   "metadata": {},
   "source": [
    "## Load text dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4a85cad2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# # debug\n",
    "# for ds_name in cfg.datasets:\n",
    "#     print(ds_name)\n",
    "#     o = load_prompts(ds_name, num_shots=1, N=2) \n",
    "#     o = list(tqdm(o))\n",
    "#     # print(ds_name, o)\n",
    "#     1/0\n",
    "# pd.DataFrame(o)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1aa8f65",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "16bf118c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# Ignore UserWarning category\n",
    "# warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
    "warnings.filterwarnings(\"ignore\", message=\"^The groups parameter is ignored by StratifiedShuffleSplit\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e987a4d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # # debug\n",
    "# list(load_prompts(cfg.datasets[0], num_shots=1, N=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b23e5aa6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "03044a83e624464a94b8081127412d3e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "803c89bd3ebc477c9cfc3f73f9ba4105",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-07-01 19:52:58.549\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mlie_elicitation_prompts.prompts.prompt_loading\u001b[0m:\u001b[36mload_prompts\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mExtracting 11 variants of each prompt\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "N = cfg.max_examples\n",
    "ds_prompts = load_preproc_datasets(\n",
    "    cfg.datasets,\n",
    "    N=N,\n",
    "    seed=cfg.seed,\n",
    "    num_shots=cfg.num_shots,\n",
    "    M=cfg.repeats,\n",
    ")\n",
    "ds_prompts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90868bf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ds_prompts_ood = load_preproc_datasets(\n",
    "#     cfg.datasets_ood,\n",
    "#     N=N,\n",
    "#     seed=cfg.seed,\n",
    "#     num_shots=cfg.num_shots,\n",
    "# )\n",
    "# ds_prompts_ood"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6334ae1",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_prompts[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "058982f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8b1050f5",
   "metadata": {},
   "source": [
    "## Load tokenized dataset\n",
    "\n",
    "- tokenize\n",
    "- filter out truncated\n",
    "- check which ones the model knows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abf4936e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, torch\n",
    "# os.environ['CUDA_VISIBLE_DEVICES'] = 'GPU-c4552741-f485-34ce-97fa-6c32983853af'\n",
    "# os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2115d010",
   "metadata": {},
   "outputs": [],
   "source": [
    "# torch.cuda.get_device_name()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a44fb25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# quantization_config = BitsAndBytesConfig(\n",
    "#     load_in_4bit=True,\n",
    "#     bnb_4bit_quant_type=\"nf4\",\n",
    "#     bnb_4bit_compute_dtype=torch.bfloat16,\n",
    "#     bnb_4bit_use_double_quant=True,\n",
    "# )\n",
    "quantization_config = BitsAndBytesConfig(\n",
    "    load_in_8bit=True,\n",
    "    bnb_8bit_compute_dtype=torch.bfloat16,\n",
    ")\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    cfg.model,\n",
    "    device_map=\"cuda:0\",\n",
    "    quantization_config=quantization_config,\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(cfg.model)\n",
    "if tokenizer.pad_token_id is None:\n",
    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
    "tokenizer.padding_side = \"left\"\n",
    "tokenizer.truncation_side = \"left\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c85e49bb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e07503ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "ds_tokens = (\n",
    "    ds_prompts.map(\n",
    "        lambda x: {\n",
    "            \"formatted_chat\": tokenizer.apply_chat_template(\n",
    "                x[\"messages\"], tokenize=False, add_generation_prompt=True\n",
    "            )\n",
    "        }\n",
    "    )\n",
    "    .map(\n",
    "        lambda x: tokenizer(\n",
    "            x[\"formatted_chat\"],\n",
    "            return_tensors=\"pt\",\n",
    "            max_length=cfg.max_tokens,\n",
    "            padding=\"max_length\",\n",
    "            truncation=True,\n",
    "        ),\n",
    "        batched=True,\n",
    "    )\n",
    "    .map(lambda r: {\"choice_ids\": row_choice_ids(r, tokenizer)}, desc=\"choice_ids\")\n",
    "    .filter(lambda x: x[\"attention_mask\"].sum() < cfg.max_tokens)\n",
    ")\n",
    "ds_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77b6136f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(ds_prompts), len(ds_tokens))\n",
    "\n",
    "pd.Series(ds_prompts['ds_string']).value_counts(), pd.Series(ds_tokens['sys_instr_name']).value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "002d0ad7",
   "metadata": {},
   "source": [
    "### QC\n",
    "\n",
    "To check prompt setup, coherency, etc generate on a few Q's"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be8fce14",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_tokens[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cf698d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "max_new_tokens = 64\n",
    "import numpy as np\n",
    "do_sample = False\n",
    "np.random.seed(42)\n",
    "for j in range(4):\n",
    "    i = np.random.randint(len(ds_tokens))\n",
    "    row = ds_tokens.with_format('torch')[i]\n",
    "    info = {k:v for k,v in row.items() if \n",
    "    (\n",
    "        (isinstance(v, str) and len(v) < 1000) or\n",
    "        (isinstance(v, (int, bool))) or\n",
    "        (isinstance(v, torch.Tensor) and v.numel() < 2) or\n",
    "        (k in ['answer_choices'])\n",
    "    )}\n",
    "\n",
    "    \n",
    "    model.eval()\n",
    "    with torch.no_grad():\n",
    "        length = row['input_ids'].shape[0]\n",
    "        out2 = model.generate(input_ids=row['input_ids'].unsqueeze(0).cuda(), \n",
    "            attention_mask=row['attention_mask'].unsqueeze(0).cuda(),\n",
    "\n",
    "                       max_new_tokens=max_new_tokens,\n",
    "            min_new_tokens=max_new_tokens,\n",
    "            do_sample=do_sample,\n",
    "            temperature=1,\n",
    "            use_cache=False,)\n",
    "        out2s_pre = tokenizer.batch_decode( out2[:, :length], skip_special_tokens=False)[0]\n",
    "        out2s_post = tokenizer.batch_decode( out2[:, length:], skip_special_tokens=False)[0]\n",
    "        print(info)\n",
    "        print(out2s_pre)\n",
    "        print('---')\n",
    "        print(out2s_post)\n",
    "        print('===')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d400297",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb21a718",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd8669c0",
   "metadata": {},
   "source": [
    "### Check model knowledge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4616102b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_metadata = ds_tokens.select_columns(['ds_string', 'example_i', 'sys_instr_name', 'instructed_to_lie']).to_pandas().reset_index(names='my_ds_index')\n",
    "df_metadata_truth = df_metadata.query('instructed_to_lie == False')\n",
    "df_metadata_truth\n",
    "\n",
    "# FIXME right now there is just one example of each, I guess I want a couple, hmm\n",
    "df_metadata.query('instructed_to_lie == False').groupby(['ds_string', 'example_i'], as_index=False)['my_ds_index'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed668740",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ds_tokens_truthful = ds_tokens.select(torch.argwhere(~ds_tokens['instructed_to_lie']))\n",
    "# ds_tokens_truthful"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1be1c6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lie_elicitation_prompts.helpers.torch_helpers import clear_mem\n",
    "clear_mem()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41127053",
   "metadata": {},
   "outputs": [],
   "source": [
    "# filter it to ones with 2 choice ids\n",
    "import numpy as np\n",
    "ds1 = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true', 'input_ids', 'attention_mask', 'choice_ids'])\n",
    "\n",
    "shapes=np.array([xx.shape[1] for xx in ds1['choice_ids']])\n",
    "mask2 = shapes == 2\n",
    "\n",
    "# FIXME this somehow select all lies?\n",
    "# ds = ds1.select(mask2) # This is wrong, it selects the first one again and again\n",
    "mask = np.argwhere(mask2)[:, 0]\n",
    "ds = ds1.select(mask)\n",
    "\n",
    "print(f\"{len(ds_tokens)} to {len(ds)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06ea2152",
   "metadata": {},
   "outputs": [],
   "source": [
    "# mask2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f46d5831",
   "metadata": {},
   "outputs": [],
   "source": [
    "# row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a3bcd57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ds['label_true']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0440173a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import DataLoader\n",
    "from lie_elicitation_prompts.helpers.select import select_multi_from_tensor\n",
    "from lie_elicitation_prompts.helpers.scores import sum_select_choices_from_logits\n",
    "\n",
    "batch_size = 10\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "dl = DataLoader(ds, batch_size=batch_size, shuffle=True)\n",
    "\n",
    "model.eval()\n",
    "\n",
    "results = []\n",
    "\n",
    "for nb, batch in enumerate(tqdm(dl)):\n",
    "\n",
    "    # to device\n",
    "    inputs = {'input_ids': batch['input_ids'].to(model.device), 'attention_mask': batch['attention_mask'].to(model.device)}\n",
    "    labels = batch['label_true']\n",
    "    choice_ids = batch['choice_ids']#.to(model.device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        out = model(**inputs)\n",
    "\n",
    "        # see how elk handles this https://github.com/EleutherAI/elk/blob/84e99a36a5050881d85f1510a2486ce46ac1f942/elk/extraction/extraction.py#L388\n",
    "        logits_last = out['logits'][:, -1].detach().cpu()\n",
    "        probs = out['prob_choices'] = sum_select_choices_from_logits(logits_last, choice_ids) # this does not add to one, as it is the prob from among all tokens\n",
    "        out['coverage'] = probs.sum(dim=1)\n",
    "\n",
    "        # select the answer\n",
    "        out['prob_ans'] = prob_ans = select_multi_from_tensor(probs, labels) \n",
    "        # ind = torch.arange(labels.size(0))\n",
    "        # out['prob_ans'] = prob_ans = probs[ind, labels*1]\n",
    "        out['odds_ans'] = prob_ans / probs.sum(-1) # ratio of probability mass assigned to the true label\n",
    "\n",
    "        # if we told it to lie, flip the truth odds. we want the odds over the other answer\n",
    "        instructed_to_lie = batch['instructed_to_lie'] * 1\n",
    "        out['odds_ans'] = (1-out['odds_ans']) * instructed_to_lie + out['odds_ans'] * (1-instructed_to_lie)\n",
    "\n",
    "        corrects = out['odds_ans']>0.5\n",
    "\n",
    "        # FIXME, make my logic forward compatible with multiple chocies, not bool\n",
    "\n",
    "        for batch_i, correct in enumerate(corrects):\n",
    "            results.append({\n",
    "                'instructed_to_lie': batch['instructed_to_lie'][batch_i].item(),\n",
    "                'ds_string': batch['ds_string'][batch_i],\n",
    "                'sys_instr_name': batch['sys_instr_name'][batch_i],\n",
    "                'example_i': batch['example_i'][batch_i].item(),\n",
    "                'correct': correct.item(),\n",
    "                'prob_ans': out['prob_ans'][batch_i].item(),\n",
    "                'odds_ans': out['odds_ans'][batch_i].item(),\n",
    "                'coverage': out['coverage'][batch_i].item(),\n",
    "                'prob_choices': out['prob_choices'][batch_i].tolist(),\n",
    "            })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "009f7bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# work out which question it knows the answer to\n",
    "df_results = pd.DataFrame(results)\n",
    "len(df_results)\n",
    "df_results['instructed_to_lie'].max()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9708088d",
   "metadata": {},
   "source": [
    "models\n",
    "- ablated 70% correct and 1% lie\n",
    "- dolhpin 77% correct and 3 lie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a087e564",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
    "df_ans = (df_results\n",
    "            .query(\"instructed_to_lie==False\")\n",
    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
    ")\n",
    "df_known = (df_ans\n",
    "            .query(\"mean > 0.9 & count > 1\")\n",
    "            # .drop(columns=['count','mean'])\n",
    ")\n",
    "mean_correct_rate=len(df_known)/len(df_ans)\n",
    "print(f'{mean_correct_rate:.2%} of the time the model got the questions reliably correct')\n",
    "df_known"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9de5ae0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
    "df_ans = (df_results\n",
    "            .query(\"instructed_to_lie==True\")\n",
    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
    ")\n",
    "df_lied = (df_ans\n",
    "            .query(\"mean > 0.9 & count > 1\")\n",
    "            .drop(columns=['count','mean'])\n",
    ")\n",
    "mean_lie_rate=len(df_lied)/len(df_ans)\n",
    "mean_lie_rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8700a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "acc, coverage = df_results.query(\"instructed_to_lie==False\")[['coverage', 'odds_ans']].mean()\n",
    "acc, coverage "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a54b493",
   "metadata": {},
   "outputs": [],
   "source": [
    "acc_lie = df_results.query(\"instructed_to_lie==True\")['odds_ans'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "290232e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"🌟Main QC metrics🌟\\n\\n\")\n",
    "print(f'|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|')\n",
    "print(f'|---|---|---|--|--|--|')\n",
    "print(f'|{cfg.model}|{mean_correct_rate:2.2%}|{mean_lie_rate:.2%}|{acc:.2%}|{acc_lie:.2%}|{coverage:.2%}|')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "238063d5",
   "metadata": {},
   "source": [
    "\n",
    "|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|\n",
    "|---|---|---|--|--|--|\n",
    "|failspy/Llama-3-8B-Instruct-abliterated|70.14%|1.45%|91.52%|9.22%|93.60%|\n",
    "|cognitivecomputations/dolphin-2.9.3-llama-3-8b|73.79%|3.60%|95.71%|14.94%|95.30%|"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5145c978",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # QC\n",
    "# print(df_results.groupby(['ds_string', 'example_i'])['instructed_to_lie'].mean())\n",
    "\n",
    "print('QC how often was it correct, when asked to lie?')\n",
    "df_results.groupby(['instructed_to_lie'])['correct'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fef8f3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# also look at the half where it was asked to lie, and find where it reliably lies\n",
    "df_lie_res_agg = (df_results\n",
    "            .query(\"instructed_to_lie==True\")\n",
    "            .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
    ")\n",
    "df_lies = (df_lie_res_agg\n",
    "            .query(\"mean > 0.6 & count > 1\")\n",
    "            # .drop(columns=['count','mean'])\n",
    ")\n",
    "print(f'{len(df_lies)/len(df_lie_res_agg):.2%} of the time the model reliably lied when asked')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81666952",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('QC: How often does it lie, by dataset')\n",
    "display(df_results\n",
    "            .query(\"instructed_to_lie==True\")\n",
    "            .groupby(['ds_string'], as_index=0)['correct'].agg(['count','mean'])\n",
    ")\n",
    "print('QC: How often does it lie, by system prompt')\n",
    "display(\n",
    "(df_results\n",
    "            .query(\"instructed_to_lie==True\")\n",
    "            .groupby(['sys_instr_name'], as_index=0)['correct'].agg(['count','mean'])\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd40ee89",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "690113f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# find our lies dataset\n",
    "df_known_and_follow = pd.merge(df_known, df_lies, how='inner', on=['ds_string', 'example_i'], suffixes=['_known', '_lie'])\n",
    "df_known_and_follow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd353c3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('QC: It should get them right often, and coverage should be high')\n",
    "# On a good dataset: Acc, or prob on correct ans should be high\n",
    "# And on a well formatted dataset, coverage should be high\n",
    "display(df_results.query(\"instructed_to_lie==False\").groupby(['ds_string'])[['coverage', 'odds_ans']].mean())\n",
    "\n",
    "display(df_results.query(\"instructed_to_lie==False\").groupby(['sys_instr_name'])[['coverage', 'odds_ans']].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5f02ee2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def row_is_known(x):\n",
    "    k = df_known_and_follow[df_known_and_follow.ds_string==x['ds_string']]\n",
    "    return x['example_i'].item() in k.example_i.values\n",
    "\n",
    "# filter the dataset to known answers based on ds_string and example_i\n",
    "ds_tokens_known = ds_tokens.filter(row_is_known)\n",
    "print(f\"{len(ds_tokens)} -> {len(ds_tokens_known)}\")\n",
    "ds_tokens_known"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d187e750",
   "metadata": {},
   "outputs": [],
   "source": [
    "(ds_tokens_known['instructed_to_lie']*1.0).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffa14959",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save\n",
    "ts = pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')\n",
    "f = Path(f\"../data/extracted_prompts_{ts}\")\n",
    "print(f)\n",
    "ds_tokens_known.info.description = json.dumps(cfg.__dict__)\n",
    "ds_tokens_known.save_to_disk(str(f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab9afec6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # push to hf https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/main_classes#datasets.Dataset.push_to_hub\n",
    "# ds_tokens_known.push_to_hub('wassname/abliterated-llama-known-prompts', split='train', config_name='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f977d4cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO see if it will also lie on an answer...\n",
    "# ds_tokens_known['formatted_chat'][:4]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d63249bf",
   "metadata": {},
   "source": [
    "## QC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acd63799",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # which source datasets did the known questions come from?\n",
    "# df_ds = ds_tokens_known.to_pandas()\n",
    "# df_ds[['ds_string','sys_instr_name']].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b2f97d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_metadata = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'answer_choices', 'label_true', 'instructed_to_lie']).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bced3f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(ds_tokens_known['ds_string']).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "994d6e9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# QC a batch\n",
    "\n",
    "d = ds_tokens_known.shuffle().select(range(300,303))\n",
    "ss = tokenizer.batch_decode(d['input_ids'], skip_special_tokens=False)\n",
    "for i, s in enumerate(ss):\n",
    "    print(d.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true']).to_pandas().iloc[i])\n",
    "    s = s.replace(tokenizer.eos_token, '')\n",
    "    s = s.replace('<|start_header_id|>', '\\n[')\n",
    "    s = s.replace('<|end_header_id|>', ']')\n",
    "    tokenizer.chat_template\n",
    "    print('---')\n",
    "    print(s)\n",
    "    print('===')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00c645fd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2ad6350",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.4 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}