mirror of
https://github.com/wassname/lie_elicitation_prompts.git
synced 2026-06-27 16:10:35 +08:00
980 lines
27 KiB
Plaintext
980 lines
27 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1b44551e",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Prepare dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "192895f0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# autoreload your package\n",
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "1ae72038",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import warnings\n",
|
|
"from loguru import logger\n",
|
|
"from tqdm.auto import tqdm\n",
|
|
"# logger.remove()\n",
|
|
"# import sys\n",
|
|
"# logger.add(sys.stderr, level=\"INFO\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "198de680",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2022-06-28T02:34:01.879987Z",
|
|
"start_time": "2022-06-28T02:34:01.864103Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"ExtractConfig(datasets=('amazon_polarity',), datasets_ood=('imdb', 'super_glue:boolq'), model='cognitivecomputations/dolphin-2.9.3-llama-3-8b', num_shots=2, max_tokens=444, max_examples=1000000, seed=42, repeats=3)"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
|
|
"import torch\n",
|
|
"import pandas as pd\n",
|
|
"import json\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"import lie_elicitation_prompts\n",
|
|
"from lie_elicitation_prompts.config import ExtractConfig\n",
|
|
"from lie_elicitation_prompts.helpers.scores import row_choice_ids\n",
|
|
"from lie_elicitation_prompts.prompts.prompt_loading import load_preproc_datasets, load_prompts\n",
|
|
"\n",
|
|
"cfg = ExtractConfig(\n",
|
|
" # model=\"failspy/Llama-3-8B-Instruct-abliterated\",\n",
|
|
" model=\"cognitivecomputations/dolphin-2.9.3-llama-3-8b\",\n",
|
|
" # model=\"NousResearch/Hermes-2-Pro-Llama-3-8B\",\n",
|
|
" datasets=(\n",
|
|
" # '../lie_elicitation_prompts/prompts/templates/UKPLab-liar',\n",
|
|
" \"amazon_polarity\",\n",
|
|
" # \"imdb\",\n",
|
|
" # \"glue:sst2\",\n",
|
|
" # \"super_glue:axg\",\n",
|
|
" \n",
|
|
"), max_examples=1000000, max_tokens=444)\n",
|
|
"cfg\n",
|
|
"# lie_elicitation_prompts/prompts/templates/liar"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ea1ce98c",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load text dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "4a85cad2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# # debug\n",
|
|
"# for ds_name in cfg.datasets:\n",
|
|
"# print(ds_name)\n",
|
|
"# o = load_prompts(ds_name, num_shots=1, N=2) \n",
|
|
"# o = list(tqdm(o))\n",
|
|
"# # print(ds_name, o)\n",
|
|
"# 1/0\n",
|
|
"# pd.DataFrame(o)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d1aa8f65",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "16bf118c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"# Ignore UserWarning category\n",
|
|
"# warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
|
|
"warnings.filterwarnings(\"ignore\", message=\"^The groups parameter is ignored by StratifiedShuffleSplit\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "e987a4d3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # # debug\n",
|
|
"# list(load_prompts(cfg.datasets[0], num_shots=1, N=2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "b23e5aa6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "03044a83e624464a94b8081127412d3e",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "803c89bd3ebc477c9cfc3f73f9ba4105",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"Generating train split: 0 examples [00:00, ? examples/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[32m2024-07-01 19:52:58.549\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mlie_elicitation_prompts.prompts.prompt_loading\u001b[0m:\u001b[36mload_prompts\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mExtracting 11 variants of each prompt\u001b[0m\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"N = cfg.max_examples\n",
|
|
"ds_prompts = load_preproc_datasets(\n",
|
|
" cfg.datasets,\n",
|
|
" N=N,\n",
|
|
" seed=cfg.seed,\n",
|
|
" num_shots=cfg.num_shots,\n",
|
|
" M=cfg.repeats,\n",
|
|
")\n",
|
|
"ds_prompts"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "90868bf7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ds_prompts_ood = load_preproc_datasets(\n",
|
|
"# cfg.datasets_ood,\n",
|
|
"# N=N,\n",
|
|
"# seed=cfg.seed,\n",
|
|
"# num_shots=cfg.num_shots,\n",
|
|
"# )\n",
|
|
"# ds_prompts_ood"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d6334ae1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ds_prompts[1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "058982f9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# save"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8b1050f5",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load tokenized dataset\n",
|
|
"\n",
|
|
"- tokenize\n",
|
|
"- filter out truncated\n",
|
|
"- check which ones the model knows"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "abf4936e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os, torch\n",
|
|
"# os.environ['CUDA_VISIBLE_DEVICES'] = 'GPU-c4552741-f485-34ce-97fa-6c32983853af'\n",
|
|
"# os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2115d010",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# torch.cuda.get_device_name()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2a44fb25",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# quantization_config = BitsAndBytesConfig(\n",
|
|
"# load_in_4bit=True,\n",
|
|
"# bnb_4bit_quant_type=\"nf4\",\n",
|
|
"# bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
"# bnb_4bit_use_double_quant=True,\n",
|
|
"# )\n",
|
|
"quantization_config = BitsAndBytesConfig(\n",
|
|
" load_in_8bit=True,\n",
|
|
" bnb_8bit_compute_dtype=torch.bfloat16,\n",
|
|
")\n",
|
|
"\n",
|
|
"model = AutoModelForCausalLM.from_pretrained(\n",
|
|
" cfg.model,\n",
|
|
" device_map=\"cuda:0\",\n",
|
|
" quantization_config=quantization_config,\n",
|
|
")\n",
|
|
"\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(cfg.model)\n",
|
|
"if tokenizer.pad_token_id is None:\n",
|
|
" tokenizer.pad_token_id = tokenizer.eos_token_id\n",
|
|
"tokenizer.padding_side = \"left\"\n",
|
|
"tokenizer.truncation_side = \"left\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c85e49bb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e07503ec",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"ds_tokens = (\n",
|
|
" ds_prompts.map(\n",
|
|
" lambda x: {\n",
|
|
" \"formatted_chat\": tokenizer.apply_chat_template(\n",
|
|
" x[\"messages\"], tokenize=False, add_generation_prompt=True\n",
|
|
" )\n",
|
|
" }\n",
|
|
" )\n",
|
|
" .map(\n",
|
|
" lambda x: tokenizer(\n",
|
|
" x[\"formatted_chat\"],\n",
|
|
" return_tensors=\"pt\",\n",
|
|
" max_length=cfg.max_tokens,\n",
|
|
" padding=\"max_length\",\n",
|
|
" truncation=True,\n",
|
|
" ),\n",
|
|
" batched=True,\n",
|
|
" )\n",
|
|
" .map(lambda r: {\"choice_ids\": row_choice_ids(r, tokenizer)}, desc=\"choice_ids\")\n",
|
|
" .filter(lambda x: x[\"attention_mask\"].sum() < cfg.max_tokens)\n",
|
|
")\n",
|
|
"ds_tokens"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "77b6136f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(len(ds_prompts), len(ds_tokens))\n",
|
|
"\n",
|
|
"pd.Series(ds_prompts['ds_string']).value_counts(), pd.Series(ds_tokens['sys_instr_name']).value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "002d0ad7",
|
|
"metadata": {},
|
|
"source": [
|
|
"### QC\n",
|
|
"\n",
|
|
"To check prompt setup, coherency, etc generate on a few Q's"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "be8fce14",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ds_tokens[1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7cf698d9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"max_new_tokens = 64\n",
|
|
"import numpy as np\n",
|
|
"do_sample = False\n",
|
|
"np.random.seed(42)\n",
|
|
"for j in range(4):\n",
|
|
" i = np.random.randint(len(ds_tokens))\n",
|
|
" row = ds_tokens.with_format('torch')[i]\n",
|
|
" info = {k:v for k,v in row.items() if \n",
|
|
" (\n",
|
|
" (isinstance(v, str) and len(v) < 1000) or\n",
|
|
" (isinstance(v, (int, bool))) or\n",
|
|
" (isinstance(v, torch.Tensor) and v.numel() < 2) or\n",
|
|
" (k in ['answer_choices'])\n",
|
|
" )}\n",
|
|
"\n",
|
|
" \n",
|
|
" model.eval()\n",
|
|
" with torch.no_grad():\n",
|
|
" length = row['input_ids'].shape[0]\n",
|
|
" out2 = model.generate(input_ids=row['input_ids'].unsqueeze(0).cuda(), \n",
|
|
" attention_mask=row['attention_mask'].unsqueeze(0).cuda(),\n",
|
|
"\n",
|
|
" max_new_tokens=max_new_tokens,\n",
|
|
" min_new_tokens=max_new_tokens,\n",
|
|
" do_sample=do_sample,\n",
|
|
" temperature=1,\n",
|
|
" use_cache=False,)\n",
|
|
" out2s_pre = tokenizer.batch_decode( out2[:, :length], skip_special_tokens=False)[0]\n",
|
|
" out2s_post = tokenizer.batch_decode( out2[:, length:], skip_special_tokens=False)[0]\n",
|
|
" print(info)\n",
|
|
" print(out2s_pre)\n",
|
|
" print('---')\n",
|
|
" print(out2s_post)\n",
|
|
" print('===')\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9d400297",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fb21a718",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ds_tokens"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bd8669c0",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check model knowledge"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4616102b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_metadata = ds_tokens.select_columns(['ds_string', 'example_i', 'sys_instr_name', 'instructed_to_lie']).to_pandas().reset_index(names='my_ds_index')\n",
|
|
"df_metadata_truth = df_metadata.query('instructed_to_lie == False')\n",
|
|
"df_metadata_truth\n",
|
|
"\n",
|
|
"# FIXME right now there is just one example of each, I guess I want a couple, hmm\n",
|
|
"df_metadata.query('instructed_to_lie == False').groupby(['ds_string', 'example_i'], as_index=False)['my_ds_index'].count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ed668740",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ds_tokens_truthful = ds_tokens.select(torch.argwhere(~ds_tokens['instructed_to_lie']))\n",
|
|
"# ds_tokens_truthful"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e1be1c6a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from lie_elicitation_prompts.helpers.torch_helpers import clear_mem\n",
|
|
"clear_mem()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "41127053",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# filter it to ones with 2 choice ids\n",
|
|
"import numpy as np\n",
|
|
"ds1 = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true', 'input_ids', 'attention_mask', 'choice_ids'])\n",
|
|
"\n",
|
|
"shapes=np.array([xx.shape[1] for xx in ds1['choice_ids']])\n",
|
|
"mask2 = shapes == 2\n",
|
|
"\n",
|
|
"# FIXME this somehow select all lies?\n",
|
|
"# ds = ds1.select(mask2) # This is wrong, it selects the first one again and again\n",
|
|
"mask = np.argwhere(mask2)[:, 0]\n",
|
|
"ds = ds1.select(mask)\n",
|
|
"\n",
|
|
"print(f\"{len(ds_tokens)} to {len(ds)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "06ea2152",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# mask2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f46d5831",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# row"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8a3bcd57",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ds['label_true']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0440173a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from torch.utils.data import DataLoader\n",
|
|
"from lie_elicitation_prompts.helpers.select import select_multi_from_tensor\n",
|
|
"from lie_elicitation_prompts.helpers.scores import sum_select_choices_from_logits\n",
|
|
"\n",
|
|
"batch_size = 10\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"dl = DataLoader(ds, batch_size=batch_size, shuffle=True)\n",
|
|
"\n",
|
|
"model.eval()\n",
|
|
"\n",
|
|
"results = []\n",
|
|
"\n",
|
|
"for nb, batch in enumerate(tqdm(dl)):\n",
|
|
"\n",
|
|
" # to device\n",
|
|
" inputs = {'input_ids': batch['input_ids'].to(model.device), 'attention_mask': batch['attention_mask'].to(model.device)}\n",
|
|
" labels = batch['label_true']\n",
|
|
" choice_ids = batch['choice_ids']#.to(model.device)\n",
|
|
"\n",
|
|
" with torch.no_grad():\n",
|
|
" out = model(**inputs)\n",
|
|
"\n",
|
|
" # see how elk handles this https://github.com/EleutherAI/elk/blob/84e99a36a5050881d85f1510a2486ce46ac1f942/elk/extraction/extraction.py#L388\n",
|
|
" logits_last = out['logits'][:, -1].detach().cpu()\n",
|
|
" probs = out['prob_choices'] = sum_select_choices_from_logits(logits_last, choice_ids) # this does not add to one, as it is the prob from among all tokens\n",
|
|
" out['coverage'] = probs.sum(dim=1)\n",
|
|
"\n",
|
|
" # select the answer\n",
|
|
" out['prob_ans'] = prob_ans = select_multi_from_tensor(probs, labels) \n",
|
|
" # ind = torch.arange(labels.size(0))\n",
|
|
" # out['prob_ans'] = prob_ans = probs[ind, labels*1]\n",
|
|
" out['odds_ans'] = prob_ans / probs.sum(-1) # ratio of probability mass assigned to the true label\n",
|
|
"\n",
|
|
" # if we told it to lie, flip the truth odds. we want the odds over the other answer\n",
|
|
" instructed_to_lie = batch['instructed_to_lie'] * 1\n",
|
|
" out['odds_ans'] = (1-out['odds_ans']) * instructed_to_lie + out['odds_ans'] * (1-instructed_to_lie)\n",
|
|
"\n",
|
|
" corrects = out['odds_ans']>0.5\n",
|
|
"\n",
|
|
" # FIXME, make my logic forward compatible with multiple chocies, not bool\n",
|
|
"\n",
|
|
" for batch_i, correct in enumerate(corrects):\n",
|
|
" results.append({\n",
|
|
" 'instructed_to_lie': batch['instructed_to_lie'][batch_i].item(),\n",
|
|
" 'ds_string': batch['ds_string'][batch_i],\n",
|
|
" 'sys_instr_name': batch['sys_instr_name'][batch_i],\n",
|
|
" 'example_i': batch['example_i'][batch_i].item(),\n",
|
|
" 'correct': correct.item(),\n",
|
|
" 'prob_ans': out['prob_ans'][batch_i].item(),\n",
|
|
" 'odds_ans': out['odds_ans'][batch_i].item(),\n",
|
|
" 'coverage': out['coverage'][batch_i].item(),\n",
|
|
" 'prob_choices': out['prob_choices'][batch_i].tolist(),\n",
|
|
" })"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "009f7bcc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# work out which question it knows the answer to\n",
|
|
"df_results = pd.DataFrame(results)\n",
|
|
"len(df_results)\n",
|
|
"df_results['instructed_to_lie'].max()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9708088d",
|
|
"metadata": {},
|
|
"source": [
|
|
"models\n",
|
|
"- ablated 70% correct and 1% lie\n",
|
|
"- dolhpin 77% correct and 3 lie"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a087e564",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
|
|
"df_ans = (df_results\n",
|
|
" .query(\"instructed_to_lie==False\")\n",
|
|
" .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
|
|
")\n",
|
|
"df_known = (df_ans\n",
|
|
" .query(\"mean > 0.9 & count > 1\")\n",
|
|
" # .drop(columns=['count','mean'])\n",
|
|
")\n",
|
|
"mean_correct_rate=len(df_known)/len(df_ans)\n",
|
|
"print(f'{mean_correct_rate:.2%} of the time the model got the questions reliably correct')\n",
|
|
"df_known"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d9de5ae0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# for half the dataset it is asked to tell the truth, lets get the question id's where it reliably succeeds. These are places the model knows the truth.\n",
|
|
"df_ans = (df_results\n",
|
|
" .query(\"instructed_to_lie==True\")\n",
|
|
" .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
|
|
")\n",
|
|
"df_lied = (df_ans\n",
|
|
" .query(\"mean > 0.9 & count > 1\")\n",
|
|
" .drop(columns=['count','mean'])\n",
|
|
")\n",
|
|
"mean_lie_rate=len(df_lied)/len(df_ans)\n",
|
|
"mean_lie_rate"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d8700a63",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"acc, coverage = df_results.query(\"instructed_to_lie==False\")[['coverage', 'odds_ans']].mean()\n",
|
|
"acc, coverage "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2a54b493",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"acc_lie = df_results.query(\"instructed_to_lie==True\")['odds_ans'].mean()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "290232e5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"🌟Main QC metrics🌟\\n\\n\")\n",
|
|
"print(f'|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|')\n",
|
|
"print(f'|---|---|---|--|--|--|')\n",
|
|
"print(f'|{cfg.model}|{mean_correct_rate:2.2%}|{mean_lie_rate:.2%}|{acc:.2%}|{acc_lie:.2%}|{coverage:.2%}|')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "238063d5",
|
|
"metadata": {},
|
|
"source": [
|
|
"\n",
|
|
"|model|acc_n3|lie_n3|acc_n1|lie_n1|coverage|\n",
|
|
"|---|---|---|--|--|--|\n",
|
|
"|failspy/Llama-3-8B-Instruct-abliterated|70.14%|1.45%|91.52%|9.22%|93.60%|\n",
|
|
"|cognitivecomputations/dolphin-2.9.3-llama-3-8b|73.79%|3.60%|95.71%|14.94%|95.30%|"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5145c978",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # QC\n",
|
|
"# print(df_results.groupby(['ds_string', 'example_i'])['instructed_to_lie'].mean())\n",
|
|
"\n",
|
|
"print('QC how often was it correct, when asked to lie?')\n",
|
|
"df_results.groupby(['instructed_to_lie'])['correct'].mean()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3fef8f3c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# also look at the half where it was asked to lie, and find where it reliably lies\n",
|
|
"df_lie_res_agg = (df_results\n",
|
|
" .query(\"instructed_to_lie==True\")\n",
|
|
" .groupby(['ds_string', 'example_i'], as_index=0)['correct'].agg(['count','mean'])\n",
|
|
")\n",
|
|
"df_lies = (df_lie_res_agg\n",
|
|
" .query(\"mean > 0.6 & count > 1\")\n",
|
|
" # .drop(columns=['count','mean'])\n",
|
|
")\n",
|
|
"print(f'{len(df_lies)/len(df_lie_res_agg):.2%} of the time the model reliably lied when asked')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "81666952",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('QC: How often does it lie, by dataset')\n",
|
|
"display(df_results\n",
|
|
" .query(\"instructed_to_lie==True\")\n",
|
|
" .groupby(['ds_string'], as_index=0)['correct'].agg(['count','mean'])\n",
|
|
")\n",
|
|
"print('QC: How often does it lie, by system prompt')\n",
|
|
"display(\n",
|
|
"(df_results\n",
|
|
" .query(\"instructed_to_lie==True\")\n",
|
|
" .groupby(['sys_instr_name'], as_index=0)['correct'].agg(['count','mean'])\n",
|
|
"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cd40ee89",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "690113f0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# find our lies dataset\n",
|
|
"df_known_and_follow = pd.merge(df_known, df_lies, how='inner', on=['ds_string', 'example_i'], suffixes=['_known', '_lie'])\n",
|
|
"df_known_and_follow"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cd353c3b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print('QC: It should get them right often, and coverage should be high')\n",
|
|
"# On a good dataset: Acc, or prob on correct ans should be high\n",
|
|
"# And on a well formatted dataset, coverage should be high\n",
|
|
"display(df_results.query(\"instructed_to_lie==False\").groupby(['ds_string'])[['coverage', 'odds_ans']].mean())\n",
|
|
"\n",
|
|
"display(df_results.query(\"instructed_to_lie==False\").groupby(['sys_instr_name'])[['coverage', 'odds_ans']].mean())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e5f02ee2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def row_is_known(x):\n",
|
|
" k = df_known_and_follow[df_known_and_follow.ds_string==x['ds_string']]\n",
|
|
" return x['example_i'].item() in k.example_i.values\n",
|
|
"\n",
|
|
"# filter the dataset to known answers based on ds_string and example_i\n",
|
|
"ds_tokens_known = ds_tokens.filter(row_is_known)\n",
|
|
"print(f\"{len(ds_tokens)} -> {len(ds_tokens_known)}\")\n",
|
|
"ds_tokens_known"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d187e750",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"(ds_tokens_known['instructed_to_lie']*1.0).mean()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ffa14959",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# save\n",
|
|
"ts = pd.Timestamp.now().strftime('%Y%m%d-%H%M%S')\n",
|
|
"f = Path(f\"../data/extracted_prompts_{ts}\")\n",
|
|
"print(f)\n",
|
|
"ds_tokens_known.info.description = json.dumps(cfg.__dict__)\n",
|
|
"ds_tokens_known.save_to_disk(str(f))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ab9afec6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # push to hf https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/main_classes#datasets.Dataset.push_to_hub\n",
|
|
"# ds_tokens_known.push_to_hub('wassname/abliterated-llama-known-prompts', split='train', config_name='')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f977d4cd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# TODO see if it will also lie on an answer...\n",
|
|
"# ds_tokens_known['formatted_chat'][:4]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d63249bf",
|
|
"metadata": {},
|
|
"source": [
|
|
"## QC"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "acd63799",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # which source datasets did the known questions come from?\n",
|
|
"# df_ds = ds_tokens_known.to_pandas()\n",
|
|
"# df_ds[['ds_string','sys_instr_name']].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7b2f97d4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# df_metadata = ds_tokens.select_columns(['ds_string', 'sys_instr_name', 'answer_choices', 'label_true', 'instructed_to_lie']).to_pandas()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1bced3f3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pd.Series(ds_tokens_known['ds_string']).value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "994d6e9a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# QC a batch\n",
|
|
"\n",
|
|
"d = ds_tokens_known.shuffle().select(range(300,303))\n",
|
|
"ss = tokenizer.batch_decode(d['input_ids'], skip_special_tokens=False)\n",
|
|
"for i, s in enumerate(ss):\n",
|
|
" print(d.select_columns(['ds_string', 'sys_instr_name', 'example_i', 'instructed_to_lie', 'label_true']).to_pandas().iloc[i])\n",
|
|
" s = s.replace(tokenizer.eos_token, '')\n",
|
|
" s = s.replace('<|start_header_id|>', '\\n[')\n",
|
|
" s = s.replace('<|end_header_id|>', ']')\n",
|
|
" tokenizer.chat_template\n",
|
|
" print('---')\n",
|
|
" print(s)\n",
|
|
" print('===')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "00c645fd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d2ad6350",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3.10.4 64-bit",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
},
|
|
"toc": {
|
|
"base_numbering": 1,
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": false,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|