lora seems to work

This commit is contained in:
wassname
2024-01-03 13:14:24 +08:00
parent 13d0c82596
commit 7f27b90159
6 changed files with 11828 additions and 597 deletions
+1
View File
@@ -1,5 +1,6 @@
.env
lightning_logs/
outputs/
*.arrow
squad_*
+16
View File
@@ -34,3 +34,19 @@ When using microsoft/phi-2 we get this amount of perplexity reduction by includi
As you can see, some of these are probobly in the training set
# Citing
If you like our work and end up using this code for your reseach give us a shout-out by citing or acknowledging
```
@misc{wassname2024,
author = {Clark, M.J.},
title = {BS Writing Detector},
year = {2024},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/wassname/detect_bs_text}},
commit = {}
}
```
File diff suppressed because one or more lines are too long
@@ -1,597 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from torch import optim\n",
"import lightning as pl\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import transformers\n",
"from datasets import load_dataset\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
"import numpy as np\n",
"from tqdm.auto import tqdm\n",
"import pandas as pd\n",
"import warnings\n",
"from peft import LoraConfig, get_peft_model, IA3Config"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"plt.style.use('ggplot')\n",
"torch.set_float32_matmul_precision('medium')\n",
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
"\n",
"model_name = \"microsoft/phi-2\"\n",
"\n",
"# model = AutoModelForCausalLM.from_pretrained(\n",
"# model_name,\n",
"# # max_memory=max_memory,\n",
"# quantization_config=BitsAndBytesConfig(\n",
"# load_in_4bit=True,\n",
"# llm_int8_threshold=6.0,\n",
"# llm_int8_has_fp16_weight=False,\n",
"# bnb_4bit_compute_dtype=torch.float16,\n",
"# bnb_4bit_use_double_quant=True,\n",
"# bnb_4bit_quant_type=\"nf4\",\n",
"# ),\n",
"# torch_dtype=torch.float16,\n",
"# trust_remote_code=True,\n",
"# )\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"model_name = \"TheBloke/phi-2-GPTQ\"\n",
"# model_name = \"microsoft/phi-2\"\n",
"\n",
"def load_model():\n",
"\n",
" # model = AutoModelForCausalLM.from_pretrained(\n",
" # model_name,\n",
" # # quantization_config=BitsAndBytesConfig(\n",
" # # load_in_4bit=True,\n",
" # # llm_int8_threshold=6.0,\n",
" # # llm_int8_has_fp16_weight=False,\n",
" # # bnb_4bit_compute_dtype=torch.float16,\n",
" # # bnb_4bit_use_double_quant=True,\n",
" # # bnb_4bit_quant_type=\"nf4\",\n",
" # # ),\n",
" # torch_dtype=torch.float16,\n",
" # trust_remote_code=True,\n",
" # )\n",
"\n",
"\n",
" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
" config.quantization_config['use_exllama'] = False\n",
" config.quantization_config['disable_exllama'] = True\n",
" model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" torch_dtype=torch.bfloat16,\n",
" trust_remote_code=True,\n",
" config=config,\n",
" )\n",
" return model\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"CUDA extension not installed.\n",
"CUDA extension not installed.\n"
]
}
],
"source": [
"base_model = load_model()\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
"tokenizer.pad_token = tokenizer.eos_token"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def reset_model(base_model):\n",
" peft_config = LoraConfig(\n",
" # task_type=TaskType.TOKEN_CLS, \n",
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
" inference_mode=False, r=8, lora_alpha=8, \n",
" # lora_dropout=0.1, \n",
" # bias=\"all\"\n",
" )\n",
" model = get_peft_model(base_model, peft_config)\n",
" model.config.use_cache = False\n",
" return model\n",
"\n",
"model = reset_model(base_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"MAX_LEN = 2000\n",
"samples = json.load(open(\"../samples.json\"))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Helpers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
"\n",
"# from evaluate.measurements.perplexity import Perplexity\n",
"import evaluate\n",
"from evaluate import logging\n",
"from torch.nn import CrossEntropyLoss\n",
"\n",
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
"def perplexity_compute(\n",
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
"):\n",
"\n",
" if device is not None:\n",
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
" if device == \"gpu\":\n",
" device = \"cuda\"\n",
" else:\n",
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
" model = model.to(device)\n",
"\n",
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"\n",
" # # if batch_size > 1 (which generally leads to padding being required), and\n",
" # # if there is not an already assigned pad_token, assign an existing\n",
" # # special token to also be the padding token\n",
" # if tokenizer.pad_token is None and batch_size > 1:\n",
" # existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
" # # check that the model already has at least one special token defined\n",
" # assert (\n",
" # len(existing_special_tokens) > 0\n",
" # ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
" # # assign one of the special tokens to also be the pad token\n",
" # tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
"\n",
" # if add_start_token and max_length:\n",
" # # leave room for <BOS> token to be added:\n",
" # assert (\n",
" # tokenizer.bos_token is not None\n",
" # ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
" # max_tokenized_len = max_length - 1\n",
" # else:\n",
" max_tokenized_len = max_length\n",
"\n",
" encodings = tokenizer(\n",
" data,\n",
" add_special_tokens=False,\n",
" padding=True,\n",
" truncation=True if max_tokenized_len else False,\n",
" max_length=max_tokenized_len,\n",
" return_tensors=\"pt\",\n",
" return_attention_mask=True,\n",
" ).to(device)\n",
"\n",
" encoded_texts = encodings[\"input_ids\"]\n",
" attn_masks = encodings[\"attention_mask\"]\n",
"\n",
" # check that each input is long enough:\n",
" if add_start_token:\n",
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
" else:\n",
" assert torch.all(\n",
" torch.ge(attn_masks.sum(1), 2)\n",
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
"\n",
" ppls = []\n",
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
"\n",
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
" encoded_batch = encoded_texts[start_index:end_index]\n",
" attn_mask = attn_masks[start_index:end_index]\n",
"\n",
" # if add_start_token:\n",
" # bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
" # encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
" # attn_mask = torch.cat(\n",
" # [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
" # )\n",
"\n",
" labels = encoded_batch\n",
"\n",
" with torch.no_grad():\n",
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
" # print(out_logits.shape)\n",
"\n",
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
" shift_labels = labels[..., 1:].contiguous()\n",
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
"\n",
" perplexity_batch = torch.exp(\n",
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
" / shift_attention_mask_batch.sum(1)\n",
" )\n",
" # perplexity_batch = torch.exp(\n",
" # (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)\n",
" # / shift_attention_mask_batch.sum(1)\n",
" # )\n",
" # print(perplexity_batch.shape)\n",
"\n",
" ppls += perplexity_batch.tolist()\n",
"\n",
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# perplexity_compute(\n",
"# second_half, model, tokenizer\n",
"# )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from torch.nn import functional as F\n",
"from torch.utils.data import DataLoader, TensorDataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lightning helpers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"\n",
"def str2xya(s, tokenizer):\n",
" max_len = min(MAX_LEN, len(s))\n",
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0]\n",
"\n",
" pad = tokenizer.bos_token_id\n",
" data = []\n",
" for i in range(1, len(input_ids)):\n",
" x = input_ids[:i][-max_len:]\n",
" padding = max_len - len(x)\n",
" x = torch.tensor([pad]*padding + x.tolist())\n",
"\n",
" label_ids = input_ids[i:i+1]\n",
" attention_mask = (x==pad)*1\n",
" data.append(dict(input_ids=x, label_ids=label_ids, attention_mask=attention_mask))\n",
" \n",
" return data\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def eval(model, tokenizer, second_half):\n",
" model.eval();\n",
" with torch.no_grad():\n",
" with model.disable_adapter():\n",
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def learn_sample(sample):\n",
" device = 'cuda'\n",
" lr = 4e-3\n",
" epochs = 3\n",
" accum_steps = 16\n",
" batch_size = 1\n",
"\n",
" s = sample['text']\n",
" first_half = s[:len(s)//2]\n",
" second_half = s[len(s)//2:]\n",
" ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
" ds_val = Dataset.from_dict(tokenizer([second_half]))\n",
"\n",
" os.environ['CUDA_VISIBLE_DEVICES']=\"1\"\n",
" verbose = False\n",
" model = reset_model(base_model)\n",
" eval(model, tokenizer, second_half)\n",
" trainer = transformers.Trainer(\n",
" model=model,\n",
" train_dataset=ds_train,\n",
" eval_dataset=ds_val,\n",
" args=transformers.TrainingArguments(\n",
" per_device_train_batch_size=batch_size,\n",
" gradient_accumulation_steps=8,\n",
" warmup_steps=0,\n",
" max_steps=40,\n",
" learning_rate=3e-4,\n",
" fp16=True,\n",
" logging_steps=1,\n",
" output_dir=\"outputs\",\n",
" log_level='error',\n",
" disable_tqdm=not verbose,\n",
" ),\n",
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
" )\n",
" trainer._signature_columns = ['input_ids', 'attention_mask', 'label_ids']\n",
" model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
" train_output = trainer.train()\n",
"\n",
" if verbose:\n",
" df_hist = pd.DataFrame(trainer.state.log_history)\n",
" df_hist_epoch = df_hist.groupby('epoch').last().dropna(axis=1).drop(columns=['step'])\n",
" df_hist_step = df_hist.set_index('step').dropna(thresh=2, axis=1)\n",
" for c in df_hist_epoch.columns:\n",
" df_hist_epoch[[c]].plot()\n",
"\n",
"\n",
" result = eval(model, tokenizer, second_half)\n",
" return result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = []\n",
"for sample in samples:\n",
" print(sample['name'])\n",
" r = learn_sample(sample)\n",
" print(r)\n",
" data.append(dict(**r, **sample))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_res = pd.DataFrame(data)\n",
"df_res = df_res[['before', 'after', 'name', 'in_training']]\n",
"df_res['improvement'] = df_res['before'] - df_res['after']\n",
"df_res"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DEBUG"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import display, HTML, Markdown\n",
"import torch\n",
"\n",
"@torch.no_grad()\n",
"def gen(model, inputs, tokenizer, clean=True):\n",
" s = model.generate(\n",
" input_ids=inputs[\"input_ids\"][None, :].to(model.device),\n",
" attention_mask=inputs[\"attention_mask\"][None, :].to(model.device),\n",
" use_cache=False,\n",
" max_new_tokens=100,\n",
" min_new_tokens=100,\n",
" do_sample=False,\n",
" early_stopping=False,\n",
" )\n",
" input_l = inputs[\"input_ids\"].shape[0]\n",
" tokenizer_kwargs=dict(clean_up_tokenization_spaces=clean, skip_special_tokens=clean)\n",
" old = tokenizer.decode(\n",
" s[0, :input_l], **tokenizer_kwargs\n",
" )\n",
" new = tokenizer.decode(\n",
" s[0, input_l:], **tokenizer_kwargs\n",
" )\n",
" s_old = \"\"+old.replace('\\n', '<br>')\n",
" s_new = '<b>' + new.replace('\\n', '<br>')+ '<br><br><b/>'\n",
" display(HTML(f\"{s_old}{s_new}\"))\n",
" # print([old, new])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"samples = samples[1]\n",
"\n",
"s = sample['text']\n",
"first_half = s[:len(s)//2]\n",
"second_half = s[len(s)//2:]\n",
"ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
"ds_val = Dataset.from_dict(tokenizer([second_half]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with model.disable_adapter():\n",
" gen(model, ds_train.with_format('pt')[0], tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"gen(model, ds_train.with_format('pt')[0], tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long