mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-06-30 17:06:08 +08:00
tidy
This commit is contained in:
@@ -16,7 +16,33 @@ If yes/yes then it's may be suprising new information. If either is not true, th
|
||||
See [main.ipynb](main.ipynb) for the code and results.
|
||||
|
||||
|
||||
# Results
|
||||
# Results using adapter fine tuning
|
||||
|
||||
I fine tuned the model on the first half of a text, then tested on the second half. I measure how much it learned by the perplexity decrease. The rows with a high perplexity are unpredictable, and the ones with a higher improvement are learnable. Unpredictable and learnable text is not BS.
|
||||
|
||||
|
||||
| name | before | after | in_training | len | improvement% | improvement |
|
||||
|:----------------------------------------------|---------:|---------:|:--------------|------:|---------------:|--------------:|
|
||||
| wikipedia on LK-99 | 32.219 | 28.8525 | False | 1038 | 0.104489 | 3.36652 |
|
||||
| Theory o. general relativity | 26.952 | 24.5425 | True | 1378 | 0.0894 | 2.40951 |
|
||||
| good_ml | 28.3473 | 26.4566 | False | 1004 | 0.0666997 | 1.89076 |
|
||||
| enron_email1 | 25.7697 | 24.3904 | True | 445 | 0.0535253 | 1.37933 |
|
||||
| openai_board_ann | 15.904 | 15.1736 | False | 1191 | 0.0459214 | 0.730332 |
|
||||
| Schmidhuber 2023 Subjective Novelty, Surprise | 29.615 | 28.4708 | False | 2654 | 0.0386353 | 1.14418 |
|
||||
| email_to_fauci | 25.0893 | 24.3714 | False | 1559 | 0.0286154 | 0.717941 |
|
||||
| sokal hoax | 15.9664 | 15.7148 | True | 2487 | 0.0157617 | 0.251658 |
|
||||
| AI gen fake paper | 7.63283 | 7.57951 | False | 2031 | 0.00698672 | 0.0533285 |
|
||||
| lorem ipsum | 1.60166 | 1.59538 | True | 445 | 0.00392053 | 0.00627935 |
|
||||
| bad_ml | 13.9061 | 13.8623 | False | 2345 | 0.00314972 | 0.0438004 |
|
||||
| I have a dream | 2.12726 | 2.12344 | True | 848 | 0.00179583 | 0.00382018 |
|
||||
|
||||
|
||||
For example the wikipedia extract `wikipedia on LK-99 ` is unpredictable (high before perplexity) and is learnable (high improvement in perplexity). That makes sense as it's a new topic. In contrast `lorem ipsum` has a low perplexity, meaning it's predictalbe or memorizable. That makes sense as this text was likely in the training corpus. The `AI gen fake paper ` has a low perplexity because it's predictable, even thought it is new.
|
||||
|
||||
|
||||
See more in [01_detection_using_adapter_ft.ipynb](01_detection_using_adapter_ft.ipynb)
|
||||
|
||||
# Results using prompting
|
||||
|
||||
When using microsoft/phi-2 we get this amount of perplexity reduction by including a summary of the key learnings
|
||||
|
||||
@@ -35,6 +61,8 @@ When using microsoft/phi-2 we get this amount of perplexity reduction by includi
|
||||
|
||||
As you can see, some of these are probobly in the training set
|
||||
|
||||
See more in [02_detection_using_tldr_prompt.ipynb](02_detection_using_tldr_prompt.ipynb)
|
||||
|
||||
# Citing
|
||||
|
||||
If you like our work and end up using this code for your reseach give us a shout-out by citing or acknowledging
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -3071,7 +3071,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
"version": "3.11.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
@@ -1,668 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"TheBloke/phi-2-GPTQ\"\n",
|
||||
"# model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" # model_name,\n",
|
||||
" # # quantization_config=BitsAndBytesConfig(\n",
|
||||
" # # load_in_4bit=True,\n",
|
||||
" # # llm_int8_threshold=6.0,\n",
|
||||
" # # llm_int8_has_fp16_weight=False,\n",
|
||||
" # # bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
" # # bnb_4bit_use_double_quant=True,\n",
|
||||
" # # bnb_4bit_quant_type=\"nf4\",\n",
|
||||
" # # ),\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" # trust_remote_code=True,\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
" config.quantization_config['use_exllama'] = False\n",
|
||||
" # del config.quantization_config['use_exllama']\n",
|
||||
" config.quantization_config['disable_exllama'] = True\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
|
||||
"\n",
|
||||
"# from evaluate.measurements.perplexity import Perplexity\n",
|
||||
"import evaluate\n",
|
||||
"from evaluate import logging\n",
|
||||
"from torch.nn import CrossEntropyLoss\n",
|
||||
"\n",
|
||||
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
|
||||
"def perplexity_compute(\n",
|
||||
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" if device is not None:\n",
|
||||
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
|
||||
" if device == \"gpu\":\n",
|
||||
" device = \"cuda\"\n",
|
||||
" else:\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
" # if batch_size > 1 (which generally leads to padding being required), and\n",
|
||||
" # if there is not an already assigned pad_token, assign an existing\n",
|
||||
" # special token to also be the padding token\n",
|
||||
" if tokenizer.pad_token is None and batch_size > 1:\n",
|
||||
" existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
|
||||
" # check that the model already has at least one special token defined\n",
|
||||
" assert (\n",
|
||||
" len(existing_special_tokens) > 0\n",
|
||||
" ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
|
||||
" # assign one of the special tokens to also be the pad token\n",
|
||||
" tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
|
||||
"\n",
|
||||
" if add_start_token and max_length:\n",
|
||||
" # leave room for <BOS> token to be added:\n",
|
||||
" assert (\n",
|
||||
" tokenizer.bos_token is not None\n",
|
||||
" ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
|
||||
" max_tokenized_len = max_length - 1\n",
|
||||
" else:\n",
|
||||
" max_tokenized_len = max_length\n",
|
||||
"\n",
|
||||
" encodings = tokenizer(\n",
|
||||
" data,\n",
|
||||
" add_special_tokens=False,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True if max_tokenized_len else False,\n",
|
||||
" max_length=max_tokenized_len,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" return_attention_mask=True,\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" encoded_texts = encodings[\"input_ids\"]\n",
|
||||
" attn_masks = encodings[\"attention_mask\"]\n",
|
||||
"\n",
|
||||
" # check that each input is long enough:\n",
|
||||
" if add_start_token:\n",
|
||||
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
|
||||
" else:\n",
|
||||
" assert torch.all(\n",
|
||||
" torch.ge(attn_masks.sum(1), 2)\n",
|
||||
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
|
||||
"\n",
|
||||
" ppls = []\n",
|
||||
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
|
||||
"\n",
|
||||
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
|
||||
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
|
||||
" encoded_batch = encoded_texts[start_index:end_index]\n",
|
||||
" attn_mask = attn_masks[start_index:end_index]\n",
|
||||
"\n",
|
||||
" if add_start_token:\n",
|
||||
" bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
|
||||
" encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
|
||||
" attn_mask = torch.cat(\n",
|
||||
" [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" labels = encoded_batch\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
|
||||
"\n",
|
||||
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
|
||||
" shift_labels = labels[..., 1:].contiguous()\n",
|
||||
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
|
||||
"\n",
|
||||
" perplexity_batch = torch.exp(\n",
|
||||
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
|
||||
" / shift_attention_mask_batch.sum(1)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ppls += perplexity_batch.tolist()\n",
|
||||
"\n",
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torch.utils.data import DataLoader, TensorDataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lightning helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample = samples[0]\n",
|
||||
"s = sample['text']\n",
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
" # turn it into a sequence\n",
|
||||
" Xs = []\n",
|
||||
" Ys = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = [pad]*padding + x\n",
|
||||
" \n",
|
||||
" Xs.append(x)\n",
|
||||
" Ys.append(input_ids[i:i+1])\n",
|
||||
"\n",
|
||||
" Xs = torch.tensor(Xs)\n",
|
||||
" Ys = torch.tensor(Ys)\n",
|
||||
" attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
|
||||
" return Xs, Ys, attention_masks\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, tokenizer, second_half):\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
|
||||
"\n",
|
||||
"def read_metrics_csv(metrics_file_path):\n",
|
||||
" df_hist = pd.read_csv(metrics_file_path)\n",
|
||||
" df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
|
||||
" df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
|
||||
" return df_histe, df_hist\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def plot_hist(df_hist, allowlist=None, logy=False):\n",
|
||||
" \"\"\"plot groups of suffixes together\"\"\"\n",
|
||||
" suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
|
||||
" for suffix in suffixes:\n",
|
||||
" if allowlist and suffix not in allowlist: continue\n",
|
||||
" df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
|
||||
" plt.title(suffix) \n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"class PL_MODEL(pl.LightningModule):\n",
|
||||
" def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
|
||||
" super().__init__()\n",
|
||||
" self.save_hyperparameters()\n",
|
||||
"\n",
|
||||
" def configure_model(self):\n",
|
||||
" # instantiate your model in this hook\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, \n",
|
||||
" # lora_dropout=0.1,\n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" self.model = load_model()\n",
|
||||
" self.model = get_peft_model(self.model, peft_config)\n",
|
||||
" self.model.config.use_cache = False\n",
|
||||
" \n",
|
||||
" def forward(self, **kwargs):\n",
|
||||
" return self.model(**kwargs)\n",
|
||||
"\n",
|
||||
" def _shared_step(self, batch, batch_idx, phase='train'):\n",
|
||||
" input_ids, targets, attention_mask = batch\n",
|
||||
" # 16, 141\n",
|
||||
" output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
|
||||
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
|
||||
" self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
|
||||
" return loss\n",
|
||||
" \n",
|
||||
" def training_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='train')\n",
|
||||
"\n",
|
||||
" def validation_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='val')\n",
|
||||
" \n",
|
||||
" def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='test')\n",
|
||||
" \n",
|
||||
" def configure_optimizers(self):\n",
|
||||
" optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
|
||||
" lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
|
||||
" )\n",
|
||||
" return [optimizer], [lr_scheduler]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "AttributeError",
|
||||
"evalue": "'PL_MODEL' object has no attribute 'model'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[13], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m epoch_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(dl_train)\n\u001b[1;32m 14\u001b[0m pl_model \u001b[38;5;241m=\u001b[39m PL_MODEL(num_iterations\u001b[38;5;241m=\u001b[39mepoch_steps\u001b[38;5;241m*\u001b[39mepochs, lr\u001b[38;5;241m=\u001b[39mlr, weight_decay\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 15\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mpl_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# from lightning.pytorch.plugins import BitsandbytesPrecision\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\u001b[39;00m\n\u001b[1;32m 19\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m 20\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# precision=\"bf16-mixed\",\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# plugins=precision\u001b[39;00m\n\u001b[1;32m 25\u001b[0m )\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1694\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mAttributeError\u001b[0m: 'PL_MODEL' object has no attribute 'model'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"device = 'cuda'\n",
|
||||
"lr = 4e-3\n",
|
||||
"epochs = 3\n",
|
||||
"accum_steps = 16\n",
|
||||
"batch_size = 1\n",
|
||||
"\n",
|
||||
"Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
|
||||
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
|
||||
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
|
||||
"\n",
|
||||
"epoch_steps = len(dl_train)\n",
|
||||
"\n",
|
||||
"pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
|
||||
"model = pl_model.model\n",
|
||||
"# from lightning.pytorch.plugins import BitsandbytesPrecision\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
|
||||
"trainer = pl.Trainer(\n",
|
||||
" accelerator='cpu',\n",
|
||||
" max_epochs=epochs,\n",
|
||||
" precision='',\n",
|
||||
" # precision=\"bf16-mixed\",\n",
|
||||
" log_every_n_steps=1,\n",
|
||||
" accumulate_grad_batches=accum_steps,\n",
|
||||
" # plugins=precision\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# train\n",
|
||||
"trainer.fit(pl_model, dl_train, dl_val)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
|
||||
"display(df_hist)\n",
|
||||
"plot_hist(df_hist)\n",
|
||||
"\n",
|
||||
"eval(model, tokenizer, second_half)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def lora_eval(model, tokenizer, sample):\n",
|
||||
" # reset/set adapter\n",
|
||||
" # peft_config = IA3Config(\n",
|
||||
" # target_modules=[ \"fc2\", \"Wqkv\",], \n",
|
||||
" # feedforward_modules=[\"fc2\"],\n",
|
||||
" # inference_mode=False,\n",
|
||||
" # )\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias=\"all\"\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
"\n",
|
||||
" # train adapter\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
|
||||
"\n",
|
||||
" device = 'cuda'\n",
|
||||
" lr = 1.0e-2\n",
|
||||
" epochs = 3\n",
|
||||
" accum_steps = 64\n",
|
||||
" epoch_steps = (len(input_ids)-1)//accum_steps+1\n",
|
||||
"\n",
|
||||
" total_steps = epochs * epoch_steps\n",
|
||||
" optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
|
||||
" scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, lr, total_steps=total_steps\n",
|
||||
" )\n",
|
||||
" model.train()\n",
|
||||
" model = model.to(device)\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" # TODO: batch\n",
|
||||
" \n",
|
||||
" accum = 0\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" X = input_ids[:i][None, ]\n",
|
||||
" targets = input_ids[i:i+1][None, ]\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" out = model(input_ids=X, \n",
|
||||
" )\n",
|
||||
" logits = out['logits'][:, -1]\n",
|
||||
" loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
|
||||
" loss.backward()\n",
|
||||
" if accum > accum_steps:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" accum = 0\n",
|
||||
" else:\n",
|
||||
" accum += 1\n",
|
||||
" if accum > 0:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" return eval(model, tokenizer, second_half)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in tqdm(samples):\n",
|
||||
" r = lora_eval(model, tokenizer, sample)\n",
|
||||
" print(sample['name'], r)\n",
|
||||
" r.update(sample)\n",
|
||||
" data.append(r)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
|
||||
"df = pd.DataFrame(data).set_index('name')\n",
|
||||
"\n",
|
||||
"df['learning'] = (df['before']-df['after'])/df['before']\n",
|
||||
"df.sort_values('learning').drop(columns=['text', 'url'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,682 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
|
||||
"\n",
|
||||
"# from evaluate.measurements.perplexity import Perplexity\n",
|
||||
"import evaluate\n",
|
||||
"from evaluate import logging\n",
|
||||
"from torch.nn import CrossEntropyLoss\n",
|
||||
"\n",
|
||||
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
|
||||
"def perplexity_compute(\n",
|
||||
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" if device is not None:\n",
|
||||
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
|
||||
" if device == \"gpu\":\n",
|
||||
" device = \"cuda\"\n",
|
||||
" else:\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
" # if batch_size > 1 (which generally leads to padding being required), and\n",
|
||||
" # if there is not an already assigned pad_token, assign an existing\n",
|
||||
" # special token to also be the padding token\n",
|
||||
" if tokenizer.pad_token is None and batch_size > 1:\n",
|
||||
" existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
|
||||
" # check that the model already has at least one special token defined\n",
|
||||
" assert (\n",
|
||||
" len(existing_special_tokens) > 0\n",
|
||||
" ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
|
||||
" # assign one of the special tokens to also be the pad token\n",
|
||||
" tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
|
||||
"\n",
|
||||
" if add_start_token and max_length:\n",
|
||||
" # leave room for <BOS> token to be added:\n",
|
||||
" assert (\n",
|
||||
" tokenizer.bos_token is not None\n",
|
||||
" ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
|
||||
" max_tokenized_len = max_length - 1\n",
|
||||
" else:\n",
|
||||
" max_tokenized_len = max_length\n",
|
||||
"\n",
|
||||
" encodings = tokenizer(\n",
|
||||
" data,\n",
|
||||
" add_special_tokens=False,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True if max_tokenized_len else False,\n",
|
||||
" max_length=max_tokenized_len,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" return_attention_mask=True,\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" encoded_texts = encodings[\"input_ids\"]\n",
|
||||
" attn_masks = encodings[\"attention_mask\"]\n",
|
||||
"\n",
|
||||
" # check that each input is long enough:\n",
|
||||
" if add_start_token:\n",
|
||||
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
|
||||
" else:\n",
|
||||
" assert torch.all(\n",
|
||||
" torch.ge(attn_masks.sum(1), 2)\n",
|
||||
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
|
||||
"\n",
|
||||
" ppls = []\n",
|
||||
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
|
||||
"\n",
|
||||
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
|
||||
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
|
||||
" encoded_batch = encoded_texts[start_index:end_index]\n",
|
||||
" attn_mask = attn_masks[start_index:end_index]\n",
|
||||
"\n",
|
||||
" if add_start_token:\n",
|
||||
" bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
|
||||
" encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
|
||||
" attn_mask = torch.cat(\n",
|
||||
" [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" labels = encoded_batch\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
|
||||
"\n",
|
||||
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
|
||||
" shift_labels = labels[..., 1:].contiguous()\n",
|
||||
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
|
||||
"\n",
|
||||
" perplexity_batch = torch.exp(\n",
|
||||
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
|
||||
" / shift_attention_mask_batch.sum(1)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ppls += perplexity_batch.tolist()\n",
|
||||
"\n",
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torch.utils.data import DataLoader, TensorDataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lightning helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample = samples[0]\n",
|
||||
"s = sample['text']\n",
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
" # turn it into a sequence\n",
|
||||
" Xs = []\n",
|
||||
" Ys = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = [pad]*padding + x\n",
|
||||
" \n",
|
||||
" Xs.append(x)\n",
|
||||
" Ys.append(input_ids[i:i+1])\n",
|
||||
"\n",
|
||||
" Xs = torch.tensor(Xs)\n",
|
||||
" Ys = torch.tensor(Ys)\n",
|
||||
" attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
|
||||
" return Xs, Ys, attention_masks\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, tokenizer, second_half):\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
|
||||
"\n",
|
||||
"def read_metrics_csv(metrics_file_path):\n",
|
||||
" df_hist = pd.read_csv(metrics_file_path)\n",
|
||||
" df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
|
||||
" df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
|
||||
" return df_histe, df_hist\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def plot_hist(df_hist, allowlist=None, logy=False):\n",
|
||||
" \"\"\"plot groups of suffixes together\"\"\"\n",
|
||||
" suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
|
||||
" for suffix in suffixes:\n",
|
||||
" if allowlist and suffix not in allowlist: continue\n",
|
||||
" df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
|
||||
" plt.title(suffix) \n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import bitsandbytes as bnb\n",
|
||||
"\n",
|
||||
"class PL_MODEL(pl.LightningModule):\n",
|
||||
" def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
|
||||
" super().__init__()\n",
|
||||
" self.save_hyperparameters()\n",
|
||||
" self.configure_model()\n",
|
||||
"\n",
|
||||
" def configure_model(self):\n",
|
||||
" # instantiate your model in this hook\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, \n",
|
||||
" # lora_dropout=0.1,\n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" self.model = load_model()\n",
|
||||
" self.model = get_peft_model(self.model, peft_config)\n",
|
||||
" self.model.config.use_cache = False\n",
|
||||
" \n",
|
||||
" def forward(self, **kwargs):\n",
|
||||
" return self.model(**kwargs)\n",
|
||||
"\n",
|
||||
" def _shared_step(self, batch, batch_idx, phase='train'):\n",
|
||||
" input_ids, targets, attention_mask = batch\n",
|
||||
" # 16, 141\n",
|
||||
" output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
|
||||
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
|
||||
" self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
|
||||
" return loss\n",
|
||||
" \n",
|
||||
" def training_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='train')\n",
|
||||
"\n",
|
||||
" def validation_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='val')\n",
|
||||
" \n",
|
||||
" def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='test')\n",
|
||||
" \n",
|
||||
" def configure_optimizers(self):\n",
|
||||
" # optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
|
||||
"\n",
|
||||
" optimizer = bnb.optim.AdamW4bit(self.parameters(), lr=self.hparams.lr, betas=(0.9, 0.995))\n",
|
||||
" lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
|
||||
" )\n",
|
||||
" return [optimizer], [lr_scheduler]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.10it/s]\n",
|
||||
"Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
|
||||
"GPU available: True (cuda), used: True\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"IPU available: False, using: 0 IPUs\n",
|
||||
"HPU available: False, using: 0 HPUs\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "Linear4bit.__init__() got an unexpected keyword argument 'dtype'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[18], line 29\u001b[0m\n\u001b[1;32m 18\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m 19\u001b[0m accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgpu\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 20\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 25\u001b[0m plugins\u001b[38;5;241m=\u001b[39mprecision\n\u001b[1;32m 26\u001b[0m )\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# train\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m model \u001b[38;5;241m=\u001b[39m pl_model\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m 33\u001b[0m df_histe, df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:544\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m 543\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 544\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 546\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 47\u001b[0m _call_teardown_hook(trainer)\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:580\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 574\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 575\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m 576\u001b[0m ckpt_path,\n\u001b[1;32m 577\u001b[0m model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 578\u001b[0m model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 579\u001b[0m )\n\u001b[0;32m--> 580\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m 583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:958\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 955\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_restore_modules_and_callbacks(ckpt_path)\n\u001b[1;32m 957\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: configuring model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 958\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_configure_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 960\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n\u001b[1;32m 961\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_logger_connector\u001b[38;5;241m.\u001b[39mreset_results()\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:109\u001b[0m, in \u001b[0;36m_call_configure_model\u001b[0;34m(trainer)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_overridden(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfigure_model\u001b[39m\u001b[38;5;124m\"\u001b[39m, trainer\u001b[38;5;241m.\u001b[39mlightning_module):\n\u001b[1;32m 108\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mtensor_init_context(), trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mmodel_sharded_context(), trainer\u001b[38;5;241m.\u001b[39mprecision_plugin\u001b[38;5;241m.\u001b[39mmodule_init_context(): \u001b[38;5;66;03m# noqa: E501\u001b[39;00m\n\u001b[0;32m--> 109\u001b[0m \u001b[43m_call_lightning_module_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mconfigure_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:157\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m 154\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m hook_name\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 157\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m 160\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
|
||||
"Cell \u001b[0;32mIn[17], line 18\u001b[0m, in \u001b[0;36mPL_MODEL.configure_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconfigure_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# instantiate your model in this hook\u001b[39;00m\n\u001b[1;32m 11\u001b[0m peft_config \u001b[38;5;241m=\u001b[39m LoraConfig(\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# task_type=TaskType.TOKEN_CLS, \u001b[39;00m\n\u001b[1;32m 13\u001b[0m target_modules\u001b[38;5;241m=\u001b[39m[ \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfc2\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWqkv\u001b[39m\u001b[38;5;124m\"\u001b[39m,],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# bias=\"all\"\u001b[39;00m\n\u001b[1;32m 17\u001b[0m )\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m get_peft_model(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, peft_config)\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_cache \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
||||
"Cell \u001b[0;32mIn[5], line 5\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m():\n\u001b[0;32m----> 5\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# torch_dtype=torch.float16,\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:560\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 559\u001b[0m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mregister(config\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, model_class, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 560\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 561\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 562\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(config) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 564\u001b[0m model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/modeling_utils.py:3085\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 3082\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_flash_attn_2(config, torch_dtype\u001b[38;5;241m=\u001b[39mtorch_dtype, device_map\u001b[38;5;241m=\u001b[39mdevice_map)\n\u001b[1;32m 3084\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[0;32m-> 3085\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3087\u001b[0m \u001b[38;5;66;03m# Check first if we are `from_pt`\u001b[39;00m\n\u001b[1;32m 3088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_keep_in_fp32_modules:\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:933\u001b[0m, in \u001b[0;36mPhiForCausalLM.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, config: PhiConfig) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 931\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[0;32m--> 933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer \u001b[38;5;241m=\u001b[39m \u001b[43mPhiModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 934\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head \u001b[38;5;241m=\u001b[39m CausalLMHead(config)\n\u001b[1;32m 935\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloss \u001b[38;5;241m=\u001b[39m CausalLMLoss()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36mPhiModel.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList(\u001b[43m[\u001b[49m\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_layer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList([\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(config\u001b[38;5;241m.\u001b[39mn_layer)])\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:757\u001b[0m, in \u001b[0;36mParallelBlock.__init__\u001b[0;34m(self, config, block_idx)\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresid_dropout \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mDropout(config\u001b[38;5;241m.\u001b[39mresid_pdrop)\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_idx \u001b[38;5;241m=\u001b[39m block_idx\n\u001b[0;32m--> 757\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmixer \u001b[38;5;241m=\u001b[39m \u001b[43mMHA\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayer_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 758\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmlp \u001b[38;5;241m=\u001b[39m MLP(config)\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:562\u001b[0m, in \u001b[0;36mMHA.__init__\u001b[0;34m(self, config, dtype, device, rotary_dim, rotary_base, rotary_scale_base, n_head, n_head_kv, head_dim, bias, causal, softmax_scale, layer_idx, return_residual, checkpointing)\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m linear_cls \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 560\u001b[0m linear_cls \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mLinear\n\u001b[0;32m--> 562\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mWqkv \u001b[38;5;241m=\u001b[39m \u001b[43mlinear_cls\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mout_proj \u001b[38;5;241m=\u001b[39m linear_cls(hidden_size, hidden_size, bias\u001b[38;5;241m=\u001b[39mbias, device\u001b[38;5;241m=\u001b[39mdevice, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[1;32m 565\u001b[0m \u001b[38;5;66;03m# Attention\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:253\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._NF4DQLinear.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 253\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquant_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnf4\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompress_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:213\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._Linear4bit.__init__\u001b[0;34m(self, device, *args, **kwargs)\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, device: Optional[_DEVICE] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 213\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;66;03m# if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;66;03m# filling the device memory with float32 weights which could lead to OOM\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;241m0\u001b[39m, device\u001b[38;5;241m=\u001b[39mdevice)\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: Linear4bit.__init__() got an unexpected keyword argument 'dtype'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"device = 'cuda'\n",
|
||||
"lr = 4e-3\n",
|
||||
"epochs = 3\n",
|
||||
"accum_steps = 16\n",
|
||||
"batch_size = 2\n",
|
||||
"\n",
|
||||
"Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
|
||||
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
|
||||
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
|
||||
"\n",
|
||||
"epoch_steps = len(dl_train)\n",
|
||||
"\n",
|
||||
"pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
|
||||
"from lightning.pytorch.plugins import BitsandbytesPrecision\n",
|
||||
"precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
|
||||
"trainer = pl.Trainer(\n",
|
||||
" accelerator='gpu',\n",
|
||||
" max_epochs=epochs,\n",
|
||||
" # precision='',\n",
|
||||
" # precision=\"bf16-mixed\",\n",
|
||||
" log_every_n_steps=1,\n",
|
||||
" accumulate_grad_batches=accum_steps,\n",
|
||||
" plugins=precision\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# train\n",
|
||||
"trainer.fit(pl_model, dl_train, dl_val)\n",
|
||||
"\n",
|
||||
"model = pl_model.model\n",
|
||||
"\n",
|
||||
"df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
|
||||
"display(df_hist)\n",
|
||||
"plot_hist(df_hist)\n",
|
||||
"\n",
|
||||
"eval(model, tokenizer, second_half)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"1/0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Old"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def lora_eval(model, tokenizer, sample):\n",
|
||||
" # reset/set adapter\n",
|
||||
" # peft_config = IA3Config(\n",
|
||||
" # target_modules=[ \"fc2\", \"Wqkv\",], \n",
|
||||
" # feedforward_modules=[\"fc2\"],\n",
|
||||
" # inference_mode=False,\n",
|
||||
" # )\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias=\"all\"\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
"\n",
|
||||
" # train adapter\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
|
||||
"\n",
|
||||
" device = 'cuda'\n",
|
||||
" lr = 1.0e-2\n",
|
||||
" epochs = 3\n",
|
||||
" accum_steps = 64\n",
|
||||
" epoch_steps = (len(input_ids)-1)//accum_steps+1\n",
|
||||
"\n",
|
||||
" total_steps = epochs * epoch_steps\n",
|
||||
" optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
|
||||
" scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, lr, total_steps=total_steps\n",
|
||||
" )\n",
|
||||
" model.train()\n",
|
||||
" model = model.to(device)\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" # TODO: batch\n",
|
||||
" \n",
|
||||
" accum = 0\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" X = input_ids[:i][None, ]\n",
|
||||
" targets = input_ids[i:i+1][None, ]\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" out = model(input_ids=X, \n",
|
||||
" )\n",
|
||||
" logits = out['logits'][:, -1]\n",
|
||||
" loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
|
||||
" loss.backward()\n",
|
||||
" if accum > accum_steps:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" accum = 0\n",
|
||||
" else:\n",
|
||||
" accum += 1\n",
|
||||
" if accum > 0:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" return eval(model, tokenizer, second_half)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in tqdm(samples):\n",
|
||||
" r = lora_eval(model, tokenizer, sample)\n",
|
||||
" print(sample['name'], r)\n",
|
||||
" r.update(sample)\n",
|
||||
" data.append(r)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
|
||||
"df = pd.DataFrame(data).set_index('name')\n",
|
||||
"\n",
|
||||
"df['learning'] = (df['before']-df['after'])/df['before']\n",
|
||||
"df.sort_values('learning').drop(columns=['text', 'url'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user