trying transformers training

2026-07-04 21:03:14 +08:00 · 2024-01-03 11:02:09 +08:00
parent e087051711
commit 13d0c82596
5 changed files with 5935 additions and 351 deletions
@@ -22,54 +22,29 @@
    }
   ],
   "source": [
-    "import os\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import transformers\n",
-    "from datasets import load_dataset\n",
-    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
-    "import numpy as np\n",
-    "from tqdm.auto import tqdm\n",
-    "import pandas as pd\n",
-    "from peft import LoraConfig, get_peft_model, IA3Config"
+    "from torch import optim\n",
+    "import lightning as pl\n",
+    "from matplotlib import pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]\n",
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "import os\n",
    "\n",
-    "model_name = \"microsoft/phi-2\"\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    model_name,\n",
-    "    # max_memory=max_memory,\n",
-    "    quantization_config=BitsAndBytesConfig(\n",
-    "        load_in_4bit=True,\n",
-    "        llm_int8_threshold=6.0,\n",
-    "        llm_int8_has_fp16_weight=False,\n",
-    "        bnb_4bit_compute_dtype=torch.float16,\n",
-    "        bnb_4bit_use_double_quant=True,\n",
-    "        bnb_4bit_quant_type=\"nf4\",\n",
-    "    ),\n",
-    "    torch_dtype=torch.float16,\n",
-    "    trust_remote_code=True,\n",
-    ")\n",
-    "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
-    "\n"
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import transformers\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "import pandas as pd\n",
+    "import warnings\n",
+    "from peft import LoraConfig, get_peft_model, IA3Config"
   ]
  },
  {
@@ -78,12 +53,113 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "MAX_LEN = 2000\n",
-    "import json\n",
-    "samples = json.load(open(\"../samples.json\"))\n",
+    "plt.style.use('ggplot')\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "\n",
-    "# sample = samples[0]\n",
-    "# sample"
+    "model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\n",
+    "#     model_name,\n",
+    "#     # max_memory=max_memory,\n",
+    "#     quantization_config=BitsAndBytesConfig(\n",
+    "#         load_in_4bit=True,\n",
+    "#         llm_int8_threshold=6.0,\n",
+    "#         llm_int8_has_fp16_weight=False,\n",
+    "#         bnb_4bit_compute_dtype=torch.float16,\n",
+    "#         bnb_4bit_use_double_quant=True,\n",
+    "#         bnb_4bit_quant_type=\"nf4\",\n",
+    "#     ),\n",
+    "#     torch_dtype=torch.float16,\n",
+    "#     trust_remote_code=True,\n",
+    "# )\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"TheBloke/phi-2-GPTQ\"\n",
+    "# model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "def load_model():\n",
+    "\n",
+    "    # model = AutoModelForCausalLM.from_pretrained(\n",
+    "    #     model_name,\n",
+    "    #     # quantization_config=BitsAndBytesConfig(\n",
+    "    #     #     load_in_4bit=True,\n",
+    "    #     #     llm_int8_threshold=6.0,\n",
+    "    #     #     llm_int8_has_fp16_weight=False,\n",
+    "    #     #     bnb_4bit_compute_dtype=torch.float16,\n",
+    "    #     #     bnb_4bit_use_double_quant=True,\n",
+    "    #     #     bnb_4bit_quant_type=\"nf4\",\n",
+    "    #     # ),\n",
+    "    #     torch_dtype=torch.float16,\n",
+    "    #     trust_remote_code=True,\n",
+    "    # )\n",
+    "\n",
+    "\n",
+    "    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
+    "    config.quantization_config['use_exllama'] = False\n",
+    "    # del config.quantization_config['use_exllama']\n",
+    "    config.quantization_config['disable_exllama'] = True\n",
+    "    model = AutoModelForCausalLM.from_pretrained(\n",
+    "        model_name,\n",
+    "        torch_dtype=torch.bfloat16,\n",
+    "        trust_remote_code=True,\n",
+    "        config=config,\n",
+    "    )\n",
+    "    return model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "MAX_LEN = 2000\n",
+    "samples = json.load(open(\"../samples.json\"))\n"
   ]
  },
  {
@@ -95,7 +171,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -199,61 +275,6 @@
    "    return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Perplexity"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
-    "# results['mean_perplexity']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Learn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# \"\"\"### Post-processing on the model\n",
-    "\n",
-    "# Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.\n",
-    "# \"\"\"\n",
-    "\n",
-    "# print(model)\n",
-    "\n",
-    "# for param in model.parameters():\n",
-    "#     param.requires_grad = False  # freeze the model - train adapters later\n",
-    "#     if param.ndim == 1:\n",
-    "#         # cast the small parameters (e.g. layernorm) to fp32 for stability\n",
-    "#         param.data = param.data.to(torch.float32)\n",
-    "\n",
-    "# # model.gradient_checkpointing_enable()  # reduce number of stored activations\n",
-    "# # model.model.decoder.project_in = lambda x: x.requires_grad_(True)\n",
-    "\n",
-    "\n",
-    "# class CastOutputToFloat(nn.Sequential):\n",
-    "#     def forward(self, x):\n",
-    "#         return super().forward(x).to(torch.float32)\n",
-    "\n",
-    "\n",
-    "# model.lm_head = CastOutputToFloat(model.lm_head)\n"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -263,7 +284,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -280,30 +301,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'matplotlib'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m optim\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlightning\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'"
-     ]
-    }
-   ],
-   "source": [
-    "from torch import optim\n",
-    "import lightning as pl\n",
-    "from matplotlib import pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -312,7 +310,10 @@
    "first_half = s[:len(s)//2]\n",
    "second_half = s[len(s)//2:]\n",
    "\n",
-    "def str2xya(s):\n",
+    "\n",
+    "\n",
+    "def str2xya(s, tokenizer):\n",
+    "    max_len = min(MAX_LEN, len(s))\n",
    "    input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
    "\n",
    "    pad = tokenizer.bos_token_id\n",
@@ -320,8 +321,8 @@
    "    Xs = []\n",
    "    Ys = []\n",
    "    for i in range(1, len(input_ids)):\n",
-    "        x = input_ids[:i]\n",
-    "        padding = len(input_ids) - len(x)\n",
+    "        x = input_ids[:i][-max_len:]\n",
+    "        padding = max_len - len(x)\n",
    "        x = [pad]*padding + x\n",
    "        \n",
    "        Xs.append(x)\n",
@@ -374,13 +375,25 @@
    "\n",
    "\n",
    "class PL_MODEL(pl.LightningModule):\n",
-    "    def __init__(self, model, num_iterations, lr=3e-4, weight_decay=0,):\n",
+    "    def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
    "        super().__init__()\n",
-    "        self._model = model\n",
-    "        self.save_hyperparameters(ignore=['model'])\n",
+    "        self.save_hyperparameters()\n",
+    "\n",
+    "    def configure_model(self):\n",
+    "        # instantiate your model in this hook\n",
+    "        peft_config = LoraConfig(\n",
+    "            # task_type=TaskType.TOKEN_CLS, \n",
+    "            target_modules=[ \"fc2\",  \"Wqkv\",],\n",
+    "            inference_mode=False, r=16, lora_alpha=16, \n",
+    "            # lora_dropout=0.1,\n",
+    "            # bias=\"all\"\n",
+    "        )\n",
+    "        self.model = load_model()\n",
+    "        self.model = get_peft_model(self.model, peft_config)\n",
+    "        self.model.config.use_cache = False\n",
    "    \n",
    "    def forward(self, **kwargs):\n",
-    "        return self._model(**kwargs)\n",
+    "        return self.model(**kwargs)\n",
    "\n",
    "    def _shared_step(self, batch, batch_idx, phase='train'):\n",
    "        input_ids, targets, attention_mask = batch\n",
@@ -416,252 +429,65 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
-      "Using bfloat16 Automatic Mixed Precision (AMP)\n",
-      "GPU available: True (cuda), used: True\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "IPU available: False, using: 0 IPUs\n",
-      "HPU available: False, using: 0 HPUs\n",
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
-      "You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]\n",
-      "\n",
-      "  | Name   | Type      | Params\n",
-      "-------------------------------------\n",
-      "0 | _model | PeftModel | 1.5 B \n",
-      "-------------------------------------\n",
-      "11.8 M    Trainable params\n",
-      "1.5 B     Non-trainable params\n",
-      "1.5 B     Total params\n",
-      "6,132.756 Total estimated model params size (MB)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.\n",
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.66it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([4, 1])) that is different to the input size (torch.Size([4, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
-      "  loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                                                           "
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 0: 100%|██████████| 52/52 [00:26<00:00,  1.94it/s, v_num=9, train/loss_step=381.0]  "
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([1, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
-      "  loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 2: 100%|██████████| 52/52 [00:40<00:00,  1.30it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]  "
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "`Trainer.fit` stopped: `max_epochs=3` reached.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 2: 100%|██████████| 52/52 [00:42<00:00,  1.22it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>train/loss_step</th>\n",
-       "      <th>step</th>\n",
-       "      <th>val/loss_step</th>\n",
-       "      <th>val/loss_epoch</th>\n",
-       "      <th>train/loss_epoch</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>epoch</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0.0</th>\n",
-       "      <td>3319.602325</td>\n",
-       "      <td>22.278689</td>\n",
-       "      <td>6775.631730</td>\n",
-       "      <td>6775.630859</td>\n",
-       "      <td>6645.847656</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1.0</th>\n",
-       "      <td>4132.752668</td>\n",
-       "      <td>67.639344</td>\n",
-       "      <td>6772.301628</td>\n",
-       "      <td>6772.301758</td>\n",
-       "      <td>6642.495605</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2.0</th>\n",
-       "      <td>5096.813714</td>\n",
-       "      <td>113.000000</td>\n",
-       "      <td>6769.673500</td>\n",
-       "      <td>6769.673828</td>\n",
-       "      <td>6639.772949</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       train/loss_step        step  val/loss_step  val/loss_epoch  \\\n",
-       "epoch                                                               \n",
-       "0.0        3319.602325   22.278689    6775.631730     6775.630859   \n",
-       "1.0        4132.752668   67.639344    6772.301628     6772.301758   \n",
-       "2.0        5096.813714  113.000000    6769.673500     6769.673828   \n",
-       "\n",
-       "       train/loss_epoch  \n",
-       "epoch                    \n",
-       "0.0         6645.847656  \n",
-       "1.0         6642.495605  \n",
-       "2.0         6639.772949  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "ename": "ImportError",
-     "evalue": "matplotlib is required for plotting when the default backend \"matplotlib\" is selected.",
+     "ename": "AttributeError",
+     "evalue": "'PL_MODEL' object has no attribute 'model'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[10], line 85\u001b[0m\n\u001b[1;32m     83\u001b[0m df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n\u001b[1;32m     84\u001b[0m display(df_hist)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mplot_hist\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;28meval\u001b[39m(model, tokenizer, second_half)\n",
-      "Cell \u001b[0;32mIn[10], line 79\u001b[0m, in \u001b[0;36mplot_hist\u001b[0;34m(df_hist, allowlist, logy)\u001b[0m\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m suffix \u001b[38;5;129;01min\u001b[39;00m suffixes:\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m allowlist \u001b[38;5;129;01mand\u001b[39;00m suffix \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m allowlist: \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m---> 79\u001b[0m     \u001b[43mdf_hist\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendswith\u001b[49m\u001b[43m(\u001b[49m\u001b[43msuffix\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstyle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     80\u001b[0m     plt\u001b[38;5;241m.\u001b[39mtitle(suffix)   \n\u001b[1;32m     81\u001b[0m     plt\u001b[38;5;241m.\u001b[39mshow()\n",
-      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:951\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    950\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 951\u001b[0m     plot_backend \u001b[38;5;241m=\u001b[39m \u001b[43m_get_plot_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbackend\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    953\u001b[0m     x, y, kind, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_call_args(\n\u001b[1;32m    954\u001b[0m         plot_backend\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, args, kwargs\n\u001b[1;32m    955\u001b[0m     )\n\u001b[1;32m    957\u001b[0m     kind \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_kind_aliases\u001b[38;5;241m.\u001b[39mget(kind, kind)\n",
-      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1947\u001b[0m, in \u001b[0;36m_get_plot_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m   1944\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend_str \u001b[38;5;129;01min\u001b[39;00m _backends:\n\u001b[1;32m   1945\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _backends[backend_str]\n\u001b[0;32m-> 1947\u001b[0m module \u001b[38;5;241m=\u001b[39m \u001b[43m_load_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbackend_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1948\u001b[0m _backends[backend_str] \u001b[38;5;241m=\u001b[39m module\n\u001b[1;32m   1949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n",
-      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1877\u001b[0m, in \u001b[0;36m_load_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m   1875\u001b[0m         module \u001b[38;5;241m=\u001b[39m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpandas.plotting._matplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1876\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m-> 1877\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m   1878\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib is required for plotting when the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1879\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdefault backend \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is selected.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m   1880\u001b[0m         ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1881\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m module\n\u001b[1;32m   1883\u001b[0m found_backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-      "\u001b[0;31mImportError\u001b[0m: matplotlib is required for plotting when the default backend \"matplotlib\" is selected."
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 15\u001b[0m\n\u001b[1;32m     12\u001b[0m epoch_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(dl_train)\n\u001b[1;32m     14\u001b[0m pl_model \u001b[38;5;241m=\u001b[39m PL_MODEL(num_iterations\u001b[38;5;241m=\u001b[39mepoch_steps\u001b[38;5;241m*\u001b[39mepochs, lr\u001b[38;5;241m=\u001b[39mlr, weight_decay\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 15\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mpl_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# from lightning.pytorch.plugins import BitsandbytesPrecision\u001b[39;00m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\u001b[39;00m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\u001b[39;00m\n\u001b[1;32m     19\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m     20\u001b[0m         max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[1;32m     21\u001b[0m         \u001b[38;5;66;03m# precision=\"bf16-mixed\",\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     24\u001b[0m         \u001b[38;5;66;03m# plugins=precision\u001b[39;00m\n\u001b[1;32m     25\u001b[0m     )\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1693\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1694\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'PL_MODEL' object has no attribute 'model'"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "\n",
-    "    \n",
-    "\n",
-    "peft_config = LoraConfig(\n",
-    "    # task_type=TaskType.TOKEN_CLS, \n",
-    "    target_modules=[ \"fc2\",  \"Wqkv\",],\n",
-    "    inference_mode=False, r=16, lora_alpha=16, \n",
-    "    # lora_dropout=0.1,\n",
-    "    # bias=\"all\"\n",
-    ")\n",
-    "model = get_peft_model(model, peft_config)\n",
-    "model.config.use_cache = False\n",
    "\n",
    "device = 'cuda'\n",
    "lr = 4e-3\n",
    "epochs = 3\n",
-    "accum_steps = 64\n",
-    "batch_size = 4\n",
+    "accum_steps = 16\n",
+    "batch_size = 1\n",
    "\n",
-    "Xs, Ys, attention_masks = str2xya(first_half)\n",
-    "dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
-    "Xs, Ys, attention_masks = str2xya(second_half)\n",
-    "dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
+    "Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
+    "dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
+    "Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
+    "dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
    "\n",
    "epoch_steps = len(dl_train)\n",
    "\n",
-    "pl_model = PL_MODEL(model, num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
+    "pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
+    "model = pl_model.model\n",
+    "# from lightning.pytorch.plugins import BitsandbytesPrecision\n",
+    "# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
+    "# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
    "trainer = pl.Trainer(\n",
+    "        accelerator='cpu',\n",
    "        max_epochs=epochs,\n",
-    "        precision=\"bf16-mixed\",\n",
+    "        precision='',\n",
+    "        # precision=\"bf16-mixed\",\n",
    "        log_every_n_steps=1,\n",
-    "        accumulate_grad_batches=8,\n",
+    "        accumulate_grad_batches=accum_steps,\n",
+    "        # plugins=precision\n",
    "    )\n",
    "\n",
    "# train\n",
    "trainer.fit(pl_model, dl_train, dl_val)\n",
    "\n",
    "\n",
-    "\n",
-    "\n",
    "df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
    "display(df_hist)\n",
    "plot_hist(df_hist)\n",
@@ -0,0 +1,682 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch import optim\n",
+    "import lightning as pl\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import transformers\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "import pandas as pd\n",
+    "import warnings\n",
+    "from peft import LoraConfig, get_peft_model, IA3Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.style.use('ggplot')\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "\n",
+    "model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\n",
+    "#     model_name,\n",
+    "#     # max_memory=max_memory,\n",
+    "#     quantization_config=BitsAndBytesConfig(\n",
+    "#         load_in_4bit=True,\n",
+    "#         llm_int8_threshold=6.0,\n",
+    "#         llm_int8_has_fp16_weight=False,\n",
+    "#         bnb_4bit_compute_dtype=torch.float16,\n",
+    "#         bnb_4bit_use_double_quant=True,\n",
+    "#         bnb_4bit_quant_type=\"nf4\",\n",
+    "#     ),\n",
+    "#     torch_dtype=torch.float16,\n",
+    "#     trust_remote_code=True,\n",
+    "# )\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "def load_model():\n",
+    "\n",
+    "    model = AutoModelForCausalLM.from_pretrained(\n",
+    "        model_name,\n",
+    "        # torch_dtype=torch.float16,\n",
+    "        trust_remote_code=True,\n",
+    "    )\n",
+    "    return model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "MAX_LEN = 2000\n",
+    "samples = json.load(open(\"../samples.json\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
+    "\n",
+    "# from evaluate.measurements.perplexity import Perplexity\n",
+    "import evaluate\n",
+    "from evaluate import logging\n",
+    "from torch.nn import CrossEntropyLoss\n",
+    "\n",
+    "# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
+    "def perplexity_compute(\n",
+    "    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
+    "):\n",
+    "\n",
+    "    if device is not None:\n",
+    "        assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
+    "        if device == \"gpu\":\n",
+    "            device = \"cuda\"\n",
+    "    else:\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "    # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
+    "    model = model.to(device)\n",
+    "\n",
+    "    # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "\n",
+    "    # if batch_size > 1 (which generally leads to padding being required), and\n",
+    "    # if there is not an already assigned pad_token, assign an existing\n",
+    "    # special token to also be the padding token\n",
+    "    if tokenizer.pad_token is None and batch_size > 1:\n",
+    "        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
+    "        # check that the model already has at least one special token defined\n",
+    "        assert (\n",
+    "            len(existing_special_tokens) > 0\n",
+    "        ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
+    "        # assign one of the special tokens to also be the pad token\n",
+    "        tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
+    "\n",
+    "    if add_start_token and max_length:\n",
+    "        # leave room for <BOS> token to be added:\n",
+    "        assert (\n",
+    "            tokenizer.bos_token is not None\n",
+    "        ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
+    "        max_tokenized_len = max_length - 1\n",
+    "    else:\n",
+    "        max_tokenized_len = max_length\n",
+    "\n",
+    "    encodings = tokenizer(\n",
+    "        data,\n",
+    "        add_special_tokens=False,\n",
+    "        padding=True,\n",
+    "        truncation=True if max_tokenized_len else False,\n",
+    "        max_length=max_tokenized_len,\n",
+    "        return_tensors=\"pt\",\n",
+    "        return_attention_mask=True,\n",
+    "    ).to(device)\n",
+    "\n",
+    "    encoded_texts = encodings[\"input_ids\"]\n",
+    "    attn_masks = encodings[\"attention_mask\"]\n",
+    "\n",
+    "    # check that each input is long enough:\n",
+    "    if add_start_token:\n",
+    "        assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
+    "    else:\n",
+    "        assert torch.all(\n",
+    "            torch.ge(attn_masks.sum(1), 2)\n",
+    "        ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
+    "\n",
+    "    ppls = []\n",
+    "    loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
+    "\n",
+    "    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
+    "        end_index = min(start_index + batch_size, len(encoded_texts))\n",
+    "        encoded_batch = encoded_texts[start_index:end_index]\n",
+    "        attn_mask = attn_masks[start_index:end_index]\n",
+    "\n",
+    "        if add_start_token:\n",
+    "            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
+    "            encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
+    "            attn_mask = torch.cat(\n",
+    "                [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
+    "            )\n",
+    "\n",
+    "        labels = encoded_batch\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
+    "\n",
+    "        shift_logits = out_logits[..., :-1, :].contiguous()\n",
+    "        shift_labels = labels[..., 1:].contiguous()\n",
+    "        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
+    "\n",
+    "        perplexity_batch = torch.exp(\n",
+    "            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
+    "            / shift_attention_mask_batch.sum(1)\n",
+    "        )\n",
+    "\n",
+    "        ppls += perplexity_batch.tolist()\n",
+    "\n",
+    "    return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.nn import functional as F\n",
+    "from torch.utils.data import DataLoader, TensorDataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lightning helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = samples[0]\n",
+    "s = sample['text']\n",
+    "first_half = s[:len(s)//2]\n",
+    "second_half = s[len(s)//2:]\n",
+    "\n",
+    "\n",
+    "\n",
+    "def str2xya(s, tokenizer):\n",
+    "    max_len = min(MAX_LEN, len(s))\n",
+    "    input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
+    "\n",
+    "    pad = tokenizer.bos_token_id\n",
+    "    # turn it into a sequence\n",
+    "    Xs = []\n",
+    "    Ys = []\n",
+    "    for i in range(1, len(input_ids)):\n",
+    "        x = input_ids[:i][-max_len:]\n",
+    "        padding = max_len - len(x)\n",
+    "        x = [pad]*padding + x\n",
+    "        \n",
+    "        Xs.append(x)\n",
+    "        Ys.append(input_ids[i:i+1])\n",
+    "\n",
+    "    Xs = torch.tensor(Xs)\n",
+    "    Ys = torch.tensor(Ys)\n",
+    "    attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
+    "    return Xs, Ys, attention_masks\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eval(model, tokenizer, second_half):\n",
+    "    model.eval();\n",
+    "    with torch.no_grad():\n",
+    "        with model.disable_adapter():\n",
+    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
+    "\n",
+    "def read_metrics_csv(metrics_file_path):\n",
+    "    df_hist = pd.read_csv(metrics_file_path)\n",
+    "    df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
+    "    df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
+    "    return df_histe, df_hist\n",
+    "\n",
+    "\n",
+    "def plot_hist(df_hist, allowlist=None, logy=False):\n",
+    "    \"\"\"plot groups of suffixes together\"\"\"\n",
+    "    suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
+    "    for suffix in suffixes:\n",
+    "        if allowlist and suffix not in allowlist: continue\n",
+    "        df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
+    "        plt.title(suffix)   \n",
+    "        plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bitsandbytes as bnb\n",
+    "\n",
+    "class PL_MODEL(pl.LightningModule):\n",
+    "    def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
+    "        super().__init__()\n",
+    "        self.save_hyperparameters()\n",
+    "        self.configure_model()\n",
+    "\n",
+    "    def configure_model(self):\n",
+    "        # instantiate your model in this hook\n",
+    "        peft_config = LoraConfig(\n",
+    "            # task_type=TaskType.TOKEN_CLS, \n",
+    "            target_modules=[ \"fc2\",  \"Wqkv\",],\n",
+    "            inference_mode=False, r=16, lora_alpha=16, \n",
+    "            # lora_dropout=0.1,\n",
+    "            # bias=\"all\"\n",
+    "        )\n",
+    "        self.model = load_model()\n",
+    "        self.model = get_peft_model(self.model, peft_config)\n",
+    "        self.model.config.use_cache = False\n",
+    "    \n",
+    "    def forward(self, **kwargs):\n",
+    "        return self.model(**kwargs)\n",
+    "\n",
+    "    def _shared_step(self, batch, batch_idx, phase='train'):\n",
+    "        input_ids, targets, attention_mask = batch\n",
+    "        # 16, 141\n",
+    "        output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
+    "        loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
+    "        self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
+    "        return loss\n",
+    "    \n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        return self._shared_step(batch, batch_idx, phase='train')\n",
+    "\n",
+    "    def validation_step(self, batch, batch_idx):\n",
+    "        return self._shared_step(batch, batch_idx, phase='val')\n",
+    "    \n",
+    "    def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
+    "        return self._shared_step(batch, batch_idx, phase='test')\n",
+    "    \n",
+    "    def configure_optimizers(self):\n",
+    "        # optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
+    "\n",
+    "        optimizer = bnb.optim.AdamW4bit(self.parameters(), lr=self.hparams.lr, betas=(0.9, 0.995))\n",
+    "        lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
+    "            optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
+    "        )\n",
+    "        return [optimizer], [lr_scheduler]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]\n",
+      "Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "Linear4bit.__init__() got an unexpected keyword argument 'dtype'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[18], line 29\u001b[0m\n\u001b[1;32m     18\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m     19\u001b[0m         accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgpu\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     20\u001b[0m         max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     25\u001b[0m         plugins\u001b[38;5;241m=\u001b[39mprecision\n\u001b[1;32m     26\u001b[0m     )\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m# train\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     31\u001b[0m model \u001b[38;5;241m=\u001b[39m pl_model\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m     33\u001b[0m df_histe, df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:544\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m    543\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 544\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    545\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    546\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     43\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m     47\u001b[0m     _call_teardown_hook(trainer)\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:580\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    574\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m    575\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m    576\u001b[0m     ckpt_path,\n\u001b[1;32m    577\u001b[0m     model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    578\u001b[0m     model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    579\u001b[0m )\n\u001b[0;32m--> 580\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    582\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m    583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:958\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m    955\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_restore_modules_and_callbacks(ckpt_path)\n\u001b[1;32m    957\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: configuring model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 958\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_configure_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    960\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n\u001b[1;32m    961\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_logger_connector\u001b[38;5;241m.\u001b[39mreset_results()\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:109\u001b[0m, in \u001b[0;36m_call_configure_model\u001b[0;34m(trainer)\u001b[0m\n\u001b[1;32m    107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_overridden(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfigure_model\u001b[39m\u001b[38;5;124m\"\u001b[39m, trainer\u001b[38;5;241m.\u001b[39mlightning_module):\n\u001b[1;32m    108\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mtensor_init_context(), trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mmodel_sharded_context(), trainer\u001b[38;5;241m.\u001b[39mprecision_plugin\u001b[38;5;241m.\u001b[39mmodule_init_context():  \u001b[38;5;66;03m# noqa: E501\u001b[39;00m\n\u001b[0;32m--> 109\u001b[0m         \u001b[43m_call_lightning_module_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mconfigure_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:157\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    154\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m hook_name\n\u001b[1;32m    156\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 157\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    160\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
+      "Cell \u001b[0;32mIn[17], line 18\u001b[0m, in \u001b[0;36mPL_MODEL.configure_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconfigure_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     10\u001b[0m     \u001b[38;5;66;03m# instantiate your model in this hook\u001b[39;00m\n\u001b[1;32m     11\u001b[0m     peft_config \u001b[38;5;241m=\u001b[39m LoraConfig(\n\u001b[1;32m     12\u001b[0m         \u001b[38;5;66;03m# task_type=TaskType.TOKEN_CLS, \u001b[39;00m\n\u001b[1;32m     13\u001b[0m         target_modules\u001b[38;5;241m=\u001b[39m[ \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfc2\u001b[39m\u001b[38;5;124m\"\u001b[39m,  \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWqkv\u001b[39m\u001b[38;5;124m\"\u001b[39m,],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     16\u001b[0m         \u001b[38;5;66;03m# bias=\"all\"\u001b[39;00m\n\u001b[1;32m     17\u001b[0m     )\n\u001b[0;32m---> 18\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m get_peft_model(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, peft_config)\n\u001b[1;32m     20\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_cache \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "Cell \u001b[0;32mIn[5], line 5\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m():\n\u001b[0;32m----> 5\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# torch_dtype=torch.float16,\u001b[39;49;00m\n\u001b[1;32m      8\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m model\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:560\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m    558\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    559\u001b[0m         \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mregister(config\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, model_class, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 560\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    561\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    562\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(config) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m    564\u001b[0m     model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/modeling_utils.py:3085\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   3082\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_flash_attn_2(config, torch_dtype\u001b[38;5;241m=\u001b[39mtorch_dtype, device_map\u001b[38;5;241m=\u001b[39mdevice_map)\n\u001b[1;32m   3084\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[0;32m-> 3085\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3087\u001b[0m \u001b[38;5;66;03m# Check first if we are `from_pt`\u001b[39;00m\n\u001b[1;32m   3088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_keep_in_fp32_modules:\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:933\u001b[0m, in \u001b[0;36mPhiForCausalLM.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m    930\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, config: PhiConfig) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    931\u001b[0m     \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[0;32m--> 933\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer \u001b[38;5;241m=\u001b[39m \u001b[43mPhiModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    934\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head \u001b[38;5;241m=\u001b[39m CausalLMHead(config)\n\u001b[1;32m    935\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloss \u001b[38;5;241m=\u001b[39m CausalLMLoss()\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36mPhiModel.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m    895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList(\u001b[43m[\u001b[49m\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_layer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m    897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m    895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList([\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(config\u001b[38;5;241m.\u001b[39mn_layer)])\n\u001b[1;32m    897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:757\u001b[0m, in \u001b[0;36mParallelBlock.__init__\u001b[0;34m(self, config, block_idx)\u001b[0m\n\u001b[1;32m    754\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresid_dropout \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mDropout(config\u001b[38;5;241m.\u001b[39mresid_pdrop)\n\u001b[1;32m    755\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_idx \u001b[38;5;241m=\u001b[39m block_idx\n\u001b[0;32m--> 757\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmixer \u001b[38;5;241m=\u001b[39m \u001b[43mMHA\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayer_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    758\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmlp \u001b[38;5;241m=\u001b[39m MLP(config)\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:562\u001b[0m, in \u001b[0;36mMHA.__init__\u001b[0;34m(self, config, dtype, device, rotary_dim, rotary_base, rotary_scale_base, n_head, n_head_kv, head_dim, bias, causal, softmax_scale, layer_idx, return_residual, checkpointing)\u001b[0m\n\u001b[1;32m    559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m linear_cls \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    560\u001b[0m     linear_cls \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mLinear\n\u001b[0;32m--> 562\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mWqkv \u001b[38;5;241m=\u001b[39m \u001b[43mlinear_cls\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mout_proj \u001b[38;5;241m=\u001b[39m linear_cls(hidden_size, hidden_size, bias\u001b[38;5;241m=\u001b[39mbias, device\u001b[38;5;241m=\u001b[39mdevice, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[1;32m    565\u001b[0m \u001b[38;5;66;03m# Attention\u001b[39;00m\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:253\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._NF4DQLinear.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 253\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquant_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnf4\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompress_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:213\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._Linear4bit.__init__\u001b[0;34m(self, device, *args, **kwargs)\u001b[0m\n\u001b[1;32m    212\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, device: Optional[_DEVICE] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 213\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    214\u001b[0m     \u001b[38;5;66;03m# if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up\u001b[39;00m\n\u001b[1;32m    215\u001b[0m     \u001b[38;5;66;03m# filling the device memory with float32 weights which could lead to OOM\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;241m0\u001b[39m, device\u001b[38;5;241m=\u001b[39mdevice)\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "\u001b[0;31mTypeError\u001b[0m: Linear4bit.__init__() got an unexpected keyword argument 'dtype'"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "    \n",
+    "\n",
+    "\n",
+    "\n",
+    "device = 'cuda'\n",
+    "lr = 4e-3\n",
+    "epochs = 3\n",
+    "accum_steps = 16\n",
+    "batch_size = 2\n",
+    "\n",
+    "Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
+    "dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
+    "Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
+    "dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
+    "\n",
+    "epoch_steps = len(dl_train)\n",
+    "\n",
+    "pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
+    "from lightning.pytorch.plugins import BitsandbytesPrecision\n",
+    "precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
+    "# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
+    "trainer = pl.Trainer(\n",
+    "        accelerator='gpu',\n",
+    "        max_epochs=epochs,\n",
+    "        # precision='',\n",
+    "        # precision=\"bf16-mixed\",\n",
+    "        log_every_n_steps=1,\n",
+    "        accumulate_grad_batches=accum_steps,\n",
+    "        plugins=precision\n",
+    "    )\n",
+    "\n",
+    "# train\n",
+    "trainer.fit(pl_model, dl_train, dl_val)\n",
+    "\n",
+    "model = pl_model.model\n",
+    "\n",
+    "df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
+    "display(df_hist)\n",
+    "plot_hist(df_hist)\n",
+    "\n",
+    "eval(model, tokenizer, second_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "1/0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Old"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import optim\n",
+    "\n",
+    "\n",
+    "def lora_eval(model, tokenizer, sample):\n",
+    "    # reset/set adapter\n",
+    "    # peft_config = IA3Config(\n",
+    "    #     target_modules=[ \"fc2\",  \"Wqkv\",], \n",
+    "    #         feedforward_modules=[\"fc2\"],\n",
+    "    #         inference_mode=False,\n",
+    "    # )\n",
+    "    peft_config = LoraConfig(\n",
+    "        # task_type=TaskType.TOKEN_CLS, \n",
+    "        target_modules=[ \"fc2\",  \"Wqkv\",],\n",
+    "        inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias=\"all\"\n",
+    "    )\n",
+    "    model = get_peft_model(model, peft_config)\n",
+    "    model.config.use_cache = False\n",
+    "\n",
+    "    # train adapter\n",
+    "    s = sample['text']\n",
+    "    first_half = s[:len(s)//2]\n",
+    "    second_half = s[len(s)//2:]\n",
+    "    input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
+    "\n",
+    "    device = 'cuda'\n",
+    "    lr = 1.0e-2\n",
+    "    epochs = 3\n",
+    "    accum_steps = 64\n",
+    "    epoch_steps = (len(input_ids)-1)//accum_steps+1\n",
+    "\n",
+    "    total_steps = epochs * epoch_steps\n",
+    "    optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
+    "    scheduler = optim.lr_scheduler.OneCycleLR(\n",
+    "            optimizer, lr, total_steps=total_steps\n",
+    "    )\n",
+    "    model.train()\n",
+    "    model = model.to(device)\n",
+    "    for epoch in range(epochs):\n",
+    "        # TODO: batch\n",
+    "        \n",
+    "        accum = 0\n",
+    "        for i in range(1, len(input_ids)):\n",
+    "            X = input_ids[:i][None, ]\n",
+    "            targets = input_ids[i:i+1][None, ]\n",
+    "            optimizer.zero_grad()\n",
+    "            out = model(input_ids=X, \n",
+    "                        )\n",
+    "            logits = out['logits'][:, -1]\n",
+    "            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
+    "            loss.backward()\n",
+    "            if accum > accum_steps:\n",
+    "                optimizer.step()\n",
+    "                scheduler.step()\n",
+    "                optimizer.zero_grad()\n",
+    "                accum = 0\n",
+    "            else:\n",
+    "                accum += 1\n",
+    "        if accum > 0:\n",
+    "            optimizer.step()\n",
+    "            scheduler.step()\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "    return eval(model, tokenizer, second_half)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = []\n",
+    "for sample in tqdm(samples):\n",
+    "    r = lora_eval(model, tokenizer, sample)\n",
+    "    print(sample['name'], r)\n",
+    "    r.update(sample)\n",
+    "    data.append(r)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
+    "df = pd.DataFrame(data).set_index('name')\n",
+    "\n",
+    "df['learning'] = (df['before']-df['after'])/df['before']\n",
+    "df.sort_values('learning').drop(columns=['text', 'url'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0rc1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,597 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch import optim\n",
+    "import lightning as pl\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import transformers\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "import pandas as pd\n",
+    "import warnings\n",
+    "from peft import LoraConfig, get_peft_model, IA3Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.style.use('ggplot')\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "\n",
+    "model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\n",
+    "#     model_name,\n",
+    "#     # max_memory=max_memory,\n",
+    "#     quantization_config=BitsAndBytesConfig(\n",
+    "#         load_in_4bit=True,\n",
+    "#         llm_int8_threshold=6.0,\n",
+    "#         llm_int8_has_fp16_weight=False,\n",
+    "#         bnb_4bit_compute_dtype=torch.float16,\n",
+    "#         bnb_4bit_use_double_quant=True,\n",
+    "#         bnb_4bit_quant_type=\"nf4\",\n",
+    "#     ),\n",
+    "#     torch_dtype=torch.float16,\n",
+    "#     trust_remote_code=True,\n",
+    "# )\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"TheBloke/phi-2-GPTQ\"\n",
+    "# model_name = \"microsoft/phi-2\"\n",
+    "\n",
+    "def load_model():\n",
+    "\n",
+    "    # model = AutoModelForCausalLM.from_pretrained(\n",
+    "    #     model_name,\n",
+    "    #     # quantization_config=BitsAndBytesConfig(\n",
+    "    #     #     load_in_4bit=True,\n",
+    "    #     #     llm_int8_threshold=6.0,\n",
+    "    #     #     llm_int8_has_fp16_weight=False,\n",
+    "    #     #     bnb_4bit_compute_dtype=torch.float16,\n",
+    "    #     #     bnb_4bit_use_double_quant=True,\n",
+    "    #     #     bnb_4bit_quant_type=\"nf4\",\n",
+    "    #     # ),\n",
+    "    #     torch_dtype=torch.float16,\n",
+    "    #     trust_remote_code=True,\n",
+    "    # )\n",
+    "\n",
+    "\n",
+    "    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
+    "    config.quantization_config['use_exllama'] = False\n",
+    "    config.quantization_config['disable_exllama'] = True\n",
+    "    model = AutoModelForCausalLM.from_pretrained(\n",
+    "        model_name,\n",
+    "        torch_dtype=torch.bfloat16,\n",
+    "        trust_remote_code=True,\n",
+    "        config=config,\n",
+    "    )\n",
+    "    return model\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "CUDA extension not installed.\n",
+      "CUDA extension not installed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "base_model = load_model()\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
+    "tokenizer.pad_token = tokenizer.eos_token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def reset_model(base_model):\n",
+    "    peft_config = LoraConfig(\n",
+    "        # task_type=TaskType.TOKEN_CLS, \n",
+    "        target_modules=[ \"fc2\",  \"Wqkv\",],\n",
+    "        inference_mode=False, r=8, lora_alpha=8, \n",
+    "        # lora_dropout=0.1, \n",
+    "        # bias=\"all\"\n",
+    "    )\n",
+    "    model = get_peft_model(base_model, peft_config)\n",
+    "    model.config.use_cache = False\n",
+    "    return model\n",
+    "\n",
+    "model = reset_model(base_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "MAX_LEN = 2000\n",
+    "samples = json.load(open(\"../samples.json\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
+    "\n",
+    "# from evaluate.measurements.perplexity import Perplexity\n",
+    "import evaluate\n",
+    "from evaluate import logging\n",
+    "from torch.nn import CrossEntropyLoss\n",
+    "\n",
+    "# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
+    "def perplexity_compute(\n",
+    "    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
+    "):\n",
+    "\n",
+    "    if device is not None:\n",
+    "        assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
+    "        if device == \"gpu\":\n",
+    "            device = \"cuda\"\n",
+    "    else:\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "    # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
+    "    model = model.to(device)\n",
+    "\n",
+    "    # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "\n",
+    "    # # if batch_size > 1 (which generally leads to padding being required), and\n",
+    "    # # if there is not an already assigned pad_token, assign an existing\n",
+    "    # # special token to also be the padding token\n",
+    "    # if tokenizer.pad_token is None and batch_size > 1:\n",
+    "    #     existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
+    "    #     # check that the model already has at least one special token defined\n",
+    "    #     assert (\n",
+    "    #         len(existing_special_tokens) > 0\n",
+    "    #     ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
+    "    #     # assign one of the special tokens to also be the pad token\n",
+    "    #     tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
+    "\n",
+    "    # if add_start_token and max_length:\n",
+    "    #     # leave room for <BOS> token to be added:\n",
+    "    #     assert (\n",
+    "    #         tokenizer.bos_token is not None\n",
+    "    #     ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
+    "    #     max_tokenized_len = max_length - 1\n",
+    "    # else:\n",
+    "    max_tokenized_len = max_length\n",
+    "\n",
+    "    encodings = tokenizer(\n",
+    "        data,\n",
+    "        add_special_tokens=False,\n",
+    "        padding=True,\n",
+    "        truncation=True if max_tokenized_len else False,\n",
+    "        max_length=max_tokenized_len,\n",
+    "        return_tensors=\"pt\",\n",
+    "        return_attention_mask=True,\n",
+    "    ).to(device)\n",
+    "\n",
+    "    encoded_texts = encodings[\"input_ids\"]\n",
+    "    attn_masks = encodings[\"attention_mask\"]\n",
+    "\n",
+    "    # check that each input is long enough:\n",
+    "    if add_start_token:\n",
+    "        assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
+    "    else:\n",
+    "        assert torch.all(\n",
+    "            torch.ge(attn_masks.sum(1), 2)\n",
+    "        ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
+    "\n",
+    "    ppls = []\n",
+    "    loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
+    "\n",
+    "    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
+    "        end_index = min(start_index + batch_size, len(encoded_texts))\n",
+    "        encoded_batch = encoded_texts[start_index:end_index]\n",
+    "        attn_mask = attn_masks[start_index:end_index]\n",
+    "\n",
+    "        # if add_start_token:\n",
+    "        #     bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
+    "        #     encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
+    "        #     attn_mask = torch.cat(\n",
+    "        #         [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
+    "        #     )\n",
+    "\n",
+    "        labels = encoded_batch\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
+    "            # print(out_logits.shape)\n",
+    "\n",
+    "        shift_logits = out_logits[..., :-1, :].contiguous()\n",
+    "        shift_labels = labels[..., 1:].contiguous()\n",
+    "        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
+    "\n",
+    "        perplexity_batch = torch.exp(\n",
+    "            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
+    "            / shift_attention_mask_batch.sum(1)\n",
+    "        )\n",
+    "        # perplexity_batch = torch.exp(\n",
+    "        #     (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)\n",
+    "        #     / shift_attention_mask_batch.sum(1)\n",
+    "        # )\n",
+    "        # print(perplexity_batch.shape)\n",
+    "\n",
+    "        ppls += perplexity_batch.tolist()\n",
+    "\n",
+    "    return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# perplexity_compute(\n",
+    "#     second_half, model, tokenizer\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.nn import functional as F\n",
+    "from torch.utils.data import DataLoader, TensorDataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lightning helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "def str2xya(s, tokenizer):\n",
+    "    max_len = min(MAX_LEN, len(s))\n",
+    "    input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0]\n",
+    "\n",
+    "    pad = tokenizer.bos_token_id\n",
+    "    data = []\n",
+    "    for i in range(1, len(input_ids)):\n",
+    "        x = input_ids[:i][-max_len:]\n",
+    "        padding = max_len - len(x)\n",
+    "        x = torch.tensor([pad]*padding + x.tolist())\n",
+    "\n",
+    "        label_ids = input_ids[i:i+1]\n",
+    "        attention_mask = (x==pad)*1\n",
+    "        data.append(dict(input_ids=x, label_ids=label_ids, attention_mask=attention_mask))\n",
+    "        \n",
+    "    return data\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eval(model, tokenizer, second_half):\n",
+    "    model.eval();\n",
+    "    with torch.no_grad():\n",
+    "        with model.disable_adapter():\n",
+    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def learn_sample(sample):\n",
+    "    device = 'cuda'\n",
+    "    lr = 4e-3\n",
+    "    epochs = 3\n",
+    "    accum_steps = 16\n",
+    "    batch_size = 1\n",
+    "\n",
+    "    s = sample['text']\n",
+    "    first_half = s[:len(s)//2]\n",
+    "    second_half = s[len(s)//2:]\n",
+    "    ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
+    "    ds_val = Dataset.from_dict(tokenizer([second_half]))\n",
+    "\n",
+    "    os.environ['CUDA_VISIBLE_DEVICES']=\"1\"\n",
+    "    verbose = False\n",
+    "    model = reset_model(base_model)\n",
+    "    eval(model, tokenizer, second_half)\n",
+    "    trainer = transformers.Trainer(\n",
+    "        model=model,\n",
+    "        train_dataset=ds_train,\n",
+    "        eval_dataset=ds_val,\n",
+    "        args=transformers.TrainingArguments(\n",
+    "            per_device_train_batch_size=batch_size,\n",
+    "            gradient_accumulation_steps=8,\n",
+    "            warmup_steps=0,\n",
+    "            max_steps=40,\n",
+    "            learning_rate=3e-4,\n",
+    "            fp16=True,\n",
+    "            logging_steps=1,\n",
+    "            output_dir=\"outputs\",\n",
+    "            log_level='error',\n",
+    "            disable_tqdm=not verbose,\n",
+    "        ),\n",
+    "        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
+    "    )\n",
+    "    trainer._signature_columns = ['input_ids', 'attention_mask', 'label_ids']\n",
+    "    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!\n",
+    "    train_output = trainer.train()\n",
+    "\n",
+    "    if verbose:\n",
+    "        df_hist = pd.DataFrame(trainer.state.log_history)\n",
+    "        df_hist_epoch = df_hist.groupby('epoch').last().dropna(axis=1).drop(columns=['step'])\n",
+    "        df_hist_step = df_hist.set_index('step').dropna(thresh=2, axis=1)\n",
+    "        for c in df_hist_epoch.columns:\n",
+    "            df_hist_epoch[[c]].plot()\n",
+    "\n",
+    "\n",
+    "    result = eval(model, tokenizer, second_half)\n",
+    "    return result\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = []\n",
+    "for sample in samples:\n",
+    "    print(sample['name'])\n",
+    "    r = learn_sample(sample)\n",
+    "    print(r)\n",
+    "    data.append(dict(**r, **sample))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_res = pd.DataFrame(data)\n",
+    "df_res = df_res[['before', 'after', 'name', 'in_training']]\n",
+    "df_res['improvement'] = df_res['before'] - df_res['after']\n",
+    "df_res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DEBUG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, HTML, Markdown\n",
+    "import torch\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def gen(model, inputs, tokenizer, clean=True):\n",
+    "    s = model.generate(\n",
+    "        input_ids=inputs[\"input_ids\"][None, :].to(model.device),\n",
+    "        attention_mask=inputs[\"attention_mask\"][None, :].to(model.device),\n",
+    "        use_cache=False,\n",
+    "        max_new_tokens=100,\n",
+    "        min_new_tokens=100,\n",
+    "        do_sample=False,\n",
+    "        early_stopping=False,\n",
+    "    )\n",
+    "    input_l = inputs[\"input_ids\"].shape[0]\n",
+    "    tokenizer_kwargs=dict(clean_up_tokenization_spaces=clean, skip_special_tokens=clean)\n",
+    "    old = tokenizer.decode(\n",
+    "        s[0, :input_l], **tokenizer_kwargs\n",
+    "    )\n",
+    "    new = tokenizer.decode(\n",
+    "        s[0, input_l:], **tokenizer_kwargs\n",
+    "    )\n",
+    "    s_old = \"\"+old.replace('\\n', '<br>')\n",
+    "    s_new =  '<b>' + new.replace('\\n', '<br>')+ '<br><br><b/>'\n",
+    "    display(HTML(f\"{s_old}{s_new}\"))\n",
+    "    # print([old, new])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "samples = samples[1]\n",
+    "\n",
+    "s = sample['text']\n",
+    "first_half = s[:len(s)//2]\n",
+    "second_half = s[len(s)//2:]\n",
+    "ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
+    "ds_val = Dataset.from_dict(tokenizer([second_half]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with model.disable_adapter():\n",
+    "    gen(model, ds_train.with_format('pt')[0], tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen(model, ds_train.with_format('pt')[0], tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0rc1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}