misc

2026-06-27 16:45:29 +08:00 · 2024-01-03 08:10:05 +08:00
parent 331e534ff7
commit e087051711
5 changed files with 1890 additions and 3806 deletions
@@ -1,5 +1,5 @@
 .env
-
+lightning_logs/

 *.arrow
 squad_*
@@ -44,7 +44,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
@@ -56,15 +56,15 @@
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    # max_memory=max_memory,\n",
-    "    # quantization_config=BitsAndBytesConfig(\n",
-    "    #     load_in_4bit=True,\n",
-    "    #     llm_int8_threshold=6.0,\n",
-    "    #     llm_int8_has_fp16_weight=False,\n",
-    "    #     bnb_4bit_compute_dtype=torch.float16,\n",
-    "    #     bnb_4bit_use_double_quant=True,\n",
-    "    #     bnb_4bit_quant_type=\"nf4\",\n",
-    "    # ),\n",
-    "    # torch_dtype=torch.float16,\n",
+    "    quantization_config=BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        llm_int8_threshold=6.0,\n",
+    "        llm_int8_has_fp16_weight=False,\n",
+    "        bnb_4bit_compute_dtype=torch.float16,\n",
+    "        bnb_4bit_use_double_quant=True,\n",
+    "        bnb_4bit_quant_type=\"nf4\",\n",
+    "    ),\n",
+    "    torch_dtype=torch.float16,\n",
    "    trust_remote_code=True,\n",
    ")\n",
    "\n",
@@ -267,12 +267,418 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from torch.nn import functional as F"
+    "from torch.nn import functional as F\n",
+    "from torch.utils.data import DataLoader, TensorDataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lightning helpers"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'matplotlib'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m optim\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlightning\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'"
+     ]
+    }
+   ],
+   "source": [
+    "from torch import optim\n",
+    "import lightning as pl\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = samples[0]\n",
+    "s = sample['text']\n",
+    "first_half = s[:len(s)//2]\n",
+    "second_half = s[len(s)//2:]\n",
+    "\n",
+    "def str2xya(s):\n",
+    "    input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
+    "\n",
+    "    pad = tokenizer.bos_token_id\n",
+    "    # turn it into a sequence\n",
+    "    Xs = []\n",
+    "    Ys = []\n",
+    "    for i in range(1, len(input_ids)):\n",
+    "        x = input_ids[:i]\n",
+    "        padding = len(input_ids) - len(x)\n",
+    "        x = [pad]*padding + x\n",
+    "        \n",
+    "        Xs.append(x)\n",
+    "        Ys.append(input_ids[i:i+1])\n",
+    "\n",
+    "    Xs = torch.tensor(Xs)\n",
+    "    Ys = torch.tensor(Ys)\n",
+    "    attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
+    "    return Xs, Ys, attention_masks\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eval(model, tokenizer, second_half):\n",
+    "    model.eval();\n",
+    "    with torch.no_grad():\n",
+    "        with model.disable_adapter():\n",
+    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
+    "\n",
+    "def read_metrics_csv(metrics_file_path):\n",
+    "    df_hist = pd.read_csv(metrics_file_path)\n",
+    "    df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
+    "    df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
+    "    return df_histe, df_hist\n",
+    "\n",
+    "\n",
+    "def plot_hist(df_hist, allowlist=None, logy=False):\n",
+    "    \"\"\"plot groups of suffixes together\"\"\"\n",
+    "    suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
+    "    for suffix in suffixes:\n",
+    "        if allowlist and suffix not in allowlist: continue\n",
+    "        df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
+    "        plt.title(suffix)   \n",
+    "        plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "class PL_MODEL(pl.LightningModule):\n",
+    "    def __init__(self, model, num_iterations, lr=3e-4, weight_decay=0,):\n",
+    "        super().__init__()\n",
+    "        self._model = model\n",
+    "        self.save_hyperparameters(ignore=['model'])\n",
+    "    \n",
+    "    def forward(self, **kwargs):\n",
+    "        return self._model(**kwargs)\n",
+    "\n",
+    "    def _shared_step(self, batch, batch_idx, phase='train'):\n",
+    "        input_ids, targets, attention_mask = batch\n",
+    "        # 16, 141\n",
+    "        output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
+    "        loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
+    "        self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
+    "        return loss\n",
+    "    \n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        return self._shared_step(batch, batch_idx, phase='train')\n",
+    "\n",
+    "    def validation_step(self, batch, batch_idx):\n",
+    "        return self._shared_step(batch, batch_idx, phase='val')\n",
+    "    \n",
+    "    def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
+    "        return self._shared_step(batch, batch_idx, phase='test')\n",
+    "    \n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
+    "        lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
+    "            optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
+    "        )\n",
+    "        return [optimizer], [lr_scheduler]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
+      "Using bfloat16 Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
+      "You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]\n",
+      "\n",
+      "  | Name   | Type      | Params\n",
+      "-------------------------------------\n",
+      "0 | _model | PeftModel | 1.5 B \n",
+      "-------------------------------------\n",
+      "11.8 M    Trainable params\n",
+      "1.5 B     Non-trainable params\n",
+      "1.5 B     Total params\n",
+      "6,132.756 Total estimated model params size (MB)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.\n",
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.66it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([4, 1])) that is different to the input size (torch.Size([4, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
+      "  loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                                           "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: 100%|██████████| 52/52 [00:26<00:00,  1.94it/s, v_num=9, train/loss_step=381.0]  "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([1, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
+      "  loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 2: 100%|██████████| 52/52 [00:40<00:00,  1.30it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]  "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`Trainer.fit` stopped: `max_epochs=3` reached.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 2: 100%|██████████| 52/52 [00:42<00:00,  1.22it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>train/loss_step</th>\n",
+       "      <th>step</th>\n",
+       "      <th>val/loss_step</th>\n",
+       "      <th>val/loss_epoch</th>\n",
+       "      <th>train/loss_epoch</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>epoch</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0.0</th>\n",
+       "      <td>3319.602325</td>\n",
+       "      <td>22.278689</td>\n",
+       "      <td>6775.631730</td>\n",
+       "      <td>6775.630859</td>\n",
+       "      <td>6645.847656</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1.0</th>\n",
+       "      <td>4132.752668</td>\n",
+       "      <td>67.639344</td>\n",
+       "      <td>6772.301628</td>\n",
+       "      <td>6772.301758</td>\n",
+       "      <td>6642.495605</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2.0</th>\n",
+       "      <td>5096.813714</td>\n",
+       "      <td>113.000000</td>\n",
+       "      <td>6769.673500</td>\n",
+       "      <td>6769.673828</td>\n",
+       "      <td>6639.772949</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       train/loss_step        step  val/loss_step  val/loss_epoch  \\\n",
+       "epoch                                                               \n",
+       "0.0        3319.602325   22.278689    6775.631730     6775.630859   \n",
+       "1.0        4132.752668   67.639344    6772.301628     6772.301758   \n",
+       "2.0        5096.813714  113.000000    6769.673500     6769.673828   \n",
+       "\n",
+       "       train/loss_epoch  \n",
+       "epoch                    \n",
+       "0.0         6645.847656  \n",
+       "1.0         6642.495605  \n",
+       "2.0         6639.772949  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "ImportError",
+     "evalue": "matplotlib is required for plotting when the default backend \"matplotlib\" is selected.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 85\u001b[0m\n\u001b[1;32m     83\u001b[0m df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n\u001b[1;32m     84\u001b[0m display(df_hist)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mplot_hist\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;28meval\u001b[39m(model, tokenizer, second_half)\n",
+      "Cell \u001b[0;32mIn[10], line 79\u001b[0m, in \u001b[0;36mplot_hist\u001b[0;34m(df_hist, allowlist, logy)\u001b[0m\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m suffix \u001b[38;5;129;01min\u001b[39;00m suffixes:\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m allowlist \u001b[38;5;129;01mand\u001b[39;00m suffix \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m allowlist: \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m---> 79\u001b[0m     \u001b[43mdf_hist\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendswith\u001b[49m\u001b[43m(\u001b[49m\u001b[43msuffix\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstyle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     80\u001b[0m     plt\u001b[38;5;241m.\u001b[39mtitle(suffix)   \n\u001b[1;32m     81\u001b[0m     plt\u001b[38;5;241m.\u001b[39mshow()\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:951\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    950\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 951\u001b[0m     plot_backend \u001b[38;5;241m=\u001b[39m \u001b[43m_get_plot_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbackend\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    953\u001b[0m     x, y, kind, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_call_args(\n\u001b[1;32m    954\u001b[0m         plot_backend\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, args, kwargs\n\u001b[1;32m    955\u001b[0m     )\n\u001b[1;32m    957\u001b[0m     kind \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_kind_aliases\u001b[38;5;241m.\u001b[39mget(kind, kind)\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1947\u001b[0m, in \u001b[0;36m_get_plot_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m   1944\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend_str \u001b[38;5;129;01min\u001b[39;00m _backends:\n\u001b[1;32m   1945\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _backends[backend_str]\n\u001b[0;32m-> 1947\u001b[0m module \u001b[38;5;241m=\u001b[39m \u001b[43m_load_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbackend_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1948\u001b[0m _backends[backend_str] \u001b[38;5;241m=\u001b[39m module\n\u001b[1;32m   1949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n",
+      "File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1877\u001b[0m, in \u001b[0;36m_load_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m   1875\u001b[0m         module \u001b[38;5;241m=\u001b[39m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpandas.plotting._matplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1876\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m-> 1877\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m   1878\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib is required for plotting when the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1879\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdefault backend \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is selected.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m   1880\u001b[0m         ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1881\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m module\n\u001b[1;32m   1883\u001b[0m found_backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "\u001b[0;31mImportError\u001b[0m: matplotlib is required for plotting when the default backend \"matplotlib\" is selected."
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "    \n",
+    "\n",
+    "peft_config = LoraConfig(\n",
+    "    # task_type=TaskType.TOKEN_CLS, \n",
+    "    target_modules=[ \"fc2\",  \"Wqkv\",],\n",
+    "    inference_mode=False, r=16, lora_alpha=16, \n",
+    "    # lora_dropout=0.1,\n",
+    "    # bias=\"all\"\n",
+    ")\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.config.use_cache = False\n",
+    "\n",
+    "device = 'cuda'\n",
+    "lr = 4e-3\n",
+    "epochs = 3\n",
+    "accum_steps = 64\n",
+    "batch_size = 4\n",
+    "\n",
+    "Xs, Ys, attention_masks = str2xya(first_half)\n",
+    "dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
+    "Xs, Ys, attention_masks = str2xya(second_half)\n",
+    "dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
+    "\n",
+    "epoch_steps = len(dl_train)\n",
+    "\n",
+    "pl_model = PL_MODEL(model, num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
+    "trainer = pl.Trainer(\n",
+    "        max_epochs=epochs,\n",
+    "        precision=\"bf16-mixed\",\n",
+    "        log_every_n_steps=1,\n",
+    "        accumulate_grad_batches=8,\n",
+    "    )\n",
+    "\n",
+    "# train\n",
+    "trainer.fit(pl_model, dl_train, dl_val)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
+    "display(df_hist)\n",
+    "plot_hist(df_hist)\n",
+    "\n",
+    "eval(model, tokenizer, second_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -338,15 +744,7 @@
    "            scheduler.step()\n",
    "            optimizer.zero_grad()\n",
    "\n",
-    "    # eval\n",
-    "    model.eval();\n",
-    "    with torch.no_grad():\n",
-    "        with model.disable_adapter():\n",
-    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
-    "            results['mean_perplexity']\n",
-    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
-    "\n",
-    "    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
+    "    return eval(model, tokenizer, second_half)\n",
    "\n"
   ]
  },
@@ -359,218 +757,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|          | 0/12 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/peft/tuners/lora/model.py:402: UserWarning: Careful, disabling adapter layers with bias configured to be 'all' does not produce the same output as the the base model would without adaption.\n",
-      "  warnings.warn(msg)\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.42it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.36it/s]\n",
-      "  8%|▊         | 1/12 [01:21<14:54, 81.32s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bad_ml {'before': 17.1319522857666, 'after': 17.076616287231445}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  9.21it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  9.19it/s]\n",
-      " 17%|█▋        | 2/12 [01:43<07:45, 46.58s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "good_ml {'before': 48.654518127441406, 'after': 48.63978576660156}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.00it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.20it/s]\n",
-      " 25%|██▌       | 3/12 [03:26<10:49, 72.18s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "sokal hoax {'before': 29.55867576599121, 'after': 29.561065673828125}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  7.25it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  7.55it/s]\n",
-      " 33%|███▎      | 4/12 [04:10<08:10, 61.34s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Theory o. general relativity {'before': 48.4825553894043, 'after': 48.46034622192383}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  8.83it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  9.19it/s]\n",
-      " 42%|████▏     | 5/12 [04:29<05:21, 45.97s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "lorem ipsum  {'before': 243.0447540283203, 'after': 238.47674560546875}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  8.78it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  8.88it/s]\n",
-      " 50%|█████     | 6/12 [05:05<04:14, 42.40s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "wikipedia on LK-99 {'before': 53.24197006225586, 'after': 53.270263671875}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  8.77it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  8.92it/s]\n",
-      " 58%|█████▊    | 7/12 [05:30<03:03, 36.69s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "I have a dream {'before': 18.867136001586914, 'after': 18.801422119140625}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.27it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.20it/s]\n",
-      " 67%|██████▋   | 8/12 [06:40<03:09, 47.34s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "AI gen fake paper {'before': 11.114971160888672, 'after': 11.109580039978027}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.13it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.07it/s]\n",
-      " 75%|███████▌  | 9/12 [08:43<03:32, 70.94s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Schmidhuber 2023 Subjective Novelty, Surprise {'before': 67.33682250976562, 'after': 67.20210266113281}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.77it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  6.85it/s]\n",
-      " 83%|████████▎ | 10/12 [09:50<02:19, 69.77s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "email_to_fauci {'before': 55.9570198059082, 'after': 56.01524353027344}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00, 27.48it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00, 24.11it/s]\n",
-      " 92%|█████████▏| 11/12 [10:02<00:52, 52.08s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "enron_email1 {'before': 59.76203536987305, 'after': 59.75802230834961}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  8.55it/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00,  8.91it/s]\n",
-      "100%|██████████| 12/12 [10:48<00:00, 54.00s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "openai_board_ann {'before': 30.923919677734375, 'after': 30.946474075317383}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "data = []\n",
    "for sample in tqdm(samples):\n",
@@ -582,176 +771,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "perplexity (on 2nd half) before and after training adapter on first half of text\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>before</th>\n",
-       "      <th>after</th>\n",
-       "      <th>in_training</th>\n",
-       "      <th>learning</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>name</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>email_to_fauci</th>\n",
-       "      <td>55.957020</td>\n",
-       "      <td>56.015244</td>\n",
-       "      <td>False</td>\n",
-       "      <td>-0.001041</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>openai_board_ann</th>\n",
-       "      <td>30.923920</td>\n",
-       "      <td>30.946474</td>\n",
-       "      <td>False</td>\n",
-       "      <td>-0.000729</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>wikipedia on LK-99</th>\n",
-       "      <td>53.241970</td>\n",
-       "      <td>53.270264</td>\n",
-       "      <td>False</td>\n",
-       "      <td>-0.000531</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>sokal hoax</th>\n",
-       "      <td>29.558676</td>\n",
-       "      <td>29.561066</td>\n",
-       "      <td>True</td>\n",
-       "      <td>-0.000081</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>enron_email1</th>\n",
-       "      <td>59.762035</td>\n",
-       "      <td>59.758022</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0.000067</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>good_ml</th>\n",
-       "      <td>48.654518</td>\n",
-       "      <td>48.639786</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.000303</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Theory o. general relativity</th>\n",
-       "      <td>48.482555</td>\n",
-       "      <td>48.460346</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0.000458</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>AI gen fake paper</th>\n",
-       "      <td>11.114971</td>\n",
-       "      <td>11.109580</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.000485</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Schmidhuber 2023 Subjective Novelty, Surprise</th>\n",
-       "      <td>67.336823</td>\n",
-       "      <td>67.202103</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.002001</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>bad_ml</th>\n",
-       "      <td>17.131952</td>\n",
-       "      <td>17.076616</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.003230</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>I have a dream</th>\n",
-       "      <td>18.867136</td>\n",
-       "      <td>18.801422</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0.003483</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>lorem ipsum</th>\n",
-       "      <td>243.044754</td>\n",
-       "      <td>238.476746</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0.018795</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                   before       after  \\\n",
-       "name                                                                    \n",
-       "email_to_fauci                                  55.957020   56.015244   \n",
-       "openai_board_ann                                30.923920   30.946474   \n",
-       "wikipedia on LK-99                              53.241970   53.270264   \n",
-       "sokal hoax                                      29.558676   29.561066   \n",
-       "enron_email1                                    59.762035   59.758022   \n",
-       "good_ml                                         48.654518   48.639786   \n",
-       "Theory o. general relativity                    48.482555   48.460346   \n",
-       "AI gen fake paper                               11.114971   11.109580   \n",
-       "Schmidhuber 2023 Subjective Novelty, Surprise   67.336823   67.202103   \n",
-       "bad_ml                                          17.131952   17.076616   \n",
-       "I have a dream                                  18.867136   18.801422   \n",
-       "lorem ipsum                                    243.044754  238.476746   \n",
-       "\n",
-       "                                               in_training  learning  \n",
-       "name                                                                  \n",
-       "email_to_fauci                                       False -0.001041  \n",
-       "openai_board_ann                                     False -0.000729  \n",
-       "wikipedia on LK-99                                   False -0.000531  \n",
-       "sokal hoax                                            True -0.000081  \n",
-       "enron_email1                                          True  0.000067  \n",
-       "good_ml                                              False  0.000303  \n",
-       "Theory o. general relativity                          True  0.000458  \n",
-       "AI gen fake paper                                    False  0.000485  \n",
-       "Schmidhuber 2023 Subjective Novelty, Surprise        False  0.002001  \n",
-       "bad_ml                                               False  0.003230  \n",
-       "I have a dream                                        True  0.003483  \n",
-       "lorem ipsum                                           True  0.018795  "
-      ]
-     },
-     "execution_count": 34,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
    "df = pd.DataFrame(data).set_index('name')\n",
@@ -25,6 +25,8 @@ openai = "^1.6.1"
 python-dotenv = "^1.0.0"
 einops = "^0.7.0"
 tabulate = "^0.9.0"
+lightning = "^2.1.3"
+matplotlib = "^3.8.0"

 [[tool.poetry.source]]
 name = "pytorch"