This commit is contained in:
wassname
2024-01-03 08:10:05 +08:00
parent 331e534ff7
commit e087051711
5 changed files with 1890 additions and 3806 deletions
+1 -1
View File
@@ -1,5 +1,5 @@
.env
lightning_logs/
*.arrow
squad_*
+423 -401
View File
@@ -44,7 +44,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.34it/s]\n",
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.29it/s]\n",
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
@@ -56,15 +56,15 @@
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" # max_memory=max_memory,\n",
" # quantization_config=BitsAndBytesConfig(\n",
" # load_in_4bit=True,\n",
" # llm_int8_threshold=6.0,\n",
" # llm_int8_has_fp16_weight=False,\n",
" # bnb_4bit_compute_dtype=torch.float16,\n",
" # bnb_4bit_use_double_quant=True,\n",
" # bnb_4bit_quant_type=\"nf4\",\n",
" # ),\n",
" # torch_dtype=torch.float16,\n",
" quantization_config=BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" llm_int8_threshold=6.0,\n",
" llm_int8_has_fp16_weight=False,\n",
" bnb_4bit_compute_dtype=torch.float16,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" ),\n",
" torch_dtype=torch.float16,\n",
" trust_remote_code=True,\n",
")\n",
"\n",
@@ -267,12 +267,418 @@
"metadata": {},
"outputs": [],
"source": [
"from torch.nn import functional as F"
"from torch.nn import functional as F\n",
"from torch.utils.data import DataLoader, TensorDataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lightning helpers"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'matplotlib'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m optim\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlightning\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'"
]
}
],
"source": [
"from torch import optim\n",
"import lightning as pl\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"sample = samples[0]\n",
"s = sample['text']\n",
"first_half = s[:len(s)//2]\n",
"second_half = s[len(s)//2:]\n",
"\n",
"def str2xya(s):\n",
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
"\n",
" pad = tokenizer.bos_token_id\n",
" # turn it into a sequence\n",
" Xs = []\n",
" Ys = []\n",
" for i in range(1, len(input_ids)):\n",
" x = input_ids[:i]\n",
" padding = len(input_ids) - len(x)\n",
" x = [pad]*padding + x\n",
" \n",
" Xs.append(x)\n",
" Ys.append(input_ids[i:i+1])\n",
"\n",
" Xs = torch.tensor(Xs)\n",
" Ys = torch.tensor(Ys)\n",
" attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
" return Xs, Ys, attention_masks\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def eval(model, tokenizer, second_half):\n",
" model.eval();\n",
" with torch.no_grad():\n",
" with model.disable_adapter():\n",
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
"\n",
"def read_metrics_csv(metrics_file_path):\n",
" df_hist = pd.read_csv(metrics_file_path)\n",
" df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
" df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
" return df_histe, df_hist\n",
"\n",
"\n",
"def plot_hist(df_hist, allowlist=None, logy=False):\n",
" \"\"\"plot groups of suffixes together\"\"\"\n",
" suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
" for suffix in suffixes:\n",
" if allowlist and suffix not in allowlist: continue\n",
" df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
" plt.title(suffix) \n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"class PL_MODEL(pl.LightningModule):\n",
" def __init__(self, model, num_iterations, lr=3e-4, weight_decay=0,):\n",
" super().__init__()\n",
" self._model = model\n",
" self.save_hyperparameters(ignore=['model'])\n",
" \n",
" def forward(self, **kwargs):\n",
" return self._model(**kwargs)\n",
"\n",
" def _shared_step(self, batch, batch_idx, phase='train'):\n",
" input_ids, targets, attention_mask = batch\n",
" # 16, 141\n",
" output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
" self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
" return loss\n",
" \n",
" def training_step(self, batch, batch_idx):\n",
" return self._shared_step(batch, batch_idx, phase='train')\n",
"\n",
" def validation_step(self, batch, batch_idx):\n",
" return self._shared_step(batch, batch_idx, phase='val')\n",
" \n",
" def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
" return self._shared_step(batch, batch_idx, phase='test')\n",
" \n",
" def configure_optimizers(self):\n",
" optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
" lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
" optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
" )\n",
" return [optimizer], [lr_scheduler]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
"Using bfloat16 Automatic Mixed Precision (AMP)\n",
"GPU available: True (cuda), used: True\n",
"TPU available: False, using: 0 TPU cores\n",
"IPU available: False, using: 0 IPUs\n",
"HPU available: False, using: 0 HPUs\n",
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
"You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
"LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]\n",
"\n",
" | Name | Type | Params\n",
"-------------------------------------\n",
"0 | _model | PeftModel | 1.5 B \n",
"-------------------------------------\n",
"11.8 M Trainable params\n",
"1.5 B Non-trainable params\n",
"1.5 B Total params\n",
"6,132.756 Total estimated model params size (MB)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.\n",
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sanity Checking DataLoader 0: 50%|█████ | 1/2 [00:00<00:00, 2.66it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([4, 1])) that is different to the input size (torch.Size([4, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0: 100%|██████████| 52/52 [00:26<00:00, 1.94it/s, v_num=9, train/loss_step=381.0] "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([1, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 2: 100%|██████████| 52/52 [00:40<00:00, 1.30it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3] "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"`Trainer.fit` stopped: `max_epochs=3` reached.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 2: 100%|██████████| 52/52 [00:42<00:00, 1.22it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>train/loss_step</th>\n",
" <th>step</th>\n",
" <th>val/loss_step</th>\n",
" <th>val/loss_epoch</th>\n",
" <th>train/loss_epoch</th>\n",
" </tr>\n",
" <tr>\n",
" <th>epoch</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0.0</th>\n",
" <td>3319.602325</td>\n",
" <td>22.278689</td>\n",
" <td>6775.631730</td>\n",
" <td>6775.630859</td>\n",
" <td>6645.847656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1.0</th>\n",
" <td>4132.752668</td>\n",
" <td>67.639344</td>\n",
" <td>6772.301628</td>\n",
" <td>6772.301758</td>\n",
" <td>6642.495605</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2.0</th>\n",
" <td>5096.813714</td>\n",
" <td>113.000000</td>\n",
" <td>6769.673500</td>\n",
" <td>6769.673828</td>\n",
" <td>6639.772949</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" train/loss_step step val/loss_step val/loss_epoch \\\n",
"epoch \n",
"0.0 3319.602325 22.278689 6775.631730 6775.630859 \n",
"1.0 4132.752668 67.639344 6772.301628 6772.301758 \n",
"2.0 5096.813714 113.000000 6769.673500 6769.673828 \n",
"\n",
" train/loss_epoch \n",
"epoch \n",
"0.0 6645.847656 \n",
"1.0 6642.495605 \n",
"2.0 6639.772949 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "ImportError",
"evalue": "matplotlib is required for plotting when the default backend \"matplotlib\" is selected.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[10], line 85\u001b[0m\n\u001b[1;32m 83\u001b[0m df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n\u001b[1;32m 84\u001b[0m display(df_hist)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mplot_hist\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28meval\u001b[39m(model, tokenizer, second_half)\n",
"Cell \u001b[0;32mIn[10], line 79\u001b[0m, in \u001b[0;36mplot_hist\u001b[0;34m(df_hist, allowlist, logy)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m suffix \u001b[38;5;129;01min\u001b[39;00m suffixes:\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allowlist \u001b[38;5;129;01mand\u001b[39;00m suffix \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m allowlist: \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m---> 79\u001b[0m \u001b[43mdf_hist\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendswith\u001b[49m\u001b[43m(\u001b[49m\u001b[43msuffix\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstyle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 80\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(suffix) \n\u001b[1;32m 81\u001b[0m plt\u001b[38;5;241m.\u001b[39mshow()\n",
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:951\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 951\u001b[0m plot_backend \u001b[38;5;241m=\u001b[39m \u001b[43m_get_plot_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbackend\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 953\u001b[0m x, y, kind, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_call_args(\n\u001b[1;32m 954\u001b[0m plot_backend\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, args, kwargs\n\u001b[1;32m 955\u001b[0m )\n\u001b[1;32m 957\u001b[0m kind \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_kind_aliases\u001b[38;5;241m.\u001b[39mget(kind, kind)\n",
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1947\u001b[0m, in \u001b[0;36m_get_plot_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m 1944\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend_str \u001b[38;5;129;01min\u001b[39;00m _backends:\n\u001b[1;32m 1945\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _backends[backend_str]\n\u001b[0;32m-> 1947\u001b[0m module \u001b[38;5;241m=\u001b[39m \u001b[43m_load_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbackend_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1948\u001b[0m _backends[backend_str] \u001b[38;5;241m=\u001b[39m module\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n",
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1877\u001b[0m, in \u001b[0;36m_load_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m 1875\u001b[0m module \u001b[38;5;241m=\u001b[39m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpandas.plotting._matplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1876\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m-> 1877\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib is required for plotting when the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1879\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdefault backend \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is selected.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 1880\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1881\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n\u001b[1;32m 1883\u001b[0m found_backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
"\u001b[0;31mImportError\u001b[0m: matplotlib is required for plotting when the default backend \"matplotlib\" is selected."
]
}
],
"source": [
"\n",
" \n",
"\n",
"peft_config = LoraConfig(\n",
" # task_type=TaskType.TOKEN_CLS, \n",
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
" inference_mode=False, r=16, lora_alpha=16, \n",
" # lora_dropout=0.1,\n",
" # bias=\"all\"\n",
")\n",
"model = get_peft_model(model, peft_config)\n",
"model.config.use_cache = False\n",
"\n",
"device = 'cuda'\n",
"lr = 4e-3\n",
"epochs = 3\n",
"accum_steps = 64\n",
"batch_size = 4\n",
"\n",
"Xs, Ys, attention_masks = str2xya(first_half)\n",
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
"Xs, Ys, attention_masks = str2xya(second_half)\n",
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
"\n",
"epoch_steps = len(dl_train)\n",
"\n",
"pl_model = PL_MODEL(model, num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
"trainer = pl.Trainer(\n",
" max_epochs=epochs,\n",
" precision=\"bf16-mixed\",\n",
" log_every_n_steps=1,\n",
" accumulate_grad_batches=8,\n",
" )\n",
"\n",
"# train\n",
"trainer.fit(pl_model, dl_train, dl_val)\n",
"\n",
"\n",
"\n",
"\n",
"df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
"display(df_hist)\n",
"plot_hist(df_hist)\n",
"\n",
"eval(model, tokenizer, second_half)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -338,15 +744,7 @@
" scheduler.step()\n",
" optimizer.zero_grad()\n",
"\n",
" # eval\n",
" model.eval();\n",
" with torch.no_grad():\n",
" with model.disable_adapter():\n",
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
" results['mean_perplexity']\n",
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
"\n",
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
" return eval(model, tokenizer, second_half)\n",
"\n"
]
},
@@ -359,218 +757,9 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/12 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/peft/tuners/lora/model.py:402: UserWarning: Careful, disabling adapter layers with bias configured to be 'all' does not produce the same output as the the base model would without adaption.\n",
" warnings.warn(msg)\n",
"100%|██████████| 1/1 [00:00<00:00, 6.42it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 6.36it/s]\n",
" 8%|▊ | 1/12 [01:21<14:54, 81.32s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"bad_ml {'before': 17.1319522857666, 'after': 17.076616287231445}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 9.21it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 9.19it/s]\n",
" 17%|█▋ | 2/12 [01:43<07:45, 46.58s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"good_ml {'before': 48.654518127441406, 'after': 48.63978576660156}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 6.00it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 6.20it/s]\n",
" 25%|██▌ | 3/12 [03:26<10:49, 72.18s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"sokal hoax {'before': 29.55867576599121, 'after': 29.561065673828125}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 7.25it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 7.55it/s]\n",
" 33%|███▎ | 4/12 [04:10<08:10, 61.34s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Theory o. general relativity {'before': 48.4825553894043, 'after': 48.46034622192383}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 8.83it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 9.19it/s]\n",
" 42%|████▏ | 5/12 [04:29<05:21, 45.97s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"lorem ipsum {'before': 243.0447540283203, 'after': 238.47674560546875}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 8.78it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 8.88it/s]\n",
" 50%|█████ | 6/12 [05:05<04:14, 42.40s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"wikipedia on LK-99 {'before': 53.24197006225586, 'after': 53.270263671875}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 8.77it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 8.92it/s]\n",
" 58%|█████▊ | 7/12 [05:30<03:03, 36.69s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"I have a dream {'before': 18.867136001586914, 'after': 18.801422119140625}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 6.27it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 6.20it/s]\n",
" 67%|██████▋ | 8/12 [06:40<03:09, 47.34s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"AI gen fake paper {'before': 11.114971160888672, 'after': 11.109580039978027}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 6.13it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 6.07it/s]\n",
" 75%|███████▌ | 9/12 [08:43<03:32, 70.94s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Schmidhuber 2023 Subjective Novelty, Surprise {'before': 67.33682250976562, 'after': 67.20210266113281}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 6.77it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 6.85it/s]\n",
" 83%|████████▎ | 10/12 [09:50<02:19, 69.77s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"email_to_fauci {'before': 55.9570198059082, 'after': 56.01524353027344}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 27.48it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 24.11it/s]\n",
" 92%|█████████▏| 11/12 [10:02<00:52, 52.08s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"enron_email1 {'before': 59.76203536987305, 'after': 59.75802230834961}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 8.55it/s]\n",
"100%|██████████| 1/1 [00:00<00:00, 8.91it/s]\n",
"100%|██████████| 12/12 [10:48<00:00, 54.00s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"openai_board_ann {'before': 30.923919677734375, 'after': 30.946474075317383}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"data = []\n",
"for sample in tqdm(samples):\n",
@@ -582,176 +771,9 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"perplexity (on 2nd half) before and after training adapter on first half of text\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>before</th>\n",
" <th>after</th>\n",
" <th>in_training</th>\n",
" <th>learning</th>\n",
" </tr>\n",
" <tr>\n",
" <th>name</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>email_to_fauci</th>\n",
" <td>55.957020</td>\n",
" <td>56.015244</td>\n",
" <td>False</td>\n",
" <td>-0.001041</td>\n",
" </tr>\n",
" <tr>\n",
" <th>openai_board_ann</th>\n",
" <td>30.923920</td>\n",
" <td>30.946474</td>\n",
" <td>False</td>\n",
" <td>-0.000729</td>\n",
" </tr>\n",
" <tr>\n",
" <th>wikipedia on LK-99</th>\n",
" <td>53.241970</td>\n",
" <td>53.270264</td>\n",
" <td>False</td>\n",
" <td>-0.000531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>sokal hoax</th>\n",
" <td>29.558676</td>\n",
" <td>29.561066</td>\n",
" <td>True</td>\n",
" <td>-0.000081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>enron_email1</th>\n",
" <td>59.762035</td>\n",
" <td>59.758022</td>\n",
" <td>True</td>\n",
" <td>0.000067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>good_ml</th>\n",
" <td>48.654518</td>\n",
" <td>48.639786</td>\n",
" <td>False</td>\n",
" <td>0.000303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Theory o. general relativity</th>\n",
" <td>48.482555</td>\n",
" <td>48.460346</td>\n",
" <td>True</td>\n",
" <td>0.000458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AI gen fake paper</th>\n",
" <td>11.114971</td>\n",
" <td>11.109580</td>\n",
" <td>False</td>\n",
" <td>0.000485</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Schmidhuber 2023 Subjective Novelty, Surprise</th>\n",
" <td>67.336823</td>\n",
" <td>67.202103</td>\n",
" <td>False</td>\n",
" <td>0.002001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bad_ml</th>\n",
" <td>17.131952</td>\n",
" <td>17.076616</td>\n",
" <td>False</td>\n",
" <td>0.003230</td>\n",
" </tr>\n",
" <tr>\n",
" <th>I have a dream</th>\n",
" <td>18.867136</td>\n",
" <td>18.801422</td>\n",
" <td>True</td>\n",
" <td>0.003483</td>\n",
" </tr>\n",
" <tr>\n",
" <th>lorem ipsum</th>\n",
" <td>243.044754</td>\n",
" <td>238.476746</td>\n",
" <td>True</td>\n",
" <td>0.018795</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" before after \\\n",
"name \n",
"email_to_fauci 55.957020 56.015244 \n",
"openai_board_ann 30.923920 30.946474 \n",
"wikipedia on LK-99 53.241970 53.270264 \n",
"sokal hoax 29.558676 29.561066 \n",
"enron_email1 59.762035 59.758022 \n",
"good_ml 48.654518 48.639786 \n",
"Theory o. general relativity 48.482555 48.460346 \n",
"AI gen fake paper 11.114971 11.109580 \n",
"Schmidhuber 2023 Subjective Novelty, Surprise 67.336823 67.202103 \n",
"bad_ml 17.131952 17.076616 \n",
"I have a dream 18.867136 18.801422 \n",
"lorem ipsum 243.044754 238.476746 \n",
"\n",
" in_training learning \n",
"name \n",
"email_to_fauci False -0.001041 \n",
"openai_board_ann False -0.000729 \n",
"wikipedia on LK-99 False -0.000531 \n",
"sokal hoax True -0.000081 \n",
"enron_email1 True 0.000067 \n",
"good_ml False 0.000303 \n",
"Theory o. general relativity True 0.000458 \n",
"AI gen fake paper False 0.000485 \n",
"Schmidhuber 2023 Subjective Novelty, Surprise False 0.002001 \n",
"bad_ml False 0.003230 \n",
"I have a dream True 0.003483 \n",
"lorem ipsum True 0.018795 "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
"df = pd.DataFrame(data).set_index('name')\n",
Generated
-3404
View File
File diff suppressed because it is too large Load Diff
+2
View File
@@ -25,6 +25,8 @@ openai = "^1.6.1"
python-dotenv = "^1.0.0"
einops = "^0.7.0"
tabulate = "^0.9.0"
lightning = "^2.1.3"
matplotlib = "^3.8.0"
[[tool.poetry.source]]
name = "pytorch"
+1464
View File
File diff suppressed because it is too large Load Diff