mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-07-04 21:03:14 +08:00
trying transformers training
This commit is contained in:
+177
-351
@@ -22,54 +22,29 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.29it/s]\n",
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" # max_memory=max_memory,\n",
|
||||
" quantization_config=BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" llm_int8_threshold=6.0,\n",
|
||||
" llm_int8_has_fp16_weight=False,\n",
|
||||
" bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\",\n",
|
||||
" ),\n",
|
||||
" torch_dtype=torch.float16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
"\n"
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -78,12 +53,113 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MAX_LEN = 2000\n",
|
||||
"import json\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n",
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"# sample = samples[0]\n",
|
||||
"# sample"
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"TheBloke/phi-2-GPTQ\"\n",
|
||||
"# model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" # model_name,\n",
|
||||
" # # quantization_config=BitsAndBytesConfig(\n",
|
||||
" # # load_in_4bit=True,\n",
|
||||
" # # llm_int8_threshold=6.0,\n",
|
||||
" # # llm_int8_has_fp16_weight=False,\n",
|
||||
" # # bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
" # # bnb_4bit_use_double_quant=True,\n",
|
||||
" # # bnb_4bit_quant_type=\"nf4\",\n",
|
||||
" # # ),\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" # trust_remote_code=True,\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
" config.quantization_config['use_exllama'] = False\n",
|
||||
" # del config.quantization_config['use_exllama']\n",
|
||||
" config.quantization_config['disable_exllama'] = True\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -95,7 +171,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -199,61 +275,6 @@
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Perplexity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
"# results['mean_perplexity']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Learn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# \"\"\"### Post-processing on the model\n",
|
||||
"\n",
|
||||
"# Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.\n",
|
||||
"# \"\"\"\n",
|
||||
"\n",
|
||||
"# print(model)\n",
|
||||
"\n",
|
||||
"# for param in model.parameters():\n",
|
||||
"# param.requires_grad = False # freeze the model - train adapters later\n",
|
||||
"# if param.ndim == 1:\n",
|
||||
"# # cast the small parameters (e.g. layernorm) to fp32 for stability\n",
|
||||
"# param.data = param.data.to(torch.float32)\n",
|
||||
"\n",
|
||||
"# # model.gradient_checkpointing_enable() # reduce number of stored activations\n",
|
||||
"# # model.model.decoder.project_in = lambda x: x.requires_grad_(True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# class CastOutputToFloat(nn.Sequential):\n",
|
||||
"# def forward(self, x):\n",
|
||||
"# return super().forward(x).to(torch.float32)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# model.lm_head = CastOutputToFloat(model.lm_head)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -263,7 +284,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -280,30 +301,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'matplotlib'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m optim\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlightning\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pyplot \u001b[38;5;28;01mas\u001b[39;00m plt\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -312,7 +310,10 @@
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"\n",
|
||||
"def str2xya(s):\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
@@ -320,8 +321,8 @@
|
||||
" Xs = []\n",
|
||||
" Ys = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i]\n",
|
||||
" padding = len(input_ids) - len(x)\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = [pad]*padding + x\n",
|
||||
" \n",
|
||||
" Xs.append(x)\n",
|
||||
@@ -374,13 +375,25 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"class PL_MODEL(pl.LightningModule):\n",
|
||||
" def __init__(self, model, num_iterations, lr=3e-4, weight_decay=0,):\n",
|
||||
" def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
|
||||
" super().__init__()\n",
|
||||
" self._model = model\n",
|
||||
" self.save_hyperparameters(ignore=['model'])\n",
|
||||
" self.save_hyperparameters()\n",
|
||||
"\n",
|
||||
" def configure_model(self):\n",
|
||||
" # instantiate your model in this hook\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, \n",
|
||||
" # lora_dropout=0.1,\n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" self.model = load_model()\n",
|
||||
" self.model = get_peft_model(self.model, peft_config)\n",
|
||||
" self.model.config.use_cache = False\n",
|
||||
" \n",
|
||||
" def forward(self, **kwargs):\n",
|
||||
" return self._model(**kwargs)\n",
|
||||
" return self.model(**kwargs)\n",
|
||||
"\n",
|
||||
" def _shared_step(self, batch, batch_idx, phase='train'):\n",
|
||||
" input_ids, targets, attention_mask = batch\n",
|
||||
@@ -416,252 +429,65 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
|
||||
"Using bfloat16 Automatic Mixed Precision (AMP)\n",
|
||||
"GPU available: True (cuda), used: True\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"IPU available: False, using: 0 IPUs\n",
|
||||
"HPU available: False, using: 0 HPUs\n",
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
|
||||
"You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
|
||||
"LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]\n",
|
||||
"\n",
|
||||
" | Name | Type | Params\n",
|
||||
"-------------------------------------\n",
|
||||
"0 | _model | PeftModel | 1.5 B \n",
|
||||
"-------------------------------------\n",
|
||||
"11.8 M Trainable params\n",
|
||||
"1.5 B Non-trainable params\n",
|
||||
"1.5 B Total params\n",
|
||||
"6,132.756 Total estimated model params size (MB)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.\n",
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sanity Checking DataLoader 0: 50%|█████ | 1/2 [00:00<00:00, 2.66it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([4, 1])) that is different to the input size (torch.Size([4, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
|
||||
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 0: 100%|██████████| 52/52 [00:26<00:00, 1.94it/s, v_num=9, train/loss_step=381.0] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_2786928/2640872279.py:17: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([1, 51200])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
|
||||
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 2: 100%|██████████| 52/52 [00:40<00:00, 1.30it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"`Trainer.fit` stopped: `max_epochs=3` reached.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 2: 100%|██████████| 52/52 [00:42<00:00, 1.22it/s, v_num=9, train/loss_step=189.0, val/loss_step=1.28e+4, val/loss_epoch=6.77e+3, train/loss_epoch=6.64e+3]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>train/loss_step</th>\n",
|
||||
" <th>step</th>\n",
|
||||
" <th>val/loss_step</th>\n",
|
||||
" <th>val/loss_epoch</th>\n",
|
||||
" <th>train/loss_epoch</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>epoch</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0.0</th>\n",
|
||||
" <td>3319.602325</td>\n",
|
||||
" <td>22.278689</td>\n",
|
||||
" <td>6775.631730</td>\n",
|
||||
" <td>6775.630859</td>\n",
|
||||
" <td>6645.847656</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1.0</th>\n",
|
||||
" <td>4132.752668</td>\n",
|
||||
" <td>67.639344</td>\n",
|
||||
" <td>6772.301628</td>\n",
|
||||
" <td>6772.301758</td>\n",
|
||||
" <td>6642.495605</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2.0</th>\n",
|
||||
" <td>5096.813714</td>\n",
|
||||
" <td>113.000000</td>\n",
|
||||
" <td>6769.673500</td>\n",
|
||||
" <td>6769.673828</td>\n",
|
||||
" <td>6639.772949</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" train/loss_step step val/loss_step val/loss_epoch \\\n",
|
||||
"epoch \n",
|
||||
"0.0 3319.602325 22.278689 6775.631730 6775.630859 \n",
|
||||
"1.0 4132.752668 67.639344 6772.301628 6772.301758 \n",
|
||||
"2.0 5096.813714 113.000000 6769.673500 6769.673828 \n",
|
||||
"\n",
|
||||
" train/loss_epoch \n",
|
||||
"epoch \n",
|
||||
"0.0 6645.847656 \n",
|
||||
"1.0 6642.495605 \n",
|
||||
"2.0 6639.772949 "
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"ename": "ImportError",
|
||||
"evalue": "matplotlib is required for plotting when the default backend \"matplotlib\" is selected.",
|
||||
"ename": "AttributeError",
|
||||
"evalue": "'PL_MODEL' object has no attribute 'model'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[10], line 85\u001b[0m\n\u001b[1;32m 83\u001b[0m df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n\u001b[1;32m 84\u001b[0m display(df_hist)\n\u001b[0;32m---> 85\u001b[0m \u001b[43mplot_hist\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28meval\u001b[39m(model, tokenizer, second_half)\n",
|
||||
"Cell \u001b[0;32mIn[10], line 79\u001b[0m, in \u001b[0;36mplot_hist\u001b[0;34m(df_hist, allowlist, logy)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m suffix \u001b[38;5;129;01min\u001b[39;00m suffixes:\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allowlist \u001b[38;5;129;01mand\u001b[39;00m suffix \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m allowlist: \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m---> 79\u001b[0m \u001b[43mdf_hist\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdf_hist\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendswith\u001b[49m\u001b[43m(\u001b[49m\u001b[43msuffix\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mc\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstyle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 80\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(suffix) \n\u001b[1;32m 81\u001b[0m plt\u001b[38;5;241m.\u001b[39mshow()\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:951\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 950\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 951\u001b[0m plot_backend \u001b[38;5;241m=\u001b[39m \u001b[43m_get_plot_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbackend\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 953\u001b[0m x, y, kind, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_call_args(\n\u001b[1;32m 954\u001b[0m plot_backend\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, args, kwargs\n\u001b[1;32m 955\u001b[0m )\n\u001b[1;32m 957\u001b[0m kind \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_kind_aliases\u001b[38;5;241m.\u001b[39mget(kind, kind)\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1947\u001b[0m, in \u001b[0;36m_get_plot_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m 1944\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend_str \u001b[38;5;129;01min\u001b[39;00m _backends:\n\u001b[1;32m 1945\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _backends[backend_str]\n\u001b[0;32m-> 1947\u001b[0m module \u001b[38;5;241m=\u001b[39m \u001b[43m_load_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbackend_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1948\u001b[0m _backends[backend_str] \u001b[38;5;241m=\u001b[39m module\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/pandas/plotting/_core.py:1877\u001b[0m, in \u001b[0;36m_load_backend\u001b[0;34m(backend)\u001b[0m\n\u001b[1;32m 1875\u001b[0m module \u001b[38;5;241m=\u001b[39m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpandas.plotting._matplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1876\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m-> 1877\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib is required for plotting when the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1879\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdefault backend \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmatplotlib\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is selected.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 1880\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1881\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\n\u001b[1;32m 1883\u001b[0m found_backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
||||
"\u001b[0;31mImportError\u001b[0m: matplotlib is required for plotting when the default backend \"matplotlib\" is selected."
|
||||
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[13], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m epoch_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(dl_train)\n\u001b[1;32m 14\u001b[0m pl_model \u001b[38;5;241m=\u001b[39m PL_MODEL(num_iterations\u001b[38;5;241m=\u001b[39mepoch_steps\u001b[38;5;241m*\u001b[39mepochs, lr\u001b[38;5;241m=\u001b[39mlr, weight_decay\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 15\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mpl_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# from lightning.pytorch.plugins import BitsandbytesPrecision\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\u001b[39;00m\n\u001b[1;32m 19\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m 20\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# precision=\"bf16-mixed\",\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# plugins=precision\u001b[39;00m\n\u001b[1;32m 25\u001b[0m )\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1694\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mAttributeError\u001b[0m: 'PL_MODEL' object has no attribute 'model'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, \n",
|
||||
" # lora_dropout=0.1,\n",
|
||||
" # bias=\"all\"\n",
|
||||
")\n",
|
||||
"model = get_peft_model(model, peft_config)\n",
|
||||
"model.config.use_cache = False\n",
|
||||
"\n",
|
||||
"device = 'cuda'\n",
|
||||
"lr = 4e-3\n",
|
||||
"epochs = 3\n",
|
||||
"accum_steps = 64\n",
|
||||
"batch_size = 4\n",
|
||||
"accum_steps = 16\n",
|
||||
"batch_size = 1\n",
|
||||
"\n",
|
||||
"Xs, Ys, attention_masks = str2xya(first_half)\n",
|
||||
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(second_half)\n",
|
||||
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=4, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
|
||||
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
|
||||
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
|
||||
"\n",
|
||||
"epoch_steps = len(dl_train)\n",
|
||||
"\n",
|
||||
"pl_model = PL_MODEL(model, num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
|
||||
"pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
|
||||
"model = pl_model.model\n",
|
||||
"# from lightning.pytorch.plugins import BitsandbytesPrecision\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
|
||||
"trainer = pl.Trainer(\n",
|
||||
" accelerator='cpu',\n",
|
||||
" max_epochs=epochs,\n",
|
||||
" precision=\"bf16-mixed\",\n",
|
||||
" precision='',\n",
|
||||
" # precision=\"bf16-mixed\",\n",
|
||||
" log_every_n_steps=1,\n",
|
||||
" accumulate_grad_batches=8,\n",
|
||||
" accumulate_grad_batches=accum_steps,\n",
|
||||
" # plugins=precision\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# train\n",
|
||||
"trainer.fit(pl_model, dl_train, dl_val)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
|
||||
"display(df_hist)\n",
|
||||
"plot_hist(df_hist)\n",
|
||||
|
||||
@@ -0,0 +1,682 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
|
||||
"\n",
|
||||
"# from evaluate.measurements.perplexity import Perplexity\n",
|
||||
"import evaluate\n",
|
||||
"from evaluate import logging\n",
|
||||
"from torch.nn import CrossEntropyLoss\n",
|
||||
"\n",
|
||||
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
|
||||
"def perplexity_compute(\n",
|
||||
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" if device is not None:\n",
|
||||
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
|
||||
" if device == \"gpu\":\n",
|
||||
" device = \"cuda\"\n",
|
||||
" else:\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
" # if batch_size > 1 (which generally leads to padding being required), and\n",
|
||||
" # if there is not an already assigned pad_token, assign an existing\n",
|
||||
" # special token to also be the padding token\n",
|
||||
" if tokenizer.pad_token is None and batch_size > 1:\n",
|
||||
" existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
|
||||
" # check that the model already has at least one special token defined\n",
|
||||
" assert (\n",
|
||||
" len(existing_special_tokens) > 0\n",
|
||||
" ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
|
||||
" # assign one of the special tokens to also be the pad token\n",
|
||||
" tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
|
||||
"\n",
|
||||
" if add_start_token and max_length:\n",
|
||||
" # leave room for <BOS> token to be added:\n",
|
||||
" assert (\n",
|
||||
" tokenizer.bos_token is not None\n",
|
||||
" ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
|
||||
" max_tokenized_len = max_length - 1\n",
|
||||
" else:\n",
|
||||
" max_tokenized_len = max_length\n",
|
||||
"\n",
|
||||
" encodings = tokenizer(\n",
|
||||
" data,\n",
|
||||
" add_special_tokens=False,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True if max_tokenized_len else False,\n",
|
||||
" max_length=max_tokenized_len,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" return_attention_mask=True,\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" encoded_texts = encodings[\"input_ids\"]\n",
|
||||
" attn_masks = encodings[\"attention_mask\"]\n",
|
||||
"\n",
|
||||
" # check that each input is long enough:\n",
|
||||
" if add_start_token:\n",
|
||||
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
|
||||
" else:\n",
|
||||
" assert torch.all(\n",
|
||||
" torch.ge(attn_masks.sum(1), 2)\n",
|
||||
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
|
||||
"\n",
|
||||
" ppls = []\n",
|
||||
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
|
||||
"\n",
|
||||
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
|
||||
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
|
||||
" encoded_batch = encoded_texts[start_index:end_index]\n",
|
||||
" attn_mask = attn_masks[start_index:end_index]\n",
|
||||
"\n",
|
||||
" if add_start_token:\n",
|
||||
" bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
|
||||
" encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
|
||||
" attn_mask = torch.cat(\n",
|
||||
" [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" labels = encoded_batch\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
|
||||
"\n",
|
||||
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
|
||||
" shift_labels = labels[..., 1:].contiguous()\n",
|
||||
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
|
||||
"\n",
|
||||
" perplexity_batch = torch.exp(\n",
|
||||
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
|
||||
" / shift_attention_mask_batch.sum(1)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ppls += perplexity_batch.tolist()\n",
|
||||
"\n",
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torch.utils.data import DataLoader, TensorDataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lightning helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample = samples[0]\n",
|
||||
"s = sample['text']\n",
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0].tolist()\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
" # turn it into a sequence\n",
|
||||
" Xs = []\n",
|
||||
" Ys = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = [pad]*padding + x\n",
|
||||
" \n",
|
||||
" Xs.append(x)\n",
|
||||
" Ys.append(input_ids[i:i+1])\n",
|
||||
"\n",
|
||||
" Xs = torch.tensor(Xs)\n",
|
||||
" Ys = torch.tensor(Ys)\n",
|
||||
" attention_masks = torch.stack([(x==pad)*1 for x in Xs])\n",
|
||||
" return Xs, Ys, attention_masks\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, tokenizer, second_half):\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n",
|
||||
"\n",
|
||||
"def read_metrics_csv(metrics_file_path):\n",
|
||||
" df_hist = pd.read_csv(metrics_file_path)\n",
|
||||
" df_hist[\"epoch\"] = df_hist[\"epoch\"].ffill()\n",
|
||||
" df_histe = df_hist.set_index(\"epoch\").groupby(\"epoch\").mean()\n",
|
||||
" return df_histe, df_hist\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def plot_hist(df_hist, allowlist=None, logy=False):\n",
|
||||
" \"\"\"plot groups of suffixes together\"\"\"\n",
|
||||
" suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))\n",
|
||||
" for suffix in suffixes:\n",
|
||||
" if allowlist and suffix not in allowlist: continue\n",
|
||||
" df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)\n",
|
||||
" plt.title(suffix) \n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import bitsandbytes as bnb\n",
|
||||
"\n",
|
||||
"class PL_MODEL(pl.LightningModule):\n",
|
||||
" def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):\n",
|
||||
" super().__init__()\n",
|
||||
" self.save_hyperparameters()\n",
|
||||
" self.configure_model()\n",
|
||||
"\n",
|
||||
" def configure_model(self):\n",
|
||||
" # instantiate your model in this hook\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, \n",
|
||||
" # lora_dropout=0.1,\n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" self.model = load_model()\n",
|
||||
" self.model = get_peft_model(self.model, peft_config)\n",
|
||||
" self.model.config.use_cache = False\n",
|
||||
" \n",
|
||||
" def forward(self, **kwargs):\n",
|
||||
" return self.model(**kwargs)\n",
|
||||
"\n",
|
||||
" def _shared_step(self, batch, batch_idx, phase='train'):\n",
|
||||
" input_ids, targets, attention_mask = batch\n",
|
||||
" # 16, 141\n",
|
||||
" output = self.forward(input_ids=input_ids, attention_mask=attention_mask)\n",
|
||||
" loss = F.smooth_l1_loss(output.logits[:, -1], targets)\n",
|
||||
" self.log(f\"{phase}/loss\", loss, on_epoch=True, on_step=True, prog_bar=True)\n",
|
||||
" return loss\n",
|
||||
" \n",
|
||||
" def training_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='train')\n",
|
||||
"\n",
|
||||
" def validation_step(self, batch, batch_idx):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='val')\n",
|
||||
" \n",
|
||||
" def test_step(self, batch, batch_idx, dataloader_idx=0):\n",
|
||||
" return self._shared_step(batch, batch_idx, phase='test')\n",
|
||||
" \n",
|
||||
" def configure_optimizers(self):\n",
|
||||
" # optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)\n",
|
||||
"\n",
|
||||
" optimizer = bnb.optim.AdamW4bit(self.parameters(), lr=self.hparams.lr, betas=(0.9, 0.995))\n",
|
||||
" lr_scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations\n",
|
||||
" )\n",
|
||||
" return [optimizer], [lr_scheduler]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.10it/s]\n",
|
||||
"Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.\n",
|
||||
"GPU available: True (cuda), used: True\n",
|
||||
"TPU available: False, using: 0 TPU cores\n",
|
||||
"IPU available: False, using: 0 IPUs\n",
|
||||
"HPU available: False, using: 0 HPUs\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "Linear4bit.__init__() got an unexpected keyword argument 'dtype'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[18], line 29\u001b[0m\n\u001b[1;32m 18\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m 19\u001b[0m accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgpu\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 20\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39mepochs,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 25\u001b[0m plugins\u001b[38;5;241m=\u001b[39mprecision\n\u001b[1;32m 26\u001b[0m )\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# train\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdl_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m model \u001b[38;5;241m=\u001b[39m pl_model\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m 33\u001b[0m df_histe, df_hist \u001b[38;5;241m=\u001b[39m read_metrics_csv(trainer\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39mexperiment\u001b[38;5;241m.\u001b[39mmetrics_file_path)\u001b[38;5;241m.\u001b[39mbfill()\u001b[38;5;241m.\u001b[39mffill()\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:544\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m 543\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 544\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 546\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 47\u001b[0m _call_teardown_hook(trainer)\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:580\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 574\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 575\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m 576\u001b[0m ckpt_path,\n\u001b[1;32m 577\u001b[0m model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 578\u001b[0m model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 579\u001b[0m )\n\u001b[0;32m--> 580\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m 583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:958\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 955\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_restore_modules_and_callbacks(ckpt_path)\n\u001b[1;32m 957\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: configuring model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 958\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_configure_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 960\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n\u001b[1;32m 961\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_logger_connector\u001b[38;5;241m.\u001b[39mreset_results()\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:109\u001b[0m, in \u001b[0;36m_call_configure_model\u001b[0;34m(trainer)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_overridden(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconfigure_model\u001b[39m\u001b[38;5;124m\"\u001b[39m, trainer\u001b[38;5;241m.\u001b[39mlightning_module):\n\u001b[1;32m 108\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mtensor_init_context(), trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mmodel_sharded_context(), trainer\u001b[38;5;241m.\u001b[39mprecision_plugin\u001b[38;5;241m.\u001b[39mmodule_init_context(): \u001b[38;5;66;03m# noqa: E501\u001b[39;00m\n\u001b[0;32m--> 109\u001b[0m \u001b[43m_call_lightning_module_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mconfigure_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:157\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m 154\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m hook_name\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 157\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m 160\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
|
||||
"Cell \u001b[0;32mIn[17], line 18\u001b[0m, in \u001b[0;36mPL_MODEL.configure_model\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconfigure_model\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# instantiate your model in this hook\u001b[39;00m\n\u001b[1;32m 11\u001b[0m peft_config \u001b[38;5;241m=\u001b[39m LoraConfig(\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# task_type=TaskType.TOKEN_CLS, \u001b[39;00m\n\u001b[1;32m 13\u001b[0m target_modules\u001b[38;5;241m=\u001b[39m[ \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfc2\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWqkv\u001b[39m\u001b[38;5;124m\"\u001b[39m,],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# bias=\"all\"\u001b[39;00m\n\u001b[1;32m 17\u001b[0m )\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m get_peft_model(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, peft_config)\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_cache \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
||||
"Cell \u001b[0;32mIn[5], line 5\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m():\n\u001b[0;32m----> 5\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# torch_dtype=torch.float16,\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:560\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 559\u001b[0m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mregister(config\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, model_class, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 560\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 561\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 562\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(config) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 564\u001b[0m model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/transformers/modeling_utils.py:3085\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 3082\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_flash_attn_2(config, torch_dtype\u001b[38;5;241m=\u001b[39mtorch_dtype, device_map\u001b[38;5;241m=\u001b[39mdevice_map)\n\u001b[1;32m 3084\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[0;32m-> 3085\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3087\u001b[0m \u001b[38;5;66;03m# Check first if we are `from_pt`\u001b[39;00m\n\u001b[1;32m 3088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_keep_in_fp32_modules:\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:933\u001b[0m, in \u001b[0;36mPhiForCausalLM.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, config: PhiConfig) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 931\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[0;32m--> 933\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer \u001b[38;5;241m=\u001b[39m \u001b[43mPhiModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 934\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head \u001b[38;5;241m=\u001b[39m CausalLMHead(config)\n\u001b[1;32m 935\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloss \u001b[38;5;241m=\u001b[39m CausalLMLoss()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36mPhiModel.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList(\u001b[43m[\u001b[49m\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_layer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:896\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(config)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membd \u001b[38;5;241m=\u001b[39m Embedding(config)\n\u001b[0;32m--> 896\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mh \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList([\u001b[43mParallelBlock\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(config\u001b[38;5;241m.\u001b[39mn_layer)])\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_init()\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:757\u001b[0m, in \u001b[0;36mParallelBlock.__init__\u001b[0;34m(self, config, block_idx)\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresid_dropout \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mDropout(config\u001b[38;5;241m.\u001b[39mresid_pdrop)\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_idx \u001b[38;5;241m=\u001b[39m block_idx\n\u001b[0;32m--> 757\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmixer \u001b[38;5;241m=\u001b[39m \u001b[43mMHA\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayer_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 758\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmlp \u001b[38;5;241m=\u001b[39m MLP(config)\n",
|
||||
"File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/d3186761bf5c4409f7679359284066c25ab668ee/modeling_phi.py:562\u001b[0m, in \u001b[0;36mMHA.__init__\u001b[0;34m(self, config, dtype, device, rotary_dim, rotary_base, rotary_scale_base, n_head, n_head_kv, head_dim, bias, causal, softmax_scale, layer_idx, return_residual, checkpointing)\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m linear_cls \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 560\u001b[0m linear_cls \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mLinear\n\u001b[0;32m--> 562\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mWqkv \u001b[38;5;241m=\u001b[39m \u001b[43mlinear_cls\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mop_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mout_proj \u001b[38;5;241m=\u001b[39m linear_cls(hidden_size, hidden_size, bias\u001b[38;5;241m=\u001b[39mbias, device\u001b[38;5;241m=\u001b[39mdevice, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[1;32m 565\u001b[0m \u001b[38;5;66;03m# Attention\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:253\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._NF4DQLinear.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 253\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquant_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnf4\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompress_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/lightning/fabric/plugins/precision/bitsandbytes.py:213\u001b[0m, in \u001b[0;36m_import_bitsandbytes.<locals>._Linear4bit.__init__\u001b[0;34m(self, device, *args, **kwargs)\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, device: Optional[_DEVICE] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 213\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;66;03m# if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;66;03m# filling the device memory with float32 weights which could lead to OOM\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;241m0\u001b[39m, device\u001b[38;5;241m=\u001b[39mdevice)\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: Linear4bit.__init__() got an unexpected keyword argument 'dtype'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"device = 'cuda'\n",
|
||||
"lr = 4e-3\n",
|
||||
"epochs = 3\n",
|
||||
"accum_steps = 16\n",
|
||||
"batch_size = 2\n",
|
||||
"\n",
|
||||
"Xs, Ys, attention_masks = str2xya(first_half, tokenizer)\n",
|
||||
"dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)\n",
|
||||
"Xs, Ys, attention_masks = str2xya(second_half, tokenizer)\n",
|
||||
"dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)\n",
|
||||
"\n",
|
||||
"epoch_steps = len(dl_train)\n",
|
||||
"\n",
|
||||
"pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)\n",
|
||||
"from lightning.pytorch.plugins import BitsandbytesPrecision\n",
|
||||
"precision = BitsandbytesPrecision(mode=\"nf4-dq\")\n",
|
||||
"# precision = BitsandbytesPrecision(mode=\"int8-training\", dtype=torch.float16, ignore_modules={\"lm_head\"})\n",
|
||||
"trainer = pl.Trainer(\n",
|
||||
" accelerator='gpu',\n",
|
||||
" max_epochs=epochs,\n",
|
||||
" # precision='',\n",
|
||||
" # precision=\"bf16-mixed\",\n",
|
||||
" log_every_n_steps=1,\n",
|
||||
" accumulate_grad_batches=accum_steps,\n",
|
||||
" plugins=precision\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# train\n",
|
||||
"trainer.fit(pl_model, dl_train, dl_val)\n",
|
||||
"\n",
|
||||
"model = pl_model.model\n",
|
||||
"\n",
|
||||
"df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()\n",
|
||||
"display(df_hist)\n",
|
||||
"plot_hist(df_hist)\n",
|
||||
"\n",
|
||||
"eval(model, tokenizer, second_half)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"1/0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Old"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def lora_eval(model, tokenizer, sample):\n",
|
||||
" # reset/set adapter\n",
|
||||
" # peft_config = IA3Config(\n",
|
||||
" # target_modules=[ \"fc2\", \"Wqkv\",], \n",
|
||||
" # feedforward_modules=[\"fc2\"],\n",
|
||||
" # inference_mode=False,\n",
|
||||
" # )\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias=\"all\"\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
"\n",
|
||||
" # train adapter\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
|
||||
"\n",
|
||||
" device = 'cuda'\n",
|
||||
" lr = 1.0e-2\n",
|
||||
" epochs = 3\n",
|
||||
" accum_steps = 64\n",
|
||||
" epoch_steps = (len(input_ids)-1)//accum_steps+1\n",
|
||||
"\n",
|
||||
" total_steps = epochs * epoch_steps\n",
|
||||
" optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
|
||||
" scheduler = optim.lr_scheduler.OneCycleLR(\n",
|
||||
" optimizer, lr, total_steps=total_steps\n",
|
||||
" )\n",
|
||||
" model.train()\n",
|
||||
" model = model.to(device)\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" # TODO: batch\n",
|
||||
" \n",
|
||||
" accum = 0\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" X = input_ids[:i][None, ]\n",
|
||||
" targets = input_ids[i:i+1][None, ]\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" out = model(input_ids=X, \n",
|
||||
" )\n",
|
||||
" logits = out['logits'][:, -1]\n",
|
||||
" loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
|
||||
" loss.backward()\n",
|
||||
" if accum > accum_steps:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" accum = 0\n",
|
||||
" else:\n",
|
||||
" accum += 1\n",
|
||||
" if accum > 0:\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" return eval(model, tokenizer, second_half)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in tqdm(samples):\n",
|
||||
" r = lora_eval(model, tokenizer, sample)\n",
|
||||
" print(sample['name'], r)\n",
|
||||
" r.update(sample)\n",
|
||||
" data.append(r)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
|
||||
"df = pd.DataFrame(data).set_index('name')\n",
|
||||
"\n",
|
||||
"df['learning'] = (df['before']-df['after'])/df['before']\n",
|
||||
"df.sort_values('learning').drop(columns=['text', 'url'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,597 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"TheBloke/phi-2-GPTQ\"\n",
|
||||
"# model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" # model_name,\n",
|
||||
" # # quantization_config=BitsAndBytesConfig(\n",
|
||||
" # # load_in_4bit=True,\n",
|
||||
" # # llm_int8_threshold=6.0,\n",
|
||||
" # # llm_int8_has_fp16_weight=False,\n",
|
||||
" # # bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
" # # bnb_4bit_use_double_quant=True,\n",
|
||||
" # # bnb_4bit_quant_type=\"nf4\",\n",
|
||||
" # # ),\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" # trust_remote_code=True,\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
" config.quantization_config['use_exllama'] = False\n",
|
||||
" config.quantization_config['disable_exllama'] = True\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CUDA extension not installed.\n",
|
||||
"CUDA extension not installed.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"base_model = load_model()\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def reset_model(base_model):\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=8, lora_alpha=8, \n",
|
||||
" # lora_dropout=0.1, \n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(base_model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"model = reset_model(base_model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
|
||||
"\n",
|
||||
"# from evaluate.measurements.perplexity import Perplexity\n",
|
||||
"import evaluate\n",
|
||||
"from evaluate import logging\n",
|
||||
"from torch.nn import CrossEntropyLoss\n",
|
||||
"\n",
|
||||
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
|
||||
"def perplexity_compute(\n",
|
||||
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" if device is not None:\n",
|
||||
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
|
||||
" if device == \"gpu\":\n",
|
||||
" device = \"cuda\"\n",
|
||||
" else:\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
" # # if batch_size > 1 (which generally leads to padding being required), and\n",
|
||||
" # # if there is not an already assigned pad_token, assign an existing\n",
|
||||
" # # special token to also be the padding token\n",
|
||||
" # if tokenizer.pad_token is None and batch_size > 1:\n",
|
||||
" # existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
|
||||
" # # check that the model already has at least one special token defined\n",
|
||||
" # assert (\n",
|
||||
" # len(existing_special_tokens) > 0\n",
|
||||
" # ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
|
||||
" # # assign one of the special tokens to also be the pad token\n",
|
||||
" # tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
|
||||
"\n",
|
||||
" # if add_start_token and max_length:\n",
|
||||
" # # leave room for <BOS> token to be added:\n",
|
||||
" # assert (\n",
|
||||
" # tokenizer.bos_token is not None\n",
|
||||
" # ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
|
||||
" # max_tokenized_len = max_length - 1\n",
|
||||
" # else:\n",
|
||||
" max_tokenized_len = max_length\n",
|
||||
"\n",
|
||||
" encodings = tokenizer(\n",
|
||||
" data,\n",
|
||||
" add_special_tokens=False,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True if max_tokenized_len else False,\n",
|
||||
" max_length=max_tokenized_len,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" return_attention_mask=True,\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" encoded_texts = encodings[\"input_ids\"]\n",
|
||||
" attn_masks = encodings[\"attention_mask\"]\n",
|
||||
"\n",
|
||||
" # check that each input is long enough:\n",
|
||||
" if add_start_token:\n",
|
||||
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
|
||||
" else:\n",
|
||||
" assert torch.all(\n",
|
||||
" torch.ge(attn_masks.sum(1), 2)\n",
|
||||
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
|
||||
"\n",
|
||||
" ppls = []\n",
|
||||
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
|
||||
"\n",
|
||||
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
|
||||
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
|
||||
" encoded_batch = encoded_texts[start_index:end_index]\n",
|
||||
" attn_mask = attn_masks[start_index:end_index]\n",
|
||||
"\n",
|
||||
" # if add_start_token:\n",
|
||||
" # bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
|
||||
" # encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
|
||||
" # attn_mask = torch.cat(\n",
|
||||
" # [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
" labels = encoded_batch\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
|
||||
" # print(out_logits.shape)\n",
|
||||
"\n",
|
||||
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
|
||||
" shift_labels = labels[..., 1:].contiguous()\n",
|
||||
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
|
||||
"\n",
|
||||
" perplexity_batch = torch.exp(\n",
|
||||
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
|
||||
" / shift_attention_mask_batch.sum(1)\n",
|
||||
" )\n",
|
||||
" # perplexity_batch = torch.exp(\n",
|
||||
" # (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)\n",
|
||||
" # / shift_attention_mask_batch.sum(1)\n",
|
||||
" # )\n",
|
||||
" # print(perplexity_batch.shape)\n",
|
||||
"\n",
|
||||
" ppls += perplexity_batch.tolist()\n",
|
||||
"\n",
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perplexity_compute(\n",
|
||||
"# second_half, model, tokenizer\n",
|
||||
"# )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torch.utils.data import DataLoader, TensorDataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lightning helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0]\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
" data = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = torch.tensor([pad]*padding + x.tolist())\n",
|
||||
"\n",
|
||||
" label_ids = input_ids[i:i+1]\n",
|
||||
" attention_mask = (x==pad)*1\n",
|
||||
" data.append(dict(input_ids=x, label_ids=label_ids, attention_mask=attention_mask))\n",
|
||||
" \n",
|
||||
" return data\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, tokenizer, second_half):\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def learn_sample(sample):\n",
|
||||
" device = 'cuda'\n",
|
||||
" lr = 4e-3\n",
|
||||
" epochs = 3\n",
|
||||
" accum_steps = 16\n",
|
||||
" batch_size = 1\n",
|
||||
"\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
|
||||
" ds_val = Dataset.from_dict(tokenizer([second_half]))\n",
|
||||
"\n",
|
||||
" os.environ['CUDA_VISIBLE_DEVICES']=\"1\"\n",
|
||||
" verbose = False\n",
|
||||
" model = reset_model(base_model)\n",
|
||||
" eval(model, tokenizer, second_half)\n",
|
||||
" trainer = transformers.Trainer(\n",
|
||||
" model=model,\n",
|
||||
" train_dataset=ds_train,\n",
|
||||
" eval_dataset=ds_val,\n",
|
||||
" args=transformers.TrainingArguments(\n",
|
||||
" per_device_train_batch_size=batch_size,\n",
|
||||
" gradient_accumulation_steps=8,\n",
|
||||
" warmup_steps=0,\n",
|
||||
" max_steps=40,\n",
|
||||
" learning_rate=3e-4,\n",
|
||||
" fp16=True,\n",
|
||||
" logging_steps=1,\n",
|
||||
" output_dir=\"outputs\",\n",
|
||||
" log_level='error',\n",
|
||||
" disable_tqdm=not verbose,\n",
|
||||
" ),\n",
|
||||
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
|
||||
" )\n",
|
||||
" trainer._signature_columns = ['input_ids', 'attention_mask', 'label_ids']\n",
|
||||
" model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
|
||||
" train_output = trainer.train()\n",
|
||||
"\n",
|
||||
" if verbose:\n",
|
||||
" df_hist = pd.DataFrame(trainer.state.log_history)\n",
|
||||
" df_hist_epoch = df_hist.groupby('epoch').last().dropna(axis=1).drop(columns=['step'])\n",
|
||||
" df_hist_step = df_hist.set_index('step').dropna(thresh=2, axis=1)\n",
|
||||
" for c in df_hist_epoch.columns:\n",
|
||||
" df_hist_epoch[[c]].plot()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" result = eval(model, tokenizer, second_half)\n",
|
||||
" return result\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in samples:\n",
|
||||
" print(sample['name'])\n",
|
||||
" r = learn_sample(sample)\n",
|
||||
" print(r)\n",
|
||||
" data.append(dict(**r, **sample))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_res = pd.DataFrame(data)\n",
|
||||
"df_res = df_res[['before', 'after', 'name', 'in_training']]\n",
|
||||
"df_res['improvement'] = df_res['before'] - df_res['after']\n",
|
||||
"df_res"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# DEBUG"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import display, HTML, Markdown\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"@torch.no_grad()\n",
|
||||
"def gen(model, inputs, tokenizer, clean=True):\n",
|
||||
" s = model.generate(\n",
|
||||
" input_ids=inputs[\"input_ids\"][None, :].to(model.device),\n",
|
||||
" attention_mask=inputs[\"attention_mask\"][None, :].to(model.device),\n",
|
||||
" use_cache=False,\n",
|
||||
" max_new_tokens=100,\n",
|
||||
" min_new_tokens=100,\n",
|
||||
" do_sample=False,\n",
|
||||
" early_stopping=False,\n",
|
||||
" )\n",
|
||||
" input_l = inputs[\"input_ids\"].shape[0]\n",
|
||||
" tokenizer_kwargs=dict(clean_up_tokenization_spaces=clean, skip_special_tokens=clean)\n",
|
||||
" old = tokenizer.decode(\n",
|
||||
" s[0, :input_l], **tokenizer_kwargs\n",
|
||||
" )\n",
|
||||
" new = tokenizer.decode(\n",
|
||||
" s[0, input_l:], **tokenizer_kwargs\n",
|
||||
" )\n",
|
||||
" s_old = \"\"+old.replace('\\n', '<br>')\n",
|
||||
" s_new = '<b>' + new.replace('\\n', '<br>')+ '<br><br><b/>'\n",
|
||||
" display(HTML(f\"{s_old}{s_new}\"))\n",
|
||||
" # print([old, new])\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"samples = samples[1]\n",
|
||||
"\n",
|
||||
"s = sample['text']\n",
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
|
||||
"ds_val = Dataset.from_dict(tokenizer([second_half]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with model.disable_adapter():\n",
|
||||
" gen(model, ds_train.with_format('pt')[0], tokenizer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gen(model, ds_train.with_format('pt')[0], tokenizer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Generated
+3963
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user