lora seems to work

2026-06-27 17:00:44 +08:00 · 2024-01-03 13:14:24 +08:00
parent 13d0c82596
commit 7f27b90159
6 changed files with 11828 additions and 597 deletions
@@ -1,5 +1,6 @@
 .env
 lightning_logs/
+outputs/

 *.arrow
 squad_*
@@ -34,3 +34,19 @@ When using microsoft/phi-2 we get this amount of perplexity reduction by includi


 As you can see, some of these are probobly in the training set
+
+# Citing
+
+If you like our work and end up using this code for your reseach give us a shout-out by citing or acknowledging
+
+```
+@misc{wassname2024,
+  author = {Clark, M.J.},
+  title = {BS Writing Detector},
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/wassname/detect_bs_text}},
+  commit = {}
+}
+```
@@ -1,597 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from torch import optim\n",
-    "import lightning as pl\n",
-    "from matplotlib import pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import transformers\n",
-    "from datasets import load_dataset\n",
-    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
-    "import numpy as np\n",
-    "from tqdm.auto import tqdm\n",
-    "import pandas as pd\n",
-    "import warnings\n",
-    "from peft import LoraConfig, get_peft_model, IA3Config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.style.use('ggplot')\n",
-    "torch.set_float32_matmul_precision('medium')\n",
-    "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
-    "\n",
-    "model_name = \"microsoft/phi-2\"\n",
-    "\n",
-    "# model = AutoModelForCausalLM.from_pretrained(\n",
-    "#     model_name,\n",
-    "#     # max_memory=max_memory,\n",
-    "#     quantization_config=BitsAndBytesConfig(\n",
-    "#         load_in_4bit=True,\n",
-    "#         llm_int8_threshold=6.0,\n",
-    "#         llm_int8_has_fp16_weight=False,\n",
-    "#         bnb_4bit_compute_dtype=torch.float16,\n",
-    "#         bnb_4bit_use_double_quant=True,\n",
-    "#         bnb_4bit_quant_type=\"nf4\",\n",
-    "#     ),\n",
-    "#     torch_dtype=torch.float16,\n",
-    "#     trust_remote_code=True,\n",
-    "# )\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_name = \"TheBloke/phi-2-GPTQ\"\n",
-    "# model_name = \"microsoft/phi-2\"\n",
-    "\n",
-    "def load_model():\n",
-    "\n",
-    "    # model = AutoModelForCausalLM.from_pretrained(\n",
-    "    #     model_name,\n",
-    "    #     # quantization_config=BitsAndBytesConfig(\n",
-    "    #     #     load_in_4bit=True,\n",
-    "    #     #     llm_int8_threshold=6.0,\n",
-    "    #     #     llm_int8_has_fp16_weight=False,\n",
-    "    #     #     bnb_4bit_compute_dtype=torch.float16,\n",
-    "    #     #     bnb_4bit_use_double_quant=True,\n",
-    "    #     #     bnb_4bit_quant_type=\"nf4\",\n",
-    "    #     # ),\n",
-    "    #     torch_dtype=torch.float16,\n",
-    "    #     trust_remote_code=True,\n",
-    "    # )\n",
-    "\n",
-    "\n",
-    "    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
-    "    config.quantization_config['use_exllama'] = False\n",
-    "    config.quantization_config['disable_exllama'] = True\n",
-    "    model = AutoModelForCausalLM.from_pretrained(\n",
-    "        model_name,\n",
-    "        torch_dtype=torch.bfloat16,\n",
-    "        trust_remote_code=True,\n",
-    "        config=config,\n",
-    "    )\n",
-    "    return model\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "CUDA extension not installed.\n",
-      "CUDA extension not installed.\n"
-     ]
-    }
-   ],
-   "source": [
-    "base_model = load_model()\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
-    "tokenizer.pad_token = tokenizer.eos_token"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def reset_model(base_model):\n",
-    "    peft_config = LoraConfig(\n",
-    "        # task_type=TaskType.TOKEN_CLS, \n",
-    "        target_modules=[ \"fc2\",  \"Wqkv\",],\n",
-    "        inference_mode=False, r=8, lora_alpha=8, \n",
-    "        # lora_dropout=0.1, \n",
-    "        # bias=\"all\"\n",
-    "    )\n",
-    "    model = get_peft_model(base_model, peft_config)\n",
-    "    model.config.use_cache = False\n",
-    "    return model\n",
-    "\n",
-    "model = reset_model(base_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "MAX_LEN = 2000\n",
-    "samples = json.load(open(\"../samples.json\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Helpers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
-    "\n",
-    "# from evaluate.measurements.perplexity import Perplexity\n",
-    "import evaluate\n",
-    "from evaluate import logging\n",
-    "from torch.nn import CrossEntropyLoss\n",
-    "\n",
-    "# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
-    "def perplexity_compute(\n",
-    "    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
-    "):\n",
-    "\n",
-    "    if device is not None:\n",
-    "        assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
-    "        if device == \"gpu\":\n",
-    "            device = \"cuda\"\n",
-    "    else:\n",
-    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "\n",
-    "    # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
-    "    model = model.to(device)\n",
-    "\n",
-    "    # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
-    "\n",
-    "    # # if batch_size > 1 (which generally leads to padding being required), and\n",
-    "    # # if there is not an already assigned pad_token, assign an existing\n",
-    "    # # special token to also be the padding token\n",
-    "    # if tokenizer.pad_token is None and batch_size > 1:\n",
-    "    #     existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
-    "    #     # check that the model already has at least one special token defined\n",
-    "    #     assert (\n",
-    "    #         len(existing_special_tokens) > 0\n",
-    "    #     ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
-    "    #     # assign one of the special tokens to also be the pad token\n",
-    "    #     tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
-    "\n",
-    "    # if add_start_token and max_length:\n",
-    "    #     # leave room for <BOS> token to be added:\n",
-    "    #     assert (\n",
-    "    #         tokenizer.bos_token is not None\n",
-    "    #     ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
-    "    #     max_tokenized_len = max_length - 1\n",
-    "    # else:\n",
-    "    max_tokenized_len = max_length\n",
-    "\n",
-    "    encodings = tokenizer(\n",
-    "        data,\n",
-    "        add_special_tokens=False,\n",
-    "        padding=True,\n",
-    "        truncation=True if max_tokenized_len else False,\n",
-    "        max_length=max_tokenized_len,\n",
-    "        return_tensors=\"pt\",\n",
-    "        return_attention_mask=True,\n",
-    "    ).to(device)\n",
-    "\n",
-    "    encoded_texts = encodings[\"input_ids\"]\n",
-    "    attn_masks = encodings[\"attention_mask\"]\n",
-    "\n",
-    "    # check that each input is long enough:\n",
-    "    if add_start_token:\n",
-    "        assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
-    "    else:\n",
-    "        assert torch.all(\n",
-    "            torch.ge(attn_masks.sum(1), 2)\n",
-    "        ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
-    "\n",
-    "    ppls = []\n",
-    "    loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
-    "\n",
-    "    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
-    "        end_index = min(start_index + batch_size, len(encoded_texts))\n",
-    "        encoded_batch = encoded_texts[start_index:end_index]\n",
-    "        attn_mask = attn_masks[start_index:end_index]\n",
-    "\n",
-    "        # if add_start_token:\n",
-    "        #     bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
-    "        #     encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
-    "        #     attn_mask = torch.cat(\n",
-    "        #         [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
-    "        #     )\n",
-    "\n",
-    "        labels = encoded_batch\n",
-    "\n",
-    "        with torch.no_grad():\n",
-    "            out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
-    "            # print(out_logits.shape)\n",
-    "\n",
-    "        shift_logits = out_logits[..., :-1, :].contiguous()\n",
-    "        shift_labels = labels[..., 1:].contiguous()\n",
-    "        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
-    "\n",
-    "        perplexity_batch = torch.exp(\n",
-    "            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
-    "            / shift_attention_mask_batch.sum(1)\n",
-    "        )\n",
-    "        # perplexity_batch = torch.exp(\n",
-    "        #     (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)\n",
-    "        #     / shift_attention_mask_batch.sum(1)\n",
-    "        # )\n",
-    "        # print(perplexity_batch.shape)\n",
-    "\n",
-    "        ppls += perplexity_batch.tolist()\n",
-    "\n",
-    "    return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# perplexity_compute(\n",
-    "#     second_half, model, tokenizer\n",
-    "# )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torch.nn import functional as F\n",
-    "from torch.utils.data import DataLoader, TensorDataset"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Lightning helpers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "def str2xya(s, tokenizer):\n",
-    "    max_len = min(MAX_LEN, len(s))\n",
-    "    input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0]\n",
-    "\n",
-    "    pad = tokenizer.bos_token_id\n",
-    "    data = []\n",
-    "    for i in range(1, len(input_ids)):\n",
-    "        x = input_ids[:i][-max_len:]\n",
-    "        padding = max_len - len(x)\n",
-    "        x = torch.tensor([pad]*padding + x.tolist())\n",
-    "\n",
-    "        label_ids = input_ids[i:i+1]\n",
-    "        attention_mask = (x==pad)*1\n",
-    "        data.append(dict(input_ids=x, label_ids=label_ids, attention_mask=attention_mask))\n",
-    "        \n",
-    "    return data\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def eval(model, tokenizer, second_half):\n",
-    "    model.eval();\n",
-    "    with torch.no_grad():\n",
-    "        with model.disable_adapter():\n",
-    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
-    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
-    "    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def learn_sample(sample):\n",
-    "    device = 'cuda'\n",
-    "    lr = 4e-3\n",
-    "    epochs = 3\n",
-    "    accum_steps = 16\n",
-    "    batch_size = 1\n",
-    "\n",
-    "    s = sample['text']\n",
-    "    first_half = s[:len(s)//2]\n",
-    "    second_half = s[len(s)//2:]\n",
-    "    ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
-    "    ds_val = Dataset.from_dict(tokenizer([second_half]))\n",
-    "\n",
-    "    os.environ['CUDA_VISIBLE_DEVICES']=\"1\"\n",
-    "    verbose = False\n",
-    "    model = reset_model(base_model)\n",
-    "    eval(model, tokenizer, second_half)\n",
-    "    trainer = transformers.Trainer(\n",
-    "        model=model,\n",
-    "        train_dataset=ds_train,\n",
-    "        eval_dataset=ds_val,\n",
-    "        args=transformers.TrainingArguments(\n",
-    "            per_device_train_batch_size=batch_size,\n",
-    "            gradient_accumulation_steps=8,\n",
-    "            warmup_steps=0,\n",
-    "            max_steps=40,\n",
-    "            learning_rate=3e-4,\n",
-    "            fp16=True,\n",
-    "            logging_steps=1,\n",
-    "            output_dir=\"outputs\",\n",
-    "            log_level='error',\n",
-    "            disable_tqdm=not verbose,\n",
-    "        ),\n",
-    "        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
-    "    )\n",
-    "    trainer._signature_columns = ['input_ids', 'attention_mask', 'label_ids']\n",
-    "    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!\n",
-    "    train_output = trainer.train()\n",
-    "\n",
-    "    if verbose:\n",
-    "        df_hist = pd.DataFrame(trainer.state.log_history)\n",
-    "        df_hist_epoch = df_hist.groupby('epoch').last().dropna(axis=1).drop(columns=['step'])\n",
-    "        df_hist_step = df_hist.set_index('step').dropna(thresh=2, axis=1)\n",
-    "        for c in df_hist_epoch.columns:\n",
-    "            df_hist_epoch[[c]].plot()\n",
-    "\n",
-    "\n",
-    "    result = eval(model, tokenizer, second_half)\n",
-    "    return result\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = []\n",
-    "for sample in samples:\n",
-    "    print(sample['name'])\n",
-    "    r = learn_sample(sample)\n",
-    "    print(r)\n",
-    "    data.append(dict(**r, **sample))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_res = pd.DataFrame(data)\n",
-    "df_res = df_res[['before', 'after', 'name', 'in_training']]\n",
-    "df_res['improvement'] = df_res['before'] - df_res['after']\n",
-    "df_res"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# DEBUG"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import display, HTML, Markdown\n",
-    "import torch\n",
-    "\n",
-    "@torch.no_grad()\n",
-    "def gen(model, inputs, tokenizer, clean=True):\n",
-    "    s = model.generate(\n",
-    "        input_ids=inputs[\"input_ids\"][None, :].to(model.device),\n",
-    "        attention_mask=inputs[\"attention_mask\"][None, :].to(model.device),\n",
-    "        use_cache=False,\n",
-    "        max_new_tokens=100,\n",
-    "        min_new_tokens=100,\n",
-    "        do_sample=False,\n",
-    "        early_stopping=False,\n",
-    "    )\n",
-    "    input_l = inputs[\"input_ids\"].shape[0]\n",
-    "    tokenizer_kwargs=dict(clean_up_tokenization_spaces=clean, skip_special_tokens=clean)\n",
-    "    old = tokenizer.decode(\n",
-    "        s[0, :input_l], **tokenizer_kwargs\n",
-    "    )\n",
-    "    new = tokenizer.decode(\n",
-    "        s[0, input_l:], **tokenizer_kwargs\n",
-    "    )\n",
-    "    s_old = \"\"+old.replace('\\n', '<br>')\n",
-    "    s_new =  '<b>' + new.replace('\\n', '<br>')+ '<br><br><b/>'\n",
-    "    display(HTML(f\"{s_old}{s_new}\"))\n",
-    "    # print([old, new])\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "samples = samples[1]\n",
-    "\n",
-    "s = sample['text']\n",
-    "first_half = s[:len(s)//2]\n",
-    "second_half = s[len(s)//2:]\n",
-    "ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
-    "ds_val = Dataset.from_dict(tokenizer([second_half]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with model.disable_adapter():\n",
-    "    gen(model, ds_train.with_format('pt')[0], tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gen(model, ds_train.with_format('pt')[0], tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.0rc1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}