working

2026-06-27 16:45:29 +08:00 · 2024-01-02 14:21:23 +08:00
parent 6f9c32ebb4
commit 594dd3db31
3 changed files with 2610 additions and 55 deletions
@@ -1,2 +1,192 @@
 .env

+
+*.arrow
+squad_*
+*sbert_embedded*
+*.pkl
+ckpts*
+.deepspeed_env
+*.jsonl
+*tar.gz
+ckpts**
+wandb
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+
+# vs code
+.vscode
+*.bin
+
+.DS_Store
+
+# gpt4all-chat
+CMakeLists.txt.user
+gpt4all-chat/models/*
+build_*
+build-*
+
+# IntelliJ
+.idea/
+
+# LLM models
+*.gguf
@@ -30,6 +30,7 @@
    "from datasets import load_dataset\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
    "import numpy as np\n",
+    "import pandas as pd\n",
    "from peft import LoraConfig, get_peft_model, IA3Config"
   ]
  },
@@ -71,8 +72,8 @@
    "import json\n",
    "samples = json.load(open(\"../samples.json\"))\n",
    "\n",
-    "sample = samples[0]\n",
-    "sample"
+    "# sample = samples[0]\n",
+    "# sample"
   ]
  },
  {
@@ -108,7 +109,7 @@
    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "    # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
-    "    # model = model.to(device)\n",
+    "    model = model.to(device)\n",
    "\n",
    "    # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "\n",
@@ -201,8 +202,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
-    "results['mean_perplexity']"
+    "# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
+    "# results['mean_perplexity']"
   ]
  },
  {
@@ -243,19 +244,31 @@
    "model.lm_head = CastOutputToFloat(model.lm_head)\n"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "peft_config = IA3Config(\n",
-    "    target_modules=[ \"fc2\",  \"Wqkv\",], \n",
-    "        feedforward_modules=[\"fc2\"],\n",
-    "        inference_mode=False,\n",
-    ")\n",
-    "model = get_peft_model(model, peft_config)\n",
-    "model.config.use_cache = False"
+    "# # Verifying the datatypes.\n",
+    "# dtypes = {}\n",
+    "# for _, p in model.named_parameters():\n",
+    "#     dtype = p.dtype\n",
+    "#     if dtype not in dtypes:\n",
+    "#         dtypes[dtype] = 0\n",
+    "#     dtypes[dtype] += p.numel()\n",
+    "# total = 0\n",
+    "# for k, v in dtypes.items():\n",
+    "#     total += v\n",
+    "# for k, v in dtypes.items():\n",
+    "#     print(k, v, v / total)"
   ]
  },
  {
@@ -264,19 +277,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "\n",
-    "# Verifying the datatypes.\n",
-    "dtypes = {}\n",
-    "for _, p in model.named_parameters():\n",
-    "    dtype = p.dtype\n",
-    "    if dtype not in dtypes:\n",
-    "        dtypes[dtype] = 0\n",
-    "    dtypes[dtype] += p.numel()\n",
-    "total = 0\n",
-    "for k, v in dtypes.items():\n",
-    "    total += v\n",
-    "for k, v in dtypes.items():\n",
-    "    print(k, v, v / total)\n"
+    "# sample['text']"
   ]
  },
  {
@@ -286,12 +287,12 @@
   "outputs": [],
   "source": [
    "\"\"\"### Training\"\"\"\n",
-    "from datasets import Dataset\n",
+    "# from datasets import Dataset\n",
    "\n",
    "# data = load_dataset(\"Abirate/english_quotes\")\n",
-    "data = Dataset.from_dict({\"text\": [sample['text'][:len(sample['text'])//2]]*100})\n",
-    "data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True).with_format(\"torch\")\n",
-    "data"
+    "# data = Dataset.from_dict({\"text\": [sample['text'][:len(sample['text'])//2]]*100})\n",
+    "# data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True).with_format(\"torch\")\n",
+    "# data"
   ]
  },
  {
@@ -300,8 +301,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from torch.utils.data import DataLoader\n",
-    "# batch.keys()"
+    "from torch.nn import functional as F"
   ]
  },
  {
@@ -310,17 +310,74 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n",
-    "model.train()\n",
-    "model = model.to('cuda')\n",
-    "for epoch in range(10):\n",
-    "    for batch in DataLoader(data, batch_size=1):\n",
-    "        b_in = {'input_ids': batch['input_ids'].to('cuda').to(dtype), 'attention_mask': batch['attention_mask'].to('cuda').to(dtype)}\n",
-    "        optimizer.zero_grad()\n",
-    "        loss = model(**batch).loss\n",
-    "        loss.backward()\n",
-    "        optimizer.step()\n",
-    "        print(loss.item())"
+    "def lora_eval(model, sample):\n",
+    "    # reset/set adapter\n",
+    "    peft_config = IA3Config(\n",
+    "        target_modules=[ \"fc2\",  \"Wqkv\",], \n",
+    "            feedforward_modules=[\"fc2\"],\n",
+    "            inference_mode=False,\n",
+    "    )\n",
+    "    model = get_peft_model(model, peft_config)\n",
+    "    model.config.use_cache = False\n",
+    "\n",
+    "    # train adapter\n",
+    "    s = sample['text']\n",
+    "    first_half = s[:len(s)//2]\n",
+    "    second_half = s[len(s)//2:]\n",
+    "    input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
+    "    device = 'cuda'\n",
+    "    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
+    "    model.train()\n",
+    "    model = model.to(device)\n",
+    "    for epoch in range(1):\n",
+    "        for i in range(1, len(input_ids)):\n",
+    "            X = input_ids[:i][None, ]\n",
+    "            targets = input_ids[i:i+1][None, ]\n",
+    "            optimizer.zero_grad()\n",
+    "            out = model(input_ids=X, \n",
+    "                        )\n",
+    "            logits = out['logits'][:, -1]\n",
+    "            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            # print(loss.item())\n",
+    "\n",
+    "    # eval\n",
+    "    model.eval();\n",
+    "    with torch.no_grad():\n",
+    "        with model.disable_adapter():\n",
+    "            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "            results['mean_perplexity']\n",
+    "        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
+    "\n",
+    "    return dict(before=results['mean_perplexity'], after=results2['mean_perplexity'])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = []\n",
+    "for sample in samples:\n",
+    "    r = lora_eval(model, sample)\n",
+    "    r.update(sample)\n",
+    "    data.append(r)\n",
+    "    1/0\n",
+    "    print(data[-1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
+    "df = pd.DataFrame(data)\n",
+    "df"
   ]
  },
  {
@@ -335,10 +392,21 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "results2 = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
-    "results['mean_perplexity'], results2['mean_perplexity']"
-   ]
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  },
  {
   "cell_type": "code",