mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-06-27 16:45:29 +08:00
working
This commit is contained in:
+190
@@ -1,2 +1,192 @@
|
||||
.env
|
||||
|
||||
|
||||
*.arrow
|
||||
squad_*
|
||||
*sbert_embedded*
|
||||
*.pkl
|
||||
ckpts*
|
||||
.deepspeed_env
|
||||
*.jsonl
|
||||
*tar.gz
|
||||
ckpts**
|
||||
wandb
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
|
||||
# vs code
|
||||
.vscode
|
||||
*.bin
|
||||
|
||||
.DS_Store
|
||||
|
||||
# gpt4all-chat
|
||||
CMakeLists.txt.user
|
||||
gpt4all-chat/models/*
|
||||
build_*
|
||||
build-*
|
||||
|
||||
# IntelliJ
|
||||
.idea/
|
||||
|
||||
# LLM models
|
||||
*.gguf
|
||||
|
||||
+2306
-9
File diff suppressed because it is too large
Load Diff
+114
-46
@@ -30,6 +30,7 @@
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
@@ -71,8 +72,8 @@
|
||||
"import json\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n",
|
||||
"\n",
|
||||
"sample = samples[0]\n",
|
||||
"sample"
|
||||
"# sample = samples[0]\n",
|
||||
"# sample"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -108,7 +109,7 @@
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" # model = model.to(device)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
@@ -201,8 +202,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
"results['mean_perplexity']"
|
||||
"# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
"# results['mean_perplexity']"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -243,19 +244,31 @@
|
||||
"model.lm_head = CastOutputToFloat(model.lm_head)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"peft_config = IA3Config(\n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",], \n",
|
||||
" feedforward_modules=[\"fc2\"],\n",
|
||||
" inference_mode=False,\n",
|
||||
")\n",
|
||||
"model = get_peft_model(model, peft_config)\n",
|
||||
"model.config.use_cache = False"
|
||||
"# # Verifying the datatypes.\n",
|
||||
"# dtypes = {}\n",
|
||||
"# for _, p in model.named_parameters():\n",
|
||||
"# dtype = p.dtype\n",
|
||||
"# if dtype not in dtypes:\n",
|
||||
"# dtypes[dtype] = 0\n",
|
||||
"# dtypes[dtype] += p.numel()\n",
|
||||
"# total = 0\n",
|
||||
"# for k, v in dtypes.items():\n",
|
||||
"# total += v\n",
|
||||
"# for k, v in dtypes.items():\n",
|
||||
"# print(k, v, v / total)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -264,19 +277,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Verifying the datatypes.\n",
|
||||
"dtypes = {}\n",
|
||||
"for _, p in model.named_parameters():\n",
|
||||
" dtype = p.dtype\n",
|
||||
" if dtype not in dtypes:\n",
|
||||
" dtypes[dtype] = 0\n",
|
||||
" dtypes[dtype] += p.numel()\n",
|
||||
"total = 0\n",
|
||||
"for k, v in dtypes.items():\n",
|
||||
" total += v\n",
|
||||
"for k, v in dtypes.items():\n",
|
||||
" print(k, v, v / total)\n"
|
||||
"# sample['text']"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -286,12 +287,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"### Training\"\"\"\n",
|
||||
"from datasets import Dataset\n",
|
||||
"# from datasets import Dataset\n",
|
||||
"\n",
|
||||
"# data = load_dataset(\"Abirate/english_quotes\")\n",
|
||||
"data = Dataset.from_dict({\"text\": [sample['text'][:len(sample['text'])//2]]*100})\n",
|
||||
"data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True).with_format(\"torch\")\n",
|
||||
"data"
|
||||
"# data = Dataset.from_dict({\"text\": [sample['text'][:len(sample['text'])//2]]*100})\n",
|
||||
"# data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True).with_format(\"torch\")\n",
|
||||
"# data"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -300,8 +301,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"# batch.keys()"
|
||||
"from torch.nn import functional as F"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -310,17 +310,74 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n",
|
||||
"model.train()\n",
|
||||
"model = model.to('cuda')\n",
|
||||
"for epoch in range(10):\n",
|
||||
" for batch in DataLoader(data, batch_size=1):\n",
|
||||
" b_in = {'input_ids': batch['input_ids'].to('cuda').to(dtype), 'attention_mask': batch['attention_mask'].to('cuda').to(dtype)}\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = model(**batch).loss\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" print(loss.item())"
|
||||
"def lora_eval(model, sample):\n",
|
||||
" # reset/set adapter\n",
|
||||
" peft_config = IA3Config(\n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",], \n",
|
||||
" feedforward_modules=[\"fc2\"],\n",
|
||||
" inference_mode=False,\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
"\n",
|
||||
" # train adapter\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" input_ids = tokenizer(first_half, return_tensors=\"pt\")[\"input_ids\"][0].to('cuda')\n",
|
||||
" device = 'cuda'\n",
|
||||
" optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
|
||||
" model.train()\n",
|
||||
" model = model.to(device)\n",
|
||||
" for epoch in range(1):\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" X = input_ids[:i][None, ]\n",
|
||||
" targets = input_ids[i:i+1][None, ]\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" out = model(input_ids=X, \n",
|
||||
" )\n",
|
||||
" logits = out['logits'][:, -1]\n",
|
||||
" loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" # print(loss.item())\n",
|
||||
"\n",
|
||||
" # eval\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results['mean_perplexity']\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
"\n",
|
||||
" return dict(before=results['mean_perplexity'], after=results2['mean_perplexity'])\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in samples:\n",
|
||||
" r = lora_eval(model, sample)\n",
|
||||
" r.update(sample)\n",
|
||||
" data.append(r)\n",
|
||||
" 1/0\n",
|
||||
" print(data[-1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('perplexity (on 2nd half) before and after training adapter on first half of text')\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -335,10 +392,21 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results2 = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
"results['mean_perplexity'], results2['mean_perplexity']"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
|
||||
Reference in New Issue
Block a user