mirror of
https://github.com/wassname/detect_bs_text.git
synced 2026-06-27 15:15:26 +08:00
lora seems to work
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
.env
|
||||
lightning_logs/
|
||||
outputs/
|
||||
|
||||
*.arrow
|
||||
squad_*
|
||||
|
||||
@@ -34,3 +34,19 @@ When using microsoft/phi-2 we get this amount of perplexity reduction by includi
|
||||
|
||||
|
||||
As you can see, some of these are probobly in the training set
|
||||
|
||||
# Citing
|
||||
|
||||
If you like our work and end up using this code for your reseach give us a shout-out by citing or acknowledging
|
||||
|
||||
```
|
||||
@misc{wassname2024,
|
||||
author = {Clark, M.J.},
|
||||
title = {BS Writing Detector},
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\url{https://github.com/wassname/detect_bs_text}},
|
||||
commit = {}
|
||||
}
|
||||
```
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,597 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/wassname/SGIronWolf/projects5/bs_writing_detector/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from torch import optim\n",
|
||||
"import lightning as pl\n",
|
||||
"from matplotlib import pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import transformers\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"import warnings\n",
|
||||
"from peft import LoraConfig, get_peft_model, IA3Config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.style.use('ggplot')\n",
|
||||
"torch.set_float32_matmul_precision('medium')\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
||||
"\n",
|
||||
"model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
"# model_name,\n",
|
||||
"# # max_memory=max_memory,\n",
|
||||
"# quantization_config=BitsAndBytesConfig(\n",
|
||||
"# load_in_4bit=True,\n",
|
||||
"# llm_int8_threshold=6.0,\n",
|
||||
"# llm_int8_has_fp16_weight=False,\n",
|
||||
"# bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
"# bnb_4bit_use_double_quant=True,\n",
|
||||
"# bnb_4bit_quant_type=\"nf4\",\n",
|
||||
"# ),\n",
|
||||
"# torch_dtype=torch.float16,\n",
|
||||
"# trust_remote_code=True,\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"TheBloke/phi-2-GPTQ\"\n",
|
||||
"# model_name = \"microsoft/phi-2\"\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" # model_name,\n",
|
||||
" # # quantization_config=BitsAndBytesConfig(\n",
|
||||
" # # load_in_4bit=True,\n",
|
||||
" # # llm_int8_threshold=6.0,\n",
|
||||
" # # llm_int8_has_fp16_weight=False,\n",
|
||||
" # # bnb_4bit_compute_dtype=torch.float16,\n",
|
||||
" # # bnb_4bit_use_double_quant=True,\n",
|
||||
" # # bnb_4bit_quant_type=\"nf4\",\n",
|
||||
" # # ),\n",
|
||||
" # torch_dtype=torch.float16,\n",
|
||||
" # trust_remote_code=True,\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" config = AutoConfig.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
" config.quantization_config['use_exllama'] = False\n",
|
||||
" config.quantization_config['disable_exllama'] = True\n",
|
||||
" model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
" return model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CUDA extension not installed.\n",
|
||||
"CUDA extension not installed.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"base_model = load_model()\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def reset_model(base_model):\n",
|
||||
" peft_config = LoraConfig(\n",
|
||||
" # task_type=TaskType.TOKEN_CLS, \n",
|
||||
" target_modules=[ \"fc2\", \"Wqkv\",],\n",
|
||||
" inference_mode=False, r=8, lora_alpha=8, \n",
|
||||
" # lora_dropout=0.1, \n",
|
||||
" # bias=\"all\"\n",
|
||||
" )\n",
|
||||
" model = get_peft_model(base_model, peft_config)\n",
|
||||
" model.config.use_cache = False\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"model = reset_model(base_model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"MAX_LEN = 2000\n",
|
||||
"samples = json.load(open(\"../samples.json\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154\n",
|
||||
"\n",
|
||||
"# from evaluate.measurements.perplexity import Perplexity\n",
|
||||
"import evaluate\n",
|
||||
"from evaluate import logging\n",
|
||||
"from torch.nn import CrossEntropyLoss\n",
|
||||
"\n",
|
||||
"# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)\n",
|
||||
"def perplexity_compute(\n",
|
||||
" data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" if device is not None:\n",
|
||||
" assert device in [\"gpu\", \"cpu\", \"cuda\"], \"device should be either gpu or cpu.\"\n",
|
||||
" if device == \"gpu\":\n",
|
||||
" device = \"cuda\"\n",
|
||||
" else:\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
" # model = AutoModelForCausalLM.from_pretrained(model_id)\n",
|
||||
" model = model.to(device)\n",
|
||||
"\n",
|
||||
" # tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
|
||||
"\n",
|
||||
" # # if batch_size > 1 (which generally leads to padding being required), and\n",
|
||||
" # # if there is not an already assigned pad_token, assign an existing\n",
|
||||
" # # special token to also be the padding token\n",
|
||||
" # if tokenizer.pad_token is None and batch_size > 1:\n",
|
||||
" # existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())\n",
|
||||
" # # check that the model already has at least one special token defined\n",
|
||||
" # assert (\n",
|
||||
" # len(existing_special_tokens) > 0\n",
|
||||
" # ), \"If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1.\"\n",
|
||||
" # # assign one of the special tokens to also be the pad token\n",
|
||||
" # tokenizer.add_special_tokens({\"pad_token\": existing_special_tokens[0]})\n",
|
||||
"\n",
|
||||
" # if add_start_token and max_length:\n",
|
||||
" # # leave room for <BOS> token to be added:\n",
|
||||
" # assert (\n",
|
||||
" # tokenizer.bos_token is not None\n",
|
||||
" # ), \"Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False\"\n",
|
||||
" # max_tokenized_len = max_length - 1\n",
|
||||
" # else:\n",
|
||||
" max_tokenized_len = max_length\n",
|
||||
"\n",
|
||||
" encodings = tokenizer(\n",
|
||||
" data,\n",
|
||||
" add_special_tokens=False,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True if max_tokenized_len else False,\n",
|
||||
" max_length=max_tokenized_len,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" return_attention_mask=True,\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" encoded_texts = encodings[\"input_ids\"]\n",
|
||||
" attn_masks = encodings[\"attention_mask\"]\n",
|
||||
"\n",
|
||||
" # check that each input is long enough:\n",
|
||||
" if add_start_token:\n",
|
||||
" assert torch.all(torch.ge(attn_masks.sum(1), 1)), \"Each input text must be at least one token long.\"\n",
|
||||
" else:\n",
|
||||
" assert torch.all(\n",
|
||||
" torch.ge(attn_masks.sum(1), 2)\n",
|
||||
" ), \"When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.\"\n",
|
||||
"\n",
|
||||
" ppls = []\n",
|
||||
" loss_fct = CrossEntropyLoss(reduction=\"none\")\n",
|
||||
"\n",
|
||||
" for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):\n",
|
||||
" end_index = min(start_index + batch_size, len(encoded_texts))\n",
|
||||
" encoded_batch = encoded_texts[start_index:end_index]\n",
|
||||
" attn_mask = attn_masks[start_index:end_index]\n",
|
||||
"\n",
|
||||
" # if add_start_token:\n",
|
||||
" # bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)\n",
|
||||
" # encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)\n",
|
||||
" # attn_mask = torch.cat(\n",
|
||||
" # [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
" labels = encoded_batch\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" out_logits = model(encoded_batch, attention_mask=attn_mask).logits\n",
|
||||
" # print(out_logits.shape)\n",
|
||||
"\n",
|
||||
" shift_logits = out_logits[..., :-1, :].contiguous()\n",
|
||||
" shift_labels = labels[..., 1:].contiguous()\n",
|
||||
" shift_attention_mask_batch = attn_mask[..., 1:].contiguous()\n",
|
||||
"\n",
|
||||
" perplexity_batch = torch.exp(\n",
|
||||
" (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)\n",
|
||||
" / shift_attention_mask_batch.sum(1)\n",
|
||||
" )\n",
|
||||
" # perplexity_batch = torch.exp(\n",
|
||||
" # (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch)\n",
|
||||
" # / shift_attention_mask_batch.sum(1)\n",
|
||||
" # )\n",
|
||||
" # print(perplexity_batch.shape)\n",
|
||||
"\n",
|
||||
" ppls += perplexity_batch.tolist()\n",
|
||||
"\n",
|
||||
" return {\"perplexities\": ppls, \"mean_perplexity\": torch.tensor(ppls).mean()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perplexity_compute(\n",
|
||||
"# second_half, model, tokenizer\n",
|
||||
"# )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torch.utils.data import DataLoader, TensorDataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lightning helpers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def str2xya(s, tokenizer):\n",
|
||||
" max_len = min(MAX_LEN, len(s))\n",
|
||||
" input_ids = tokenizer(s, return_tensors=\"pt\")[\"input_ids\"][0]\n",
|
||||
"\n",
|
||||
" pad = tokenizer.bos_token_id\n",
|
||||
" data = []\n",
|
||||
" for i in range(1, len(input_ids)):\n",
|
||||
" x = input_ids[:i][-max_len:]\n",
|
||||
" padding = max_len - len(x)\n",
|
||||
" x = torch.tensor([pad]*padding + x.tolist())\n",
|
||||
"\n",
|
||||
" label_ids = input_ids[i:i+1]\n",
|
||||
" attention_mask = (x==pad)*1\n",
|
||||
" data.append(dict(input_ids=x, label_ids=label_ids, attention_mask=attention_mask))\n",
|
||||
" \n",
|
||||
" return data\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval(model, tokenizer, second_half):\n",
|
||||
" model.eval();\n",
|
||||
" with torch.no_grad():\n",
|
||||
" with model.disable_adapter():\n",
|
||||
" results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')\n",
|
||||
" return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def learn_sample(sample):\n",
|
||||
" device = 'cuda'\n",
|
||||
" lr = 4e-3\n",
|
||||
" epochs = 3\n",
|
||||
" accum_steps = 16\n",
|
||||
" batch_size = 1\n",
|
||||
"\n",
|
||||
" s = sample['text']\n",
|
||||
" first_half = s[:len(s)//2]\n",
|
||||
" second_half = s[len(s)//2:]\n",
|
||||
" ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
|
||||
" ds_val = Dataset.from_dict(tokenizer([second_half]))\n",
|
||||
"\n",
|
||||
" os.environ['CUDA_VISIBLE_DEVICES']=\"1\"\n",
|
||||
" verbose = False\n",
|
||||
" model = reset_model(base_model)\n",
|
||||
" eval(model, tokenizer, second_half)\n",
|
||||
" trainer = transformers.Trainer(\n",
|
||||
" model=model,\n",
|
||||
" train_dataset=ds_train,\n",
|
||||
" eval_dataset=ds_val,\n",
|
||||
" args=transformers.TrainingArguments(\n",
|
||||
" per_device_train_batch_size=batch_size,\n",
|
||||
" gradient_accumulation_steps=8,\n",
|
||||
" warmup_steps=0,\n",
|
||||
" max_steps=40,\n",
|
||||
" learning_rate=3e-4,\n",
|
||||
" fp16=True,\n",
|
||||
" logging_steps=1,\n",
|
||||
" output_dir=\"outputs\",\n",
|
||||
" log_level='error',\n",
|
||||
" disable_tqdm=not verbose,\n",
|
||||
" ),\n",
|
||||
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
|
||||
" )\n",
|
||||
" trainer._signature_columns = ['input_ids', 'attention_mask', 'label_ids']\n",
|
||||
" model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
|
||||
" train_output = trainer.train()\n",
|
||||
"\n",
|
||||
" if verbose:\n",
|
||||
" df_hist = pd.DataFrame(trainer.state.log_history)\n",
|
||||
" df_hist_epoch = df_hist.groupby('epoch').last().dropna(axis=1).drop(columns=['step'])\n",
|
||||
" df_hist_step = df_hist.set_index('step').dropna(thresh=2, axis=1)\n",
|
||||
" for c in df_hist_epoch.columns:\n",
|
||||
" df_hist_epoch[[c]].plot()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" result = eval(model, tokenizer, second_half)\n",
|
||||
" return result\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = []\n",
|
||||
"for sample in samples:\n",
|
||||
" print(sample['name'])\n",
|
||||
" r = learn_sample(sample)\n",
|
||||
" print(r)\n",
|
||||
" data.append(dict(**r, **sample))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_res = pd.DataFrame(data)\n",
|
||||
"df_res = df_res[['before', 'after', 'name', 'in_training']]\n",
|
||||
"df_res['improvement'] = df_res['before'] - df_res['after']\n",
|
||||
"df_res"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# DEBUG"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import display, HTML, Markdown\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"@torch.no_grad()\n",
|
||||
"def gen(model, inputs, tokenizer, clean=True):\n",
|
||||
" s = model.generate(\n",
|
||||
" input_ids=inputs[\"input_ids\"][None, :].to(model.device),\n",
|
||||
" attention_mask=inputs[\"attention_mask\"][None, :].to(model.device),\n",
|
||||
" use_cache=False,\n",
|
||||
" max_new_tokens=100,\n",
|
||||
" min_new_tokens=100,\n",
|
||||
" do_sample=False,\n",
|
||||
" early_stopping=False,\n",
|
||||
" )\n",
|
||||
" input_l = inputs[\"input_ids\"].shape[0]\n",
|
||||
" tokenizer_kwargs=dict(clean_up_tokenization_spaces=clean, skip_special_tokens=clean)\n",
|
||||
" old = tokenizer.decode(\n",
|
||||
" s[0, :input_l], **tokenizer_kwargs\n",
|
||||
" )\n",
|
||||
" new = tokenizer.decode(\n",
|
||||
" s[0, input_l:], **tokenizer_kwargs\n",
|
||||
" )\n",
|
||||
" s_old = \"\"+old.replace('\\n', '<br>')\n",
|
||||
" s_new = '<b>' + new.replace('\\n', '<br>')+ '<br><br><b/>'\n",
|
||||
" display(HTML(f\"{s_old}{s_new}\"))\n",
|
||||
" # print([old, new])\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"samples = samples[1]\n",
|
||||
"\n",
|
||||
"s = sample['text']\n",
|
||||
"first_half = s[:len(s)//2]\n",
|
||||
"second_half = s[len(s)//2:]\n",
|
||||
"ds_train = Dataset.from_dict(tokenizer([first_half]))\n",
|
||||
"ds_val = Dataset.from_dict(tokenizer([second_half]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with model.disable_adapter():\n",
|
||||
" gen(model, ds_train.with_format('pt')[0], tokenizer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gen(model, ds_train.with_format('pt')[0], tokenizer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0rc1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user