From 88ee3b32644ee3f8a249dc281150f86505490b5e Mon Sep 17 00:00:00 2001 From: Sotirios Anagnostidis Date: Fri, 6 Jan 2023 21:28:26 +0100 Subject: [PATCH] merge deepspeed --- model/supervised_finetuning/README.md | 35 +++- .../supervised_finetuning/configs/config.yaml | 19 +- .../custom_datasets/__init__.py | 10 +- .../custom_datasets/prompt_dialogue.py | 66 +++++++ model/supervised_finetuning/losses.py | 2 +- model/supervised_finetuning/trainer.py | 171 ++++++------------ model/supervised_finetuning/utils.py | 10 +- 7 files changed, 181 insertions(+), 132 deletions(-) create mode 100644 model/supervised_finetuning/custom_datasets/prompt_dialogue.py diff --git a/model/supervised_finetuning/README.md b/model/supervised_finetuning/README.md index 014afa95..bd202397 100644 --- a/model/supervised_finetuning/README.md +++ b/model/supervised_finetuning/README.md @@ -23,7 +23,40 @@ open-asisstant dataset are available it will be added here. ## Model -TBD +Normally you should be able to add new models in configs/config.yml + +``` +your-model-name: + learning_rate: 2e-6 + model_name: + weight_decay: 0.01 + max_length: 812 + warmup_steps: 600 + gradient_checkpointing: false + gradient_accumulation_steps: 5 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 +``` + +``` +python trainer.py --configs defaults your-model-name +``` + +However, if the model of your choice doesn't have pad_token, eos_token, +sep_token, you have to update utils.py `get_tokenizer` to use the right token. + +## Deepspeed support + +You can edit the configs/zero_config.json and use any stage you wish. The +current config uses zero-stage 3. For more details on how to setup the config +checkout [this page](https://www.deepspeed.ai/tutorials/zero/) + +Once you are satisfy with your deepzero config, you can add --deepspeed flag at +the end to trigger deepspeed + +``` +python trainer.py --configs defaults your-model-name --deepspeed +``` ## Results diff --git a/model/supervised_finetuning/configs/config.yaml b/model/supervised_finetuning/configs/config.yaml index 64c024c2..fb2bdaa0 100644 --- a/model/supervised_finetuning/configs/config.yaml +++ b/model/supervised_finetuning/configs/config.yaml @@ -6,7 +6,7 @@ defaults: per_device_eval_batch_size: 2 weight_decay: 0.00 warmup_steps: 600 - eval_steps: 200 + eval_steps: 100 save_steps: 500 max_length: 512 num_train_epochs: 3 @@ -17,6 +17,7 @@ defaults: freeze_layer: datasets: - webgpt + - prompt_dialogue cache_dir: ~/.cache loss_fn: CrossEntropyLoss eval_size: @@ -44,11 +45,21 @@ gpt-jt: per_device_train_batch_size: 4 per_device_eval_batch_size: 4 +codegen: + learning_rate: 2e-6 + model_name: Salesforce/codegen-2B-multi + weight_decay: 0.01 + max_length: 812 + warmup_steps: 600 + gradient_checkpointing: false + gradient_accumulation_steps: 5 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 + debug: eval_steps: 20 eval_size: 100 - model_name: EleutherAI/gpt-j-6B - gradient_accumulation_steps: 2 + gradient_accumulation_steps: 1 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 - quantization: 8bit \ No newline at end of file + quantization: \ No newline at end of file diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index 7e3bdc79..5706bfa7 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -2,6 +2,8 @@ from datasets import load_dataset from sklearn.model_selection import train_test_split from torch.utils.data import Dataset, Subset +from .prompt_dialogue import PromptGeneratedDataset + QA_SPECIAL_TOKENS = {"Question": "", "Answer": ""} @@ -14,8 +16,8 @@ class SquadV2Dataset(Dataset): def __getitem__(self, idx): data = self.dataset[idx] - # dummy return first answer - return "".join([data["title"], ". ", data["context"], " " + data["question"]]), data["answers"]["text"][0] + # return first answer form list of possible answers + return data["title"] + ". " + data["context"] + " " + data["question"], data["answers"]["text"][0] class WebGPT(Dataset): @@ -57,12 +59,14 @@ def get_one_dataset(conf, dataset_name): dataset_name = dataset_name.lower() if dataset_name == "squadv2": - raise ValueError("SquadV2 is not diverse enough for generation .. ") train = SquadV2Dataset(conf.cache_dir, "train") eval = SquadV2Dataset(conf.cache_dir, "validation") elif dataset_name == "webgpt": dataset = WebGPT() train, eval = train_val_dataset(dataset, val_split=0.2) + elif dataset_name == "prompt_dialogue": + dataset = PromptGeneratedDataset() + train, eval = train_val_dataset(dataset, val_split=0.2) else: raise ValueError(f"Unknown dataset {dataset_name}") diff --git a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py new file mode 100644 index 00000000..17911141 --- /dev/null +++ b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py @@ -0,0 +1,66 @@ +import os +from urllib.request import urlopen + +from torch.utils.data import Dataset + + +class PromptGeneratedDataset(Dataset): + """Generates from flan 11B + User: What are the best methods for preventing a slave trade? + + Rosey: The best methods .... + <|endoftext|> + + we are ignoring results with multiple lines for now + """ + + url = "https://github.com/Rallio67/language-model-agents/raw/main/chat_dialogue_v2_c.txt" + + def __init__(self) -> None: + super().__init__() + os.makedirs("datasets", exist_ok=True) + chat_dialogue = os.path.join("datasets", "chat_dialogue_v2_c.txt") + if not os.path.exists(chat_dialogue): + with urlopen(self.url) as file: + content = file.read().decode() + with open(chat_dialogue, "w") as fout: + fout.write(content) + + question = "" + answer = "" + self.pairs = [] + with open(chat_dialogue, "r") as f: + corpus = f.read().split("<|endoftext|>") + for dialogue in corpus: + dialogue = dialogue.strip() + if "Rosey:" in dialogue: + user, bot = dialogue.split("Rosey:", maxsplit=1) + question = user.split(":", maxsplit=1)[1].strip() + answer = bot.strip() + if len(answer) and len(question): + self.pairs.append((question, answer)) + + if len(question) > 0 and len(answer) > 0: + self.pairs.append((question, answer)) + + def __len__(self): + return len(self.pairs) + + def __getitem__(self, index): + question, answer = self.pairs[index] + return question, answer + + +if __name__ == "__main__": + from torch.utils.data import DataLoader + from transformers import AutoTokenizer + + from .dialogue_collator import DialogueDataCollator + + tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-multi") + tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) + dataset = PromptGeneratedDataset() + collate_fn = DialogueDataCollator(tokenizer, padding=True, max_length=128) + dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=5) + for batch in dataloader: + print(batch["input_ids"].shape) diff --git a/model/supervised_finetuning/losses.py b/model/supervised_finetuning/losses.py index 795396b9..0cc639cf 100644 --- a/model/supervised_finetuning/losses.py +++ b/model/supervised_finetuning/losses.py @@ -7,7 +7,7 @@ class CrossEntropyLoss(nn.CrossEntropyLoss): def forward(self, input, target, mask=None): if mask is not None: - mask = mask.view(-1) + mask = mask.view(-1).bool() input = input.view(-1, input.size(-1)) target = target.view(-1) input = input[mask] diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index 590f9bbd..bb77b9c3 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -1,34 +1,19 @@ import argparse import os -from dataclasses import dataclass from distutils.util import strtobool -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch from torch import nn from torch.utils.data import Dataset -from transformers import ( - DataCollator, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizerBase, - Trainer, - TrainerCallback, - TrainingArguments, - get_cosine_schedule_with_warmup, -) +from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup import bitsandbytes as bnb + from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls os.environ["WANDB_PROJECT"] = "supervised-finetuning" -@dataclass -class CustomTrainingArguments(TrainingArguments): - loss_function: str = "CrossEntropyLoss" - quantization: str = None - - def compute_metrics(eval_pred): pred_ids = eval_pred.predictions labels = eval_pred.label_ids @@ -46,35 +31,15 @@ class SFTTrainer(Trainer): self, model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), - preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, + loss_function: str = "CrossEntropyLoss", + **kwargs, ): - super().__init__( - model, - args, - data_collator, - train_dataset, - eval_dataset, - tokenizer, - model_init, - compute_metrics, - callbacks, - optimizers, - preprocess_logits_for_metrics, - ) + super().__init__(model, args, **kwargs) # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct - self.loss_fct = get_loss(args.loss_function) + self.loss_fct = get_loss(loss_function) def create_optimizer_and_scheduler(self, num_training_steps: int): - print("Optimizer") if self.args.quantization == "8bit": self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) else: @@ -82,7 +47,6 @@ class SFTTrainer(Trainer): self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay ) - print("lr sheduler") self.lr_scheduler = get_cosine_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_steps, @@ -93,16 +57,17 @@ class SFTTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): labels_mask = inputs.pop("label_masks") + targets = inputs.pop("targets") outputs = model(**inputs) - loss = self.loss_fct(outputs.get("logits"), torch.roll(inputs["input_ids"], -1, -1), mask=labels_mask) + loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask) return (loss, outputs) if return_outputs else loss def _compute_loss(self, model, inputs): - labels_mask = inputs.pop("label_masks") + targets = inputs.pop("targets") inputs = self._prepare_inputs(inputs) @@ -110,7 +75,6 @@ class SFTTrainer(Trainer): logits = outputs.get("logits") - targets = torch.roll(inputs["input_ids"], -1, -1) loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask) return loss, logits, targets, labels_mask @@ -142,12 +106,18 @@ def _strtobool(x): def argument_parsing(notebook=False, notebook_args=None): parser = argparse.ArgumentParser() parser.add_argument("--configs", nargs="+", required=True) + parser.add_argument("--local_rank", type=int, default=-1) + parser.add_argument("--deepspeed", action="store_true") + parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false") + parser.set_defaults(deepspeed=False) if notebook: args, remaining = parser.parse_known_args(notebook_args) else: args, remaining = parser.parse_known_args() + print(args) + # Config from YAML conf = {} configs = read_yamls("./configs") @@ -158,6 +128,8 @@ def argument_parsing(notebook=False, notebook_args=None): else: conf.update(configs[name]) + conf["local_rank"] = args.local_rank + conf["deepspeed"] = args.deepspeed # Override config from command-line parser = argparse.ArgumentParser() for key, value in conf.items(): @@ -175,76 +147,41 @@ if __name__ == "__main__": tokenizer = get_tokenizer(training_conf) model = get_model(training_conf, tokenizer) - ### - from datasets import load_dataset - from bitsandbytes.optim import Adam8bit - from torch.nn import functional as F - from tqdm import tqdm + train, evals, collate_fn = get_dataset(training_conf, tokenizer) - gpt = model.to("cuda") + args = TrainingArguments( + output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned", + num_train_epochs=training_conf.num_train_epochs, + warmup_steps=training_conf.warmup_steps, + learning_rate=float(training_conf.learning_rate), + deepspeed="configs/zero_config.json" if training_conf.deepspeed else None, + fp16=True, + local_rank=training_conf.local_rank, + gradient_checkpointing=training_conf.gradient_checkpointing, + gradient_accumulation_steps=training_conf.gradient_accumulation_steps, + per_device_train_batch_size=training_conf.per_device_train_batch_size, + per_device_eval_batch_size=training_conf.per_device_eval_batch_size, + weight_decay=training_conf.weight_decay, + max_grad_norm=training_conf.max_grad_norm, + logging_steps=training_conf.logging_steps, + save_total_limit=training_conf.save_total_limit, + evaluation_strategy="steps", + eval_steps=training_conf.eval_steps, + save_steps=training_conf.save_steps, + eval_accumulation_steps=training_conf.eval_accumulation_steps, + report_to="wandb", + ) - gpt.gradient_checkpointing_enable() - - codeparrot = load_dataset("transformersbook/codeparrot-train", streaming=True, cache_dir=training_conf.cache_dir) - optimizer = Adam8bit(gpt.parameters(), lr=1e-5) - - with torch.cuda.amp.autocast(): - for row in tqdm(codeparrot["train"]): - if len(row["content"]) <= 1: - continue - - batch = tokenizer(row["content"], truncation=True, max_length=128, return_tensors="pt") - batch = {k: v.cuda() for k, v in batch.items()} - - out = gpt.forward( - **batch, - ) - - loss = F.cross_entropy( - out.logits[:, :-1, :].flatten(0, -2), batch["input_ids"][:, 1:].flatten(), reduction="mean" - ) - print(loss) - loss.backward() - - optimizer.step() - optimizer.zero_grad() - ### - - - # train, evals, collate_fn = get_dataset(training_conf, tokenizer) - # assert len(evals) > 0 - - # args = CustomTrainingArguments( - # output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned", - # num_train_epochs=training_conf.num_train_epochs, - # warmup_steps=training_conf.warmup_steps, - # loss_function=training_conf.loss_fn, - # learning_rate=float(training_conf.learning_rate), - # fp16=True, - # gradient_checkpointing=training_conf.gradient_checkpointing, - # gradient_accumulation_steps=training_conf.gradient_accumulation_steps, - # per_device_train_batch_size=training_conf.per_device_train_batch_size, - # per_device_eval_batch_size=training_conf.per_device_eval_batch_size, - # weight_decay=training_conf.weight_decay, - # max_grad_norm=training_conf.max_grad_norm, - # logging_steps=training_conf.logging_steps, - # save_total_limit=training_conf.save_total_limit, - # evaluation_strategy="steps", - # eval_steps=training_conf.eval_steps, - # save_steps=training_conf.save_steps, - # eval_accumulation_steps=training_conf.eval_accumulation_steps, - # report_to="wandb", - # quantization=training_conf.quantization, - # ) - - # trainer = SFTTrainer( - # model, - # args, - # train_dataset=train, - # eval_dataset=evals, - # data_collator=collate_fn, - # tokenizer=tokenizer, - # compute_metrics=compute_metrics, - # preprocess_logits_for_metrics=preprocess_logits_for_metrics, - # ) - # trainer.train() + assert len(evals) > 0 + trainer = SFTTrainer( + model, + args, + loss_function=training_conf.loss_fn, + train_dataset=train, + eval_dataset=evals, + data_collator=collate_fn, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + trainer.train() diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index c3cc512c..8f9ed5ca 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -15,6 +15,10 @@ def get_tokenizer(conf): if "galactica" in conf.model_name: tokenizer.add_special_tokens({"pad_token": "", "eos_token": ""}) + elif "GPT-JT" in conf.model_name: + tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"}) + elif "codegen" in conf.model_name: + tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) additional_special_tokens = ( [] @@ -29,12 +33,6 @@ def get_tokenizer(conf): def get_model(conf, tokenizer): - if not any([x in conf.model_name.lower() for x in SUPPORTED_MODELS]): - raise ValueError( - f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. " - "To include more make sure the masking is done correctly... (decoder only supported for now)" - ) - model = get_specific_model(conf.model_name, conf.cache_dir, conf.quantization) if len(tokenizer) != model.get_input_embeddings().num_embeddings: