From b67181776a99cdeaac3de52e0a8ce7d543c2de0d Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Fri, 6 Jan 2023 16:47:58 +0000 Subject: [PATCH] [feature] add deepspeed, rallio dialogue dataset and codegen parameters --- .../supervised_finetuning/configs/config.yaml | 14 +++- .../configs/zero_config.json | 52 +++++++++++++ .../custom_datasets/__init__.py | 5 ++ .../custom_datasets/dialogue_collator.py | 1 + .../custom_datasets/prompt_dialogue.py | 77 +++++++++++++++++++ model/supervised_finetuning/losses.py | 2 +- model/supervised_finetuning/trainer.py | 58 ++++---------- model/supervised_finetuning/utils.py | 11 +-- 8 files changed, 169 insertions(+), 51 deletions(-) create mode 100644 model/supervised_finetuning/configs/zero_config.json create mode 100644 model/supervised_finetuning/custom_datasets/prompt_dialogue.py diff --git a/model/supervised_finetuning/configs/config.yaml b/model/supervised_finetuning/configs/config.yaml index 29086395..2ceb1388 100644 --- a/model/supervised_finetuning/configs/config.yaml +++ b/model/supervised_finetuning/configs/config.yaml @@ -6,7 +6,7 @@ defaults: per_device_eval_batch_size: 2 weight_decay: 0.00 warmup_steps: 600 - eval_steps: 200 + eval_steps: 100 save_steps: 500 max_length: 512 num_train_epochs: 3 @@ -17,6 +17,7 @@ defaults: freeze_layer: datasets: - webgpt + - prompt_dialogue cache_dir: ~/.cache loss_fn: CrossEntropyLoss eval_size: @@ -43,6 +44,17 @@ gpt-jt: per_device_train_batch_size: 4 per_device_eval_batch_size: 4 +codegen: + learning_rate: 2e-6 + model_name: Salesforce/codegen-2B-multi + weight_decay: 0.01 + max_length: 812 + warmup_steps: 600 + gradient_checkpointing: false + gradient_accumulation_steps: 5 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 + debug: eval_steps: 20 eval_size: 100 diff --git a/model/supervised_finetuning/configs/zero_config.json b/model/supervised_finetuning/configs/zero_config.json new file mode 100644 index 00000000..e196d6a0 --- /dev/null +++ b/model/supervised_finetuning/configs/zero_config.json @@ -0,0 +1,52 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "sub_group_size": 1e9, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": 2.0, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index 7e3bdc79..60462186 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -2,6 +2,8 @@ from datasets import load_dataset from sklearn.model_selection import train_test_split from torch.utils.data import Dataset, Subset +from .prompt_dialogue import PromptGeneratedDataset + QA_SPECIAL_TOKENS = {"Question": "", "Answer": ""} @@ -63,6 +65,9 @@ def get_one_dataset(conf, dataset_name): elif dataset_name == "webgpt": dataset = WebGPT() train, eval = train_val_dataset(dataset, val_split=0.2) + elif dataset_name == "prompt_dialogue": + dataset = PromptGeneratedDataset() + train, eval = train_val_dataset(dataset, val_split=0.2) else: raise ValueError(f"Unknown dataset {dataset_name}") diff --git a/model/supervised_finetuning/custom_datasets/dialogue_collator.py b/model/supervised_finetuning/custom_datasets/dialogue_collator.py index f9e1bb5e..535b9007 100644 --- a/model/supervised_finetuning/custom_datasets/dialogue_collator.py +++ b/model/supervised_finetuning/custom_datasets/dialogue_collator.py @@ -81,6 +81,7 @@ class DialogueDataCollator: batch["label_masks"] = torch.stack([F.pad(torch.tensor(x), (0, dim - len(x))) for x in label_masks]) + # why the fuck? for k in list(batch.keys()): if k not in ["input_ids", "attention_mask", "label_masks"]: batch.pop(k) diff --git a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py new file mode 100644 index 00000000..07f32266 --- /dev/null +++ b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py @@ -0,0 +1,77 @@ +import os +from urllib.request import urlopen + +from torch.utils.data import Dataset + + +class PromptGeneratedDataset(Dataset): + """Generates from flan 11B + User: What are the best methods for preventing a slave trade? + + Rosey: The best methods .... + <|endoftext|> + + we are ignoring results with multiple lines for now + """ + + url = "https://github.com/Rallio67/language-model-agents/raw/main/chat_dialogue_v2_c.txt" + + def __init__(self) -> None: + super().__init__() + os.makedirs("datasets", exist_ok=True) + chat_dialogue = os.path.join("datasets", "chat_dialogue_v2_c.txt") + if not os.path.exists(chat_dialogue): + with urlopen(self.url) as file: + content = file.read().decode() + with open(chat_dialogue, "w") as fout: + fout.write(content) + + question = "" + answer = "" + self.pairs = [] + with open(chat_dialogue, "r") as f: + + for line in f: + try: + line = line.rstrip() + if len(line) == 0: + continue + elif line == "<|endoftext|>" and len(question) > 0 and len(answer) > 0: + self.pairs.append((question, answer)) + question = "" + answer = "" + elif line[:4] == "User": + question = line.split(":", maxsplit=1)[1] + elif line == "Rosey:": + question = "" + answer = "" + elif len(line) > 4: # should be bot answer + answer = line.split(":", maxsplit=1)[1] + except IndexError: + question = "" + answer = "" + + if len(question) > 0 and len(answer) > 0: + self.pairs.append((question, answer)) + + def __len__(self): + return len(self.pairs) + + def __getitem__(self, index): + question, answer = self.pairs[index] + return question, answer + + +if __name__ == "__main__": + from torch.utils.data import DataLoader + from transformers import AutoTokenizer + + from .dialogue_collator import DialogueDataCollator + + tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-multi") + tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) + dataset = PromptGeneratedDataset() + collate_fn = DialogueDataCollator(tokenizer, padding=True, max_length=128) + dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=5) + for batch in dataloader: + print(batch["input_ids"].shape) diff --git a/model/supervised_finetuning/losses.py b/model/supervised_finetuning/losses.py index 795396b9..0cc639cf 100644 --- a/model/supervised_finetuning/losses.py +++ b/model/supervised_finetuning/losses.py @@ -7,7 +7,7 @@ class CrossEntropyLoss(nn.CrossEntropyLoss): def forward(self, input, target, mask=None): if mask is not None: - mask = mask.view(-1) + mask = mask.view(-1).bool() input = input.view(-1, input.size(-1)) target = target.view(-1) input = input[mask] diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index dc7b5934..2e59edcc 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -1,32 +1,17 @@ import argparse import os -from dataclasses import dataclass from distutils.util import strtobool -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch from torch import nn from torch.utils.data import Dataset -from transformers import ( - DataCollator, - EvalPrediction, - PreTrainedModel, - PreTrainedTokenizerBase, - Trainer, - TrainerCallback, - TrainingArguments, - get_cosine_schedule_with_warmup, -) +from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls os.environ["WANDB_PROJECT"] = "supervised-finetuning" -@dataclass -class CustomTrainingArguments(TrainingArguments): - loss_function: str = "CrossEntropyLoss" - - def compute_metrics(eval_pred): pred_ids = eval_pred.predictions labels = eval_pred.label_ids @@ -44,32 +29,13 @@ class SFTTrainer(Trainer): self, model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), - preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, + loss_function: str = "CrossEntropyLoss", + **kwargs, ): - super().__init__( - model, - args, - data_collator, - train_dataset, - eval_dataset, - tokenizer, - model_init, - compute_metrics, - callbacks, - optimizers, - preprocess_logits_for_metrics, - ) + super().__init__(model, args, **kwargs) # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct - self.loss_fct = get_loss(args.loss_function) + self.loss_fct = get_loss(loss_function) def fetch_scheduler(self): return get_cosine_schedule_with_warmup( @@ -131,6 +97,10 @@ def _strtobool(x): def argument_parsing(notebook=False, notebook_args=None): parser = argparse.ArgumentParser() parser.add_argument("--configs", nargs="+", required=True) + parser.add_argument("--local_rank", type=int, default=-1) + parser.add_argument("--deepspeed", action="store_true") + parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false") + parser.set_defaults(deepspeed=False) if notebook: args, remaining = parser.parse_known_args(notebook_args) @@ -147,6 +117,8 @@ def argument_parsing(notebook=False, notebook_args=None): else: conf.update(configs[name]) + conf["local_rank"] = args.local_rank + conf["deepspeed"] = args.deepspeed # Override config from command-line parser = argparse.ArgumentParser() for key, value in conf.items(): @@ -166,13 +138,14 @@ if __name__ == "__main__": train, evals, collate_fn = get_dataset(training_conf, tokenizer) - args = CustomTrainingArguments( + args = TrainingArguments( output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned", num_train_epochs=training_conf.num_train_epochs, warmup_steps=training_conf.warmup_steps, - loss_function=training_conf.loss_fn, learning_rate=float(training_conf.learning_rate), + deepspeed="configs/zero_config.json" if training_conf.deepspeed else None, fp16=True, + local_rank=training_conf.local_rank, gradient_checkpointing=training_conf.gradient_checkpointing, gradient_accumulation_steps=training_conf.gradient_accumulation_steps, per_device_train_batch_size=training_conf.per_device_train_batch_size, @@ -192,6 +165,7 @@ if __name__ == "__main__": trainer = SFTTrainer( model, args, + loss_function=training_conf.loss_fn, train_dataset=train, eval_dataset=evals, data_collator=collate_fn, diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index a31f74d3..7147dcb9 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -8,14 +8,16 @@ from sklearn.model_selection import train_test_split from torch.utils.data import ConcatDataset, Subset from transformers import AutoModelForCausalLM, AutoTokenizer -SUPPORTED_MODELS = ["galactica", "GPT-JT"] # deprecated .. - def get_tokenizer(conf): tokenizer = AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir) if "galactica" in conf.model_name: tokenizer.add_special_tokens({"pad_token": "", "eos_token": ""}) + elif "GPT-JT" in conf.model_name: + tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"}) + elif "codegen" in conf.model_name: + tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) additional_special_tokens = ( [] @@ -30,11 +32,6 @@ def get_tokenizer(conf): def get_model(conf, tokenizer): - if not any([x in conf.model_name for x in SUPPORTED_MODELS]): - raise ValueError( - f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. " - "To include more make sure the masking is dne correctly... (decoder only supported for now)" - ) model = AutoModelForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)