diff --git a/model/supervised_finetuning/README.md b/model/supervised_finetuning/README.md index d5b10e01..387e91e4 100644 --- a/model/supervised_finetuning/README.md +++ b/model/supervised_finetuning/README.md @@ -1,62 +1,18 @@ # Train using supervised examples -Requirements +## Requirements -``` -wandb -evaluate -datasets -transformers -torch -``` +`pip install -r requirements.txt` -Start training reward model +Start training SFT model ```bash -python trainer.py --configs defaults galactica-125 +python trainer.py --configs defaults galactica-125m ``` -## Dataset - -For now we only support webgpt and summary dataset from OpenAI. Once -open-asisstant dataset are available it will be added here. - -## Model - -Normally you should be able to add new models in configs/config.yml - -``` -your-model-name: - learning_rate: 2e-6 - model_name: - weight_decay: 0.01 - max_length: 812 - warmup_steps: 600 - gradient_checkpointing: false - gradient_accumulation_steps: 5 - per_device_train_batch_size: 4 - per_device_eval_batch_size: 4 -``` - -``` -python trainer.py --configs defaults your-model-name -``` - -However, if the model of your choice doesn't have pad_token, eos_token, -sep_token, you have to update utils.py `get_tokenizer` to use the right token. - -## Deepspeed support - -You can edit the configs/zero_config.json and use any stage you wish. The -current config uses zero-stage 3. For more details on how to setup the config -checkout [this page](https://www.deepspeed.ai/tutorials/zero/) - -Once you are satisfy with your deepzero config, you can add --deepspeed flag at -the end to trigger deepspeed - -``` -python trainer.py --configs defaults your-model-name --deepspeed -``` +For `wandb`: update the `entity` argument in `trainer.py`'s call to `wandb.init` +to be your weights and biases username per +[docs](https://docs.wandb.ai/ref/python/init). ## Dataset choices @@ -80,6 +36,74 @@ Currently only these languages are supported via prompt translation: ar,de,fr,en,it,nl,tr,ru,ms,ko,ja,zh ``` +## Dataset sub-sampling + +We can subsample the **training** data by passing either the `fraction` or +`size` argument in the `configs/config.yml` file. Don't forget the additional +colon ":" after the dataset name when doing this. + +Example: + +``` + datasets: + - webgpt: + fraction : 0.05 + - prompt_dialogue: + size : 500 + - adversarial_qa + - trivia_qa_nocontext +``` + +In this example, per epoch we will use: + +- A random 5% of `webgpt`; +- A random 500 examples from `prompt_dialogue`; +- All examples from datasets for which we don't specify the `fraction` or `size` + argument. + +In the above example, per epoch we'll use a different 5% from `webgpt` and a +different 500 examples from `prompt_dialogue`. + +This works with `torch.distributed`. + +## Model + +Normally you should be able to add new models in `configs/config.yml` + +``` +your-model-name: + learning_rate: 2e-6 + model_name: + weight_decay: 0.01 + max_length: 812 + warmup_steps: 600 + gradient_checkpointing: false + gradient_accumulation_steps: 5 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 +``` + +``` +python trainer.py --configs defaults your-model-name +``` + +However, if the model of your choice doesn't have `pad_token`, `eos_token`, +`sep_token`, you have to update `get_tokenizer` in `utils.py` to use the right +token. + +## Deepspeed support + +You can edit the configs/zero_config.json and use any stage you wish. The +current config uses zero-stage 3. For more details on how to setup the config +checkout [this page](https://www.deepspeed.ai/tutorials/zero/). + +Once you are satisfy with your deepzero config, you can add --deepspeed flag at +the end to trigger deepspeed + +``` +python trainer.py --configs defaults your-model-name --deepspeed +``` + ## Results Experimental results in wandb @@ -87,7 +111,7 @@ Experimental results in wandb ## TODOS -- decide on a model +- Decide on a model - Merge utils etc with reward model - Casual Modelling for GPT-JT does not leverage the bidirectional mask for the prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1) diff --git a/model/supervised_finetuning/configs/config.yaml b/model/supervised_finetuning/configs/config.yaml index d70fad41..79e4751d 100644 --- a/model/supervised_finetuning/configs/config.yaml +++ b/model/supervised_finetuning/configs/config.yaml @@ -17,7 +17,7 @@ defaults: freeze_layer: datasets: - webgpt - - prompt_dialogue + # - prompt_dialogue - squad_v2 - adversarial_qa - trivia_qa_nocontext diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 5faa22e6..092b7743 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -219,7 +219,7 @@ class SODA(Dataset): return pairs - def __init__(self, cache_dir, max_sample_size=10000, input_max_length=1024) -> None: + def __init__(self, cache_dir, input_max_length=1024) -> None: super().__init__() self.pairs = [] @@ -230,9 +230,6 @@ class SODA(Dataset): if len(prompt) < input_max_length: self.pairs.append((prompt, answer)) - if len(self.pairs) > max_sample_size: - break - def __len__(self): return len(self.pairs) diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index f9a71a8e..008de751 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -100,8 +100,6 @@ class WMT2019(TranslationPair): else: # translating in reverse direction source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) - if len(self.pairs) > 100000: - break class DiveMT(TranslationPair): diff --git a/model/supervised_finetuning/requirements.txt b/model/supervised_finetuning/requirements.txt index 8f8cc63c..95e5a472 100644 --- a/model/supervised_finetuning/requirements.txt +++ b/model/supervised_finetuning/requirements.txt @@ -4,7 +4,6 @@ datasets==2.8.0 deepspeed==0.7.7 evaluate==0.4.0 gdown -mpi4py==3.1.4 nltk==3.8.1 numpy>=1.22.4 py7zr @@ -12,3 +11,4 @@ PyYAML>=6.0 scikit_learn==1.2.0 torch>=1.11.0 transformers==4.25.1 +wandb diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index 83034d95..fb5d4bee 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -9,7 +9,7 @@ from efficiency_utils import fuse_gelu from torch import nn from transformers import PreTrainedModel, Trainer, TrainingArguments from transformers.training_args import OptimizerNames -from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls +from utils import PerDatasetSampler, get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls def compute_metrics(eval_pred, preprocess_fns, metrics): @@ -31,6 +31,7 @@ class SFTTrainer(Trainer): self, model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, + sampler: torch.utils.data.sampler.Sampler = None, loss_function: str = "CrossEntropyLoss", poly_eps: float = 1.0, **kwargs, @@ -39,6 +40,7 @@ class SFTTrainer(Trainer): # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct self.loss_fct = get_loss(loss_function, poly_eps) + self.sampler = sampler def compute_loss(self, model, inputs, return_outputs=False): labels_mask = inputs.pop("label_masks") @@ -89,6 +91,32 @@ class SFTTrainer(Trainer): return (loss, logits, labels) + def get_train_dataloader(self): + """Inject custom data sampling behaviour into training loop""" + if self.sampler is None: + torch.utils.data.DataLoader( + self.train_dataset, + batch_size=self.args.per_device_train_batch_size, + shuffle=True, + collate_fn=self.data_collator, + ) + else: + dataloader = torch.utils.data.DataLoader( + self.train_dataset, + batch_size=self.args.per_device_train_batch_size, + sampler=self.sampler, + collate_fn=self.data_collator, + ) + if torch.cuda.device_count() <= 1: + return dataloader + else: + # Not strictly necessary to use accelerate, currently just + # ensures batches are padded to be divisible by # devices + from accelerate import Accelerator + + accelerator = Accelerator() + return accelerator.prepare(dataloader) + def _strtobool(x): return bool(strtobool(x)) @@ -142,8 +170,8 @@ if __name__ == "__main__": model = get_model(training_conf, tokenizer) train, evals, collate_fn = get_dataset(training_conf, tokenizer) + sampler = PerDatasetSampler.build_sampler_from_config(training_conf, train.datasets) metrics, preprocess_fns = get_metrics(training_conf, tokenizer) - optimizer = OptimizerNames.ADAMW_BNB if training_conf.quantization else OptimizerNames.ADAMW_HF if training_conf.quantization: @@ -181,7 +209,6 @@ if __name__ == "__main__": ) assert len(evals) > 0 - if not training_conf.deepspeed or training_conf.local_rank == 0: import wandb @@ -192,8 +219,9 @@ if __name__ == "__main__": ) trainer = SFTTrainer( - model, - args, + model=model, + args=args, + sampler=sampler, loss_function=training_conf.loss_fn, poly_eps=training_conf.poly_eps, train_dataset=train, diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index c3b8264f..380bca8e 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -1,11 +1,8 @@ -# from functools import partial +import random from pathlib import Path -from typing import NamedTuple +from typing import List, NamedTuple import evaluate - -# import nltk -# import numpy as np import transformers import yaml from custom_datasets import get_one_dataset @@ -15,6 +12,79 @@ from losses import CrossEntropyLoss, PolyLoss from models import freeze_top_n_layers, get_specific_model from sklearn.model_selection import train_test_split from torch.utils.data import ConcatDataset, Subset +from torch.utils.data.sampler import Sampler + + +class PerDatasetSampler(Sampler): + """Sampler which returns a fixed number of samples per dataset, per epoch. + + Example: + + Dataset 1 has 10,000 examples and we want 200 per epoch + Dataset 2 has 500 examples and we want all 500 per epoch + + Epoch size will be 700 and every epoch we'll sample a different + 200 from dataset 1. + + Parameters + ---------- + dataset_sizes : List[int] + A list with the size of each dataset. + dataset_size_per_epoch : List[int] + How many examples to get from each dataset per epoch. + + Note: dataset_sizes & dataset_size_per_epoch must be in the same order. + Further the examples in the underlying torch.utils.data.Dataset + must per ordered as dataset_1, dataset_2, ..., dataset_n. This is fine + if we concatenate a bunch of datasets together + e.g. using torch.utils.data.ConcatDataset which is current behaviour. + """ + + def __init__(self, dataset_sizes: List[int], dataset_size_per_epoch: List[int]): + self.dataset_sizes = dataset_sizes + self.dataset_size_per_epoch = dataset_size_per_epoch + self.num_datasets = len(dataset_sizes) + + def __iter__(self): + epoch_idx = [] + n = 0 + for i in range(self.num_datasets): + sampled_idx = random.sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i]) + n += self.dataset_sizes[i] + epoch_idx.extend(sampled_idx) + random.shuffle(epoch_idx) + return iter(epoch_idx) + + def __len__(self): + return int(sum(self.dataset_size_per_epoch)) + + @classmethod + def build_sampler_from_config(cls, training_conf, datasets): + dataset_sizes = [len(x) for x in datasets] + fractions = get_dataset_fractions(training_conf.datasets, dataset_sizes) + dataset_size_per_epoch = [int(size * frac) for size, frac in zip(dataset_sizes, fractions)] + return cls(dataset_sizes, dataset_size_per_epoch) + + +def get_dataset_fractions(conf, dataset_sizes): + """Calculate fraction of each dataset to use per epoch when subsampling""" + fractions = [] + for i, data_config in enumerate(conf): + dataset_name = get_dataset_name_from_data_config(data_config) + if isinstance(data_config, dict): + if "fraction" in data_config[dataset_name]: + if data_config[dataset_name]["fraction"] <= 0: + raise ValueError("Please specify fraction as a value between 0 < fraction <= 1") + fractions.append(min(1, data_config[dataset_name]["fraction"])) + elif "size" in data_config[dataset_name]: + if data_config[dataset_name]["size"] > dataset_sizes[i]: + raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}") + fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i]) + else: + raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.") + else: + fractions.append(1) + return fractions class SpecialTokens(NamedTuple): @@ -140,10 +210,17 @@ def get_model(conf, tokenizer): return model +def get_dataset_name_from_data_config(data_config): + if isinstance(data_config, dict): + return list(data_config.keys())[0] + return data_config + + def get_dataset(conf, tokenizer): train_datasets, evals = [], {} - for dataset_name in conf.datasets: + for data_config in conf.datasets: + dataset_name = get_dataset_name_from_data_config(data_config) train, val = get_one_dataset(conf, dataset_name) train_datasets.append(train) evals[dataset_name] = Subset(val, list(range(min(len(val), conf.eval_size)))) if conf.eval_size else val