Merge pull request #1368 from maw501/sft-data-sampling

SFT data sampling
This commit is contained in:
sanagnos
2023-02-09 09:22:13 +01:00
committed by GitHub
7 changed files with 195 additions and 71 deletions
+76 -52
View File
@@ -1,62 +1,18 @@
# Train using supervised examples
Requirements
## Requirements
```
wandb
evaluate
datasets
transformers
torch
```
`pip install -r requirements.txt`
Start training reward model
Start training SFT model
```bash
python trainer.py --configs defaults galactica-125
python trainer.py --configs defaults galactica-125m
```
## Dataset
For now we only support webgpt and summary dataset from OpenAI. Once
open-asisstant dataset are available it will be added here.
## Model
Normally you should be able to add new models in configs/config.yml
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have pad_token, eos_token,
sep_token, you have to update utils.py `get_tokenizer` to use the right token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
For `wandb`: update the `entity` argument in `trainer.py`'s call to `wandb.init`
to be your weights and biases username per
[docs](https://docs.wandb.ai/ref/python/init).
## Dataset choices
@@ -80,6 +36,74 @@ Currently only these languages are supported via prompt translation:
ar,de,fr,en,it,nl,tr,ru,ms,ko,ja,zh
```
## Dataset sub-sampling
We can subsample the **training** data by passing either the `fraction` or
`size` argument in the `configs/config.yml` file. Don't forget the additional
colon ":" after the dataset name when doing this.
Example:
```
datasets:
- webgpt:
fraction : 0.05
- prompt_dialogue:
size : 500
- adversarial_qa
- trivia_qa_nocontext
```
In this example, per epoch we will use:
- A random 5% of `webgpt`;
- A random 500 examples from `prompt_dialogue`;
- All examples from datasets for which we don't specify the `fraction` or `size`
argument.
In the above example, per epoch we'll use a different 5% from `webgpt` and a
different 500 examples from `prompt_dialogue`.
This works with `torch.distributed`.
## Model
Normally you should be able to add new models in `configs/config.yml`
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have `pad_token`, `eos_token`,
`sep_token`, you have to update `get_tokenizer` in `utils.py` to use the right
token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/).
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
## Results
Experimental results in wandb
@@ -87,7 +111,7 @@ Experimental results in wandb
## TODOS
- decide on a model
- Decide on a model
- Merge utils etc with reward model
- Casual Modelling for GPT-JT does not leverage the bidirectional mask for the
prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1)
@@ -17,7 +17,7 @@ defaults:
freeze_layer:
datasets:
- webgpt
- prompt_dialogue
# - prompt_dialogue
- squad_v2
- adversarial_qa
- trivia_qa_nocontext
@@ -219,7 +219,7 @@ class SODA(Dataset):
return pairs
def __init__(self, cache_dir, max_sample_size=10000, input_max_length=1024) -> None:
def __init__(self, cache_dir, input_max_length=1024) -> None:
super().__init__()
self.pairs = []
@@ -230,9 +230,6 @@ class SODA(Dataset):
if len(prompt) < input_max_length:
self.pairs.append((prompt, answer))
if len(self.pairs) > max_sample_size:
break
def __len__(self):
return len(self.pairs)
@@ -100,8 +100,6 @@ class WMT2019(TranslationPair):
else: # translating in reverse direction
source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt])
self.pairs.append((source, row[src]))
if len(self.pairs) > 100000:
break
class DiveMT(TranslationPair):
+1 -1
View File
@@ -4,7 +4,6 @@ datasets==2.8.0
deepspeed==0.7.7
evaluate==0.4.0
gdown
mpi4py==3.1.4
nltk==3.8.1
numpy>=1.22.4
py7zr
@@ -12,3 +11,4 @@ PyYAML>=6.0
scikit_learn==1.2.0
torch>=1.11.0
transformers==4.25.1
wandb
+33 -5
View File
@@ -9,7 +9,7 @@ from efficiency_utils import fuse_gelu
from torch import nn
from transformers import PreTrainedModel, Trainer, TrainingArguments
from transformers.training_args import OptimizerNames
from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls
from utils import PerDatasetSampler, get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls
def compute_metrics(eval_pred, preprocess_fns, metrics):
@@ -31,6 +31,7 @@ class SFTTrainer(Trainer):
self,
model: Union[PreTrainedModel, nn.Module] = None,
args: TrainingArguments = None,
sampler: torch.utils.data.sampler.Sampler = None,
loss_function: str = "CrossEntropyLoss",
poly_eps: float = 1.0,
**kwargs,
@@ -39,6 +40,7 @@ class SFTTrainer(Trainer):
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
self.loss_fct = get_loss(loss_function, poly_eps)
self.sampler = sampler
def compute_loss(self, model, inputs, return_outputs=False):
labels_mask = inputs.pop("label_masks")
@@ -89,6 +91,32 @@ class SFTTrainer(Trainer):
return (loss, logits, labels)
def get_train_dataloader(self):
"""Inject custom data sampling behaviour into training loop"""
if self.sampler is None:
torch.utils.data.DataLoader(
self.train_dataset,
batch_size=self.args.per_device_train_batch_size,
shuffle=True,
collate_fn=self.data_collator,
)
else:
dataloader = torch.utils.data.DataLoader(
self.train_dataset,
batch_size=self.args.per_device_train_batch_size,
sampler=self.sampler,
collate_fn=self.data_collator,
)
if torch.cuda.device_count() <= 1:
return dataloader
else:
# Not strictly necessary to use accelerate, currently just
# ensures batches are padded to be divisible by # devices
from accelerate import Accelerator
accelerator = Accelerator()
return accelerator.prepare(dataloader)
def _strtobool(x):
return bool(strtobool(x))
@@ -142,8 +170,8 @@ if __name__ == "__main__":
model = get_model(training_conf, tokenizer)
train, evals, collate_fn = get_dataset(training_conf, tokenizer)
sampler = PerDatasetSampler.build_sampler_from_config(training_conf, train.datasets)
metrics, preprocess_fns = get_metrics(training_conf, tokenizer)
optimizer = OptimizerNames.ADAMW_BNB if training_conf.quantization else OptimizerNames.ADAMW_HF
if training_conf.quantization:
@@ -181,7 +209,6 @@ if __name__ == "__main__":
)
assert len(evals) > 0
if not training_conf.deepspeed or training_conf.local_rank == 0:
import wandb
@@ -192,8 +219,9 @@ if __name__ == "__main__":
)
trainer = SFTTrainer(
model,
args,
model=model,
args=args,
sampler=sampler,
loss_function=training_conf.loss_fn,
poly_eps=training_conf.poly_eps,
train_dataset=train,
+83 -6
View File
@@ -1,11 +1,8 @@
# from functools import partial
import random
from pathlib import Path
from typing import NamedTuple
from typing import List, NamedTuple
import evaluate
# import nltk
# import numpy as np
import transformers
import yaml
from custom_datasets import get_one_dataset
@@ -15,6 +12,79 @@ from losses import CrossEntropyLoss, PolyLoss
from models import freeze_top_n_layers, get_specific_model
from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, Subset
from torch.utils.data.sampler import Sampler
class PerDatasetSampler(Sampler):
"""Sampler which returns a fixed number of samples per dataset, per epoch.
Example:
Dataset 1 has 10,000 examples and we want 200 per epoch
Dataset 2 has 500 examples and we want all 500 per epoch
Epoch size will be 700 and every epoch we'll sample a different
200 from dataset 1.
Parameters
----------
dataset_sizes : List[int]
A list with the size of each dataset.
dataset_size_per_epoch : List[int]
How many examples to get from each dataset per epoch.
Note: dataset_sizes & dataset_size_per_epoch must be in the same order.
Further the examples in the underlying torch.utils.data.Dataset
must per ordered as dataset_1, dataset_2, ..., dataset_n. This is fine
if we concatenate a bunch of datasets together
e.g. using torch.utils.data.ConcatDataset which is current behaviour.
"""
def __init__(self, dataset_sizes: List[int], dataset_size_per_epoch: List[int]):
self.dataset_sizes = dataset_sizes
self.dataset_size_per_epoch = dataset_size_per_epoch
self.num_datasets = len(dataset_sizes)
def __iter__(self):
epoch_idx = []
n = 0
for i in range(self.num_datasets):
sampled_idx = random.sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i])
n += self.dataset_sizes[i]
epoch_idx.extend(sampled_idx)
random.shuffle(epoch_idx)
return iter(epoch_idx)
def __len__(self):
return int(sum(self.dataset_size_per_epoch))
@classmethod
def build_sampler_from_config(cls, training_conf, datasets):
dataset_sizes = [len(x) for x in datasets]
fractions = get_dataset_fractions(training_conf.datasets, dataset_sizes)
dataset_size_per_epoch = [int(size * frac) for size, frac in zip(dataset_sizes, fractions)]
return cls(dataset_sizes, dataset_size_per_epoch)
def get_dataset_fractions(conf, dataset_sizes):
"""Calculate fraction of each dataset to use per epoch when subsampling"""
fractions = []
for i, data_config in enumerate(conf):
dataset_name = get_dataset_name_from_data_config(data_config)
if isinstance(data_config, dict):
if "fraction" in data_config[dataset_name]:
if data_config[dataset_name]["fraction"] <= 0:
raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
fractions.append(min(1, data_config[dataset_name]["fraction"]))
elif "size" in data_config[dataset_name]:
if data_config[dataset_name]["size"] > dataset_sizes[i]:
raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
else:
raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
else:
fractions.append(1)
return fractions
class SpecialTokens(NamedTuple):
@@ -140,10 +210,17 @@ def get_model(conf, tokenizer):
return model
def get_dataset_name_from_data_config(data_config):
if isinstance(data_config, dict):
return list(data_config.keys())[0]
return data_config
def get_dataset(conf, tokenizer):
train_datasets, evals = [], {}
for dataset_name in conf.datasets:
for data_config in conf.datasets:
dataset_name = get_dataset_name_from_data_config(data_config)
train, val = get_one_dataset(conf, dataset_name)
train_datasets.append(train)
evals[dataset_name] = Subset(val, list(range(min(len(val), conf.eval_size)))) if conf.eval_size else val