[merge] Fix conflict

This commit is contained in:
theblackcat102
2023-02-11 00:23:25 +00:00
196 changed files with 11306 additions and 1005 deletions
+14 -12
View File
@@ -2,20 +2,22 @@
Trainer code based on huggingface. Compatible with deepspeed or accelerate
Requirements
```
wandb
evaluate
datasets
transformers
torch==1.12
```
Start training reward model
Install Python requirements
```bash
python trainer.py configs/electra-base-dis-webgpt.yml
pip install -r requirements.txt
```
Write or inherit a `configs/<config-name>.yml` file to store training
configuration details.
> The configuration file must have _at least_ all the keys present in
> [`configs/dummy.yml`](configs/dummy.yml)
Run training procedure
```bash
python trainer.py configs/<config-name>.yml
```
Additional axis labeling, this outputs a 4 summary quality evaluation metrics
+21
View File
@@ -0,0 +1,21 @@
model_name: X
tokenizer_name: X
max_length: X
num_train_epochs: X
warmup_steps: X
scheduler: X
learning_rate: X
deepspeed: X
fp16: X
local_rank: X
gradient_checkpointing: X
gradient_accumulation_steps: X
per_device_train_batch_size: X
per_device_eval_batch_size: X
weight_decay: X
max_grad_norm: X
eval_steps: X
save_steps: X
wandb_entity: X
datasets:
- X
+76 -52
View File
@@ -1,62 +1,18 @@
# Train using supervised examples
Requirements
## Requirements
```
wandb
evaluate
datasets
transformers
torch
```
`pip install -r requirements.txt`
Start training reward model
Start training SFT model
```bash
python trainer.py --configs defaults galactica-125
python trainer.py --configs defaults galactica-125m
```
## Dataset
For now we only support webgpt and summary dataset from OpenAI. Once
open-asisstant dataset are available it will be added here.
## Model
Normally you should be able to add new models in configs/config.yml
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have pad_token, eos_token,
sep_token, you have to update utils.py `get_tokenizer` to use the right token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
For `wandb`: update the `entity` argument in `trainer.py`'s call to `wandb.init`
to be your weights and biases username per
[docs](https://docs.wandb.ai/ref/python/init).
## Dataset choices
@@ -80,6 +36,74 @@ Currently only these languages are supported via prompt translation:
ar,de,fr,en,it,nl,tr,ru,ms,ko,ja,zh
```
## Dataset sub-sampling
We can subsample the **training** data by passing either the `fraction` or
`size` argument in the `configs/config.yml` file. Don't forget the additional
colon ":" after the dataset name when doing this.
Example:
```
datasets:
- webgpt:
fraction : 0.05
- prompt_dialogue:
size : 500
- adversarial_qa
- trivia_qa_nocontext
```
In this example, per epoch we will use:
- A random 5% of `webgpt`;
- A random 500 examples from `prompt_dialogue`;
- All examples from datasets for which we don't specify the `fraction` or `size`
argument.
In the above example, per epoch we'll use a different 5% from `webgpt` and a
different 500 examples from `prompt_dialogue`.
This works with `torch.distributed`.
## Model
Normally you should be able to add new models in `configs/config.yml`
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have `pad_token`, `eos_token`,
`sep_token`, you have to update `get_tokenizer` in `utils.py` to use the right
token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/).
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
## Results
Experimental results in wandb
@@ -87,7 +111,7 @@ Experimental results in wandb
## TODOS
- decide on a model
- Decide on a model
- Merge utils etc with reward model
- Casual Modelling for GPT-JT does not leverage the bidirectional mask for the
prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1)
@@ -17,7 +17,7 @@ defaults:
freeze_layer:
datasets:
- webgpt
- prompt_dialogue
# - prompt_dialogue
- squad_v2
- adversarial_qa
- trivia_qa_nocontext
@@ -222,7 +222,7 @@ class SODA(Dataset):
return pairs
def __init__(self, cache_dir, max_sample_size=10000, input_max_length=1024) -> None:
def __init__(self, cache_dir, input_max_length=1024) -> None:
super().__init__()
self.pairs = []
@@ -233,9 +233,6 @@ class SODA(Dataset):
if len(prompt) < input_max_length:
self.pairs.append((prompt, answer))
if len(self.pairs) > max_sample_size:
break
def __len__(self):
return len(self.pairs)
+1 -1
View File
@@ -4,7 +4,6 @@ datasets==2.8.0
deepspeed==0.7.7
evaluate==0.4.0
gdown
mpi4py==3.1.4
nltk==3.8.1
numpy>=1.22.4
py7zr
@@ -12,3 +11,4 @@ PyYAML>=6.0
scikit_learn==1.2.0
torch>=1.11.0
transformers==4.25.1
wandb
@@ -1,9 +1,28 @@
from argparse import Namespace
from utils import get_tokenizer
import pytest
from utils import TOKENIZER_CONFIGS, get_tokenizer, match_tokenizer_name
def test_tokenizer():
get_tokenizer(Namespace(model_name="Salesforce/codegen-2B-multi", cache_dir=".cache"))
get_tokenizer(Namespace(model_name="facebook/galactica-1.3b", cache_dir=".cache"))
get_tokenizer(Namespace(model_name="", cache_dir=".cache"))
def test_tokenizer_successful_match():
for config_name, config in TOKENIZER_CONFIGS.items():
found_config = match_tokenizer_name(config_name)
assert found_config == config
def test_tokenizer_partial_match():
for config_name in ["facebook/galactica-1.3b", "togethercomputer/GPT-JT-6B-v1", "Salesforce/codegen-2B-multi"]:
found_config = match_tokenizer_name(config_name)
assert found_config
def test_tokenizer_failed_match():
for fake_config_name in ["not-a-model", "fake"]:
with pytest.raises(ValueError):
match_tokenizer_name(fake_config_name)
+21 -22
View File
@@ -1,7 +1,7 @@
import argparse
from distutils.util import strtobool
from functools import partial
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import bitsandbytes
import datasets
@@ -14,7 +14,7 @@ from transformers.trainer_pt_utils import IterableDatasetShard
from transformers.trainer_utils import seed_worker
from transformers.training_args import OptimizerNames
from transformers.utils import is_datasets_available
from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls
from utils import PerDatasetSampler, get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls
def compute_metrics(eval_pred, preprocess_fns, metrics):
@@ -36,7 +36,7 @@ class SFTTrainer(Trainer):
self,
model: Union[PreTrainedModel, nn.Module] = None,
args: TrainingArguments = None,
train_collate_fn: Callable = None,
sampler: torch.utils.data.sampler.Sampler = None,
loss_function: str = "CrossEntropyLoss",
poly_eps: float = 1.0,
**kwargs,
@@ -45,6 +45,7 @@ class SFTTrainer(Trainer):
self.train_collate_fn = train_collate_fn
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
self.loss_fct = get_loss(loss_function, poly_eps)
self.sampler = sampler
def compute_loss(self, model, inputs, return_outputs=False):
labels_mask = inputs.pop("label_masks")
@@ -95,24 +96,22 @@ class SFTTrainer(Trainer):
return (loss, logits, labels)
def get_train_dataloader(self) -> DataLoader:
def get_train_dataloader(self):
"""
Returns the training [`~torch.utils.data.DataLoader`].
Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
training if necessary) otherwise.
Subclass and override this method if you want to inject some custom behavior.
"""
if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.")
Inject custom data sampling behaviour into training loop
and use custom task mixing collate function : train_collate_fn
train_dataset = self.train_dataset
rewrite from:
https://github.com/huggingface/transformers/blob/67d074874d285e616393c65a0e670088e1b6b74a/src/transformers/trainer.py#L846
"""
data_collator = self.train_collate_fn
train_dataset = self.train_dataset
if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
train_dataset = self._remove_unused_columns(train_dataset, description="training")
else:
data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
if isinstance(train_dataset, torch.utils.data.IterableDataset):
# if we are using iterable dataset it means no weight sampling
# added for backward compat
if self.args.world_size > 1:
train_dataset = IterableDatasetShard(
train_dataset,
@@ -121,7 +120,6 @@ class SFTTrainer(Trainer):
num_processes=self.args.world_size,
process_index=self.args.process_index,
)
return DataLoader(
train_dataset,
batch_size=self.args.per_device_train_batch_size,
@@ -129,8 +127,10 @@ class SFTTrainer(Trainer):
num_workers=self.args.dataloader_num_workers,
pin_memory=self.args.dataloader_pin_memory,
)
train_sampler = self._get_train_sampler()
if self.sampler is None:
train_sampler = self._get_train_sampler()
else:
train_sampler = self.sampler
return DataLoader(
train_dataset,
@@ -194,10 +194,9 @@ if __name__ == "__main__":
tokenizer = get_tokenizer(training_conf)
model = get_model(training_conf, tokenizer)
train, evals, collate_fn, train_collate_fn = get_dataset(training_conf, tokenizer)
sampler = PerDatasetSampler.build_sampler_from_config(training_conf, train.datasets)
metrics, preprocess_fns = get_metrics(training_conf, tokenizer)
optimizer = OptimizerNames.ADAMW_BNB if training_conf.quantization else OptimizerNames.ADAMW_HF
if training_conf.quantization:
@@ -235,7 +234,6 @@ if __name__ == "__main__":
)
assert len(evals) > 0
if not training_conf.deepspeed or training_conf.local_rank == 0:
import wandb
@@ -246,8 +244,9 @@ if __name__ == "__main__":
)
trainer = SFTTrainer(
model,
args,
model=model,
args=args,
sampler=sampler,
train_collate_fn=train_collate_fn,
loss_function=training_conf.loss_fn,
poly_eps=training_conf.poly_eps,
+87 -7
View File
@@ -1,11 +1,8 @@
# from functools import partial
import random
from pathlib import Path
from typing import NamedTuple
from typing import List, NamedTuple
import evaluate
# import nltk
# import numpy as np
import transformers
import yaml
from custom_datasets import get_one_dataset
@@ -15,6 +12,79 @@ from losses import CrossEntropyLoss, PolyLoss
from models import freeze_top_n_layers, get_specific_model
from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, Subset
from torch.utils.data.sampler import Sampler
class PerDatasetSampler(Sampler):
"""Sampler which returns a fixed number of samples per dataset, per epoch.
Example:
Dataset 1 has 10,000 examples and we want 200 per epoch
Dataset 2 has 500 examples and we want all 500 per epoch
Epoch size will be 700 and every epoch we'll sample a different
200 from dataset 1.
Parameters
----------
dataset_sizes : List[int]
A list with the size of each dataset.
dataset_size_per_epoch : List[int]
How many examples to get from each dataset per epoch.
Note: dataset_sizes & dataset_size_per_epoch must be in the same order.
Further the examples in the underlying torch.utils.data.Dataset
must per ordered as dataset_1, dataset_2, ..., dataset_n. This is fine
if we concatenate a bunch of datasets together
e.g. using torch.utils.data.ConcatDataset which is current behaviour.
"""
def __init__(self, dataset_sizes: List[int], dataset_size_per_epoch: List[int]):
self.dataset_sizes = dataset_sizes
self.dataset_size_per_epoch = dataset_size_per_epoch
self.num_datasets = len(dataset_sizes)
def __iter__(self):
epoch_idx = []
n = 0
for i in range(self.num_datasets):
sampled_idx = random.sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i])
n += self.dataset_sizes[i]
epoch_idx.extend(sampled_idx)
random.shuffle(epoch_idx)
return iter(epoch_idx)
def __len__(self):
return int(sum(self.dataset_size_per_epoch))
@classmethod
def build_sampler_from_config(cls, training_conf, datasets):
dataset_sizes = [len(x) for x in datasets]
fractions = get_dataset_fractions(training_conf.datasets, dataset_sizes)
dataset_size_per_epoch = [int(size * frac) for size, frac in zip(dataset_sizes, fractions)]
return cls(dataset_sizes, dataset_size_per_epoch)
def get_dataset_fractions(conf, dataset_sizes):
"""Calculate fraction of each dataset to use per epoch when subsampling"""
fractions = []
for i, data_config in enumerate(conf):
dataset_name = get_dataset_name_from_data_config(data_config)
if isinstance(data_config, dict):
if "fraction" in data_config[dataset_name]:
if data_config[dataset_name]["fraction"] <= 0:
raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
fractions.append(min(1, data_config[dataset_name]["fraction"]))
elif "size" in data_config[dataset_name]:
if data_config[dataset_name]["size"] > dataset_sizes[i]:
raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
else:
raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
else:
fractions.append(1)
return fractions
class SpecialTokens(NamedTuple):
@@ -36,7 +106,10 @@ TOKENIZER_CONFIGS = {
def match_tokenizer_name(model_name: str) -> TokenizerConfig:
"""Match a partial model name to a tokenizer configuration"""
"""
Match a partial model name to a tokenizer configuration
i.e. model_name `Salesforce/codegen-2B-multi` has config name `codegen`
"""
tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name]
if not tokenizer_config_matches:
raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")
@@ -140,10 +213,17 @@ def get_model(conf, tokenizer):
return model
def get_dataset_name_from_data_config(data_config):
if isinstance(data_config, dict):
return list(data_config.keys())[0]
return data_config
def get_dataset(conf, tokenizer):
train_datasets, evals = [], {}
for dataset_name in conf.datasets:
for data_config in conf.datasets:
dataset_name = get_dataset_name_from_data_config(data_config)
train, val = get_one_dataset(conf, dataset_name)
train_datasets.append(train)
evals[dataset_name] = Subset(val, list(range(min(len(val), conf.eval_size)))) if conf.eval_size else val