[merge] Fix conflict

2026-06-27 16:10:30 +08:00 · 2023-02-11 00:23:25 +00:00
parent 34347607d4 76f7af0dfd
commit bcebbbc49c
196 changed files with 11306 additions and 1005 deletions
@@ -2,20 +2,22 @@

 Trainer code based on huggingface. Compatible with deepspeed or accelerate

-Requirements
-
-```
-wandb
-evaluate
-datasets
-transformers
-torch==1.12
-```
-
-Start training reward model
+Install Python requirements

 ```bash
-python trainer.py configs/electra-base-dis-webgpt.yml
+pip install -r requirements.txt
+```
+
+Write or inherit a `configs/<config-name>.yml` file to store training
+configuration details.
+
+> The configuration file must have _at least_ all the keys present in
+> [`configs/dummy.yml`](configs/dummy.yml)
+
+Run training procedure
+
+```bash
+python trainer.py configs/<config-name>.yml
 ```

 Additional axis labeling, this outputs a 4 summary quality evaluation metrics
@@ -0,0 +1,21 @@
+model_name: X
+tokenizer_name: X
+max_length: X
+num_train_epochs: X
+warmup_steps: X
+scheduler: X
+learning_rate: X
+deepspeed: X
+fp16: X
+local_rank: X
+gradient_checkpointing: X
+gradient_accumulation_steps: X
+per_device_train_batch_size: X
+per_device_eval_batch_size: X
+weight_decay: X
+max_grad_norm: X
+eval_steps: X
+save_steps: X
+wandb_entity: X
+datasets:
+  - X
@@ -1,62 +1,18 @@
 # Train using supervised examples

-Requirements
+## Requirements

-```
-wandb
-evaluate
-datasets
-transformers
-torch
-```
+`pip install -r requirements.txt`

-Start training reward model
+Start training SFT model

 ```bash
-python trainer.py --configs defaults galactica-125
+python trainer.py --configs defaults galactica-125m
 ```

-## Dataset
-
-For now we only support webgpt and summary dataset from OpenAI. Once
-open-asisstant dataset are available it will be added here.
-
-## Model
-
-Normally you should be able to add new models in configs/config.yml
-
-```
-your-model-name:
-  learning_rate: 2e-6
-  model_name: <huggingface model name>
-  weight_decay: 0.01
-  max_length: 812
-  warmup_steps: 600
-  gradient_checkpointing: false
-  gradient_accumulation_steps: 5
-  per_device_train_batch_size: 4
-  per_device_eval_batch_size: 4
-```
-
-```
-python trainer.py --configs defaults your-model-name
-```
-
-However, if the model of your choice doesn't have pad_token, eos_token,
-sep_token, you have to update utils.py `get_tokenizer` to use the right token.
-
-## Deepspeed support
-
-You can edit the configs/zero_config.json and use any stage you wish. The
-current config uses zero-stage 3. For more details on how to setup the config
-checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
-
-Once you are satisfy with your deepzero config, you can add --deepspeed flag at
-the end to trigger deepspeed
-
-```
-python trainer.py --configs defaults your-model-name --deepspeed
-```
+For `wandb`: update the `entity` argument in `trainer.py`'s call to `wandb.init`
+to be your weights and biases username per
+[docs](https://docs.wandb.ai/ref/python/init).

 ## Dataset choices

@@ -80,6 +36,74 @@ Currently only these languages are supported via prompt translation:
 ar,de,fr,en,it,nl,tr,ru,ms,ko,ja,zh
 ```

+## Dataset sub-sampling
+
+We can subsample the **training** data by passing either the `fraction` or
+`size` argument in the `configs/config.yml` file. Don't forget the additional
+colon ":" after the dataset name when doing this.
+
+Example:
+
+```
+  datasets:
+    - webgpt:
+        fraction : 0.05
+    - prompt_dialogue:
+        size : 500
+    - adversarial_qa
+    - trivia_qa_nocontext
+```
+
+In this example, per epoch we will use:
+
+- A random 5% of `webgpt`;
+- A random 500 examples from `prompt_dialogue`;
+- All examples from datasets for which we don't specify the `fraction` or `size`
+  argument.
+
+In the above example, per epoch we'll use a different 5% from `webgpt` and a
+different 500 examples from `prompt_dialogue`.
+
+This works with `torch.distributed`.
+
+## Model
+
+Normally you should be able to add new models in `configs/config.yml`
+
+```
+your-model-name:
+  learning_rate: 2e-6
+  model_name: <huggingface model name>
+  weight_decay: 0.01
+  max_length: 812
+  warmup_steps: 600
+  gradient_checkpointing: false
+  gradient_accumulation_steps: 5
+  per_device_train_batch_size: 4
+  per_device_eval_batch_size: 4
+```
+
+```
+python trainer.py --configs defaults your-model-name
+```
+
+However, if the model of your choice doesn't have `pad_token`, `eos_token`,
+`sep_token`, you have to update `get_tokenizer` in `utils.py` to use the right
+token.
+
+## Deepspeed support
+
+You can edit the configs/zero_config.json and use any stage you wish. The
+current config uses zero-stage 3. For more details on how to setup the config
+checkout [this page](https://www.deepspeed.ai/tutorials/zero/).
+
+Once you are satisfy with your deepzero config, you can add --deepspeed flag at
+the end to trigger deepspeed
+
+```
+python trainer.py --configs defaults your-model-name --deepspeed
+```
+
 ## Results

 Experimental results in wandb
@@ -87,7 +111,7 @@ Experimental results in wandb

 ## TODOS

- decide on a model
+- Decide on a model
 - Merge utils etc with reward model
 - Casual Modelling for GPT-JT does not leverage the bidirectional mask for the
  prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1)
@@ -17,7 +17,7 @@ defaults:
  freeze_layer:
  datasets:
    - webgpt
-    - prompt_dialogue
+    # - prompt_dialogue
    - squad_v2
    - adversarial_qa
    - trivia_qa_nocontext
@@ -222,7 +222,7 @@ class SODA(Dataset):

        return pairs

-    def __init__(self, cache_dir, max_sample_size=10000, input_max_length=1024) -> None:
+    def __init__(self, cache_dir, input_max_length=1024) -> None:
        super().__init__()

        self.pairs = []
@@ -233,9 +233,6 @@ class SODA(Dataset):
                if len(prompt) < input_max_length:
                    self.pairs.append((prompt, answer))

-            if len(self.pairs) > max_sample_size:
-                break
-
    def __len__(self):
        return len(self.pairs)

@@ -4,7 +4,6 @@ datasets==2.8.0
 deepspeed==0.7.7
 evaluate==0.4.0
 gdown
-mpi4py==3.1.4
 nltk==3.8.1
 numpy>=1.22.4
 py7zr
@@ -12,3 +11,4 @@ PyYAML>=6.0
 scikit_learn==1.2.0
 torch>=1.11.0
 transformers==4.25.1
+wandb
@@ -1,9 +1,28 @@
 from argparse import Namespace

-from utils import get_tokenizer
+import pytest
+from utils import TOKENIZER_CONFIGS, get_tokenizer, match_tokenizer_name


 def test_tokenizer():
    get_tokenizer(Namespace(model_name="Salesforce/codegen-2B-multi", cache_dir=".cache"))
    get_tokenizer(Namespace(model_name="facebook/galactica-1.3b", cache_dir=".cache"))
    get_tokenizer(Namespace(model_name="", cache_dir=".cache"))
+
+
+def test_tokenizer_successful_match():
+    for config_name, config in TOKENIZER_CONFIGS.items():
+        found_config = match_tokenizer_name(config_name)
+        assert found_config == config
+
+
+def test_tokenizer_partial_match():
+    for config_name in ["facebook/galactica-1.3b", "togethercomputer/GPT-JT-6B-v1", "Salesforce/codegen-2B-multi"]:
+        found_config = match_tokenizer_name(config_name)
+        assert found_config
+
+
+def test_tokenizer_failed_match():
+    for fake_config_name in ["not-a-model", "fake"]:
+        with pytest.raises(ValueError):
+            match_tokenizer_name(fake_config_name)
@@ -1,7 +1,7 @@
 import argparse
 from distutils.util import strtobool
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import bitsandbytes
 import datasets
@@ -14,7 +14,7 @@ from transformers.trainer_pt_utils import IterableDatasetShard
 from transformers.trainer_utils import seed_worker
 from transformers.training_args import OptimizerNames
 from transformers.utils import is_datasets_available
-from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls
+from utils import PerDatasetSampler, get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls


 def compute_metrics(eval_pred, preprocess_fns, metrics):
@@ -36,7 +36,7 @@ class SFTTrainer(Trainer):
        self,
        model: Union[PreTrainedModel, nn.Module] = None,
        args: TrainingArguments = None,
-        train_collate_fn: Callable = None,
+        sampler: torch.utils.data.sampler.Sampler = None,
        loss_function: str = "CrossEntropyLoss",
        poly_eps: float = 1.0,
        **kwargs,
@@ -45,6 +45,7 @@ class SFTTrainer(Trainer):
        self.train_collate_fn = train_collate_fn
        # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
        self.loss_fct = get_loss(loss_function, poly_eps)
+        self.sampler = sampler

    def compute_loss(self, model, inputs, return_outputs=False):
        labels_mask = inputs.pop("label_masks")
@@ -95,24 +96,22 @@ class SFTTrainer(Trainer):

        return (loss, logits, labels)

-    def get_train_dataloader(self) -> DataLoader:
+    def get_train_dataloader(self):
        """
-        Returns the training [`~torch.utils.data.DataLoader`].
-        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
-        training if necessary) otherwise.
-        Subclass and override this method if you want to inject some custom behavior.
-        """
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
+        Inject custom data sampling behaviour into training loop
+        and use custom task mixing collate function : train_collate_fn

-        train_dataset = self.train_dataset
+        rewrite from:
+        https://github.com/huggingface/transformers/blob/67d074874d285e616393c65a0e670088e1b6b74a/src/transformers/trainer.py#L846
+        """
        data_collator = self.train_collate_fn
+        train_dataset = self.train_dataset
        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
            train_dataset = self._remove_unused_columns(train_dataset, description="training")
-        else:
-            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

        if isinstance(train_dataset, torch.utils.data.IterableDataset):
+            # if we are using iterable dataset it means no weight sampling
+            # added for backward compat
            if self.args.world_size > 1:
                train_dataset = IterableDatasetShard(
                    train_dataset,
@@ -121,7 +120,6 @@ class SFTTrainer(Trainer):
                    num_processes=self.args.world_size,
                    process_index=self.args.process_index,
                )
-
            return DataLoader(
                train_dataset,
                batch_size=self.args.per_device_train_batch_size,
@@ -129,8 +127,10 @@ class SFTTrainer(Trainer):
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
-
-        train_sampler = self._get_train_sampler()
+        if self.sampler is None:
+            train_sampler = self._get_train_sampler()
+        else:
+            train_sampler = self.sampler

        return DataLoader(
            train_dataset,
@@ -194,10 +194,9 @@ if __name__ == "__main__":

    tokenizer = get_tokenizer(training_conf)
    model = get_model(training_conf, tokenizer)
-
    train, evals, collate_fn, train_collate_fn = get_dataset(training_conf, tokenizer)
+    sampler = PerDatasetSampler.build_sampler_from_config(training_conf, train.datasets)
    metrics, preprocess_fns = get_metrics(training_conf, tokenizer)
-
    optimizer = OptimizerNames.ADAMW_BNB if training_conf.quantization else OptimizerNames.ADAMW_HF

    if training_conf.quantization:
@@ -235,7 +234,6 @@ if __name__ == "__main__":
    )

    assert len(evals) > 0
-
    if not training_conf.deepspeed or training_conf.local_rank == 0:
        import wandb

@@ -246,8 +244,9 @@ if __name__ == "__main__":
        )

    trainer = SFTTrainer(
-        model,
-        args,
+        model=model,
+        args=args,
+        sampler=sampler,
        train_collate_fn=train_collate_fn,
        loss_function=training_conf.loss_fn,
        poly_eps=training_conf.poly_eps,
@@ -1,11 +1,8 @@
-# from functools import partial
+import random
 from pathlib import Path
-from typing import NamedTuple
+from typing import List, NamedTuple

 import evaluate
-
-# import nltk
-# import numpy as np
 import transformers
 import yaml
 from custom_datasets import get_one_dataset
@@ -15,6 +12,79 @@ from losses import CrossEntropyLoss, PolyLoss
 from models import freeze_top_n_layers, get_specific_model
 from sklearn.model_selection import train_test_split
 from torch.utils.data import ConcatDataset, Subset
+from torch.utils.data.sampler import Sampler
+
+
+class PerDatasetSampler(Sampler):
+    """Sampler which returns a fixed number of samples per dataset, per epoch.
+
+    Example:
+
+    Dataset 1 has 10,000 examples and we want 200 per epoch
+    Dataset 2 has 500 examples and we want all 500 per epoch
+
+    Epoch size will be 700 and every epoch we'll sample a different
+    200 from dataset 1.
+
+    Parameters
+    ----------
+    dataset_sizes : List[int]
+        A list with the size of each dataset.
+    dataset_size_per_epoch : List[int]
+        How many examples to get from each dataset per epoch.
+
+    Note: dataset_sizes & dataset_size_per_epoch must be in the same order.
+    Further the examples in the underlying torch.utils.data.Dataset
+    must per ordered as dataset_1, dataset_2, ..., dataset_n. This is fine
+    if we concatenate a bunch of datasets together
+    e.g. using torch.utils.data.ConcatDataset which is current behaviour.
+    """
+
+    def __init__(self, dataset_sizes: List[int], dataset_size_per_epoch: List[int]):
+        self.dataset_sizes = dataset_sizes
+        self.dataset_size_per_epoch = dataset_size_per_epoch
+        self.num_datasets = len(dataset_sizes)
+
+    def __iter__(self):
+        epoch_idx = []
+        n = 0
+        for i in range(self.num_datasets):
+            sampled_idx = random.sample(range(n, self.dataset_sizes[i] + n), self.dataset_size_per_epoch[i])
+            n += self.dataset_sizes[i]
+            epoch_idx.extend(sampled_idx)
+        random.shuffle(epoch_idx)
+        return iter(epoch_idx)
+
+    def __len__(self):
+        return int(sum(self.dataset_size_per_epoch))
+
+    @classmethod
+    def build_sampler_from_config(cls, training_conf, datasets):
+        dataset_sizes = [len(x) for x in datasets]
+        fractions = get_dataset_fractions(training_conf.datasets, dataset_sizes)
+        dataset_size_per_epoch = [int(size * frac) for size, frac in zip(dataset_sizes, fractions)]
+        return cls(dataset_sizes, dataset_size_per_epoch)
+
+
+def get_dataset_fractions(conf, dataset_sizes):
+    """Calculate fraction of each dataset to use per epoch when subsampling"""
+    fractions = []
+    for i, data_config in enumerate(conf):
+        dataset_name = get_dataset_name_from_data_config(data_config)
+        if isinstance(data_config, dict):
+            if "fraction" in data_config[dataset_name]:
+                if data_config[dataset_name]["fraction"] <= 0:
+                    raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
+                fractions.append(min(1, data_config[dataset_name]["fraction"]))
+            elif "size" in data_config[dataset_name]:
+                if data_config[dataset_name]["size"] > dataset_sizes[i]:
+                    raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
+                fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
+            else:
+                raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
+        else:
+            fractions.append(1)
+    return fractions


 class SpecialTokens(NamedTuple):
@@ -36,7 +106,10 @@ TOKENIZER_CONFIGS = {


 def match_tokenizer_name(model_name: str) -> TokenizerConfig:
-    """Match a partial model name to a tokenizer configuration"""
+    """
+    Match a partial model name to a tokenizer configuration
+    i.e. model_name `Salesforce/codegen-2B-multi` has config name `codegen`
+    """
    tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name]
    if not tokenizer_config_matches:
        raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")
@@ -140,10 +213,17 @@ def get_model(conf, tokenizer):
    return model


+def get_dataset_name_from_data_config(data_config):
+    if isinstance(data_config, dict):
+        return list(data_config.keys())[0]
+    return data_config
+
+
 def get_dataset(conf, tokenizer):
    train_datasets, evals = [], {}

-    for dataset_name in conf.datasets:
+    for data_config in conf.datasets:
+        dataset_name = get_dataset_name_from_data_config(data_config)
        train, val = get_one_dataset(conf, dataset_name)
        train_datasets.append(train)
        evals[dataset_name] = Subset(val, list(range(min(len(val), conf.eval_size)))) if conf.eval_size else val