Merge branch 'main' into sft-data-sampling

2026-06-27 16:10:30 +08:00 · 2023-02-09 09:19:17 +01:00
parent 9faae250ce c53d8e9bce
commit 4ba622de8e
332 changed files with 10207 additions and 2327 deletions
@@ -1,6 +1,6 @@
 Some other reward features we can use

-0. Finish classifcation feature
+0. Finish classification feature

 1. Summaries from human feedback

@@ -14,7 +14,7 @@
    [] support additional negative samples generated from other models.

        For example we can use galactica-125m to generate a TLDR and assume it was
-        inferior than the human perference one
+        inferior than the human preference one


 """
@@ -46,6 +46,7 @@ defaults:
  quantization: false
  seq2seqmodel: false
  poly_eps: 1.0
+  fuse_gelu: true

 galactica-125m:
  learning_rate: 5e-5
@@ -23,5 +23,5 @@ Issues and TODO:
 - ideally we can update the config yaml and new dataset will be download from
  hub

-  - one possible idea is we upload the trasform format of these dataset to the
+  - one possible idea is we upload the transform format of these dataset to the
    OA hub
@@ -311,7 +311,7 @@ class JokeExplaination(Dataset):
            for line in f:
                data = json.loads(line)
                joke = data["joke"]
-                explanation = data["explaination"]
+                explanation = data["explanation"]
                self.pairs.append((joke, explanation))

        if len(question) > 0 and len(answer) > 0:
@@ -0,0 +1,55 @@
+import functools
+
+import torch
+from transformers.activations import FastGELUActivation, GELUActivation, NewGELUActivation, QuickGELUActivation
+
+
+def rsetattr(obj, attr, val):
+    pre, _, post = attr.rpartition(".")
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+
+
+def rgetattr(obj, attr, *args):
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split("."))
+
+
+def fuse_gelu(model):
+    @torch.jit.script
+    def gelu_fwd(x):
+        return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+    @torch.jit.script
+    def gelu_bwd(g, x):
+        tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+        ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+        return ff * g
+
+    class _FusedGeLUFunction(torch.autograd.Function):
+        @staticmethod
+        # bias is an optional argument
+        def forward(ctx, input):
+            ctx.input_tensor = input
+            return gelu_fwd(input)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input = ctx.input_tensor
+            tmp = gelu_bwd(grad_output, input)
+            return tmp
+
+    class FusedGelu(torch.nn.Module):
+        def forward(self, input):
+            return _FusedGeLUFunction.apply(input)
+
+    fused_gelu_module = FusedGelu()
+    hf_gelu_functions = [GELUActivation, FastGELUActivation, NewGELUActivation, QuickGELUActivation]
+
+    for name, module in model.named_modules():
+        for hf_gelu_function in hf_gelu_functions:
+            if isinstance(module, hf_gelu_function):
+                rsetattr(model, name, fused_gelu_module)
+
+    return model
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import bitsandbytes
 import torch
+from efficiency_utils import fuse_gelu
 from torch import nn
 from transformers import PreTrainedModel, Trainer, TrainingArguments
 from transformers.training_args import OptimizerNames
@@ -180,6 +181,9 @@ if __name__ == "__main__":
                    module, "weight", {"optim_bits": 32}
                )

+    if training_conf.fuse_gelu:
+        model = fuse_gelu(model)
+
    args = TrainingArguments(
        output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
        num_train_epochs=training_conf.num_train_epochs,
@@ -1,6 +1,6 @@
 import random
 from pathlib import Path
-from typing import List
+from typing import List, NamedTuple

 import evaluate
 import transformers
@@ -66,19 +66,64 @@ class PerDatasetSampler(Sampler):
        return cls(dataset_sizes, dataset_size_per_epoch)


-def get_tokenizer(conf):
-    tokenizer = transformers.AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
+def get_dataset_fractions(conf, dataset_sizes):
+    """Calculate fraction of each dataset to use per epoch when subsampling"""
+    fractions = []
+    for i, data_config in enumerate(conf):
+        dataset_name = get_dataset_name_from_data_config(data_config)
+        if isinstance(data_config, dict):
+            if "fraction" in data_config[dataset_name]:
+                if data_config[dataset_name]["fraction"] <= 0:
+                    raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
+                fractions.append(min(1, data_config[dataset_name]["fraction"]))
+            elif "size" in data_config[dataset_name]:
+                if data_config[dataset_name]["size"] > dataset_sizes[i]:
+                    raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
+                fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
+            else:
+                raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
+        else:
+            fractions.append(1)
+    return fractions

-    if "galactica" in conf.model_name:
-        tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
-    elif "GPT-JT" in conf.model_name:
-        tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
-    elif "codegen" in conf.model_name:
-        tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
-    elif "pythia" in conf.model_name:
-        tokenizer.add_special_tokens(
-            {"pad_token": "<|padding|>", "sep_token": "<|endoftext|>", "eos_token": "<|endoftext|>"}
-        )
+
+class SpecialTokens(NamedTuple):
+    pad_token: str = ""
+    eos_token: str = ""
+    sep_token: str = ""
+
+
+class TokenizerConfig(NamedTuple):
+    special_tokens: SpecialTokens = {}
+
+
+TOKENIZER_CONFIGS = {
+    "galactica": TokenizerConfig(special_tokens=SpecialTokens("<pad>", "</s>")),
+    "GPT-JT": TokenizerConfig(special_tokens=SpecialTokens(sep_token="<|extratoken_100|>")),
+    "codegen": TokenizerConfig(special_tokens=SpecialTokens("<|endoftext|>", sep_token="<|endoftext|>")),
+    "pythia": TokenizerConfig(special_tokens=SpecialTokens("<|padding|>", "<|endoftext|>", "<|endoftext|>")),
+}
+
+
+def match_tokenizer_name(model_name: str) -> TokenizerConfig:
+    """Match a partial model name to a tokenizer configuration"""
+    tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name]
+    if not tokenizer_config_matches:
+        raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")
+    elif 1 < len(tokenizer_config_matches):
+        raise ValueError(f"Found multiple tokeniser configuration matches for {model_name=}")
+    else:
+        return tokenizer_config_matches[0]
+
+
+def get_tokenizer(conf) -> transformers.AutoTokenizer:
+    tokenizer = transformers.AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
+    tokenizer_config = match_tokenizer_name(conf.model_name)
+
+    if tokenizer_config.special_tokens:
+        if "GPT-JT" in conf.model_name:
+            tokenizer_config.special_tokens.pad_token = tokenizer.eos_token
+        tokenizer.add_special_tokens(tokenizer_config.special_tokens)

    additional_special_tokens = (
        []
@@ -171,27 +216,6 @@ def get_dataset_name_from_data_config(data_config):
    return data_config


-def get_dataset_fractions(conf, dataset_sizes):
-    """Calculate fraction of each dataset to use per epoch when subsampling"""
-    fractions = []
-    for i, data_config in enumerate(conf):
-        dataset_name = get_dataset_name_from_data_config(data_config)
-        if isinstance(data_config, dict):
-            if "fraction" in data_config[dataset_name]:
-                if data_config[dataset_name]["fraction"] <= 0:
-                    raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
-                fractions.append(min(1, data_config[dataset_name]["fraction"]))
-            elif "size" in data_config[dataset_name]:
-                if data_config[dataset_name]["size"] > dataset_sizes[i]:
-                    raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
-                fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
-            else:
-                raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
-        else:
-            fractions.append(1)
-    return fractions
-
-
 def get_dataset(conf, tokenizer):
    train_datasets, evals = [], {}