mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
Merge branch 'main' into sft-data-sampling
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
Some other reward features we can use
|
||||
|
||||
0. Finish classifcation feature
|
||||
0. Finish classification feature
|
||||
|
||||
1. Summaries from human feedback
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
[] support additional negative samples generated from other models.
|
||||
|
||||
For example we can use galactica-125m to generate a TLDR and assume it was
|
||||
inferior than the human perference one
|
||||
inferior than the human preference one
|
||||
|
||||
|
||||
"""
|
||||
|
||||
@@ -46,6 +46,7 @@ defaults:
|
||||
quantization: false
|
||||
seq2seqmodel: false
|
||||
poly_eps: 1.0
|
||||
fuse_gelu: true
|
||||
|
||||
galactica-125m:
|
||||
learning_rate: 5e-5
|
||||
|
||||
@@ -23,5 +23,5 @@ Issues and TODO:
|
||||
- ideally we can update the config yaml and new dataset will be download from
|
||||
hub
|
||||
|
||||
- one possible idea is we upload the trasform format of these dataset to the
|
||||
- one possible idea is we upload the transform format of these dataset to the
|
||||
OA hub
|
||||
|
||||
@@ -311,7 +311,7 @@ class JokeExplaination(Dataset):
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
joke = data["joke"]
|
||||
explanation = data["explaination"]
|
||||
explanation = data["explanation"]
|
||||
self.pairs.append((joke, explanation))
|
||||
|
||||
if len(question) > 0 and len(answer) > 0:
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
import functools
|
||||
|
||||
import torch
|
||||
from transformers.activations import FastGELUActivation, GELUActivation, NewGELUActivation, QuickGELUActivation
|
||||
|
||||
|
||||
def rsetattr(obj, attr, val):
|
||||
pre, _, post = attr.rpartition(".")
|
||||
return setattr(rgetattr(obj, pre) if pre else obj, post, val)
|
||||
|
||||
|
||||
def rgetattr(obj, attr, *args):
|
||||
def _getattr(obj, attr):
|
||||
return getattr(obj, attr, *args)
|
||||
|
||||
return functools.reduce(_getattr, [obj] + attr.split("."))
|
||||
|
||||
|
||||
def fuse_gelu(model):
|
||||
@torch.jit.script
|
||||
def gelu_fwd(x):
|
||||
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
|
||||
|
||||
@torch.jit.script
|
||||
def gelu_bwd(g, x):
|
||||
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
|
||||
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
|
||||
return ff * g
|
||||
|
||||
class _FusedGeLUFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
# bias is an optional argument
|
||||
def forward(ctx, input):
|
||||
ctx.input_tensor = input
|
||||
return gelu_fwd(input)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input = ctx.input_tensor
|
||||
tmp = gelu_bwd(grad_output, input)
|
||||
return tmp
|
||||
|
||||
class FusedGelu(torch.nn.Module):
|
||||
def forward(self, input):
|
||||
return _FusedGeLUFunction.apply(input)
|
||||
|
||||
fused_gelu_module = FusedGelu()
|
||||
hf_gelu_functions = [GELUActivation, FastGELUActivation, NewGELUActivation, QuickGELUActivation]
|
||||
|
||||
for name, module in model.named_modules():
|
||||
for hf_gelu_function in hf_gelu_functions:
|
||||
if isinstance(module, hf_gelu_function):
|
||||
rsetattr(model, name, fused_gelu_module)
|
||||
|
||||
return model
|
||||
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import bitsandbytes
|
||||
import torch
|
||||
from efficiency_utils import fuse_gelu
|
||||
from torch import nn
|
||||
from transformers import PreTrainedModel, Trainer, TrainingArguments
|
||||
from transformers.training_args import OptimizerNames
|
||||
@@ -180,6 +181,9 @@ if __name__ == "__main__":
|
||||
module, "weight", {"optim_bits": 32}
|
||||
)
|
||||
|
||||
if training_conf.fuse_gelu:
|
||||
model = fuse_gelu(model)
|
||||
|
||||
args = TrainingArguments(
|
||||
output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
|
||||
num_train_epochs=training_conf.num_train_epochs,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import List, NamedTuple
|
||||
|
||||
import evaluate
|
||||
import transformers
|
||||
@@ -66,19 +66,64 @@ class PerDatasetSampler(Sampler):
|
||||
return cls(dataset_sizes, dataset_size_per_epoch)
|
||||
|
||||
|
||||
def get_tokenizer(conf):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
|
||||
def get_dataset_fractions(conf, dataset_sizes):
|
||||
"""Calculate fraction of each dataset to use per epoch when subsampling"""
|
||||
fractions = []
|
||||
for i, data_config in enumerate(conf):
|
||||
dataset_name = get_dataset_name_from_data_config(data_config)
|
||||
if isinstance(data_config, dict):
|
||||
if "fraction" in data_config[dataset_name]:
|
||||
if data_config[dataset_name]["fraction"] <= 0:
|
||||
raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
|
||||
fractions.append(min(1, data_config[dataset_name]["fraction"]))
|
||||
elif "size" in data_config[dataset_name]:
|
||||
if data_config[dataset_name]["size"] > dataset_sizes[i]:
|
||||
raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
|
||||
fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
|
||||
else:
|
||||
raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
|
||||
else:
|
||||
fractions.append(1)
|
||||
return fractions
|
||||
|
||||
if "galactica" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
|
||||
elif "GPT-JT" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
|
||||
elif "codegen" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
|
||||
elif "pythia" in conf.model_name:
|
||||
tokenizer.add_special_tokens(
|
||||
{"pad_token": "<|padding|>", "sep_token": "<|endoftext|>", "eos_token": "<|endoftext|>"}
|
||||
)
|
||||
|
||||
class SpecialTokens(NamedTuple):
|
||||
pad_token: str = ""
|
||||
eos_token: str = ""
|
||||
sep_token: str = ""
|
||||
|
||||
|
||||
class TokenizerConfig(NamedTuple):
|
||||
special_tokens: SpecialTokens = {}
|
||||
|
||||
|
||||
TOKENIZER_CONFIGS = {
|
||||
"galactica": TokenizerConfig(special_tokens=SpecialTokens("<pad>", "</s>")),
|
||||
"GPT-JT": TokenizerConfig(special_tokens=SpecialTokens(sep_token="<|extratoken_100|>")),
|
||||
"codegen": TokenizerConfig(special_tokens=SpecialTokens("<|endoftext|>", sep_token="<|endoftext|>")),
|
||||
"pythia": TokenizerConfig(special_tokens=SpecialTokens("<|padding|>", "<|endoftext|>", "<|endoftext|>")),
|
||||
}
|
||||
|
||||
|
||||
def match_tokenizer_name(model_name: str) -> TokenizerConfig:
|
||||
"""Match a partial model name to a tokenizer configuration"""
|
||||
tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name]
|
||||
if not tokenizer_config_matches:
|
||||
raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")
|
||||
elif 1 < len(tokenizer_config_matches):
|
||||
raise ValueError(f"Found multiple tokeniser configuration matches for {model_name=}")
|
||||
else:
|
||||
return tokenizer_config_matches[0]
|
||||
|
||||
|
||||
def get_tokenizer(conf) -> transformers.AutoTokenizer:
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
|
||||
tokenizer_config = match_tokenizer_name(conf.model_name)
|
||||
|
||||
if tokenizer_config.special_tokens:
|
||||
if "GPT-JT" in conf.model_name:
|
||||
tokenizer_config.special_tokens.pad_token = tokenizer.eos_token
|
||||
tokenizer.add_special_tokens(tokenizer_config.special_tokens)
|
||||
|
||||
additional_special_tokens = (
|
||||
[]
|
||||
@@ -171,27 +216,6 @@ def get_dataset_name_from_data_config(data_config):
|
||||
return data_config
|
||||
|
||||
|
||||
def get_dataset_fractions(conf, dataset_sizes):
|
||||
"""Calculate fraction of each dataset to use per epoch when subsampling"""
|
||||
fractions = []
|
||||
for i, data_config in enumerate(conf):
|
||||
dataset_name = get_dataset_name_from_data_config(data_config)
|
||||
if isinstance(data_config, dict):
|
||||
if "fraction" in data_config[dataset_name]:
|
||||
if data_config[dataset_name]["fraction"] <= 0:
|
||||
raise ValueError("Please specify fraction as a value between 0 < fraction <= 1")
|
||||
fractions.append(min(1, data_config[dataset_name]["fraction"]))
|
||||
elif "size" in data_config[dataset_name]:
|
||||
if data_config[dataset_name]["size"] > dataset_sizes[i]:
|
||||
raise ValueError(f"Please specify a size smaller than number of examples: {dataset_sizes[i]:,.0f}")
|
||||
fractions.append(data_config[dataset_name]["size"] / dataset_sizes[i])
|
||||
else:
|
||||
raise ValueError("Please specify either fraction or size in config.yaml. See README for instructions.")
|
||||
else:
|
||||
fractions.append(1)
|
||||
return fractions
|
||||
|
||||
|
||||
def get_dataset(conf, tokenizer):
|
||||
train_datasets, evals = [], {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user