Merge pull request #452 from LAION-AI/sft-rallio

Added deepspeed and dialogue dataset for code
This commit is contained in:
sanagnos
2023-01-06 19:19:01 +02:00
committed by GitHub
9 changed files with 192 additions and 53 deletions
+34 -1
View File
@@ -23,7 +23,40 @@ open-asisstant dataset are available it will be added here.
## Model
TBD
Normally you should be able to add new models in configs/config.yml
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have pad_token, eos_token,
sep_token, you have to update utils.py `get_tokenizer` to use the right token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
## Results
@@ -6,7 +6,7 @@ defaults:
per_device_eval_batch_size: 2
weight_decay: 0.00
warmup_steps: 600
eval_steps: 200
eval_steps: 100
save_steps: 500
max_length: 512
num_train_epochs: 3
@@ -17,6 +17,7 @@ defaults:
freeze_layer:
datasets:
- webgpt
- prompt_dialogue
cache_dir: ~/.cache
loss_fn: CrossEntropyLoss
eval_size:
@@ -43,6 +44,17 @@ gpt-jt:
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
codegen:
learning_rate: 2e-6
model_name: Salesforce/codegen-2B-multi
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
debug:
eval_steps: 20
eval_size: 100
@@ -0,0 +1,52 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 0.1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"sub_group_size": 1e9,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 2.0,
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
@@ -2,6 +2,8 @@ from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, Subset
from .prompt_dialogue import PromptGeneratedDataset
QA_SPECIAL_TOKENS = {"Question": "<question>", "Answer": "<answer>"}
@@ -63,6 +65,9 @@ def get_one_dataset(conf, dataset_name):
elif dataset_name == "webgpt":
dataset = WebGPT()
train, eval = train_val_dataset(dataset, val_split=0.2)
elif dataset_name == "prompt_dialogue":
dataset = PromptGeneratedDataset()
train, eval = train_val_dataset(dataset, val_split=0.2)
else:
raise ValueError(f"Unknown dataset {dataset_name}")
@@ -81,6 +81,7 @@ class DialogueDataCollator:
batch["label_masks"] = torch.stack([F.pad(torch.tensor(x), (0, dim - len(x))) for x in label_masks])
# why the fuck?
for k in list(batch.keys()):
if k not in ["input_ids", "attention_mask", "label_masks"]:
batch.pop(k)
@@ -0,0 +1,66 @@
import os
from urllib.request import urlopen
from torch.utils.data import Dataset
class PromptGeneratedDataset(Dataset):
"""Generates from flan 11B
User: What are the best methods for preventing a slave trade?
Rosey: The best methods ....
<|endoftext|>
we are ignoring results with multiple lines for now
"""
url = "https://github.com/Rallio67/language-model-agents/raw/main/chat_dialogue_v2_c.txt"
def __init__(self) -> None:
super().__init__()
os.makedirs("datasets", exist_ok=True)
chat_dialogue = os.path.join("datasets", "chat_dialogue_v2_c.txt")
if not os.path.exists(chat_dialogue):
with urlopen(self.url) as file:
content = file.read().decode()
with open(chat_dialogue, "w") as fout:
fout.write(content)
question = ""
answer = ""
self.pairs = []
with open(chat_dialogue, "r") as f:
corpus = f.read().split("<|endoftext|>")
for dialogue in corpus:
dialogue = dialogue.strip()
if "Rosey:" in dialogue:
user, bot = dialogue.split("Rosey:", maxsplit=1)
question = user.split(":", maxsplit=1)[1].strip()
answer = bot.strip()
if len(answer) and len(question):
self.pairs.append((question, answer))
if len(question) > 0 and len(answer) > 0:
self.pairs.append((question, answer))
def __len__(self):
return len(self.pairs)
def __getitem__(self, index):
question, answer = self.pairs[index]
return question, answer
if __name__ == "__main__":
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from .dialogue_collator import DialogueDataCollator
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-multi")
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
dataset = PromptGeneratedDataset()
collate_fn = DialogueDataCollator(tokenizer, padding=True, max_length=128)
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=5)
for batch in dataloader:
print(batch["input_ids"].shape)
+1 -1
View File
@@ -7,7 +7,7 @@ class CrossEntropyLoss(nn.CrossEntropyLoss):
def forward(self, input, target, mask=None):
if mask is not None:
mask = mask.view(-1)
mask = mask.view(-1).bool()
input = input.view(-1, input.size(-1))
target = target.view(-1)
input = input[mask]
+16 -43
View File
@@ -1,32 +1,16 @@
import argparse
import os
from dataclasses import dataclass
from distutils.util import strtobool
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
DataCollator,
EvalPrediction,
PreTrainedModel,
PreTrainedTokenizerBase,
Trainer,
TrainerCallback,
TrainingArguments,
get_cosine_schedule_with_warmup,
)
from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls
os.environ["WANDB_PROJECT"] = "supervised-finetuning"
@dataclass
class CustomTrainingArguments(TrainingArguments):
loss_function: str = "CrossEntropyLoss"
def compute_metrics(eval_pred):
pred_ids = eval_pred.predictions
labels = eval_pred.label_ids
@@ -44,32 +28,13 @@ class SFTTrainer(Trainer):
self,
model: Union[PreTrainedModel, nn.Module] = None,
args: TrainingArguments = None,
data_collator: Optional[DataCollator] = None,
train_dataset: Optional[Dataset] = None,
eval_dataset: Optional[Dataset] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
model_init: Callable[[], PreTrainedModel] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
callbacks: Optional[List[TrainerCallback]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
loss_function: str = "CrossEntropyLoss",
**kwargs,
):
super().__init__(
model,
args,
data_collator,
train_dataset,
eval_dataset,
tokenizer,
model_init,
compute_metrics,
callbacks,
optimizers,
preprocess_logits_for_metrics,
)
super().__init__(model, args, **kwargs)
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
self.loss_fct = get_loss(args.loss_function)
self.loss_fct = get_loss(loss_function)
def fetch_scheduler(self):
return get_cosine_schedule_with_warmup(
@@ -131,6 +96,10 @@ def _strtobool(x):
def argument_parsing(notebook=False, notebook_args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--configs", nargs="+", required=True)
parser.add_argument("--local_rank", type=int, default=-1)
parser.add_argument("--deepspeed", action="store_true")
parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false")
parser.set_defaults(deepspeed=False)
if notebook:
args, remaining = parser.parse_known_args(notebook_args)
@@ -147,6 +116,8 @@ def argument_parsing(notebook=False, notebook_args=None):
else:
conf.update(configs[name])
conf["local_rank"] = args.local_rank
conf["deepspeed"] = args.deepspeed
# Override config from command-line
parser = argparse.ArgumentParser()
for key, value in conf.items():
@@ -166,13 +137,14 @@ if __name__ == "__main__":
train, evals, collate_fn = get_dataset(training_conf, tokenizer)
args = CustomTrainingArguments(
args = TrainingArguments(
output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
num_train_epochs=training_conf.num_train_epochs,
warmup_steps=training_conf.warmup_steps,
loss_function=training_conf.loss_fn,
learning_rate=float(training_conf.learning_rate),
deepspeed="configs/zero_config.json" if training_conf.deepspeed else None,
fp16=True,
local_rank=training_conf.local_rank,
gradient_checkpointing=training_conf.gradient_checkpointing,
gradient_accumulation_steps=training_conf.gradient_accumulation_steps,
per_device_train_batch_size=training_conf.per_device_train_batch_size,
@@ -192,6 +164,7 @@ if __name__ == "__main__":
trainer = SFTTrainer(
model,
args,
loss_function=training_conf.loss_fn,
train_dataset=train,
eval_dataset=evals,
data_collator=collate_fn,
+4 -7
View File
@@ -8,14 +8,16 @@ from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, Subset
from transformers import AutoModelForCausalLM, AutoTokenizer
SUPPORTED_MODELS = ["galactica", "GPT-JT"] # deprecated ..
def get_tokenizer(conf):
tokenizer = AutoTokenizer.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
if "galactica" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
elif "GPT-JT" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
elif "codegen" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
additional_special_tokens = (
[]
@@ -30,11 +32,6 @@ def get_tokenizer(conf):
def get_model(conf, tokenizer):
if not any([x in conf.model_name for x in SUPPORTED_MODELS]):
raise ValueError(
f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. "
"To include more make sure the masking is dne correctly... (decoder only supported for now)"
)
model = AutoModelForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)