merge deepspeed

This commit is contained in:
Sotirios Anagnostidis
2023-01-06 21:28:26 +01:00
parent f2b125cbe3
commit 88ee3b3264
7 changed files with 181 additions and 132 deletions
+34 -1
View File
@@ -23,7 +23,40 @@ open-asisstant dataset are available it will be added here.
## Model
TBD
Normally you should be able to add new models in configs/config.yml
```
your-model-name:
learning_rate: 2e-6
model_name: <huggingface model name>
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
```
```
python trainer.py --configs defaults your-model-name
```
However, if the model of your choice doesn't have pad_token, eos_token,
sep_token, you have to update utils.py `get_tokenizer` to use the right token.
## Deepspeed support
You can edit the configs/zero_config.json and use any stage you wish. The
current config uses zero-stage 3. For more details on how to setup the config
checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
the end to trigger deepspeed
```
python trainer.py --configs defaults your-model-name --deepspeed
```
## Results
@@ -6,7 +6,7 @@ defaults:
per_device_eval_batch_size: 2
weight_decay: 0.00
warmup_steps: 600
eval_steps: 200
eval_steps: 100
save_steps: 500
max_length: 512
num_train_epochs: 3
@@ -17,6 +17,7 @@ defaults:
freeze_layer:
datasets:
- webgpt
- prompt_dialogue
cache_dir: ~/.cache
loss_fn: CrossEntropyLoss
eval_size:
@@ -44,11 +45,21 @@ gpt-jt:
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
codegen:
learning_rate: 2e-6
model_name: Salesforce/codegen-2B-multi
weight_decay: 0.01
max_length: 812
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 5
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
debug:
eval_steps: 20
eval_size: 100
model_name: EleutherAI/gpt-j-6B
gradient_accumulation_steps: 2
gradient_accumulation_steps: 1
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
quantization: 8bit
quantization:
@@ -2,6 +2,8 @@ from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, Subset
from .prompt_dialogue import PromptGeneratedDataset
QA_SPECIAL_TOKENS = {"Question": "<question>", "Answer": "<answer>"}
@@ -14,8 +16,8 @@ class SquadV2Dataset(Dataset):
def __getitem__(self, idx):
data = self.dataset[idx]
# dummy return first answer
return "".join([data["title"], ". ", data["context"], " " + data["question"]]), data["answers"]["text"][0]
# return first answer form list of possible answers
return data["title"] + ". " + data["context"] + " " + data["question"], data["answers"]["text"][0]
class WebGPT(Dataset):
@@ -57,12 +59,14 @@ def get_one_dataset(conf, dataset_name):
dataset_name = dataset_name.lower()
if dataset_name == "squadv2":
raise ValueError("SquadV2 is not diverse enough for generation .. ")
train = SquadV2Dataset(conf.cache_dir, "train")
eval = SquadV2Dataset(conf.cache_dir, "validation")
elif dataset_name == "webgpt":
dataset = WebGPT()
train, eval = train_val_dataset(dataset, val_split=0.2)
elif dataset_name == "prompt_dialogue":
dataset = PromptGeneratedDataset()
train, eval = train_val_dataset(dataset, val_split=0.2)
else:
raise ValueError(f"Unknown dataset {dataset_name}")
@@ -0,0 +1,66 @@
import os
from urllib.request import urlopen
from torch.utils.data import Dataset
class PromptGeneratedDataset(Dataset):
"""Generates from flan 11B
User: What are the best methods for preventing a slave trade?
Rosey: The best methods ....
<|endoftext|>
we are ignoring results with multiple lines for now
"""
url = "https://github.com/Rallio67/language-model-agents/raw/main/chat_dialogue_v2_c.txt"
def __init__(self) -> None:
super().__init__()
os.makedirs("datasets", exist_ok=True)
chat_dialogue = os.path.join("datasets", "chat_dialogue_v2_c.txt")
if not os.path.exists(chat_dialogue):
with urlopen(self.url) as file:
content = file.read().decode()
with open(chat_dialogue, "w") as fout:
fout.write(content)
question = ""
answer = ""
self.pairs = []
with open(chat_dialogue, "r") as f:
corpus = f.read().split("<|endoftext|>")
for dialogue in corpus:
dialogue = dialogue.strip()
if "Rosey:" in dialogue:
user, bot = dialogue.split("Rosey:", maxsplit=1)
question = user.split(":", maxsplit=1)[1].strip()
answer = bot.strip()
if len(answer) and len(question):
self.pairs.append((question, answer))
if len(question) > 0 and len(answer) > 0:
self.pairs.append((question, answer))
def __len__(self):
return len(self.pairs)
def __getitem__(self, index):
question, answer = self.pairs[index]
return question, answer
if __name__ == "__main__":
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from .dialogue_collator import DialogueDataCollator
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-multi")
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
dataset = PromptGeneratedDataset()
collate_fn = DialogueDataCollator(tokenizer, padding=True, max_length=128)
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=5)
for batch in dataloader:
print(batch["input_ids"].shape)
+1 -1
View File
@@ -7,7 +7,7 @@ class CrossEntropyLoss(nn.CrossEntropyLoss):
def forward(self, input, target, mask=None):
if mask is not None:
mask = mask.view(-1)
mask = mask.view(-1).bool()
input = input.view(-1, input.size(-1))
target = target.view(-1)
input = input[mask]
+54 -117
View File
@@ -1,34 +1,19 @@
import argparse
import os
from dataclasses import dataclass
from distutils.util import strtobool
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
DataCollator,
EvalPrediction,
PreTrainedModel,
PreTrainedTokenizerBase,
Trainer,
TrainerCallback,
TrainingArguments,
get_cosine_schedule_with_warmup,
)
from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
import bitsandbytes as bnb
from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls
os.environ["WANDB_PROJECT"] = "supervised-finetuning"
@dataclass
class CustomTrainingArguments(TrainingArguments):
loss_function: str = "CrossEntropyLoss"
quantization: str = None
def compute_metrics(eval_pred):
pred_ids = eval_pred.predictions
labels = eval_pred.label_ids
@@ -46,35 +31,15 @@ class SFTTrainer(Trainer):
self,
model: Union[PreTrainedModel, nn.Module] = None,
args: TrainingArguments = None,
data_collator: Optional[DataCollator] = None,
train_dataset: Optional[Dataset] = None,
eval_dataset: Optional[Dataset] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
model_init: Callable[[], PreTrainedModel] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
callbacks: Optional[List[TrainerCallback]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
loss_function: str = "CrossEntropyLoss",
**kwargs,
):
super().__init__(
model,
args,
data_collator,
train_dataset,
eval_dataset,
tokenizer,
model_init,
compute_metrics,
callbacks,
optimizers,
preprocess_logits_for_metrics,
)
super().__init__(model, args, **kwargs)
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
self.loss_fct = get_loss(args.loss_function)
self.loss_fct = get_loss(loss_function)
def create_optimizer_and_scheduler(self, num_training_steps: int):
print("Optimizer")
if self.args.quantization == "8bit":
self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
else:
@@ -82,7 +47,6 @@ class SFTTrainer(Trainer):
self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay
)
print("lr sheduler")
self.lr_scheduler = get_cosine_schedule_with_warmup(
self.optimizer,
num_warmup_steps=self.args.warmup_steps,
@@ -93,16 +57,17 @@ class SFTTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels_mask = inputs.pop("label_masks")
targets = inputs.pop("targets")
outputs = model(**inputs)
loss = self.loss_fct(outputs.get("logits"), torch.roll(inputs["input_ids"], -1, -1), mask=labels_mask)
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
return (loss, outputs) if return_outputs else loss
def _compute_loss(self, model, inputs):
labels_mask = inputs.pop("label_masks")
targets = inputs.pop("targets")
inputs = self._prepare_inputs(inputs)
@@ -110,7 +75,6 @@ class SFTTrainer(Trainer):
logits = outputs.get("logits")
targets = torch.roll(inputs["input_ids"], -1, -1)
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
return loss, logits, targets, labels_mask
@@ -142,12 +106,18 @@ def _strtobool(x):
def argument_parsing(notebook=False, notebook_args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--configs", nargs="+", required=True)
parser.add_argument("--local_rank", type=int, default=-1)
parser.add_argument("--deepspeed", action="store_true")
parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false")
parser.set_defaults(deepspeed=False)
if notebook:
args, remaining = parser.parse_known_args(notebook_args)
else:
args, remaining = parser.parse_known_args()
print(args)
# Config from YAML
conf = {}
configs = read_yamls("./configs")
@@ -158,6 +128,8 @@ def argument_parsing(notebook=False, notebook_args=None):
else:
conf.update(configs[name])
conf["local_rank"] = args.local_rank
conf["deepspeed"] = args.deepspeed
# Override config from command-line
parser = argparse.ArgumentParser()
for key, value in conf.items():
@@ -175,76 +147,41 @@ if __name__ == "__main__":
tokenizer = get_tokenizer(training_conf)
model = get_model(training_conf, tokenizer)
###
from datasets import load_dataset
from bitsandbytes.optim import Adam8bit
from torch.nn import functional as F
from tqdm import tqdm
train, evals, collate_fn = get_dataset(training_conf, tokenizer)
gpt = model.to("cuda")
args = TrainingArguments(
output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
num_train_epochs=training_conf.num_train_epochs,
warmup_steps=training_conf.warmup_steps,
learning_rate=float(training_conf.learning_rate),
deepspeed="configs/zero_config.json" if training_conf.deepspeed else None,
fp16=True,
local_rank=training_conf.local_rank,
gradient_checkpointing=training_conf.gradient_checkpointing,
gradient_accumulation_steps=training_conf.gradient_accumulation_steps,
per_device_train_batch_size=training_conf.per_device_train_batch_size,
per_device_eval_batch_size=training_conf.per_device_eval_batch_size,
weight_decay=training_conf.weight_decay,
max_grad_norm=training_conf.max_grad_norm,
logging_steps=training_conf.logging_steps,
save_total_limit=training_conf.save_total_limit,
evaluation_strategy="steps",
eval_steps=training_conf.eval_steps,
save_steps=training_conf.save_steps,
eval_accumulation_steps=training_conf.eval_accumulation_steps,
report_to="wandb",
)
gpt.gradient_checkpointing_enable()
codeparrot = load_dataset("transformersbook/codeparrot-train", streaming=True, cache_dir=training_conf.cache_dir)
optimizer = Adam8bit(gpt.parameters(), lr=1e-5)
with torch.cuda.amp.autocast():
for row in tqdm(codeparrot["train"]):
if len(row["content"]) <= 1:
continue
batch = tokenizer(row["content"], truncation=True, max_length=128, return_tensors="pt")
batch = {k: v.cuda() for k, v in batch.items()}
out = gpt.forward(
**batch,
)
loss = F.cross_entropy(
out.logits[:, :-1, :].flatten(0, -2), batch["input_ids"][:, 1:].flatten(), reduction="mean"
)
print(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
###
# train, evals, collate_fn = get_dataset(training_conf, tokenizer)
# assert len(evals) > 0
# args = CustomTrainingArguments(
# output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
# num_train_epochs=training_conf.num_train_epochs,
# warmup_steps=training_conf.warmup_steps,
# loss_function=training_conf.loss_fn,
# learning_rate=float(training_conf.learning_rate),
# fp16=True,
# gradient_checkpointing=training_conf.gradient_checkpointing,
# gradient_accumulation_steps=training_conf.gradient_accumulation_steps,
# per_device_train_batch_size=training_conf.per_device_train_batch_size,
# per_device_eval_batch_size=training_conf.per_device_eval_batch_size,
# weight_decay=training_conf.weight_decay,
# max_grad_norm=training_conf.max_grad_norm,
# logging_steps=training_conf.logging_steps,
# save_total_limit=training_conf.save_total_limit,
# evaluation_strategy="steps",
# eval_steps=training_conf.eval_steps,
# save_steps=training_conf.save_steps,
# eval_accumulation_steps=training_conf.eval_accumulation_steps,
# report_to="wandb",
# quantization=training_conf.quantization,
# )
# trainer = SFTTrainer(
# model,
# args,
# train_dataset=train,
# eval_dataset=evals,
# data_collator=collate_fn,
# tokenizer=tokenizer,
# compute_metrics=compute_metrics,
# preprocess_logits_for_metrics=preprocess_logits_for_metrics,
# )
# trainer.train()
assert len(evals) > 0
trainer = SFTTrainer(
model,
args,
loss_function=training_conf.loss_fn,
train_dataset=train,
eval_dataset=evals,
data_collator=collate_fn,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
trainer.train()
+4 -6
View File
@@ -15,6 +15,10 @@ def get_tokenizer(conf):
if "galactica" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
elif "GPT-JT" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
elif "codegen" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
additional_special_tokens = (
[]
@@ -29,12 +33,6 @@ def get_tokenizer(conf):
def get_model(conf, tokenizer):
if not any([x in conf.model_name.lower() for x in SUPPORTED_MODELS]):
raise ValueError(
f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. "
"To include more make sure the masking is done correctly... (decoder only supported for now)"
)
model = get_specific_model(conf.model_name, conf.cache_dir, conf.quantization)
if len(tokenizer) != model.get_input_embeddings().num_embeddings: