mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
merge deepspeed
This commit is contained in:
@@ -23,7 +23,40 @@ open-asisstant dataset are available it will be added here.
|
||||
|
||||
## Model
|
||||
|
||||
TBD
|
||||
Normally you should be able to add new models in configs/config.yml
|
||||
|
||||
```
|
||||
your-model-name:
|
||||
learning_rate: 2e-6
|
||||
model_name: <huggingface model name>
|
||||
weight_decay: 0.01
|
||||
max_length: 812
|
||||
warmup_steps: 600
|
||||
gradient_checkpointing: false
|
||||
gradient_accumulation_steps: 5
|
||||
per_device_train_batch_size: 4
|
||||
per_device_eval_batch_size: 4
|
||||
```
|
||||
|
||||
```
|
||||
python trainer.py --configs defaults your-model-name
|
||||
```
|
||||
|
||||
However, if the model of your choice doesn't have pad_token, eos_token,
|
||||
sep_token, you have to update utils.py `get_tokenizer` to use the right token.
|
||||
|
||||
## Deepspeed support
|
||||
|
||||
You can edit the configs/zero_config.json and use any stage you wish. The
|
||||
current config uses zero-stage 3. For more details on how to setup the config
|
||||
checkout [this page](https://www.deepspeed.ai/tutorials/zero/)
|
||||
|
||||
Once you are satisfy with your deepzero config, you can add --deepspeed flag at
|
||||
the end to trigger deepspeed
|
||||
|
||||
```
|
||||
python trainer.py --configs defaults your-model-name --deepspeed
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ defaults:
|
||||
per_device_eval_batch_size: 2
|
||||
weight_decay: 0.00
|
||||
warmup_steps: 600
|
||||
eval_steps: 200
|
||||
eval_steps: 100
|
||||
save_steps: 500
|
||||
max_length: 512
|
||||
num_train_epochs: 3
|
||||
@@ -17,6 +17,7 @@ defaults:
|
||||
freeze_layer:
|
||||
datasets:
|
||||
- webgpt
|
||||
- prompt_dialogue
|
||||
cache_dir: ~/.cache
|
||||
loss_fn: CrossEntropyLoss
|
||||
eval_size:
|
||||
@@ -44,11 +45,21 @@ gpt-jt:
|
||||
per_device_train_batch_size: 4
|
||||
per_device_eval_batch_size: 4
|
||||
|
||||
codegen:
|
||||
learning_rate: 2e-6
|
||||
model_name: Salesforce/codegen-2B-multi
|
||||
weight_decay: 0.01
|
||||
max_length: 812
|
||||
warmup_steps: 600
|
||||
gradient_checkpointing: false
|
||||
gradient_accumulation_steps: 5
|
||||
per_device_train_batch_size: 4
|
||||
per_device_eval_batch_size: 4
|
||||
|
||||
debug:
|
||||
eval_steps: 20
|
||||
eval_size: 100
|
||||
model_name: EleutherAI/gpt-j-6B
|
||||
gradient_accumulation_steps: 2
|
||||
gradient_accumulation_steps: 1
|
||||
per_device_train_batch_size: 1
|
||||
per_device_eval_batch_size: 1
|
||||
quantization: 8bit
|
||||
quantization:
|
||||
@@ -2,6 +2,8 @@ from datasets import load_dataset
|
||||
from sklearn.model_selection import train_test_split
|
||||
from torch.utils.data import Dataset, Subset
|
||||
|
||||
from .prompt_dialogue import PromptGeneratedDataset
|
||||
|
||||
QA_SPECIAL_TOKENS = {"Question": "<question>", "Answer": "<answer>"}
|
||||
|
||||
|
||||
@@ -14,8 +16,8 @@ class SquadV2Dataset(Dataset):
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.dataset[idx]
|
||||
# dummy return first answer
|
||||
return "".join([data["title"], ". ", data["context"], " " + data["question"]]), data["answers"]["text"][0]
|
||||
# return first answer form list of possible answers
|
||||
return data["title"] + ". " + data["context"] + " " + data["question"], data["answers"]["text"][0]
|
||||
|
||||
|
||||
class WebGPT(Dataset):
|
||||
@@ -57,12 +59,14 @@ def get_one_dataset(conf, dataset_name):
|
||||
dataset_name = dataset_name.lower()
|
||||
|
||||
if dataset_name == "squadv2":
|
||||
raise ValueError("SquadV2 is not diverse enough for generation .. ")
|
||||
train = SquadV2Dataset(conf.cache_dir, "train")
|
||||
eval = SquadV2Dataset(conf.cache_dir, "validation")
|
||||
elif dataset_name == "webgpt":
|
||||
dataset = WebGPT()
|
||||
train, eval = train_val_dataset(dataset, val_split=0.2)
|
||||
elif dataset_name == "prompt_dialogue":
|
||||
dataset = PromptGeneratedDataset()
|
||||
train, eval = train_val_dataset(dataset, val_split=0.2)
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset {dataset_name}")
|
||||
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
import os
|
||||
from urllib.request import urlopen
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class PromptGeneratedDataset(Dataset):
|
||||
"""Generates from flan 11B
|
||||
User: What are the best methods for preventing a slave trade?
|
||||
|
||||
Rosey: The best methods ....
|
||||
<|endoftext|>
|
||||
|
||||
we are ignoring results with multiple lines for now
|
||||
"""
|
||||
|
||||
url = "https://github.com/Rallio67/language-model-agents/raw/main/chat_dialogue_v2_c.txt"
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
os.makedirs("datasets", exist_ok=True)
|
||||
chat_dialogue = os.path.join("datasets", "chat_dialogue_v2_c.txt")
|
||||
if not os.path.exists(chat_dialogue):
|
||||
with urlopen(self.url) as file:
|
||||
content = file.read().decode()
|
||||
with open(chat_dialogue, "w") as fout:
|
||||
fout.write(content)
|
||||
|
||||
question = ""
|
||||
answer = ""
|
||||
self.pairs = []
|
||||
with open(chat_dialogue, "r") as f:
|
||||
corpus = f.read().split("<|endoftext|>")
|
||||
for dialogue in corpus:
|
||||
dialogue = dialogue.strip()
|
||||
if "Rosey:" in dialogue:
|
||||
user, bot = dialogue.split("Rosey:", maxsplit=1)
|
||||
question = user.split(":", maxsplit=1)[1].strip()
|
||||
answer = bot.strip()
|
||||
if len(answer) and len(question):
|
||||
self.pairs.append((question, answer))
|
||||
|
||||
if len(question) > 0 and len(answer) > 0:
|
||||
self.pairs.append((question, answer))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.pairs)
|
||||
|
||||
def __getitem__(self, index):
|
||||
question, answer = self.pairs[index]
|
||||
return question, answer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from .dialogue_collator import DialogueDataCollator
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-multi")
|
||||
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
|
||||
dataset = PromptGeneratedDataset()
|
||||
collate_fn = DialogueDataCollator(tokenizer, padding=True, max_length=128)
|
||||
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=5)
|
||||
for batch in dataloader:
|
||||
print(batch["input_ids"].shape)
|
||||
@@ -7,7 +7,7 @@ class CrossEntropyLoss(nn.CrossEntropyLoss):
|
||||
|
||||
def forward(self, input, target, mask=None):
|
||||
if mask is not None:
|
||||
mask = mask.view(-1)
|
||||
mask = mask.view(-1).bool()
|
||||
input = input.view(-1, input.size(-1))
|
||||
target = target.view(-1)
|
||||
input = input[mask]
|
||||
|
||||
@@ -1,34 +1,19 @@
|
||||
import argparse
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from distutils.util import strtobool
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import (
|
||||
DataCollator,
|
||||
EvalPrediction,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizerBase,
|
||||
Trainer,
|
||||
TrainerCallback,
|
||||
TrainingArguments,
|
||||
get_cosine_schedule_with_warmup,
|
||||
)
|
||||
from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
|
||||
import bitsandbytes as bnb
|
||||
|
||||
from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls
|
||||
|
||||
os.environ["WANDB_PROJECT"] = "supervised-finetuning"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CustomTrainingArguments(TrainingArguments):
|
||||
loss_function: str = "CrossEntropyLoss"
|
||||
quantization: str = None
|
||||
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
pred_ids = eval_pred.predictions
|
||||
labels = eval_pred.label_ids
|
||||
@@ -46,35 +31,15 @@ class SFTTrainer(Trainer):
|
||||
self,
|
||||
model: Union[PreTrainedModel, nn.Module] = None,
|
||||
args: TrainingArguments = None,
|
||||
data_collator: Optional[DataCollator] = None,
|
||||
train_dataset: Optional[Dataset] = None,
|
||||
eval_dataset: Optional[Dataset] = None,
|
||||
tokenizer: Optional[PreTrainedTokenizerBase] = None,
|
||||
model_init: Callable[[], PreTrainedModel] = None,
|
||||
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
|
||||
callbacks: Optional[List[TrainerCallback]] = None,
|
||||
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
|
||||
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
|
||||
loss_function: str = "CrossEntropyLoss",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
model,
|
||||
args,
|
||||
data_collator,
|
||||
train_dataset,
|
||||
eval_dataset,
|
||||
tokenizer,
|
||||
model_init,
|
||||
compute_metrics,
|
||||
callbacks,
|
||||
optimizers,
|
||||
preprocess_logits_for_metrics,
|
||||
)
|
||||
super().__init__(model, args, **kwargs)
|
||||
|
||||
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
|
||||
self.loss_fct = get_loss(args.loss_function)
|
||||
self.loss_fct = get_loss(loss_function)
|
||||
|
||||
def create_optimizer_and_scheduler(self, num_training_steps: int):
|
||||
print("Optimizer")
|
||||
if self.args.quantization == "8bit":
|
||||
self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
|
||||
else:
|
||||
@@ -82,7 +47,6 @@ class SFTTrainer(Trainer):
|
||||
self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay
|
||||
)
|
||||
|
||||
print("lr sheduler")
|
||||
self.lr_scheduler = get_cosine_schedule_with_warmup(
|
||||
self.optimizer,
|
||||
num_warmup_steps=self.args.warmup_steps,
|
||||
@@ -93,16 +57,17 @@ class SFTTrainer(Trainer):
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels_mask = inputs.pop("label_masks")
|
||||
targets = inputs.pop("targets")
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
loss = self.loss_fct(outputs.get("logits"), torch.roll(inputs["input_ids"], -1, -1), mask=labels_mask)
|
||||
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_loss(self, model, inputs):
|
||||
|
||||
labels_mask = inputs.pop("label_masks")
|
||||
targets = inputs.pop("targets")
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
|
||||
@@ -110,7 +75,6 @@ class SFTTrainer(Trainer):
|
||||
|
||||
logits = outputs.get("logits")
|
||||
|
||||
targets = torch.roll(inputs["input_ids"], -1, -1)
|
||||
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
|
||||
|
||||
return loss, logits, targets, labels_mask
|
||||
@@ -142,12 +106,18 @@ def _strtobool(x):
|
||||
def argument_parsing(notebook=False, notebook_args=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--configs", nargs="+", required=True)
|
||||
parser.add_argument("--local_rank", type=int, default=-1)
|
||||
parser.add_argument("--deepspeed", action="store_true")
|
||||
parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false")
|
||||
parser.set_defaults(deepspeed=False)
|
||||
|
||||
if notebook:
|
||||
args, remaining = parser.parse_known_args(notebook_args)
|
||||
else:
|
||||
args, remaining = parser.parse_known_args()
|
||||
|
||||
print(args)
|
||||
|
||||
# Config from YAML
|
||||
conf = {}
|
||||
configs = read_yamls("./configs")
|
||||
@@ -158,6 +128,8 @@ def argument_parsing(notebook=False, notebook_args=None):
|
||||
else:
|
||||
conf.update(configs[name])
|
||||
|
||||
conf["local_rank"] = args.local_rank
|
||||
conf["deepspeed"] = args.deepspeed
|
||||
# Override config from command-line
|
||||
parser = argparse.ArgumentParser()
|
||||
for key, value in conf.items():
|
||||
@@ -175,76 +147,41 @@ if __name__ == "__main__":
|
||||
tokenizer = get_tokenizer(training_conf)
|
||||
model = get_model(training_conf, tokenizer)
|
||||
|
||||
###
|
||||
from datasets import load_dataset
|
||||
from bitsandbytes.optim import Adam8bit
|
||||
from torch.nn import functional as F
|
||||
from tqdm import tqdm
|
||||
train, evals, collate_fn = get_dataset(training_conf, tokenizer)
|
||||
|
||||
gpt = model.to("cuda")
|
||||
args = TrainingArguments(
|
||||
output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
|
||||
num_train_epochs=training_conf.num_train_epochs,
|
||||
warmup_steps=training_conf.warmup_steps,
|
||||
learning_rate=float(training_conf.learning_rate),
|
||||
deepspeed="configs/zero_config.json" if training_conf.deepspeed else None,
|
||||
fp16=True,
|
||||
local_rank=training_conf.local_rank,
|
||||
gradient_checkpointing=training_conf.gradient_checkpointing,
|
||||
gradient_accumulation_steps=training_conf.gradient_accumulation_steps,
|
||||
per_device_train_batch_size=training_conf.per_device_train_batch_size,
|
||||
per_device_eval_batch_size=training_conf.per_device_eval_batch_size,
|
||||
weight_decay=training_conf.weight_decay,
|
||||
max_grad_norm=training_conf.max_grad_norm,
|
||||
logging_steps=training_conf.logging_steps,
|
||||
save_total_limit=training_conf.save_total_limit,
|
||||
evaluation_strategy="steps",
|
||||
eval_steps=training_conf.eval_steps,
|
||||
save_steps=training_conf.save_steps,
|
||||
eval_accumulation_steps=training_conf.eval_accumulation_steps,
|
||||
report_to="wandb",
|
||||
)
|
||||
|
||||
gpt.gradient_checkpointing_enable()
|
||||
|
||||
codeparrot = load_dataset("transformersbook/codeparrot-train", streaming=True, cache_dir=training_conf.cache_dir)
|
||||
optimizer = Adam8bit(gpt.parameters(), lr=1e-5)
|
||||
|
||||
with torch.cuda.amp.autocast():
|
||||
for row in tqdm(codeparrot["train"]):
|
||||
if len(row["content"]) <= 1:
|
||||
continue
|
||||
|
||||
batch = tokenizer(row["content"], truncation=True, max_length=128, return_tensors="pt")
|
||||
batch = {k: v.cuda() for k, v in batch.items()}
|
||||
|
||||
out = gpt.forward(
|
||||
**batch,
|
||||
)
|
||||
|
||||
loss = F.cross_entropy(
|
||||
out.logits[:, :-1, :].flatten(0, -2), batch["input_ids"][:, 1:].flatten(), reduction="mean"
|
||||
)
|
||||
print(loss)
|
||||
loss.backward()
|
||||
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
###
|
||||
|
||||
|
||||
# train, evals, collate_fn = get_dataset(training_conf, tokenizer)
|
||||
# assert len(evals) > 0
|
||||
|
||||
# args = CustomTrainingArguments(
|
||||
# output_dir=f"{training_conf.model_name}-{training_conf.log_dir}-finetuned",
|
||||
# num_train_epochs=training_conf.num_train_epochs,
|
||||
# warmup_steps=training_conf.warmup_steps,
|
||||
# loss_function=training_conf.loss_fn,
|
||||
# learning_rate=float(training_conf.learning_rate),
|
||||
# fp16=True,
|
||||
# gradient_checkpointing=training_conf.gradient_checkpointing,
|
||||
# gradient_accumulation_steps=training_conf.gradient_accumulation_steps,
|
||||
# per_device_train_batch_size=training_conf.per_device_train_batch_size,
|
||||
# per_device_eval_batch_size=training_conf.per_device_eval_batch_size,
|
||||
# weight_decay=training_conf.weight_decay,
|
||||
# max_grad_norm=training_conf.max_grad_norm,
|
||||
# logging_steps=training_conf.logging_steps,
|
||||
# save_total_limit=training_conf.save_total_limit,
|
||||
# evaluation_strategy="steps",
|
||||
# eval_steps=training_conf.eval_steps,
|
||||
# save_steps=training_conf.save_steps,
|
||||
# eval_accumulation_steps=training_conf.eval_accumulation_steps,
|
||||
# report_to="wandb",
|
||||
# quantization=training_conf.quantization,
|
||||
# )
|
||||
|
||||
# trainer = SFTTrainer(
|
||||
# model,
|
||||
# args,
|
||||
# train_dataset=train,
|
||||
# eval_dataset=evals,
|
||||
# data_collator=collate_fn,
|
||||
# tokenizer=tokenizer,
|
||||
# compute_metrics=compute_metrics,
|
||||
# preprocess_logits_for_metrics=preprocess_logits_for_metrics,
|
||||
# )
|
||||
# trainer.train()
|
||||
assert len(evals) > 0
|
||||
trainer = SFTTrainer(
|
||||
model,
|
||||
args,
|
||||
loss_function=training_conf.loss_fn,
|
||||
train_dataset=train,
|
||||
eval_dataset=evals,
|
||||
data_collator=collate_fn,
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics,
|
||||
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
@@ -15,6 +15,10 @@ def get_tokenizer(conf):
|
||||
|
||||
if "galactica" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
|
||||
elif "GPT-JT" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
|
||||
elif "codegen" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
|
||||
|
||||
additional_special_tokens = (
|
||||
[]
|
||||
@@ -29,12 +33,6 @@ def get_tokenizer(conf):
|
||||
|
||||
|
||||
def get_model(conf, tokenizer):
|
||||
if not any([x in conf.model_name.lower() for x in SUPPORTED_MODELS]):
|
||||
raise ValueError(
|
||||
f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. "
|
||||
"To include more make sure the masking is done correctly... (decoder only supported for now)"
|
||||
)
|
||||
|
||||
model = get_specific_model(conf.model_name, conf.cache_dir, conf.quantization)
|
||||
|
||||
if len(tokenizer) != model.get_input_embeddings().num_embeddings:
|
||||
|
||||
Reference in New Issue
Block a user