[feature] working trainer code

This commit is contained in:
theblackcat102
2022-12-31 03:02:10 +00:00
parent ad98a28241
commit bcd5c52b3b
6 changed files with 174 additions and 24 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
{
"python.formatting.provider": "black",
"python.formatting.provider": "autopep8",
"python.analysis.extraPaths": ["${workspaceFolder}/oasst-shared"]
}
@@ -1,5 +1,11 @@
'''
HFSummary
I want to train a multi regression model on axis_evals dataset mainly we can estimate the score of these score
- {"overall": "6", "accuracy": "6", "coverage": "6", "coherence": "7"}
Should be better than just a preference score
'''
import os
@@ -9,3 +15,5 @@ import torch
import numpy as np
from dataset import load_dataset
from torch.utils.data import Dataset
+34 -15
View File
@@ -8,32 +8,51 @@
[ ]
'''
from typing import Optional, Union
import os
import glob
import json
from dataclasses import dataclass
import numpy as np
from torch.utils.data import Dataset
import torch
from datasets import load_dataset
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
class CollateFN():
def __init__(self, tokenizer, max_length=400) -> None:
self.tokenizer = tokenizer
self.max_length = max_length
@dataclass
class DataCollatorForPairRank:
"""
def __call__(self, batch):
prompts = []
pos_sentences = []
neg_sentences = []
for prompt, pairs in batch:
Data collator that will dynamically pad the inputs for multiple choice received.
"""
tokenizer: PreTrainedTokenizerBase
num_choices: int = 2
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
def __call__(self, features):
flatten_features = []
batch_size = 0
for question, pairs in features:
for (pos, neg) in pairs:
prompts.append(prompt)
pos_sentences.append(pos)
neg_sentences.append(neg)
batch = [self.tokenizer(prompts, pos_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True),\
self.tokenizer(prompts, neg_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True)]
flatten_features.append(self.tokenizer(question, pos, truncation=True))
flatten_features.append(self.tokenizer(question, neg, truncation=True))
batch_size += 1
batch = self.tokenizer.pad(
flatten_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
# batch = {k: v.view(batch_size, self.num_choices, -1) for k, v in batch.items()}
return batch
class WebGPT(Dataset):
def __init__(self) -> None:
@@ -1,26 +1,26 @@
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from rank_datasets import WebGPT, HFSummary, CollateFN
from rank_datasets import WebGPT, HFSummary, DataCollatorForMultipleChoice
def test_hfsummary():
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
collate_fn = CollateFN(tokenizer)
collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200)
dataset = HFSummary()
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=8)
for batch in dataloader:
print(batch[0]['input_ids'].shape)
print(batch['input_ids'].shape)
def test_webgpt():
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
collate_fn = CollateFN(tokenizer)
collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200)
dataset = WebGPT()
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32)
for batch in dataloader:
print(batch[0]['input_ids'].shape)
print(batch['input_ids'].shape)
if __name__ == "__main__":
+102 -2
View File
@@ -1,2 +1,102 @@
import wandb
from accelerate import Accelerator
from typing import Callable, List, Optional, Tuple, Union, Dict
import torch
from torch import nn
import numpy as np
import evaluate
from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoModelForMultipleChoice
from transformers import Trainer, PreTrainedModel, TrainingArguments, DataCollator, EvalPrediction, TrainerCallback, PreTrainedTokenizerBase
from rank_datasets import DataCollatorForPairRank, WebGPT
from utils import get_tokenizer, train_val_dataset
accuracy = evaluate.load("accuracy")
@dataclass
class CustomTrainingArguments(TrainingArguments):
loss_function: str='rank'
def compute_metrics(eval_pred):
predictions, _ = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=[0]*predictions.shape[0])
class RankLoss(nn.Module):
def __init__(self, eps=1e-8) -> None:
super().__init__()
self.eps = eps
self.log_sigmoid = nn.LogSigmoid()
def forward(self, pos, neg):
return -self.log_sigmoid(pos - neg + self.eps).mean()
class RankTrainer(Trainer):
def __init__(self, model: Union[PreTrainedModel, nn.Module] = None,
args: TrainingArguments = None,
data_collator: Optional[DataCollator] = None,
train_dataset: Optional[Dataset] = None,
eval_dataset: Optional[Dataset] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
model_init: Callable[[], PreTrainedModel] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
callbacks: Optional[List[TrainerCallback]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None):
super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer,
model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
self.loss_fct = RankLoss() if args.loss_function == 'rank' else nn.CrossEntropyLoss()
self.loss_function = args.loss_function
def compute_loss(self, model, inputs, return_outputs=False):
# forward pass
outputs = model(**inputs)
logits = outputs.get("logits").view(-1, 2)
if self.loss_function == 'rank':
loss = self.loss_fct(logits[:, 0], logits[:, 1])
else:
loss = self.loss_fct(logits, torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long))
return (loss, outputs) if return_outputs else loss
if __name__ == "__main__":
model_name = 'bigscience/bloomz-560m'
model_name = 'google/electra-base-discriminator'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type='regression')
tokenizer = get_tokenizer(model_name)
args = CustomTrainingArguments(
output_dir=f"outputs/{model_name}-finetuned",
fp16=True,
num_train_epochs=4,
warmup_steps=500,
learning_rate=3e-5,
# half_precision_backend="apex",
gradient_checkpointing=False,
gradient_accumulation_steps=6,
per_device_train_batch_size=12,
per_device_eval_batch_size=5,
weight_decay=0.01,
max_grad_norm=2.0,
logging_steps=10,
save_total_limit=4,
evaluation_strategy='steps',
loss_function='rank',
eval_steps=500,
save_steps=1000,
report_to="wandb",
run_name='reward-model'
)
dataset = WebGPT()
train, eval = train_val_dataset(dataset)
collate_fn = DataCollatorForPairRank(tokenizer, max_length=400)
trainer = RankTrainer(
model,
args,
train_dataset=train,
eval_dataset=eval,
data_collator=collate_fn,
tokenizer=tokenizer
)
trainer.train()
+23
View File
@@ -1,4 +1,7 @@
import re
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
re_reference_remove = re.compile(r'\[([0-9])+\]|\[([0-9])+,([0-9])+\]')
@@ -16,3 +19,23 @@ def webgpt_return_format(row):
'pos': re_reference_remove.sub('', row['answer_1']),
'neg': re_reference_remove.sub('', row['answer_0'])
}
def get_tokenizer(tokenizer_name):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
if 'galactica' in tokenizer_name:
tokenizer.add_special_tokens({'pad_token':'<pad>', 'eos_token': '</s>' })
return tokenizer
def train_val_dataset(dataset, val_split=0.2):
train_idx, val_idx = train_test_split(list(range(len(dataset))),
test_size=val_split, random_state=666, shuffle=True)
# [3879, 11479, 8341, 9177, 10798, 18177, 5735, 15669, 4837, 2760]
print(val_idx[:10])
# [13582, 5919, 11875, 7373, 19135, 13706, 8555, 15788, 15005, 15209]
print(train_idx[:10])
return Subset(dataset, train_idx), Subset(dataset, val_idx)