mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-29 16:30:24 +08:00
[feature] working trainer code
This commit is contained in:
Vendored
+1
-1
@@ -1,4 +1,4 @@
|
||||
{
|
||||
"python.formatting.provider": "black",
|
||||
"python.formatting.provider": "autopep8",
|
||||
"python.analysis.extraPaths": ["${workspaceFolder}/oasst-shared"]
|
||||
}
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
'''
|
||||
|
||||
HFSummary
|
||||
|
||||
I want to train a multi regression model on axis_evals dataset mainly we can estimate the score of these score
|
||||
|
||||
- {"overall": "6", "accuracy": "6", "coverage": "6", "coherence": "7"}
|
||||
|
||||
Should be better than just a preference score
|
||||
|
||||
'''
|
||||
import os
|
||||
@@ -9,3 +15,5 @@ import torch
|
||||
import numpy as np
|
||||
from dataset import load_dataset
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
|
||||
@@ -8,32 +8,51 @@
|
||||
[ ]
|
||||
|
||||
'''
|
||||
from typing import Optional, Union
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from torch.utils.data import Dataset
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
|
||||
class CollateFN():
|
||||
def __init__(self, tokenizer, max_length=400) -> None:
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
@dataclass
|
||||
class DataCollatorForPairRank:
|
||||
"""
|
||||
|
||||
def __call__(self, batch):
|
||||
prompts = []
|
||||
pos_sentences = []
|
||||
neg_sentences = []
|
||||
for prompt, pairs in batch:
|
||||
Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
|
||||
"""
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
num_choices: int = 2
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __call__(self, features):
|
||||
|
||||
flatten_features = []
|
||||
batch_size = 0
|
||||
for question, pairs in features:
|
||||
for (pos, neg) in pairs:
|
||||
prompts.append(prompt)
|
||||
pos_sentences.append(pos)
|
||||
neg_sentences.append(neg)
|
||||
|
||||
batch = [self.tokenizer(prompts, pos_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True),\
|
||||
self.tokenizer(prompts, neg_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True)]
|
||||
flatten_features.append(self.tokenizer(question, pos, truncation=True))
|
||||
flatten_features.append(self.tokenizer(question, neg, truncation=True))
|
||||
batch_size += 1
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flatten_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
# batch = {k: v.view(batch_size, self.num_choices, -1) for k, v in batch.items()}
|
||||
return batch
|
||||
|
||||
|
||||
class WebGPT(Dataset):
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
||||
@@ -1,26 +1,26 @@
|
||||
from transformers import AutoTokenizer
|
||||
from torch.utils.data import DataLoader
|
||||
from rank_datasets import WebGPT, HFSummary, CollateFN
|
||||
from rank_datasets import WebGPT, HFSummary, DataCollatorForMultipleChoice
|
||||
|
||||
|
||||
def test_hfsummary():
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
|
||||
collate_fn = CollateFN(tokenizer)
|
||||
collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200)
|
||||
dataset = HFSummary()
|
||||
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=8)
|
||||
for batch in dataloader:
|
||||
print(batch[0]['input_ids'].shape)
|
||||
print(batch['input_ids'].shape)
|
||||
|
||||
|
||||
def test_webgpt():
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")
|
||||
collate_fn = CollateFN(tokenizer)
|
||||
collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200)
|
||||
dataset = WebGPT()
|
||||
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32)
|
||||
for batch in dataloader:
|
||||
print(batch[0]['input_ids'].shape)
|
||||
print(batch['input_ids'].shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,2 +1,102 @@
|
||||
import wandb
|
||||
from accelerate import Accelerator
|
||||
from typing import Callable, List, Optional, Tuple, Union, Dict
|
||||
import torch
|
||||
from torch import nn
|
||||
import numpy as np
|
||||
import evaluate
|
||||
from dataclasses import dataclass
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoModelForMultipleChoice
|
||||
from transformers import Trainer, PreTrainedModel, TrainingArguments, DataCollator, EvalPrediction, TrainerCallback, PreTrainedTokenizerBase
|
||||
from rank_datasets import DataCollatorForPairRank, WebGPT
|
||||
from utils import get_tokenizer, train_val_dataset
|
||||
|
||||
accuracy = evaluate.load("accuracy")
|
||||
|
||||
@dataclass
|
||||
class CustomTrainingArguments(TrainingArguments):
|
||||
loss_function: str='rank'
|
||||
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions, _ = eval_pred
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
return accuracy.compute(predictions=predictions, references=[0]*predictions.shape[0])
|
||||
|
||||
class RankLoss(nn.Module):
|
||||
def __init__(self, eps=1e-8) -> None:
|
||||
super().__init__()
|
||||
self.eps = eps
|
||||
self.log_sigmoid = nn.LogSigmoid()
|
||||
|
||||
def forward(self, pos, neg):
|
||||
return -self.log_sigmoid(pos - neg + self.eps).mean()
|
||||
|
||||
|
||||
class RankTrainer(Trainer):
|
||||
def __init__(self, model: Union[PreTrainedModel, nn.Module] = None,
|
||||
args: TrainingArguments = None,
|
||||
data_collator: Optional[DataCollator] = None,
|
||||
train_dataset: Optional[Dataset] = None,
|
||||
eval_dataset: Optional[Dataset] = None,
|
||||
tokenizer: Optional[PreTrainedTokenizerBase] = None,
|
||||
model_init: Callable[[], PreTrainedModel] = None,
|
||||
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
|
||||
callbacks: Optional[List[TrainerCallback]] = None,
|
||||
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
|
||||
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None):
|
||||
super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer,
|
||||
model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
|
||||
self.loss_fct = RankLoss() if args.loss_function == 'rank' else nn.CrossEntropyLoss()
|
||||
self.loss_function = args.loss_function
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits").view(-1, 2)
|
||||
if self.loss_function == 'rank':
|
||||
loss = self.loss_fct(logits[:, 0], logits[:, 1])
|
||||
else:
|
||||
loss = self.loss_fct(logits, torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_name = 'bigscience/bloomz-560m'
|
||||
model_name = 'google/electra-base-discriminator'
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type='regression')
|
||||
tokenizer = get_tokenizer(model_name)
|
||||
args = CustomTrainingArguments(
|
||||
output_dir=f"outputs/{model_name}-finetuned",
|
||||
fp16=True,
|
||||
num_train_epochs=4,
|
||||
warmup_steps=500,
|
||||
learning_rate=3e-5,
|
||||
# half_precision_backend="apex",
|
||||
gradient_checkpointing=False,
|
||||
gradient_accumulation_steps=6,
|
||||
per_device_train_batch_size=12,
|
||||
per_device_eval_batch_size=5,
|
||||
weight_decay=0.01,
|
||||
max_grad_norm=2.0,
|
||||
logging_steps=10,
|
||||
save_total_limit=4,
|
||||
evaluation_strategy='steps',
|
||||
loss_function='rank',
|
||||
eval_steps=500,
|
||||
save_steps=1000,
|
||||
report_to="wandb",
|
||||
run_name='reward-model'
|
||||
)
|
||||
dataset = WebGPT()
|
||||
train, eval = train_val_dataset(dataset)
|
||||
collate_fn = DataCollatorForPairRank(tokenizer, max_length=400)
|
||||
trainer = RankTrainer(
|
||||
model,
|
||||
args,
|
||||
train_dataset=train,
|
||||
eval_dataset=eval,
|
||||
data_collator=collate_fn,
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import re
|
||||
from torch.utils.data import Subset
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
re_reference_remove = re.compile(r'\[([0-9])+\]|\[([0-9])+,([0-9])+\]')
|
||||
|
||||
@@ -16,3 +19,23 @@ def webgpt_return_format(row):
|
||||
'pos': re_reference_remove.sub('', row['answer_1']),
|
||||
'neg': re_reference_remove.sub('', row['answer_0'])
|
||||
}
|
||||
|
||||
|
||||
def get_tokenizer(tokenizer_name):
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
||||
if 'galactica' in tokenizer_name:
|
||||
tokenizer.add_special_tokens({'pad_token':'<pad>', 'eos_token': '</s>' })
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
|
||||
def train_val_dataset(dataset, val_split=0.2):
|
||||
train_idx, val_idx = train_test_split(list(range(len(dataset))),
|
||||
test_size=val_split, random_state=666, shuffle=True)
|
||||
# [3879, 11479, 8341, 9177, 10798, 18177, 5735, 15669, 4837, 2760]
|
||||
print(val_idx[:10])
|
||||
# [13582, 5919, 11875, 7373, 19135, 13706, 8555, 15788, 15005, 15209]
|
||||
print(train_idx[:10])
|
||||
return Subset(dataset, train_idx), Subset(dataset, val_idx)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user