diff --git a/.vscode/settings.json b/.vscode/settings.json index 56a51f78..4c58a32f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,4 @@ { - "python.formatting.provider": "black", + "python.formatting.provider": "autopep8", "python.analysis.extraPaths": ["${workspaceFolder}/oasst-shared"] } diff --git a/model/reward/instructor/experimental_dataset.py b/model/reward/instructor/experimental_dataset.py index 145588c4..f705ccf6 100644 --- a/model/reward/instructor/experimental_dataset.py +++ b/model/reward/instructor/experimental_dataset.py @@ -1,5 +1,11 @@ ''' - + HFSummary + + I want to train a multi regression model on axis_evals dataset mainly we can estimate the score of these score + + - {"overall": "6", "accuracy": "6", "coverage": "6", "coherence": "7"} + + Should be better than just a preference score ''' import os @@ -9,3 +15,5 @@ import torch import numpy as np from dataset import load_dataset from torch.utils.data import Dataset + + diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index 7fef5ab7..e407b30f 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -8,32 +8,51 @@ [ ] ''' +from typing import Optional, Union import os import glob import json +from dataclasses import dataclass import numpy as np from torch.utils.data import Dataset +import torch from datasets import load_dataset +from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy -class CollateFN(): - def __init__(self, tokenizer, max_length=400) -> None: - self.tokenizer = tokenizer - self.max_length = max_length +@dataclass +class DataCollatorForPairRank: + """ - def __call__(self, batch): - prompts = [] - pos_sentences = [] - neg_sentences = [] - for prompt, pairs in batch: + Data collator that will dynamically pad the inputs for multiple choice received. + + """ + tokenizer: PreTrainedTokenizerBase + num_choices: int = 2 + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + + def __call__(self, features): + + flatten_features = [] + batch_size = 0 + for question, pairs in features: for (pos, neg) in pairs: - prompts.append(prompt) - pos_sentences.append(pos) - neg_sentences.append(neg) - - batch = [self.tokenizer(prompts, pos_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True),\ - self.tokenizer(prompts, neg_sentences, return_tensors='pt', max_length=self.max_length, padding=True, truncation=True)] + flatten_features.append(self.tokenizer(question, pos, truncation=True)) + flatten_features.append(self.tokenizer(question, neg, truncation=True)) + batch_size += 1 + + batch = self.tokenizer.pad( + flatten_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="pt", + ) + # batch = {k: v.view(batch_size, self.num_choices, -1) for k, v in batch.items()} return batch + class WebGPT(Dataset): def __init__(self) -> None: diff --git a/model/reward/instructor/tests/test_dataset.py b/model/reward/instructor/tests/test_dataset.py index 4dd59c16..c452786b 100644 --- a/model/reward/instructor/tests/test_dataset.py +++ b/model/reward/instructor/tests/test_dataset.py @@ -1,26 +1,26 @@ from transformers import AutoTokenizer from torch.utils.data import DataLoader -from rank_datasets import WebGPT, HFSummary, CollateFN +from rank_datasets import WebGPT, HFSummary, DataCollatorForMultipleChoice def test_hfsummary(): tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - collate_fn = CollateFN(tokenizer) + collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200) dataset = HFSummary() dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=8) for batch in dataloader: - print(batch[0]['input_ids'].shape) + print(batch['input_ids'].shape) def test_webgpt(): tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - collate_fn = CollateFN(tokenizer) + collate_fn = DataCollatorForMultipleChoice(tokenizer, max_length=200) dataset = WebGPT() dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32) for batch in dataloader: - print(batch[0]['input_ids'].shape) + print(batch['input_ids'].shape) if __name__ == "__main__": diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index 9ee5e043..43a5f8ef 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -1,2 +1,102 @@ -import wandb -from accelerate import Accelerator +from typing import Callable, List, Optional, Tuple, Union, Dict +import torch +from torch import nn +import numpy as np +import evaluate +from dataclasses import dataclass +from torch.utils.data import Dataset +from transformers import AutoModelForSequenceClassification, AutoModelForMultipleChoice +from transformers import Trainer, PreTrainedModel, TrainingArguments, DataCollator, EvalPrediction, TrainerCallback, PreTrainedTokenizerBase +from rank_datasets import DataCollatorForPairRank, WebGPT +from utils import get_tokenizer, train_val_dataset + +accuracy = evaluate.load("accuracy") + +@dataclass +class CustomTrainingArguments(TrainingArguments): + loss_function: str='rank' + + +def compute_metrics(eval_pred): + predictions, _ = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=[0]*predictions.shape[0]) + +class RankLoss(nn.Module): + def __init__(self, eps=1e-8) -> None: + super().__init__() + self.eps = eps + self.log_sigmoid = nn.LogSigmoid() + + def forward(self, pos, neg): + return -self.log_sigmoid(pos - neg + self.eps).mean() + + +class RankTrainer(Trainer): + def __init__(self, model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None): + super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, + model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics) + self.loss_fct = RankLoss() if args.loss_function == 'rank' else nn.CrossEntropyLoss() + self.loss_function = args.loss_function + + def compute_loss(self, model, inputs, return_outputs=False): + # forward pass + outputs = model(**inputs) + logits = outputs.get("logits").view(-1, 2) + if self.loss_function == 'rank': + loss = self.loss_fct(logits[:, 0], logits[:, 1]) + else: + loss = self.loss_fct(logits, torch.zeros(logits.shape[0], device=logits.device, dtype=torch.long)) + + return (loss, outputs) if return_outputs else loss + + +if __name__ == "__main__": + model_name = 'bigscience/bloomz-560m' + model_name = 'google/electra-base-discriminator' + model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type='regression') + tokenizer = get_tokenizer(model_name) + args = CustomTrainingArguments( + output_dir=f"outputs/{model_name}-finetuned", + fp16=True, + num_train_epochs=4, + warmup_steps=500, + learning_rate=3e-5, + # half_precision_backend="apex", + gradient_checkpointing=False, + gradient_accumulation_steps=6, + per_device_train_batch_size=12, + per_device_eval_batch_size=5, + weight_decay=0.01, + max_grad_norm=2.0, + logging_steps=10, + save_total_limit=4, + evaluation_strategy='steps', + loss_function='rank', + eval_steps=500, + save_steps=1000, + report_to="wandb", + run_name='reward-model' + ) + dataset = WebGPT() + train, eval = train_val_dataset(dataset) + collate_fn = DataCollatorForPairRank(tokenizer, max_length=400) + trainer = RankTrainer( + model, + args, + train_dataset=train, + eval_dataset=eval, + data_collator=collate_fn, + tokenizer=tokenizer + ) + trainer.train() diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index 1487947c..10f84193 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -1,4 +1,7 @@ import re +from torch.utils.data import Subset +from sklearn.model_selection import train_test_split +from transformers import AutoTokenizer re_reference_remove = re.compile(r'\[([0-9])+\]|\[([0-9])+,([0-9])+\]') @@ -16,3 +19,23 @@ def webgpt_return_format(row): 'pos': re_reference_remove.sub('', row['answer_1']), 'neg': re_reference_remove.sub('', row['answer_0']) } + + +def get_tokenizer(tokenizer_name): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + if 'galactica' in tokenizer_name: + tokenizer.add_special_tokens({'pad_token':'', 'eos_token': '' }) + + return tokenizer + + + +def train_val_dataset(dataset, val_split=0.2): + train_idx, val_idx = train_test_split(list(range(len(dataset))), + test_size=val_split, random_state=666, shuffle=True) + # [3879, 11479, 8341, 9177, 10798, 18177, 5735, 15669, 4837, 2760] + print(val_idx[:10]) + # [13582, 5919, 11875, 7373, 19135, 13706, 8555, 15788, 15005, 15209] + print(train_idx[:10]) + return Subset(dataset, train_idx), Subset(dataset, val_idx) +