From f8eba6854456259f1c0d996939ac530ececc77cb Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Wed, 1 Feb 2023 22:14:11 +0000 Subject: [PATCH 01/11] [feature] Add mix conversation augmentation --- .../custom_datasets/qa_datasets.py | 77 ++++++++++++++++++- .../custom_datasets/summarization.py | 2 +- .../custom_datasets/translation.py | 28 +++++-- 3 files changed, 95 insertions(+), 12 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 2c5c7ee2..2acf9106 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -3,6 +3,7 @@ """ import json import os +import random import re from urllib.request import urlopen @@ -115,7 +116,7 @@ class QADataset(Dataset): "reddit_asks": {"name": "eli5", "index_fn": index_eli5, "split_postfix": "_asks"}, } - def __init__(self, dataset, cache_dir, split): + def __init__(self, dataset, cache_dir, split, mix_prob=0.2): self.no_val = False if dataset in self.DATASET_FORMAT_MAPPING: context = self.DATASET_FORMAT_MAPPING[dataset] @@ -137,11 +138,25 @@ class QADataset(Dataset): self.dataset = load_dataset(context["name"], **context["params"]) else: raise ValueError("Unknown dataset : " + dataset) + self.length = len(self.dataset) + self.mix_prob = mix_prob def __len__(self): - return len(self.dataset) + return self.length def __getitem__(self, idx): + if self.mix_prob > 0 and random.random() < self.mix_prob and idx > 5 and idx < (self.length - 5): + + additional = random.randint(0, 10) - 5 + while additional == idx: + additional = random.randint(0, 10) - 5 + + answer_pair = self.index_fn(self.dataset[additional + idx]) + history_text = "".join(format_pair(answer_pair)) + question, answer = self.index_fn(self.dataset[idx]) + question = history_text + question + return format_pair((question, answer)) + data = self.dataset[idx] return format_pair(self.index_fn(data)) @@ -297,8 +312,9 @@ class JokeExplaination(Dataset): name = "joke" url = "https://gist.github.com/theblackcat102/42b697e24a13fdb499e20edfbf618361/raw/1834dca207898c15f93b809d1195f6f6e47c9e1e/joke_explained.jsonl" - def __init__(self, cache_dir) -> None: + def __init__(self, cache_dir, mix_prob=0.2) -> None: super().__init__() + self.mix_prob = mix_prob os.makedirs(cache_dir, exist_ok=True) joke_explain_filename = os.path.join(cache_dir, "joke_explaination.jsonl") if not os.path.exists(joke_explain_filename): @@ -319,9 +335,62 @@ class JokeExplaination(Dataset): if len(question) > 0 and len(answer) > 0: self.pairs.append((question, answer)) + self.length = len(self.pairs) def __len__(self): - return len(self.pairs) + return self.length def __getitem__(self, index): + if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): + additional = random.randint(0, 10) - 5 + while additional == index: + additional = random.randint(0, 10) - 5 + + history_text = "".join(format_pair(self.pairs[additional + index])) + question, answer = self.pairs[index] + question = history_text + question + return format_pair((question, answer)) + + return format_pair(self.pairs[index]) + + +class TranslatedQA(Dataset): + + name = "oa_translated" + + def __init__(self, cache_dir, mix_prob=0.2) -> None: + super().__init__() + self.mix_prob = mix_prob + os.makedirs(cache_dir, exist_ok=True) + path = os.path.join(cache_dir, "oa_translated") + os.makedirs(path, exist_ok=True) + import glob + + self.pairs = [] + for translated_jsonl in glob.glob(os.path.join(path, "*.jsonl")): + with open(translated_jsonl, "r") as f: + for line in f: + data = json.loads(line) + if "Python " in data["text"]: + continue + # incorrect, TODO: fix later + for convo_round in data["translate"]: + self.pairs.append((convo_round["human"], convo_round["answer"])) + + self.length = len(self.pairs) + + def __len__(self): + return self.length + + def __getitem__(self, index): + if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): + additional = random.randint(0, 10) - 5 + while additional == index: + additional = random.randint(0, 10) - 5 + + history_text = "".join(format_pair(self.pairs[additional + index])) + question, answer = self.pairs[index] + question = history_text + question + return format_pair((question, answer)) + return format_pair(self.pairs[index]) diff --git a/model/supervised_finetuning/custom_datasets/summarization.py b/model/supervised_finetuning/custom_datasets/summarization.py index 834fa16c..aa644691 100644 --- a/model/supervised_finetuning/custom_datasets/summarization.py +++ b/model/supervised_finetuning/custom_datasets/summarization.py @@ -57,7 +57,7 @@ def index_summary_merge(text, summary): class SummarizationDataset(Dataset): def __init__(self, dataset, cache_dir, split, max_words=512): self.name = dataset - if summarization_config_mapping[dataset][0] in ["billsum", "tldr_news"] & split == "validation": + if (dataset in ["billsum", "tldr_news"]) and (split == "validation"): split = "test" self.dataset = load_dataset(*summarization_config_mapping[dataset], cache_dir=cache_dir, split=split) self.text_column, self.summary_column = summarization_name_mapping[dataset] diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index f9a71a8e..18cb9a09 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -75,20 +75,34 @@ TRANSLATION_PROMPT = { class TranslationPair(Dataset): - def __init__(self) -> None: + def __init__(self, mix_prob=0.2) -> None: super().__init__() self.pairs = [] + self.length = -1 + self.mix_prob = mix_prob def __len__(self): + if self.length < 0: + self.length = len(self.pairs) return len(self.pairs) def __getitem__(self, index): + if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): + additional = random.randint(0, 10) - 5 + while additional == index: + additional = random.randint(0, 10) - 5 + + history_text = "".join(format_pair(self.pairs[additional + index])) + question, answer = self.pairs[index] + question = history_text + question + return format_pair((question, answer)) + return format_pair(self.pairs[index]) class WMT2019(TranslationPair): - def __init__(self, pair="zh-en", split="train") -> None: - super().__init__() + def __init__(self, pair="zh-en", split="train", mix_prob=0.2) -> None: + super().__init__(mix_prob=mix_prob) dataset = load_dataset("wmt19", pair)[split] self.pairs = [] src, tgt = pair.split("-") @@ -108,8 +122,8 @@ class DiveMT(TranslationPair): REMAP = {"tur": "tr", "ita": "it", "ukr": "uk", "nld": "nl", "vie": "vi", "ara": "ar"} - def __init__(self, split="train") -> None: - super().__init__() + def __init__(self, split="train", mix_prob=0.2) -> None: + super().__init__(mix_prob=mix_prob) dataset = load_dataset("GroNLP/divemt", "main")[split] tgt, src = "tgt_text", "src_text" for row in dataset: @@ -131,8 +145,8 @@ class DiveMT(TranslationPair): class TEDTalk(TranslationPair): # NOTE: DO NOT use chinese pair, mix with traditional and cantonese, not clean - def __init__(self, pair="de-ja", split="train", year="2016") -> None: - super().__init__() + def __init__(self, pair="de-ja", split="train", year="2016", mix_prob=0.2) -> None: + super().__init__(mix_prob=mix_prob) dataset = load_dataset("ted_talks_iwslt", language_pair=pair.split("-"), year=year)[split] src, tgt = pair.split("-") for row in dataset: From 9be4c921cdf9773f677beac742a995830095ae63 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Wed, 1 Feb 2023 22:33:37 +0000 Subject: [PATCH 02/11] [feature] Add OA translated QA --- model/supervised_finetuning/custom_datasets/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index ee061f04..f664ceaf 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -2,7 +2,7 @@ High level functions for model training """ from custom_datasets.prompt_dialogue import InstructionTuning, PromptGeneratedDataset -from custom_datasets.qa_datasets import SODA, JokeExplaination, QADataset, SODADialogue, WebGPT +from custom_datasets.qa_datasets import SODA, JokeExplaination, QADataset, SODADialogue, TranslatedQA, WebGPT from custom_datasets.summarization import SummarizationDataset from custom_datasets.toxic_conversation import ProsocialDialogue, ProsocialDialogueExplaination from custom_datasets.translation import WMT2019, DiveMT, TEDTalk @@ -92,6 +92,9 @@ def get_one_dataset(conf, dataset_name): elif dataset_name == "instruct_tuning": dataset = InstructionTuning(conf.cache_dir) train, eval = train_val_dataset(dataset, val_split=0.2) + elif dataset_name == "translate_qa": + dataset = TranslatedQA(conf.cache_dir) + train, eval = train_val_dataset(dataset, val_split=0.01) else: raise ValueError(f"Unknown dataset {dataset_name}") From 1041564db7fa7cc4bb3f62710c9e1c10ef7b4218 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Fri, 3 Feb 2023 00:15:29 +0000 Subject: [PATCH 03/11] [feature] mix generation from different tasks --- .../custom_datasets/dialogue_collator.py | 39 ++++++++++++++++++- .../tests/test_datasets.py | 11 ++++-- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/dialogue_collator.py b/model/supervised_finetuning/custom_datasets/dialogue_collator.py index c96ed576..d812bb26 100644 --- a/model/supervised_finetuning/custom_datasets/dialogue_collator.py +++ b/model/supervised_finetuning/custom_datasets/dialogue_collator.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass from typing import Optional, Union @@ -16,12 +17,14 @@ class DialogueDataCollator: tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None + mix_length_threshold: Optional[int] = 256 + mix_probability: Optional[int] = 0.6 pad_to_multiple_of: Optional[int] = None def __call__(self, features): flatten_messages = [] label_masks = [] - + total_short_context = 0 for messages in features: messages = list(messages) @@ -58,8 +61,40 @@ class DialogueDataCollator: label_mask[-1] = False label_masks.append(label_mask) - + if len(flatten_message["input_ids"]) < self.mix_length_threshold: + total_short_context += len(flatten_message["input_ids"]) flatten_messages.append({k: v for k, v in flatten_message.items() if k != "offset_mapping"}) + # packing + if total_short_context > 2: + _flatten_messages, _label_masks = [], [] + prev_short_msg, prev_short_mask = None, None + for flatten_msg, label_mask in zip(flatten_messages, label_masks): + if len(flatten_msg["input_ids"]) < self.mix_length_threshold and random.random() > 0.6: + if prev_short_msg is not None: + for key in flatten_msg.keys(): + flatten_msg[key] += prev_short_msg[key] + flatten_msg[key] = flatten_msg[key][: self.max_length] + label_mask = np.concatenate([label_mask, prev_short_mask]) + _label_masks.append(label_mask[: self.max_length]) + _flatten_messages.append(flatten_msg) + # reset + prev_short_msg, prev_short_mask = None, None + else: + # prime + prev_short_msg, prev_short_mask = flatten_msg, label_mask + else: + _label_masks.append(label_mask) + _flatten_messages.append(flatten_msg) + if prev_short_msg is not None: + for key in flatten_msg.keys(): + flatten_msg[key] += prev_short_msg[key] + flatten_msg[key] = flatten_msg[key][: self.max_length] + label_mask = np.concatenate([label_mask, prev_short_mask])[: self.max_length] + _label_masks.append(label_mask) + _flatten_messages.append(flatten_msg) + + label_masks = _label_masks + flatten_messages = _flatten_messages batch = self.tokenizer.pad( flatten_messages, diff --git a/model/supervised_finetuning/tests/test_datasets.py b/model/supervised_finetuning/tests/test_datasets.py index 8d5ad08f..2a0d4481 100644 --- a/model/supervised_finetuning/tests/test_datasets.py +++ b/model/supervised_finetuning/tests/test_datasets.py @@ -27,7 +27,7 @@ def test_collate_fn(): config = Namespace(cache_dir=".cache", model_name="Salesforce/codegen-2B-multi") tokenizer = get_tokenizer(config) - collate_fn = DialogueDataCollator(tokenizer, max_length=512) + collate_fn = DialogueDataCollator(tokenizer, max_length=620) qa_base = QA_DATASETS summarize_base = SUMMARIZATION_DATASETS others = ["prompt_dialogue", "webgpt", "soda", "joke", "gsm8k"] @@ -40,11 +40,14 @@ def test_collate_fn(): dataloader = DataLoader(ConcatDataset(trains), collate_fn=collate_fn, batch_size=128) for batch in dataloader: - print(batch.keys()) + print(batch["targets"].shape[0]) print(tokenizer.decode(batch["input_ids"][0])) print("-----") print(tokenizer.decode(batch["targets"][0][batch["label_masks"][0]])) - assert batch["targets"].shape[1] <= 512 + assert batch["targets"].shape[1] <= 620 dataloader = DataLoader(ConcatDataset(evals), collate_fn=collate_fn, batch_size=128) for batch in dataloader: - assert batch["targets"].shape[1] <= 512 + assert batch["targets"].shape[1] <= 620 + + +test_collate_fn() From 8b2080559ce08cecb8ae022583cb7fdf06505b4b Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Fri, 3 Feb 2023 06:08:01 +0000 Subject: [PATCH 04/11] [fix] Custom collate_fn for training --- .../custom_datasets/dialogue_collator.py | 71 +++++++++++++++++++ model/supervised_finetuning/trainer.py | 60 +++++++++++++++- model/supervised_finetuning/utils.py | 6 +- 3 files changed, 131 insertions(+), 6 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/dialogue_collator.py b/model/supervised_finetuning/custom_datasets/dialogue_collator.py index d812bb26..0a0b7a5a 100644 --- a/model/supervised_finetuning/custom_datasets/dialogue_collator.py +++ b/model/supervised_finetuning/custom_datasets/dialogue_collator.py @@ -14,6 +14,77 @@ class DialogueDataCollator: Expects a list of texts corresponding to a sequence of [question, answer, question, answer, ...] pairs. """ + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + + def __call__(self, features): + flatten_messages = [] + label_masks = [] + + for messages in features: + messages = list(messages) + + # Add a way for the model to terminate generation + # When we predict the start of a new expected question, we want to be able to stop generation + messages.append(self.tokenizer.eos_token) + + flatten_message = self.tokenizer( + "".join(messages), + truncation=True, + max_length=self.max_length, + return_offsets_mapping=True, + ) + + message_change_indices = np.cumsum([len(x) for x in messages[:-1]]) + # for each token an integer indicating the index of the message it belongs to. Just to create the label mask. + # Label mask is true when predicting a token that is part of the answer, false otherwise. + # TEXT: Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John. Question: + # MESSAGE_INDICES: 0 0 0 0 0 0 1 1 1 2 2 2 2 2 2 3 3 3 3 -2 + # LABEL_MASK: 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 + + # If no result in next, we are predicting the last termination token(s) + message_indices = list( + map( + lambda x: next((i for i, val in enumerate(message_change_indices) if val >= x), -2), + list(map(lambda x: x[1], flatten_message["offset_mapping"])), + ) + ) + label_mask = np.roll(list(map(lambda x: x % 2 == 1, message_indices)), -1, -1) + try: + label_mask[[i for i in range(len(message_indices)) if message_indices[i] == -2][0] - 1] = True + except IndexError: + # due to truncation, we might not have the last termination token + label_mask[-1] = False + + label_masks.append(label_mask) + + flatten_messages.append({k: v for k, v in flatten_message.items() if k != "offset_mapping"}) + + batch = self.tokenizer.pad( + flatten_messages, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="pt", + ) + dim = batch["input_ids"].shape[-1] + + batch["label_masks"] = torch.stack( + [F.pad(torch.tensor(x), (0, dim - len(x)), value=False) for x in label_masks] + ) + batch["targets"] = torch.roll(batch["input_ids"], -1, -1) + + return batch + + +@dataclass +class TrainDialogueDataCollator: + """ + Expects a list of texts corresponding to a sequence of [question, answer, question, answer, ...] pairs. + """ + tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index 0acb10dd..043534ea 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -1,13 +1,17 @@ import argparse from distutils.util import strtobool from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import bitsandbytes +import datasets import torch from torch import nn +from torch.utils.data import DataLoader from transformers import PreTrainedModel, Trainer, TrainingArguments +from transformers.trainer_pt_utils import IterableDatasetShard, seed_worker from transformers.training_args import OptimizerNames +from transformers.utils import is_datasets_available from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls @@ -30,12 +34,13 @@ class SFTTrainer(Trainer): self, model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, + train_collate_fn: Callable = None, loss_function: str = "CrossEntropyLoss", poly_eps: float = 1.0, **kwargs, ): super().__init__(model, args, **kwargs) - + self.train_collate_fn = train_collate_fn # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct self.loss_fct = get_loss(loss_function, poly_eps) @@ -88,6 +93,54 @@ class SFTTrainer(Trainer): return (loss, logits, labels) + def get_train_dataloader(self) -> DataLoader: + """ + Returns the training [`~torch.utils.data.DataLoader`]. + Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed + training if necessary) otherwise. + Subclass and override this method if you want to inject some custom behavior. + """ + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + + train_dataset = self.train_dataset + data_collator = self.train_collate_fn + if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): + train_dataset = self._remove_unused_columns(train_dataset, description="training") + else: + data_collator = self._get_collator_with_removed_columns(data_collator, description="training") + + if isinstance(train_dataset, torch.utils.data.IterableDataset): + if self.args.world_size > 1: + train_dataset = IterableDatasetShard( + train_dataset, + batch_size=self._train_batch_size, + drop_last=self.args.dataloader_drop_last, + num_processes=self.args.world_size, + process_index=self.args.process_index, + ) + + return DataLoader( + train_dataset, + batch_size=self.args.per_device_train_batch_size, + collate_fn=data_collator, + num_workers=self.args.dataloader_num_workers, + pin_memory=self.args.dataloader_pin_memory, + ) + + train_sampler = self._get_train_sampler() + + return DataLoader( + train_dataset, + batch_size=self._train_batch_size, + sampler=train_sampler, + collate_fn=data_collator, + drop_last=self.args.dataloader_drop_last, + num_workers=self.args.dataloader_num_workers, + pin_memory=self.args.dataloader_pin_memory, + worker_init_fn=seed_worker, + ) + def _strtobool(x): return bool(strtobool(x)) @@ -140,7 +193,7 @@ if __name__ == "__main__": tokenizer = get_tokenizer(training_conf) model = get_model(training_conf, tokenizer) - train, evals, collate_fn = get_dataset(training_conf, tokenizer) + train, evals, collate_fn, train_collate_fn = get_dataset(training_conf, tokenizer) metrics, preprocess_fns = get_metrics(training_conf, tokenizer) optimizer = OptimizerNames.ADAMW_BNB if training_conf.quantization else OptimizerNames.ADAMW_HF @@ -190,6 +243,7 @@ if __name__ == "__main__": trainer = SFTTrainer( model, args, + train_collate_fn=train_collate_fn, loss_function=training_conf.loss_fn, poly_eps=training_conf.poly_eps, train_dataset=train, diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index f7a0ab15..ba0e5539 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -8,7 +8,7 @@ import evaluate import transformers import yaml from custom_datasets import get_one_dataset -from custom_datasets.dialogue_collator import DialogueDataCollator +from custom_datasets.dialogue_collator import DialogueDataCollator, TrainDialogueDataCollator from custom_datasets.qa_datasets import QA_SPECIAL_TOKENS from losses import CrossEntropyLoss, PolyLoss from models import freeze_top_n_layers, get_specific_model @@ -126,8 +126,8 @@ def get_dataset(conf, tokenizer): train = ConcatDataset(train_datasets) collate_fn = DialogueDataCollator(tokenizer, max_length=conf.max_length) - - return train, evals, collate_fn + train_collate_fn = TrainDialogueDataCollator(tokenizer, max_length=conf.max_length) + return train, evals, collate_fn, train_collate_fn def get_loss(loss, poly_eps): From 0be4d88605a007b5eabc0cf58220599b9211108b Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Fri, 3 Feb 2023 15:07:05 +0000 Subject: [PATCH 05/11] [feature] Add OA private RM dataset --- model/reward/instructor/rank_datasets.py | 42 ++++++++++++++++++++++++ model/reward/instructor/utils.py | 8 ++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index 330c0a9f..bf572f4e 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -18,6 +18,7 @@ """ +import os from dataclasses import dataclass from typing import Dict, List, Optional, Union @@ -298,3 +299,44 @@ class AnthropicRLHF(Dataset): context, pair = self.pairs[index] return context, [pair] + + +class OAPrivate(Dataset): + """ + { + "prompt": , + "history": [("prompt1", "answer2"), ("prompt2", "answer2")], + "pos": , + "neg_replies": [list of bad answers] + } + """ + + split_name_mapping = { + "train": "rm_train.jsonl", + "test": "rm_test.jsonl", + "val": "rm_val.jsonl", + } + + def __init__(self, split="train", sep_token="", data_path=".cache") -> None: + super().__init__() + import json + + jsonl_file = os.path.join(data_path, self.split_name_mapping[split]) + self.pairs = [] + with open(jsonl_file, "r", encoding="utf-8") as f: + for line in f: + data = json.loads(line) + prefix = sep_token.join([sep_token.join(p) for p in data["history"][-2:]]) + prefix += sep_token + data["prompt"] + pair = [] + for neg_text in data["neg_replies"]: + pair.append((data["pos"], neg_text)) + self.pairs.append((prefix, pair)) + + def __len__(self): + return len(self.pairs) + + def __getitem__(self, index): + context, pair = self.pairs[index] + + return context, pair diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index 94a256c2..2e235213 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -115,7 +115,7 @@ def argument_parsing(parser): def get_datasets(dataset_list: List[AnyStr], tokenizer): - from rank_datasets import AnthropicRLHF, GPTJSynthetic, HFSummary, WebGPT + from rank_datasets import AnthropicRLHF, GPTJSynthetic, HFSummary, OAPrivate, WebGPT from torch.utils.data import ConcatDataset train_datasets, evals = [], {} @@ -141,5 +141,11 @@ def get_datasets(dataset_list: List[AnyStr], tokenizer): eval = AnthropicRLHF("test", tokenizer.sep_token) train_datasets.append(train) evals["anthropic_rlhf"] = eval + elif "oa_private" == dataset_name: + train = OAPrivate(split="train", sep_token=tokenizer.sep_token) + eval = OAPrivate(split="val", sep_token=tokenizer.sep_token) + train_datasets.append(train) + evals["oa_private"] = eval + train = ConcatDataset(train_datasets) return train, evals From 742161590f6c5adf1466d1ba84bdf7d893f91b40 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Mon, 6 Feb 2023 00:22:30 +0000 Subject: [PATCH 06/11] [feature] Add rallio new instruction dataset v3 --- .../custom_datasets/__init__.py | 7 ++- .../custom_datasets/prompt_dialogue.py | 44 ++++++++++++++++++- .../custom_datasets/qa_datasets.py | 41 ++--------------- 3 files changed, 51 insertions(+), 41 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index f664ceaf..48641225 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -1,7 +1,7 @@ """ High level functions for model training """ -from custom_datasets.prompt_dialogue import InstructionTuning, PromptGeneratedDataset +from custom_datasets.prompt_dialogue import InstructionTuning, PrivateInstructionTuning, PromptGeneratedDataset from custom_datasets.qa_datasets import SODA, JokeExplaination, QADataset, SODADialogue, TranslatedQA, WebGPT from custom_datasets.summarization import SummarizationDataset from custom_datasets.toxic_conversation import ProsocialDialogue, ProsocialDialogueExplaination @@ -32,7 +32,7 @@ SUMMARIZATION_DATASETS = [ "debate_sum", "tldr_news", ] -OTHER = ["prosocial_dialogue", "explain_prosocial", "instruct_tuning"] +OTHER = ["prosocial_dialogue", "explain_prosocial", "instruct_tuning", "private_tuning"] def train_val_dataset(dataset, val_split=0.2): @@ -92,6 +92,9 @@ def get_one_dataset(conf, dataset_name): elif dataset_name == "instruct_tuning": dataset = InstructionTuning(conf.cache_dir) train, eval = train_val_dataset(dataset, val_split=0.2) + elif dataset_name == "private_tuning": + dataset = PrivateInstructionTuning(conf.cache_dir) + train, eval = train_val_dataset(dataset, val_split=0.2) elif dataset_name == "translate_qa": dataset = TranslatedQA(conf.cache_dir) train, eval = train_val_dataset(dataset, val_split=0.01) diff --git a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py index 1c823934..4aac2655 100644 --- a/model/supervised_finetuning/custom_datasets/prompt_dialogue.py +++ b/model/supervised_finetuning/custom_datasets/prompt_dialogue.py @@ -2,7 +2,7 @@ import json import os from urllib.request import urlopen -from custom_datasets.formatting import format_pair +from custom_datasets.formatting import QA_SPECIAL_TOKENS, format_pair from torch.utils.data import Dataset @@ -102,3 +102,45 @@ class InstructionTuning(Dataset): def __getitem__(self, index): return format_pair(self.pairs[index]) + + +class PrivateInstructionTuning(Dataset): + """ + We have seen some promising capabilities from instruction tuning + with the following mix of datasets that are derived from datasets + available online. + The files for this data are in json format as a list of tuples + where each tuple is (source,instruction_response_pair) + + Not to be confused with unatural instruction + """ + + name = "private_tuning" + filename = "oa_v3_fixed_plus_safety.jsonl" + + def __init__(self, cache_dir) -> None: + super().__init__() + os.makedirs(cache_dir, exist_ok=True) + + self.pairs = [] + for file_link in [self.filename]: + basename = file_link.split("/")[-1] + instruction_tune_file = os.path.join(cache_dir, basename) + + with open(instruction_tune_file, "r", encoding="utf-8") as f: + for line in f: + row = json.loads(line) + prefix = "" + for _, convo in enumerate(row["text"].split("User:")): + if "Assistant" in convo: + prompt, answer = convo.split("Assistant:", maxsplit=1) + answer = answer.replace("<|endoftext|>", "").strip() + self.pairs.append((prefix + QA_SPECIAL_TOKENS["Question"] + prompt, answer)) + prefix += "".join(format_pair((prompt, answer))) + + def __len__(self): + return len(self.pairs) + + def __getitem__(self, index): + prompt, answer = self.pairs[index] + return "{}{}".format(prompt, QA_SPECIAL_TOKENS["Answer"]), answer diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 2acf9106..7876a920 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -3,7 +3,6 @@ """ import json import os -import random import re from urllib.request import urlopen @@ -116,7 +115,7 @@ class QADataset(Dataset): "reddit_asks": {"name": "eli5", "index_fn": index_eli5, "split_postfix": "_asks"}, } - def __init__(self, dataset, cache_dir, split, mix_prob=0.2): + def __init__(self, dataset, cache_dir, split): self.no_val = False if dataset in self.DATASET_FORMAT_MAPPING: context = self.DATASET_FORMAT_MAPPING[dataset] @@ -139,23 +138,11 @@ class QADataset(Dataset): else: raise ValueError("Unknown dataset : " + dataset) self.length = len(self.dataset) - self.mix_prob = mix_prob def __len__(self): return self.length def __getitem__(self, idx): - if self.mix_prob > 0 and random.random() < self.mix_prob and idx > 5 and idx < (self.length - 5): - - additional = random.randint(0, 10) - 5 - while additional == idx: - additional = random.randint(0, 10) - 5 - - answer_pair = self.index_fn(self.dataset[additional + idx]) - history_text = "".join(format_pair(answer_pair)) - question, answer = self.index_fn(self.dataset[idx]) - question = history_text + question - return format_pair((question, answer)) data = self.dataset[idx] return format_pair(self.index_fn(data)) @@ -312,9 +299,8 @@ class JokeExplaination(Dataset): name = "joke" url = "https://gist.github.com/theblackcat102/42b697e24a13fdb499e20edfbf618361/raw/1834dca207898c15f93b809d1195f6f6e47c9e1e/joke_explained.jsonl" - def __init__(self, cache_dir, mix_prob=0.2) -> None: + def __init__(self, cache_dir) -> None: super().__init__() - self.mix_prob = mix_prob os.makedirs(cache_dir, exist_ok=True) joke_explain_filename = os.path.join(cache_dir, "joke_explaination.jsonl") if not os.path.exists(joke_explain_filename): @@ -341,16 +327,6 @@ class JokeExplaination(Dataset): return self.length def __getitem__(self, index): - if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): - additional = random.randint(0, 10) - 5 - while additional == index: - additional = random.randint(0, 10) - 5 - - history_text = "".join(format_pair(self.pairs[additional + index])) - question, answer = self.pairs[index] - question = history_text + question - return format_pair((question, answer)) - return format_pair(self.pairs[index]) @@ -358,9 +334,8 @@ class TranslatedQA(Dataset): name = "oa_translated" - def __init__(self, cache_dir, mix_prob=0.2) -> None: + def __init__(self, cache_dir) -> None: super().__init__() - self.mix_prob = mix_prob os.makedirs(cache_dir, exist_ok=True) path = os.path.join(cache_dir, "oa_translated") os.makedirs(path, exist_ok=True) @@ -383,14 +358,4 @@ class TranslatedQA(Dataset): return self.length def __getitem__(self, index): - if random.random() < self.mix_prob and index > 5 and index < (self.length - 5): - additional = random.randint(0, 10) - 5 - while additional == index: - additional = random.randint(0, 10) - 5 - - history_text = "".join(format_pair(self.pairs[additional + index])) - question, answer = self.pairs[index] - question = history_text + question - return format_pair((question, answer)) - return format_pair(self.pairs[index]) From af1c62cd83da66342e20937dedd7ae3839f343bf Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Tue, 7 Feb 2023 01:23:54 +0000 Subject: [PATCH 07/11] [feature] Add missing hindi and spanish prompt for translation --- model/supervised_finetuning/custom_datasets/translation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index 18cb9a09..cece0b43 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -62,7 +62,8 @@ TRANSLATION_PROMPT = { "{} how do we write in Malay", "{} give me the malay translation", "{} , berikan saya terjemahan dalam bahasa melayu", - "{}, Jemahan di bahasa melayu" "{}, jemahkan ayat ini kepada bahasa melayu", + "{}, Jemahan di bahasa melayu", + "{}, jemahkan ayat ini kepada bahasa melayu", ], "en": ["{}. translate to english", "{} write in english", "english translation: '{}'"], "ru": ["помогите мне перевести это на русский : {}", "{} перевести на русский язык", "russian translation: '{}'"], @@ -71,6 +72,8 @@ TRANSLATION_PROMPT = { "nl": ["{}. translate to dutch", "{} write in dutch", "dutch translation: '{}'"], "vi": ["{}. Dịch sang tiếng việt nam", "{} write in vietnamese", "vietnamese translation: '{}'"], "ar": ["{}. translate to arabic", "{} write in arabic", "arabic translation: '{}'"], + "es": ["{}. translate to spanish", "{} write in spanish", "spanish translation: '{}'"], + "hi": ["{}. translate to hindi", "{}. translate to bengali", "{} write in hindi", "bengali translation: '{}'"], } @@ -114,8 +117,6 @@ class WMT2019(TranslationPair): else: # translating in reverse direction source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) - if len(self.pairs) > 100000: - break class DiveMT(TranslationPair): From a39cbab524d38a9891153528285881a6923e3eb7 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Tue, 7 Feb 2023 01:26:28 +0000 Subject: [PATCH 08/11] [fix] transformers import error --- model/supervised_finetuning/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index 043534ea..c500f8df 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -9,7 +9,8 @@ import torch from torch import nn from torch.utils.data import DataLoader from transformers import PreTrainedModel, Trainer, TrainingArguments -from transformers.trainer_pt_utils import IterableDatasetShard, seed_worker +from transformers.trainer_pt_utils import IterableDatasetShard +from transformers.trainer_utils import seed_worker from transformers.training_args import OptimizerNames from transformers.utils import is_datasets_available from utils import get_dataset, get_loss, get_metrics, get_model, get_tokenizer, read_yamls From 2c35ff6e50c9392433d425d3f2800412ec6d6f85 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Wed, 8 Feb 2023 00:20:11 +0000 Subject: [PATCH 09/11] [fix] patch translated history conversation --- .../custom_datasets/__init__.py | 4 ++-- .../custom_datasets/qa_datasets.py | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index 48641225..0e5b9a91 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -32,7 +32,7 @@ SUMMARIZATION_DATASETS = [ "debate_sum", "tldr_news", ] -OTHER = ["prosocial_dialogue", "explain_prosocial", "instruct_tuning", "private_tuning"] +OTHER = ["prosocial_dialogue", "explain_prosocial", "instruct_tuning", "private_tuning", "oa_translated"] def train_val_dataset(dataset, val_split=0.2): @@ -95,7 +95,7 @@ def get_one_dataset(conf, dataset_name): elif dataset_name == "private_tuning": dataset = PrivateInstructionTuning(conf.cache_dir) train, eval = train_val_dataset(dataset, val_split=0.2) - elif dataset_name == "translate_qa": + elif dataset_name == "oa_translated": dataset = TranslatedQA(conf.cache_dir) train, eval = train_val_dataset(dataset, val_split=0.01) else: diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 7876a920..145aee72 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -349,8 +349,21 @@ class TranslatedQA(Dataset): if "Python " in data["text"]: continue # incorrect, TODO: fix later + prefix = "" for convo_round in data["translate"]: - self.pairs.append((convo_round["human"], convo_round["answer"])) + human, answer = format_pair((convo_round["human"], convo_round["answer"])) + if convo_round["round"] > 2: + # TODO: remove this later + self.pairs.append(("{}{}{}".format(prefix, "", human), answer)) + else: + self.pairs.append((human, answer)) + + prefix += "{}{}{}{}".format( + QA_SPECIAL_TOKENS["Question"], + convo_round["human"], + QA_SPECIAL_TOKENS["Answer"], + convo_round["answer"], + ) self.length = len(self.pairs) @@ -358,4 +371,4 @@ class TranslatedQA(Dataset): return self.length def __getitem__(self, index): - return format_pair(self.pairs[index]) + return self.pairs[index] From 34347607d43499990198eb2db6b51f1bf43efe6e Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Sat, 11 Feb 2023 00:08:34 +0000 Subject: [PATCH 10/11] [fix] add comments for translation data --- .../custom_datasets/qa_datasets.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index 6a82c13c..d88b741d 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -1,6 +1,7 @@ """ Open / close book QA datasets """ +import glob import json import os import re @@ -331,29 +332,43 @@ class JokeExplaination(Dataset): class TranslatedQA(Dataset): + """ + Translation OA v3 results + a list of non english translation of OA v3 instruction generated text in jsonl + format for each line: + { + "text": "User: ... Assistant: ....", + "meta": {"source": ... }, + "translate": [ + { "round": 1, "human":"...", "answer": "..."}, + ... + { "round": K, "human":"...", "answer": "..."}, + ] + } + Since OA contain some code we needed to reference the original text to skip these + """ name = "oa_translated" def __init__(self, cache_dir) -> None: super().__init__() os.makedirs(cache_dir, exist_ok=True) - path = os.path.join(cache_dir, "oa_translated") + path = os.path.join(cache_dir, self.name) os.makedirs(path, exist_ok=True) - import glob - self.pairs = [] for translated_jsonl in glob.glob(os.path.join(path, "*.jsonl")): - with open(translated_jsonl, "r") as f: - for line in f: + with open(translated_jsonl, "r") as fin: + for line in fin: data = json.loads(line) if "Python " in data["text"]: + # translation currently doesn't ignore code + # so we will have to reference original text + # for ignoring the translation continue - # incorrect, TODO: fix later prefix = "" for convo_round in data["translate"]: human, answer = format_pair((convo_round["human"], convo_round["answer"])) if convo_round["round"] > 2: - # TODO: remove this later self.pairs.append(("{}{}{}".format(prefix, "", human), answer)) else: self.pairs.append((human, answer)) From 9e69117ead4c1724e2915fa2e99607a8b5213f35 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Sat, 11 Feb 2023 01:53:37 +0000 Subject: [PATCH 11/11] [fix] Fix other PR merge bug --- .../supervised_finetuning/custom_datasets/qa_datasets.py | 4 +++- model/supervised_finetuning/trainer.py | 3 ++- model/supervised_finetuning/utils.py | 9 ++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py index cd4d742a..f126a270 100644 --- a/model/supervised_finetuning/custom_datasets/qa_datasets.py +++ b/model/supervised_finetuning/custom_datasets/qa_datasets.py @@ -314,7 +314,9 @@ class JokeExplaination(Dataset): for line in f: data = json.loads(line) joke = data["joke"] - explanation = data["explanation"] + # DO NOT change this + # its the data that had syntax error + explanation = data["explaination"] self.pairs.append((joke, explanation)) if len(question) > 0 and len(answer) > 0: diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index 02367814..ce80830b 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -1,7 +1,7 @@ import argparse from distutils.util import strtobool from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import bitsandbytes import datasets @@ -39,6 +39,7 @@ class SFTTrainer(Trainer): sampler: torch.utils.data.sampler.Sampler = None, loss_function: str = "CrossEntropyLoss", poly_eps: float = 1.0, + train_collate_fn: Callable = None, **kwargs, ): super().__init__(model, args, **kwargs) diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 86e48a52..43377bc9 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -126,7 +126,14 @@ def get_tokenizer(conf) -> transformers.AutoTokenizer: if tokenizer_config.special_tokens: if "GPT-JT" in conf.model_name: tokenizer_config.special_tokens.pad_token = tokenizer.eos_token - tokenizer.add_special_tokens(tokenizer_config.special_tokens) + # SpecialTokens : latest in 4.25, 4.26 + tokenizer.add_special_tokens( + { + "pad_token": tokenizer_config.special_tokens.pad_token, + "eos_token": tokenizer_config.special_tokens.eos_token, + "sep_token": tokenizer_config.special_tokens.sep_token, + } + ) additional_special_tokens = ( []