From 3a10e9412d220e4a17f99151a6cbc67152b4faa5 Mon Sep 17 00:00:00 2001 From: Sotirios Anagnostidis Date: Tue, 3 Jan 2023 22:02:32 +0100 Subject: [PATCH 1/3] Question-Answer special tokens --- .../custom_datasets/__init__.py | 6 ++++++ .../custom_datasets/dialogue_collator.py | 21 +++++++++++-------- model/supervised_finetuning/trainer.py | 6 ++++-- model/supervised_finetuning/utils.py | 17 ++++++++++++++- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index fcab8a56..907e1a9b 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -2,6 +2,12 @@ from datasets import load_dataset from sklearn.model_selection import train_test_split from torch.utils.data import Dataset, Subset +QA_SPECIAL_TOKENS = { + 'Question': '', + 'Answer': '' +} + + class SquadV2Dataset(Dataset): def __init__(self, cache_dir, split): diff --git a/model/supervised_finetuning/custom_datasets/dialogue_collator.py b/model/supervised_finetuning/custom_datasets/dialogue_collator.py index 17fe1082..f9e1bb5e 100644 --- a/model/supervised_finetuning/custom_datasets/dialogue_collator.py +++ b/model/supervised_finetuning/custom_datasets/dialogue_collator.py @@ -6,6 +6,8 @@ import torch from torch.nn import functional as F from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase +from . import QA_SPECIAL_TOKENS + @dataclass class DialogueDataCollator: @@ -19,22 +21,21 @@ class DialogueDataCollator: pad_to_multiple_of: Optional[int] = None def __call__(self, features): - # TODO add special tokens for question and answer here - # additional_special_tokens = ['', ''] - prompt_tokens = ["Question: ", "Answer: "] - flatten_messages = [] label_masks = [] for messages in features: assert len(messages) % 2 == 0, "Number of messages must be even" messages = [ - (prompt_tokens[0] if i % 2 == 0 else "") + x + ((" " + prompt_tokens[1]) if i % 2 == 0 else "") + (QA_SPECIAL_TOKENS["Question"] if i % 2 == 0 else "") + + x + + (QA_SPECIAL_TOKENS["Answer"] if i % 2 == 0 else "") for i, x in enumerate(messages) ] - # Add a way for the model to terminate generation, reinitialize prompter - messages.append(prompt_tokens[0]) + # Add a way for the model to terminate generation + # When we predict the start of a new expected question, we want to be able to stop generation + messages.append(QA_SPECIAL_TOKENS["Question"]) flatten_messages.append( self.tokenizer( @@ -47,8 +48,10 @@ class DialogueDataCollator: message_change_indices = np.cumsum([len(x) for x in messages[:-1]]) # for each token an integer indicating the index of the message it belongs to. Just to create the label mask. - # TEXT: Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John. - # MESSAGE_INDICES: 0 0 0 0 0 0 1 1 1 2 2 2 2 2 2 3 3 3 3 + # Label mask is true when predicting a token that is part of the answer, false otherwise. + # TEXT: Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John. Question: + # MESSAGE_INDICES: 0 0 0 0 0 0 1 1 1 2 2 2 2 2 2 3 3 3 3 -2 + # LABEL_MASK: 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 # If no result in next, we are predicting the last termination token(s) message_indices = list( diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index b44890df..dc7b5934 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -67,6 +67,8 @@ class SFTTrainer(Trainer): optimizers, preprocess_logits_for_metrics, ) + + # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct self.loss_fct = get_loss(args.loss_function) def fetch_scheduler(self): @@ -112,7 +114,7 @@ class SFTTrainer(Trainer): with torch.no_grad(): loss, logits, labels, labels_mask = self._compute_loss(model, inputs) - labels[~labels_mask] = -1 + labels[~labels_mask] = -100 # padding_index loss = loss.mean().detach() @@ -159,8 +161,8 @@ def argument_parsing(notebook=False, notebook_args=None): if __name__ == "__main__": training_conf = argument_parsing() - model = get_model(training_conf) tokenizer = get_tokenizer(training_conf) + model = get_model(training_conf, tokenizer) train, evals, collate_fn = get_dataset(training_conf, tokenizer) diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 4a451bed..6aa5d365 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -7,6 +7,7 @@ from losses import CrossEntropyLoss from sklearn.model_selection import train_test_split from torch.utils.data import ConcatDataset, Subset from transformers import AutoModelForCausalLM, AutoTokenizer +from custom_datasets import QA_SPECIAL_TOKENS SUPPORTED_MODELS = ["galactica"] @@ -17,10 +18,19 @@ def get_tokenizer(conf): if "galactica" in conf.model_name: tokenizer.add_special_tokens({"pad_token": "", "eos_token": ""}) + additional_special_tokens = ( + [] + if not "additional_special_tokens" in tokenizer.special_tokens_map + else tokenizer.special_tokens_map["additional_special_tokens"] + ) + additional_special_tokens = list(set(additional_special_tokens + list(QA_SPECIAL_TOKENS.values()))) + + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + return tokenizer -def get_model(conf): +def get_model(conf, tokenizer): if not any([x in conf.model_name for x in SUPPORTED_MODELS]): raise ValueError( f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. " @@ -29,6 +39,11 @@ def get_model(conf): model = AutoModelForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir) + if len(tokenizer) != model.get_input_embeddings().num_embeddings: + assert not conf.freeze_layer, "Cannot change the number of embeddings if the model is frozen." + + model.resize_token_embeddings(len(tokenizer)) + if conf.freeze_layer: model = freeze_top_n_layers(model, conf.freeze_layer) From 525e6964e843dab51a370824626033155adc2da4 Mon Sep 17 00:00:00 2001 From: Sotirios Anagnostidis Date: Tue, 3 Jan 2023 22:06:59 +0100 Subject: [PATCH 2/3] requirements --- model/supervised_finetuning/requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 model/supervised_finetuning/requirements.txt diff --git a/model/supervised_finetuning/requirements.txt b/model/supervised_finetuning/requirements.txt new file mode 100644 index 00000000..d579468f --- /dev/null +++ b/model/supervised_finetuning/requirements.txt @@ -0,0 +1,6 @@ +datasets==2.8.0 +numpy==1.23.0 +PyYAML==6.0 +scikit_learn==1.2.0 +torch==1.13.1 +transformers==4.25.1 From c20dfaad5b48e4e176557378983c84f443b6dd2a Mon Sep 17 00:00:00 2001 From: Sotirios Anagnostidis Date: Tue, 3 Jan 2023 22:45:34 +0100 Subject: [PATCH 3/3] pre-commits --- model/supervised_finetuning/README.md | 4 ++-- model/supervised_finetuning/configs/config.yaml | 11 +++++++++++ .../supervised_finetuning/custom_datasets/__init__.py | 6 +----- model/supervised_finetuning/utils.py | 7 +++---- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/model/supervised_finetuning/README.md b/model/supervised_finetuning/README.md index e223e1cd..014afa95 100644 --- a/model/supervised_finetuning/README.md +++ b/model/supervised_finetuning/README.md @@ -33,6 +33,6 @@ Experimental results in wandb ## TODOS - decide on a model -- add special token to declare prompt and reply. Do nto freeze the weights for - these - Merge utils etc with reward model +- Casual Modelling for GPT-JT does not leverage the bidirectional mask for the + prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1) diff --git a/model/supervised_finetuning/configs/config.yaml b/model/supervised_finetuning/configs/config.yaml index f7164002..29086395 100644 --- a/model/supervised_finetuning/configs/config.yaml +++ b/model/supervised_finetuning/configs/config.yaml @@ -32,6 +32,17 @@ galactica-125: per_device_train_batch_size: 4 per_device_eval_batch_size: 4 +gpt-jt: + learning_rate: 2e-6 + model_name: togethercomputer/GPT-JT-6B-v1 + weight_decay: 0.01 + max_length: 1024 + warmup_steps: 600 + gradient_checkpointing: false + gradient_accumulation_steps: 2 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 + debug: eval_steps: 20 eval_size: 100 diff --git a/model/supervised_finetuning/custom_datasets/__init__.py b/model/supervised_finetuning/custom_datasets/__init__.py index 907e1a9b..7e3bdc79 100644 --- a/model/supervised_finetuning/custom_datasets/__init__.py +++ b/model/supervised_finetuning/custom_datasets/__init__.py @@ -2,11 +2,7 @@ from datasets import load_dataset from sklearn.model_selection import train_test_split from torch.utils.data import Dataset, Subset -QA_SPECIAL_TOKENS = { - 'Question': '', - 'Answer': '' -} - +QA_SPECIAL_TOKENS = {"Question": "", "Answer": ""} class SquadV2Dataset(Dataset): diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 6aa5d365..a31f74d3 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -1,15 +1,14 @@ from pathlib import Path import yaml -from custom_datasets import get_one_dataset +from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset from custom_datasets.dialogue_collator import DialogueDataCollator from losses import CrossEntropyLoss from sklearn.model_selection import train_test_split from torch.utils.data import ConcatDataset, Subset from transformers import AutoModelForCausalLM, AutoTokenizer -from custom_datasets import QA_SPECIAL_TOKENS -SUPPORTED_MODELS = ["galactica"] +SUPPORTED_MODELS = ["galactica", "GPT-JT"] # deprecated .. def get_tokenizer(conf): @@ -20,7 +19,7 @@ def get_tokenizer(conf): additional_special_tokens = ( [] - if not "additional_special_tokens" in tokenizer.special_tokens_map + if "additional_special_tokens" not in tokenizer.special_tokens_map else tokenizer.special_tokens_map["additional_special_tokens"] ) additional_special_tokens = list(set(additional_special_tokens + list(QA_SPECIAL_TOKENS.values())))