Question-Answer special tokens

2026-07-03 17:10:10 +08:00 · 2023-01-03 22:02:32 +01:00
parent e14c670cb0
commit 3a10e9412d
4 changed files with 38 additions and 12 deletions
@@ -2,6 +2,12 @@ from datasets import load_dataset
 from sklearn.model_selection import train_test_split
 from torch.utils.data import Dataset, Subset

+QA_SPECIAL_TOKENS = {
+    'Question': '<question>', 
+    'Answer': '<answer>'
+}
+
+

 class SquadV2Dataset(Dataset):
    def __init__(self, cache_dir, split):
@@ -6,6 +6,8 @@ import torch
 from torch.nn import functional as F
 from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase

+from . import QA_SPECIAL_TOKENS
+

@dataclass
 class DialogueDataCollator:
@@ -19,22 +21,21 @@ class DialogueDataCollator:
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
-        # TODO add special tokens for question and answer here
-        # additional_special_tokens = ['<question>', '<answer>']
-        prompt_tokens = ["Question: ", "Answer: "]
-
        flatten_messages = []
        label_masks = []

        for messages in features:
            assert len(messages) % 2 == 0, "Number of messages must be even"
            messages = [
-                (prompt_tokens[0] if i % 2 == 0 else "") + x + ((" " + prompt_tokens[1]) if i % 2 == 0 else "")
+                (QA_SPECIAL_TOKENS["Question"] if i % 2 == 0 else "")
+                + x
+                + (QA_SPECIAL_TOKENS["Answer"] if i % 2 == 0 else "")
                for i, x in enumerate(messages)
            ]

-            # Add a way for the model to terminate generation, reinitialize prompter
-            messages.append(prompt_tokens[0])
+            # Add a way for the model to terminate generation
+            # When we predict the start of a new expected question, we want to be able to stop generation
+            messages.append(QA_SPECIAL_TOKENS["Question"])

            flatten_messages.append(
                self.tokenizer(
@@ -47,8 +48,10 @@ class DialogueDataCollator:

            message_change_indices = np.cumsum([len(x) for x in messages[:-1]])
            # for each token an integer indicating the index of the message it belongs to. Just to create the label mask.
-            # TEXT:             Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John.
-            # MESSAGE_INDICES:  0         0      0   0   0    0       1 1  1     2         2    2  2    2     2       3  3    3  3
+            # Label mask is true when predicting a token that is part of the answer, false otherwise.
+            # TEXT:             Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John. Question:
+            # MESSAGE_INDICES:  0         0      0   0   0    0       1 1  1     2         2    2  2    2     2       3  3    3  3     -2
+            # LABEL_MASK:       0         0      0   0   0    1       1 1  1     0         0    0  0    0     1       1  1    1  1     0

            # If no result in next, we are predicting the last termination token(s)
            message_indices = list(
@@ -67,6 +67,8 @@ class SFTTrainer(Trainer):
            optimizers,
            preprocess_logits_for_metrics,
        )
+
+        # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
        self.loss_fct = get_loss(args.loss_function)

    def fetch_scheduler(self):
@@ -112,7 +114,7 @@ class SFTTrainer(Trainer):

        with torch.no_grad():
            loss, logits, labels, labels_mask = self._compute_loss(model, inputs)
-            labels[~labels_mask] = -1
+            labels[~labels_mask] = -100  # padding_index

        loss = loss.mean().detach()

@@ -159,8 +161,8 @@ def argument_parsing(notebook=False, notebook_args=None):
 if __name__ == "__main__":
    training_conf = argument_parsing()

-    model = get_model(training_conf)
    tokenizer = get_tokenizer(training_conf)
+    model = get_model(training_conf, tokenizer)

    train, evals, collate_fn = get_dataset(training_conf, tokenizer)

@@ -7,6 +7,7 @@ from losses import CrossEntropyLoss
 from sklearn.model_selection import train_test_split
 from torch.utils.data import ConcatDataset, Subset
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from custom_datasets import QA_SPECIAL_TOKENS

 SUPPORTED_MODELS = ["galactica"]

@@ -17,10 +18,19 @@ def get_tokenizer(conf):
    if "galactica" in conf.model_name:
        tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

+    additional_special_tokens = (
+        []
+        if not "additional_special_tokens" in tokenizer.special_tokens_map
+        else tokenizer.special_tokens_map["additional_special_tokens"]
+    )
+    additional_special_tokens = list(set(additional_special_tokens + list(QA_SPECIAL_TOKENS.values())))
+
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
    return tokenizer


-def get_model(conf):
+def get_model(conf, tokenizer):
    if not any([x in conf.model_name for x in SUPPORTED_MODELS]):
        raise ValueError(
            f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. "
@@ -29,6 +39,11 @@ def get_model(conf):

    model = AutoModelForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)

+    if len(tokenizer) != model.get_input_embeddings().num_embeddings:
+        assert not conf.freeze_layer, "Cannot change the number of embeddings if the model is frozen."
+
+    model.resize_token_embeddings(len(tokenizer))
+
    if conf.freeze_layer:
        model = freeze_top_n_layers(model, conf.freeze_layer)