mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-29 16:30:24 +08:00
Merge pull request #347 from LAION-AI/sft-gptjt-qa-labels
Sft gptjt qa labels
This commit is contained in:
@@ -33,6 +33,6 @@ Experimental results in wandb
|
||||
## TODOS
|
||||
|
||||
- decide on a model
|
||||
- add special token to declare prompt and reply. Do nto freeze the weights for
|
||||
these
|
||||
- Merge utils etc with reward model
|
||||
- Casual Modelling for GPT-JT does not leverage the bidirectional mask for the
|
||||
prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1)
|
||||
|
||||
@@ -32,6 +32,17 @@ galactica-125:
|
||||
per_device_train_batch_size: 4
|
||||
per_device_eval_batch_size: 4
|
||||
|
||||
gpt-jt:
|
||||
learning_rate: 2e-6
|
||||
model_name: togethercomputer/GPT-JT-6B-v1
|
||||
weight_decay: 0.01
|
||||
max_length: 1024
|
||||
warmup_steps: 600
|
||||
gradient_checkpointing: false
|
||||
gradient_accumulation_steps: 2
|
||||
per_device_train_batch_size: 4
|
||||
per_device_eval_batch_size: 4
|
||||
|
||||
debug:
|
||||
eval_steps: 20
|
||||
eval_size: 100
|
||||
|
||||
@@ -2,6 +2,8 @@ from datasets import load_dataset
|
||||
from sklearn.model_selection import train_test_split
|
||||
from torch.utils.data import Dataset, Subset
|
||||
|
||||
QA_SPECIAL_TOKENS = {"Question": "<question>", "Answer": "<answer>"}
|
||||
|
||||
|
||||
class SquadV2Dataset(Dataset):
|
||||
def __init__(self, cache_dir, split):
|
||||
|
||||
@@ -6,6 +6,8 @@ import torch
|
||||
from torch.nn import functional as F
|
||||
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
|
||||
|
||||
from . import QA_SPECIAL_TOKENS
|
||||
|
||||
|
||||
@dataclass
|
||||
class DialogueDataCollator:
|
||||
@@ -19,22 +21,21 @@ class DialogueDataCollator:
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __call__(self, features):
|
||||
# TODO add special tokens for question and answer here
|
||||
# additional_special_tokens = ['<question>', '<answer>']
|
||||
prompt_tokens = ["Question: ", "Answer: "]
|
||||
|
||||
flatten_messages = []
|
||||
label_masks = []
|
||||
|
||||
for messages in features:
|
||||
assert len(messages) % 2 == 0, "Number of messages must be even"
|
||||
messages = [
|
||||
(prompt_tokens[0] if i % 2 == 0 else "") + x + ((" " + prompt_tokens[1]) if i % 2 == 0 else "")
|
||||
(QA_SPECIAL_TOKENS["Question"] if i % 2 == 0 else "")
|
||||
+ x
|
||||
+ (QA_SPECIAL_TOKENS["Answer"] if i % 2 == 0 else "")
|
||||
for i, x in enumerate(messages)
|
||||
]
|
||||
|
||||
# Add a way for the model to terminate generation, reinitialize prompter
|
||||
messages.append(prompt_tokens[0])
|
||||
# Add a way for the model to terminate generation
|
||||
# When we predict the start of a new expected question, we want to be able to stop generation
|
||||
messages.append(QA_SPECIAL_TOKENS["Question"])
|
||||
|
||||
flatten_messages.append(
|
||||
self.tokenizer(
|
||||
@@ -47,8 +48,10 @@ class DialogueDataCollator:
|
||||
|
||||
message_change_indices = np.cumsum([len(x) for x in messages[:-1]])
|
||||
# for each token an integer indicating the index of the message it belongs to. Just to create the label mask.
|
||||
# TEXT: Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John.
|
||||
# MESSAGE_INDICES: 0 0 0 0 0 0 1 1 1 2 2 2 2 2 2 3 3 3 3
|
||||
# Label mask is true when predicting a token that is part of the answer, false otherwise.
|
||||
# TEXT: Question: Hello, how are you? Answer: I am fine. Question: What is your name? Answer: My name is John. Question:
|
||||
# MESSAGE_INDICES: 0 0 0 0 0 0 1 1 1 2 2 2 2 2 2 3 3 3 3 -2
|
||||
# LABEL_MASK: 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0
|
||||
|
||||
# If no result in next, we are predicting the last termination token(s)
|
||||
message_indices = list(
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
datasets==2.8.0
|
||||
numpy==1.23.0
|
||||
PyYAML==6.0
|
||||
scikit_learn==1.2.0
|
||||
torch==1.13.1
|
||||
transformers==4.25.1
|
||||
@@ -67,6 +67,8 @@ class SFTTrainer(Trainer):
|
||||
optimizers,
|
||||
preprocess_logits_for_metrics,
|
||||
)
|
||||
|
||||
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
|
||||
self.loss_fct = get_loss(args.loss_function)
|
||||
|
||||
def fetch_scheduler(self):
|
||||
@@ -112,7 +114,7 @@ class SFTTrainer(Trainer):
|
||||
|
||||
with torch.no_grad():
|
||||
loss, logits, labels, labels_mask = self._compute_loss(model, inputs)
|
||||
labels[~labels_mask] = -1
|
||||
labels[~labels_mask] = -100 # padding_index
|
||||
|
||||
loss = loss.mean().detach()
|
||||
|
||||
@@ -159,8 +161,8 @@ def argument_parsing(notebook=False, notebook_args=None):
|
||||
if __name__ == "__main__":
|
||||
training_conf = argument_parsing()
|
||||
|
||||
model = get_model(training_conf)
|
||||
tokenizer = get_tokenizer(training_conf)
|
||||
model = get_model(training_conf, tokenizer)
|
||||
|
||||
train, evals, collate_fn = get_dataset(training_conf, tokenizer)
|
||||
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from custom_datasets import get_one_dataset
|
||||
from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset
|
||||
from custom_datasets.dialogue_collator import DialogueDataCollator
|
||||
from losses import CrossEntropyLoss
|
||||
from sklearn.model_selection import train_test_split
|
||||
from torch.utils.data import ConcatDataset, Subset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
SUPPORTED_MODELS = ["galactica"]
|
||||
SUPPORTED_MODELS = ["galactica", "GPT-JT"] # deprecated ..
|
||||
|
||||
|
||||
def get_tokenizer(conf):
|
||||
@@ -17,10 +17,19 @@ def get_tokenizer(conf):
|
||||
if "galactica" in conf.model_name:
|
||||
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})
|
||||
|
||||
additional_special_tokens = (
|
||||
[]
|
||||
if "additional_special_tokens" not in tokenizer.special_tokens_map
|
||||
else tokenizer.special_tokens_map["additional_special_tokens"]
|
||||
)
|
||||
additional_special_tokens = list(set(additional_special_tokens + list(QA_SPECIAL_TOKENS.values())))
|
||||
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def get_model(conf):
|
||||
def get_model(conf, tokenizer):
|
||||
if not any([x in conf.model_name for x in SUPPORTED_MODELS]):
|
||||
raise ValueError(
|
||||
f"Model {conf.model_name} not supported. Supported models: {SUPPORTED_MODELS}. "
|
||||
@@ -29,6 +38,11 @@ def get_model(conf):
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir)
|
||||
|
||||
if len(tokenizer) != model.get_input_embeddings().num_embeddings:
|
||||
assert not conf.freeze_layer, "Cannot change the number of embeddings if the model is frozen."
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
if conf.freeze_layer:
|
||||
model = freeze_top_n_layers(model, conf.freeze_layer)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user