From f5b2a348577d93ea45f2f5ff186ef8f20543083e Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Sun, 22 Jan 2023 00:56:17 +0000 Subject: [PATCH] [feature] add pythia and limit translation pair --- model/supervised_finetuning/custom_datasets/translation.py | 2 ++ model/supervised_finetuning/utils.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/model/supervised_finetuning/custom_datasets/translation.py b/model/supervised_finetuning/custom_datasets/translation.py index 008de751..f9a71a8e 100644 --- a/model/supervised_finetuning/custom_datasets/translation.py +++ b/model/supervised_finetuning/custom_datasets/translation.py @@ -100,6 +100,8 @@ class WMT2019(TranslationPair): else: # translating in reverse direction source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt]) self.pairs.append((source, row[src])) + if len(self.pairs) > 100000: + break class DiveMT(TranslationPair): diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 7b6e03b6..f7a0ab15 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -25,6 +25,10 @@ def get_tokenizer(conf): tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"}) elif "codegen" in conf.model_name: tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"}) + elif "pythia" in conf.model_name: + tokenizer.add_special_tokens( + {"pad_token": "<|padding|>", "sep_token": "<|endoftext|>", "eos_token": "<|endoftext|>"} + ) additional_special_tokens = ( []