[feature] add pythia and limit translation pair

This commit is contained in:
theblackcat102
2023-01-22 00:56:17 +00:00
parent 62a203fd8c
commit f5b2a34857
2 changed files with 6 additions and 0 deletions
@@ -100,6 +100,8 @@ class WMT2019(TranslationPair):
else: # translating in reverse direction
source = random.choice(TRANSLATION_PROMPT[src]).format(row[tgt])
self.pairs.append((source, row[src]))
if len(self.pairs) > 100000:
break
class DiveMT(TranslationPair):
+4
View File
@@ -25,6 +25,10 @@ def get_tokenizer(conf):
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token, "sep_token": "<|extratoken_100|>"})
elif "codegen" in conf.model_name:
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>", "sep_token": "<|endoftext|>"})
elif "pythia" in conf.model_name:
tokenizer.add_special_tokens(
{"pad_token": "<|padding|>", "sep_token": "<|endoftext|>", "eos_token": "<|endoftext|>"}
)
additional_special_tokens = (
[]