From a83b1f617fab750bef75a924981ee09e3ebfd5df Mon Sep 17 00:00:00 2001 From: kykim0 Date: Tue, 9 Apr 2024 08:02:21 -0700 Subject: [PATCH] Fix the logic that causes an issue with philschmid/gemma-tokenizer-chatml tokenizer (#146) The `setup_chat_format()` logic should not be applied to philschmid/gemma-tokenizer-chatml tokenizer, otherwise gemma models are trained w/o proper bos, eos tokens. --- scripts/run_sft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_sft.py b/scripts/run_sft.py index a38accd..522ce86 100644 --- a/scripts/run_sft.py +++ b/scripts/run_sft.py @@ -122,7 +122,7 @@ def main(): model = model_args.model_name_or_path # For ChatML we need to add special tokens and resize the embedding layer - if "<|im_start|>" in tokenizer.chat_template: + if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path: model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) model, tokenizer = setup_chat_format(model, tokenizer) model_kwargs = None