diff --git a/model/supervised_finetuning/tests/test_utils.py b/model/supervised_finetuning/tests/test_utils.py index ad40e534..c4982024 100644 --- a/model/supervised_finetuning/tests/test_utils.py +++ b/model/supervised_finetuning/tests/test_utils.py @@ -1,9 +1,28 @@ from argparse import Namespace -from utils import get_tokenizer +import pytest +from utils import TOKENIZER_CONFIGS, get_tokenizer, match_tokenizer_name def test_tokenizer(): get_tokenizer(Namespace(model_name="Salesforce/codegen-2B-multi", cache_dir=".cache")) get_tokenizer(Namespace(model_name="facebook/galactica-1.3b", cache_dir=".cache")) get_tokenizer(Namespace(model_name="", cache_dir=".cache")) + + +def test_tokenizer_successful_match(): + for config_name, config in TOKENIZER_CONFIGS.items(): + found_config = match_tokenizer_name(config_name) + assert found_config == config + + +def test_tokenizer_partial_match(): + for config_name in ["facebook/galactica-1.3b", "togethercomputer/GPT-JT-6B-v1", "Salesforce/codegen-2B-multi"]: + found_config = match_tokenizer_name(config_name) + assert found_config + + +def test_tokenizer_failed_match(): + for fake_config_name in ["not-a-model", "fake"]: + with pytest.raises(ValueError): + match_tokenizer_name(fake_config_name) diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 380bca8e..1f77708f 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -106,7 +106,10 @@ TOKENIZER_CONFIGS = { def match_tokenizer_name(model_name: str) -> TokenizerConfig: - """Match a partial model name to a tokenizer configuration""" + """ + Match a partial model name to a tokenizer configuration + i.e. model_name `Salesforce/codegen-2B-multi` has config name `codegen` + """ tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name] if not tokenizer_config_matches: raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")