add docstring info about tokenizer matching

This commit is contained in:
jack.butler
2023-02-10 09:46:58 +00:00
parent 090c5cbcc2
commit 2fbf2fa457
+5 -2
View File
@@ -36,8 +36,11 @@ TOKENIZER_CONFIGS = {
def match_tokenizer_name(model_name: str) -> TokenizerConfig:
"""Match a partial model name to a tokenizer configuration"""
tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if model_name in name]
"""
Match a partial model name to a tokenizer configuration
i.e. model_name `Salesforce/codegen-2B-multi` has config name `codegen`
"""
tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name]
if not tokenizer_config_matches:
raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}")
elif 1 < len(tokenizer_config_matches):