From 2fbf2fa4575d1545c1513adf0a369bdc65ea63ef Mon Sep 17 00:00:00 2001 From: "jack.butler" Date: Fri, 10 Feb 2023 09:46:58 +0000 Subject: [PATCH] add docstring info about tokenizer matching --- model/supervised_finetuning/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 43ff6c8c..7be7bd27 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -36,8 +36,11 @@ TOKENIZER_CONFIGS = { def match_tokenizer_name(model_name: str) -> TokenizerConfig: - """Match a partial model name to a tokenizer configuration""" - tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if model_name in name] + """ + Match a partial model name to a tokenizer configuration + i.e. model_name `Salesforce/codegen-2B-multi` has config name `codegen` + """ + tokenizer_config_matches = [config for name, config in TOKENIZER_CONFIGS.items() if name in model_name] if not tokenizer_config_matches: raise ValueError(f"Cannot find any tokeniser configuration to match {model_name=}") elif 1 < len(tokenizer_config_matches):