diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py index e379c9834..6113e86a9 100644 --- a/onnxruntime_extensions/_hf_cvt.py +++ b/onnxruntime_extensions/_hf_cvt.py @@ -43,10 +43,11 @@ def convert_json_vocab(hf_tokenizer): f"{hf_tokenizer.__name__}: vocab_files_names is not found") tokenizer_file = filenames["tokenizer_file"] - if (hf_tokenizer.vocab_file is None) or (not os.path.exists(hf_tokenizer.vocab_file)): + vocab_file = getattr(hf_tokenizer, "vocab_file", None) + if (vocab_file is None) or (not os.path.exists(vocab_file)): model_dir = hf_tokenizer.name_or_path else: - model_dir = os.path.dirname(hf_tokenizer.vocab_file) + model_dir = os.path.dirname(vocab_file) tokenizer_json = json.load( open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")) # get vocab object from json file @@ -181,6 +182,8 @@ def spm_decoder(self, **kwargs): 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "CodeGenTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer, 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), + "GPTNeoXTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer, + 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "CLIPTokenizer": TokenOpParam('CLIPTokenizer', HFTokenizerConverter.clip_tokenizer, 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "RobertaTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,