From 1b4bb8486a3abde6b7c8517d0ba1dd5eda493b78 Mon Sep 17 00:00:00 2001 From: Noam Gat Date: Sun, 7 Jan 2024 14:02:14 +0200 Subject: [PATCH] Making the json freetext cacher not crash if the tokenizer has empty tokens --- lmformatenforcer/tokenizerprefixtree.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lmformatenforcer/tokenizerprefixtree.py b/lmformatenforcer/tokenizerprefixtree.py index 23d2d34..1ed82d7 100644 --- a/lmformatenforcer/tokenizerprefixtree.py +++ b/lmformatenforcer/tokenizerprefixtree.py @@ -34,6 +34,11 @@ def add_token(self, token_str: str, token_int: int): except json.decoder.JSONDecodeError: return # Illegal inside JSON string, skip this token + if len(token_str) == 0: + # Tokens that don't decode to anything should be ignored, will not be allowed in json freetext fields. + # TODO: Should we instead ALWAYS allow them? + return + self.token_str_to_num[token_str] = token_int self.max_token_len = max(self.max_token_len, len(token_str))