refactor: change tokenize method parameter from std::string& to const…

… std::string& for consistency
UbiquitousLearning · Nov 9, 2024 · 6deca8c · 6deca8c
1 parent 5be4be3
commit 6deca8c
Show file tree

Hide file tree

Showing 12 changed files with 23 additions and 20 deletions.
diff --git a/src/Backend.hpp b/src/Backend.hpp
@@ -5,6 +5,7 @@
 #include "OpDefined.hpp"
 #include "Types.hpp"
 #include <memory>
+#include <unordered_map>
 using std::shared_ptr;
 
 namespace mllm {

diff --git a/src/models/dclm/tokenization_dclm.hpp b/src/models/dclm/tokenization_dclm.hpp
@@ -37,10 +37,10 @@ class DCLMTokenizer final : public BPETokenizer {
         BPETokenizer::setMergeRank(merge_rank);
         BPETokenizer::setSpecialToken("<|endoftext|>", "<|endoftext|>", "<|endoftext|>");
     }
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
-        text = Tokenizer::replaceString(text, ' ', "Ġ");
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+        auto new_text = Tokenizer::replaceString(text, ' ', "Ġ");
         std::vector<token_id_t> tokens_id;
-        BPETokenizer::tokenize(text, tokens_id, false);
+        BPETokenizer::tokenize(new_text, tokens_id, false);
         return BPETokenizer::tokens2Input(tokens_id);
     }
 

diff --git a/src/models/gemma/tokenization_gemma.hpp b/src/models/gemma/tokenization_gemma.hpp
@@ -26,7 +26,7 @@ class GemmaTokenizer final : public BPETokenizer {
         Module::initBackend(MLLM_CPU);
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         // replace all blanck to '_'
         std::string new_text = BPETokenizer::replaceString(text, ' ', "▁");
 

diff --git a/src/models/llama/tokenization_llama.hpp b/src/models/llama/tokenization_llama.hpp
@@ -21,7 +21,7 @@ class LLaMATokenizer final : public BPETokenizer {
         chat_template_pre = "<s>[INST] ";
         chat_template_end = " [/INST]";
     }
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         auto tokens_id = vector<token_id_t>();
         BPETokenizer::tokenize(text, tokens_id, bos_);
         return tokens2Input(tokens_id);

diff --git a/src/models/minicpm/tokenization_minicpm.hpp b/src/models/minicpm/tokenization_minicpm.hpp
@@ -31,7 +31,7 @@ class MiniCPMTokenizer final : public BPETokenizer {
         chat_template_end = "<AI>";
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         auto new_text = " " + text;
         new_text = std::regex_replace(new_text, std::regex(" "), "▁");
 

diff --git a/src/models/mistral/tokenization_mistral.hpp b/src/models/mistral/tokenization_mistral.hpp
@@ -25,10 +25,10 @@ class MistralTokenizer final : public BPETokenizer {
         chat_template_end = " [/INST]";
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         auto newText = token_start + token_user_o + " " + text + token_user_c + token_end;
         auto tokens_id = vector<token_id_t>();
-        BPETokenizer::tokenize(text, tokens_id, false);
+        BPETokenizer::tokenize(newText, tokens_id, false);
         return BPETokenizer::tokens2Input(tokens_id);
     }
 

diff --git a/src/models/opt/tokenization_opt.hpp b/src/models/opt/tokenization_opt.hpp
@@ -25,13 +25,14 @@ class OPTTokenizer final : public BPETokenizer {
         BPETokenizer::setSpecialToken("</s>", "");
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+        string new_text;
         if (text[0] != ' ') {
-            text = ' ' + text;
+            new_text = ' ' + text;
         }
-        text = Tokenizer::replaceString(text, ' ', "Ġ");
+        new_text = Tokenizer::replaceString(new_text, ' ', "Ġ");
         std::vector<token_id_t> tokens_id;
-        BPETokenizer::tokenize(text, tokens_id, true);
+        BPETokenizer::tokenize(new_text, tokens_id, true);
 
         tokens_id.pop_back();
 

diff --git a/src/models/phi3/tokenization_phi3.hpp b/src/models/phi3/tokenization_phi3.hpp
@@ -19,7 +19,7 @@ class Phi3Tokenizer final : public BPETokenizer {
         chat_template_end = " <|end|>\n<|assistant|>";
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         // replace all blanck to '_'
         std::string new_text = BPETokenizer::replaceString(text, ' ', "▁");
 

diff --git a/src/models/qwen/tokenization_qwen.hpp b/src/models/qwen/tokenization_qwen.hpp
@@ -129,7 +129,7 @@ class QWenTokenizer final : public BPETokenizer {
         return result;
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         std::vector<token_id_t> ret;
 
         if (split_special_tokens_) {

diff --git a/src/models/smollm/tokenization_smollm.hpp b/src/models/smollm/tokenization_smollm.hpp
@@ -131,7 +131,7 @@ class SmolLMTokenizer final : public BPETokenizer {
         return result;
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
         std::vector<token_id_t> ret;
 
         if (split_special_tokens_) {

diff --git a/src/models/stablelm/tokenization_stablelm.hpp b/src/models/stablelm/tokenization_stablelm.hpp
@@ -27,13 +27,14 @@ class StableLMTokenizer final : public BPETokenizer {
         chat_template_end = "<|im_end|>\n<|im_start|>assistant\n";
     }
 
-    Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+    Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
+        string new_text;
         if (text[0] != ' ') {
-            text = ' ' + text;
+            new_text = ' ' + text;
         }
-        text = Tokenizer::replaceString(text, ' ', "Ġ");
+        new_text = Tokenizer::replaceString(new_text, ' ', "Ġ");
         std::vector<token_id_t> tokens_id;
-        BPETokenizer::tokenize(text, tokens_id, true);
+        BPETokenizer::tokenize(new_text, tokens_id, true);
         tokens_id.erase(tokens_id.begin());
         tokens_id.pop_back();
         return BPETokenizer::tokens2Input(tokens_id);

diff --git a/src/tokenizers/Tokenizer.hpp b/src/tokenizers/Tokenizer.hpp
@@ -102,7 +102,7 @@ class Tokenizer {
         return std::max_element(scores.begin(), scores.end()) - scores.begin();
     }
 
-    virtual Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) {
+    virtual Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) {
         bool bos_flag = true;
         auto tokens_id = std::vector<token_id_t>();
         this->tokenize(text, tokens_id, bos_flag);