Skip to content

Commit

Permalink
refactor: change tokenize method parameter from std::string& to const…
Browse files Browse the repository at this point in the history
… std::string& for consistency
  • Loading branch information
lx200916 committed Nov 9, 2024
1 parent 5be4be3 commit 6deca8c
Show file tree
Hide file tree
Showing 12 changed files with 23 additions and 20 deletions.
1 change: 1 addition & 0 deletions src/Backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "OpDefined.hpp"
#include "Types.hpp"
#include <memory>
#include <unordered_map>
using std::shared_ptr;

namespace mllm {
Expand Down
6 changes: 3 additions & 3 deletions src/models/dclm/tokenization_dclm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ class DCLMTokenizer final : public BPETokenizer {
BPETokenizer::setMergeRank(merge_rank);
BPETokenizer::setSpecialToken("<|endoftext|>", "<|endoftext|>", "<|endoftext|>");
}
Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
text = Tokenizer::replaceString(text, ' ', "Ġ");
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
auto new_text = Tokenizer::replaceString(text, ' ', "Ġ");
std::vector<token_id_t> tokens_id;
BPETokenizer::tokenize(text, tokens_id, false);
BPETokenizer::tokenize(new_text, tokens_id, false);
return BPETokenizer::tokens2Input(tokens_id);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma/tokenization_gemma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class GemmaTokenizer final : public BPETokenizer {
Module::initBackend(MLLM_CPU);
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
// replace all blanck to '_'
std::string new_text = BPETokenizer::replaceString(text, ' ', "");

Expand Down
2 changes: 1 addition & 1 deletion src/models/llama/tokenization_llama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class LLaMATokenizer final : public BPETokenizer {
chat_template_pre = "<s>[INST] ";
chat_template_end = " [/INST]";
}
Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
auto tokens_id = vector<token_id_t>();
BPETokenizer::tokenize(text, tokens_id, bos_);
return tokens2Input(tokens_id);
Expand Down
2 changes: 1 addition & 1 deletion src/models/minicpm/tokenization_minicpm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class MiniCPMTokenizer final : public BPETokenizer {
chat_template_end = "<AI>";
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
auto new_text = " " + text;
new_text = std::regex_replace(new_text, std::regex(" "), "");

Expand Down
4 changes: 2 additions & 2 deletions src/models/mistral/tokenization_mistral.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ class MistralTokenizer final : public BPETokenizer {
chat_template_end = " [/INST]";
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
auto newText = token_start + token_user_o + " " + text + token_user_c + token_end;
auto tokens_id = vector<token_id_t>();
BPETokenizer::tokenize(text, tokens_id, false);
BPETokenizer::tokenize(newText, tokens_id, false);
return BPETokenizer::tokens2Input(tokens_id);
}

Expand Down
9 changes: 5 additions & 4 deletions src/models/opt/tokenization_opt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ class OPTTokenizer final : public BPETokenizer {
BPETokenizer::setSpecialToken("</s>", "");
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
string new_text;
if (text[0] != ' ') {
text = ' ' + text;
new_text = ' ' + text;
}
text = Tokenizer::replaceString(text, ' ', "Ġ");
new_text = Tokenizer::replaceString(new_text, ' ', "Ġ");
std::vector<token_id_t> tokens_id;
BPETokenizer::tokenize(text, tokens_id, true);
BPETokenizer::tokenize(new_text, tokens_id, true);

tokens_id.pop_back();

Expand Down
2 changes: 1 addition & 1 deletion src/models/phi3/tokenization_phi3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Phi3Tokenizer final : public BPETokenizer {
chat_template_end = " <|end|>\n<|assistant|>";
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
// replace all blanck to '_'
std::string new_text = BPETokenizer::replaceString(text, ' ', "");

Expand Down
2 changes: 1 addition & 1 deletion src/models/qwen/tokenization_qwen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class QWenTokenizer final : public BPETokenizer {
return result;
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
std::vector<token_id_t> ret;

if (split_special_tokens_) {
Expand Down
2 changes: 1 addition & 1 deletion src/models/smollm/tokenization_smollm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class SmolLMTokenizer final : public BPETokenizer {
return result;
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
std::vector<token_id_t> ret;

if (split_special_tokens_) {
Expand Down
9 changes: 5 additions & 4 deletions src/models/stablelm/tokenization_stablelm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ class StableLMTokenizer final : public BPETokenizer {
chat_template_end = "<|im_end|>\n<|im_start|>assistant\n";
}

Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
string new_text;
if (text[0] != ' ') {
text = ' ' + text;
new_text = ' ' + text;
}
text = Tokenizer::replaceString(text, ' ', "Ġ");
new_text = Tokenizer::replaceString(new_text, ' ', "Ġ");
std::vector<token_id_t> tokens_id;
BPETokenizer::tokenize(text, tokens_id, true);
BPETokenizer::tokenize(new_text, tokens_id, true);
tokens_id.erase(tokens_id.begin());
tokens_id.pop_back();
return BPETokenizer::tokens2Input(tokens_id);
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizers/Tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class Tokenizer {
return std::max_element(scores.begin(), scores.end()) - scores.begin();
}

virtual Tensor tokenize(std::string &text, string name = "input", BackendType type = MLLM_CPU) {
virtual Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) {
bool bos_flag = true;
auto tokens_id = std::vector<token_id_t>();
this->tokenize(text, tokens_id, bos_flag);
Expand Down

0 comments on commit 6deca8c

Please sign in to comment.