Skip to content

Commit

Permalink
Merge pull request #180 from chenghuaWang/main
Browse files Browse the repository at this point in the history
fix: smollm tokenizer regex pattern
  • Loading branch information
yirongjie authored Nov 8, 2024
2 parents c2c3c68 + e65738d commit 5be4be3
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 7 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ if(QNN) # QNN lib
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
endif()

option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" ON)
option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" OFF)
if(MLLM_BUILD_XNNPACK_BACKEND)
if(NOT WIN32)
add_compile_options(-fPIC)
Expand Down
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ endif()

if(MLLM_BUILD_XNNPACK_BACKEND)
func_llm_add_executable(demo_qwen_xp)
# func_llm_add_executable(demo_qwenqkvmm)
# func_llm_add_executable(demo_qwenqkvmm_cpu)
# func_llm_add_executable(demo_qdint8_gemm)
# func_llm_add_executable(demo_fp32gemmperf)
endif()


Expand Down
5 changes: 3 additions & 2 deletions src/backends/cpu/CPUKVCacheXp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ ErrorCode CPUKVCacheXp::execute(vector<shared_ptr<Tensor>> inputs, vector<shared
}

// copy cache to output
memcpy(outputs[0]->rawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float));
// memcpy(outputs[0]->rawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float));

return MLLM_NO_ERROR;
}
Expand All @@ -68,6 +68,7 @@ ErrorCode CPUKVCacheXp::free(vector<shared_ptr<Tensor>> inputs, vector<shared_pt
}

ErrorCode CPUKVCacheXp::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::setUp(inputs, outputs);
outputs[0]->forceResetHostPointer(cache_.rawHostPtr());
return MLLM_NO_ERROR;
}
} // namespace mllm
12 changes: 8 additions & 4 deletions src/models/smollm/tokenization_smollm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class SmolLMTokenizer final : public BPETokenizer {
std::vector<token_id_t> ret;

if (split_special_tokens_) {
const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
const auto word_collection = unicode_regex_split(text, regex_exprs);
for (auto &piece : word_collection) {
// look up table
// std::string token;
Expand All @@ -160,7 +160,7 @@ class SmolLMTokenizer final : public BPETokenizer {
BPETokenizer::tokenize(token, tmp, false, special_tokens, true);
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
} else {
const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
const auto word_collection = unicode_regex_split(p, regex_exprs);
for (auto &piece : word_collection) {
// look up table
// std::string token;
Expand All @@ -183,7 +183,7 @@ class SmolLMTokenizer final : public BPETokenizer {
std::vector<token_id_t> ret;

if (split_special_tokens_) {
const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
const auto word_collection = unicode_regex_split(text, regex_exprs);
for (auto &piece : word_collection) {
// look up table
// std::string token;
Expand All @@ -208,7 +208,7 @@ class SmolLMTokenizer final : public BPETokenizer {
BPETokenizer::tokenize(token, tmp, false, special_tokens, true);
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
} else {
const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
const auto word_collection = unicode_regex_split(p, regex_exprs);
for (auto &piece : word_collection) {
// look up table
// std::string token;
Expand Down Expand Up @@ -259,6 +259,10 @@ class SmolLMTokenizer final : public BPETokenizer {
}

public:
std::vector<std::string> regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
bool split_special_tokens_ = false;
std::unordered_map<int, std::string> byte_encoder_;
std::unordered_map<std::string, int> byte_decoder_;
Expand Down

0 comments on commit 5be4be3

Please sign in to comment.