diff --git a/README.md b/README.md
index 7c784e19..05a1b529 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
| Model | CPU
FP32 | CPU
INT4 | Hexagon NPU
INT8 |
|-----------------------------------------------------------------------------|------|-----|----------------------------|
-| [LLaMA-1/2 7B](https://github.com/facebookresearch/llama) | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main) | |
+| [LLaMA 2 7B](https://github.com/facebookresearch/llama) | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main) | |
| [Alpaca 7B](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) | [✔️](https://huggingface.co/mllmTeam/chinese-alpaca-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/chinese-alpaca-7b-mllm/tree/main) | |
| [TinyLLaMA 1.1B](https://github.com/jzhang38/TinyLlama) | [✔️](https://huggingface.co/mllmTeam/tinyllama-1.1b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/tinyllama-1.1b-mllm/tree/main) | |
| [Fuyu 8B](https://www.adept.ai/blog/fuyu-8b) | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main) | |
@@ -92,16 +92,18 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
| [ImageBind](https://github.com/facebookresearch/ImageBind) (3 modalities) | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main) | |
| [LLaVA 7B](https://github.com/haotian-liu/LLaVA) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) | |
| [Gemma 2B](https://github.com/google/gemma_pytorch) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | |
-| [Qwen 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | |
-| [Qwen 1.8B Chat](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) |
+| [Qwen 1.5 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | |
+| [Qwen 1.5 1.8B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) |
+| [Qwen 2.5 1.5B](https://github.com/QwenLM/Qwen2.5) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | |
| [Mistral 7B](https://github.com/mistralai/mistral-src) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | |
| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | |
-| [StableLM 1.6B](https://github.com/Stability-AI/StableLM) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | |
+| [StableLM 2 1.6B](https://github.com/Stability-AI/StableLM) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | |
| [OPT 1.3B](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main) | |
-| [Phi-3-mini 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | |
+| [Phi 3 mini 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | |
| [MiniCPM 2B](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32) | [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main) | |
| [SmolLM 1.7B](https://huggingface.co/HuggingFaceTB/SmolLM-1.7B-Instruct) | [✔️](https://huggingface.co/mllmTeam/smollm-1.7b-instruct-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/smollm-1.7b-instruct-mllm/tree/main) | |
-
+| [DCLM 1B](https://huggingface.co/TRI-ML/DCLM-1B) | [✔️](https://huggingface.co/mllmTeam/dclm-1b-mllm/tree/main)| [✔️](https://huggingface.co/mllmTeam/dclm-1b-mllm/tree/main)| |
+| [OpenELM 1.1B](https://github.com/apple/corenet/tree/main/projects/openelm) | [✔️](https://huggingface.co/mllmTeam/openelm-1.1b-mllm/tree/main)| [✔️](https://huggingface.co/mllmTeam/openelm-1.1b-mllm/tree/main)| |
## Quick Start
### Get the Code
@@ -295,7 +297,7 @@ cd ./bin
```bash
cd ./bin
-./demo_llama -m ../models/llama-2-7b-chat-q4_k.mllm -v ../vocab/llama_vocab.mllm
+./demo_llama -m ../models/llama-2-7b-chat-q4_k.mllm -v ../vocab/llama2_vocab.mllm
```
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b5912444..d732bd60 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -528,8 +528,8 @@ else ()
target_link_libraries(demo_smollm PUBLIC MLLM_CPU -fopenmp)
endif ()
-# add_executable(demo_openelm
-# ${PROJECT_SOURCE_DIR}/examples/demo_openelm.cpp
+# add_executable(demo_phonellm
+# ${PROJECT_SOURCE_DIR}/examples/demo_phonellm.cpp
# ${DIR_SRC_CPU}
# ${DIR_SRC_MEM_MANAGER}
# ${DIR_SRC_EXP}
@@ -541,32 +541,52 @@ endif ()
# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
# )
# if (MLLM_OPENMP_STATIC)
-# target_compile_options(demo_openelm PRIVATE -fopenmp)
-# target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+# target_compile_options(demo_phonellm PRIVATE -fopenmp)
+# target_link_libraries(demo_phonellm PUBLIC MLLM_CPU -fopenmp -static-openmp)
# else ()
-# target_compile_options(demo_openelm PRIVATE -fopenmp)
-# target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp)
+# target_compile_options(demo_phonellm PRIVATE -fopenmp)
+# target_link_libraries(demo_phonellm PUBLIC MLLM_CPU -fopenmp)
# endif ()
-# add_executable(demo_dclm
-# ${PROJECT_SOURCE_DIR}/examples/demo_dclm.cpp
-# ${DIR_SRC_CPU}
-# ${DIR_SRC_MEM_MANAGER}
-# ${DIR_SRC_EXP}
-# ${DIR_SRC}
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
-# )
-# if (MLLM_OPENMP_STATIC)
-# target_compile_options(demo_dclm PRIVATE -fopenmp)
-# target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp -static-openmp)
-# else ()
-# target_compile_options(demo_dclm PRIVATE -fopenmp)
-# target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp)
-# endif ()
+add_executable(demo_openelm
+ ${PROJECT_SOURCE_DIR}/examples/demo_openelm.cpp
+ ${DIR_SRC_CPU}
+ ${DIR_SRC_MEM_MANAGER}
+ ${DIR_SRC_EXP}
+ ${DIR_SRC}
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
+ ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
+)
+if (MLLM_OPENMP_STATIC)
+ target_compile_options(demo_openelm PRIVATE -fopenmp)
+ target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+ target_compile_options(demo_openelm PRIVATE -fopenmp)
+ target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp)
+endif ()
+
+add_executable(demo_dclm
+ ${PROJECT_SOURCE_DIR}/examples/demo_dclm.cpp
+ ${DIR_SRC_CPU}
+ ${DIR_SRC_MEM_MANAGER}
+ ${DIR_SRC_EXP}
+ ${DIR_SRC}
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
+ ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
+ ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
+)
+if (MLLM_OPENMP_STATIC)
+ target_compile_options(demo_dclm PRIVATE -fopenmp)
+ target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+ target_compile_options(demo_dclm PRIVATE -fopenmp)
+ target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp)
+endif ()
add_executable(benchmark_llm
${PROJECT_SOURCE_DIR}/examples/benchmark.cpp
diff --git a/examples/demo_clip.cpp b/examples/demo_clip.cpp
index 3c66cb17..eee98e7d 100644
--- a/examples/demo_clip.cpp
+++ b/examples/demo_clip.cpp
@@ -1,5 +1,4 @@
#include
-#include
#include "cmdline.h"
#include "models/clip/modeling_clip.hpp"
#include "models/clip/processing_clip.hpp"
diff --git a/examples/demo_dclm.cpp b/examples/demo_dclm.cpp
new file mode 100644
index 00000000..7e5f4963
--- /dev/null
+++ b/examples/demo_dclm.cpp
@@ -0,0 +1,59 @@
+/**
+ * @file demo_dclm.cpp
+ * @author chenghua Wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-09-26
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "cmdline.h"
+#include "models/dclm/configuration_dclm.hpp"
+#include "models/dclm/modeling_dclm.hpp"
+#include "models/dclm/tokenization_dclm.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+ std::iostream::sync_with_stdio(false);
+
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/dclm_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/dclm_merges.txt");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/dclm-1b-fp32.mllm");
+ cmdParser.add("billion", 'b', "[1B]", false, "1B");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 400);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string model_path = cmdParser.get("model");
+ string merge_path = cmdParser.get("merge");
+ string model_billion = cmdParser.get("billion");
+ int tokens_limit = cmdParser.get("limits");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+
+ auto tokenizer = DCLMTokenizer(vocab_path, merge_path);
+ DCLMConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+ auto model = DCLM(config);
+ model.load(model_path);
+
+ vector in_strs = {
+ "Machine learning is",
+ };
+
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto in_str = in_strs[i];
+ std::cout << in_str << std::flush;
+ auto input_tensor = tokenizer.tokenize(in_str);
+ for (int step = 0; step < 100; step++) {
+ auto result = model({input_tensor});
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ std::cout << output_string << std::flush;
+ chatPostProcessing(out_token, input_tensor, {});
+ }
+ printf("\n");
+ }
+}
\ No newline at end of file
diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp
index efd63c0c..6380d0c0 100644
--- a/examples/demo_elastic_llama.cpp
+++ b/examples/demo_elastic_llama.cpp
@@ -12,7 +12,7 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
@@ -76,13 +76,10 @@ int main(int argc, char **argv) {
{(int)(32 * ratio), (int)(11008 * ratio)} // 31
};
auto result = model({input_tensor}, activate_dims);
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 2) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_fuyu.cpp b/examples/demo_fuyu.cpp
index 781d717a..31c555d3 100644
--- a/examples/demo_fuyu.cpp
+++ b/examples/demo_fuyu.cpp
@@ -51,10 +51,9 @@ int main(int argc, char **argv) {
auto outputs = processor.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
- if (out_token == 71013) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [end, string] = processor.postprocess(out_string);
+ if (!end) { break; }
+ std::cout << string << std::flush;
chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1], &input_tensors[2]});
}
printf("\n");
diff --git a/examples/demo_gemma.cpp b/examples/demo_gemma.cpp
index d3edb01e..18e0a885 100644
--- a/examples/demo_gemma.cpp
+++ b/examples/demo_gemma.cpp
@@ -42,16 +42,15 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == tokenizer.eos_id && step != 0) break;
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_llama.cpp b/examples/demo_llama.cpp
index f4d6f0e0..2de68a35 100644
--- a/examples/demo_llama.cpp
+++ b/examples/demo_llama.cpp
@@ -12,7 +12,7 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
@@ -30,24 +30,21 @@ int main(int argc, char **argv) {
model.load(model_path);
vector in_strs = {
- " Hello, who are you?",
- " What can you do?",
+ "Hello, who are you?",
+ "What can you do?",
"Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) {
- auto in_str = in_strs[i];
- auto input_tensor = tokenizer.tokenize(in_str, i);
- std::cout << "[Q] " << in_str << std::endl;
+ auto in_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(in_str);
+ std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 2) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_llava.cpp b/examples/demo_llava.cpp
index e3d0c7f1..9376e759 100644
--- a/examples/demo_llava.cpp
+++ b/examples/demo_llava.cpp
@@ -48,10 +48,12 @@ int main(int argc, char **argv) {
auto outputs = processor.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
- if (out_token == 2) {
+ auto [isOk, print_string] = processor.postprocess(out_string);
+ if (isOk) {
+ std::cout << print_string << std::flush;
+ } else {
break;
}
- std::cout << out_string << std::flush;
chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1]});
}
printf("\n");
diff --git a/examples/demo_minicpm.cpp b/examples/demo_minicpm.cpp
index 06367120..849b4af1 100644
--- a/examples/demo_minicpm.cpp
+++ b/examples/demo_minicpm.cpp
@@ -30,34 +30,17 @@ int main(int argc, char **argv) {
"Please introduce Beijing University of Posts and Telecommunications.",
};
- string system_prompt_start = tokenizer.token_user_o;
- string system_prompt_end = tokenizer.token_user_c;
-
- auto processOutput = [&](unsigned int id, std::string &text) -> std::pair {
- text = std::regex_replace(text, std::regex("▁"), " ");
- if (text == "<0x0A>") return {true, "\n"};
- if (text == "") return {false, ""};
- if (id == 2) return {false, ""};
- return {true, text};
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
- auto in_str_origin = in_strs[i];
- auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
+ auto in_str = tokenizer.apply_chat_template(in_strs[i]);
auto input_tensor = tokenizer.tokenize(in_str);
- std::cout << "[Q] " << in_str << std::endl;
+ std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- auto [isOk, print_string] = processOutput(out_token, out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- break;
- }
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_mistral.cpp b/examples/demo_mistral.cpp
index 0981688a..345ddb23 100644
--- a/examples/demo_mistral.cpp
+++ b/examples/demo_mistral.cpp
@@ -35,34 +35,23 @@ int main(int argc, char **argv) {
model.load(model_path);
vector in_strs = {
- " Hello, who are you?",
- " What can you do?",
+ "Hello, who are you?",
+ "What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};
- auto processOutput = [&](std::string &text) -> std::pair {
- text = std::regex_replace(text, std::regex("▁"), " ");
- if (text == "<0x0A>") return {true, "\n"};
- if (text == "") return {false, ""};
- return {true, text};
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ in_str = tokenizer.apply_chat_template(in_str);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- auto [isOk, print_string] = processOutput(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- break;
- }
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_openelm.cpp b/examples/demo_openelm.cpp
new file mode 100644
index 00000000..2f23a2f0
--- /dev/null
+++ b/examples/demo_openelm.cpp
@@ -0,0 +1,65 @@
+/**
+ * @file demo_openelm.cpp
+ * @author chenghua.wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-09-25
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "cmdline.h"
+#include "models/openelm/configuration_openelm.hpp"
+#include "models/openelm/modeling_openelm.hpp"
+#include "models/llama/tokenization_llama.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+ std::iostream::sync_with_stdio(false);
+
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_hf_vocab.mllm");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/openelm-1.1b-Instruct-q4_k.mllm");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 400);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string model_path = cmdParser.get("model");
+ int tokens_limit = cmdParser.get("limits");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+
+ auto tokenizer = LLaMATokenizer(vocab_path);
+ OpenELMConfig config(tokens_limit, "1.1B", RoPEType::HFHUBROPE);
+ auto model = OpenElMModel(config);
+ model.load(model_path);
+
+ vector in_strs = {
+ "Hello, who are you?",
+ "What can you do?",
+ "Please introduce Beijing University of Posts and Telecommunications.",
+ };
+
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
+ std::cout << "[Q] " << in_strs[i] << std::endl;
+ std::cout << "[A] " << std::flush;
+
+ LlmTextGeneratorOpts opt{
+ .max_new_tokens = 100,
+ .do_sample = true,
+ .temperature = 0.3F,
+ .top_k = 50,
+ .top_p = 0.F,
+ };
+ model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+ auto out_string = tokenizer.detokenize({out_token});
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
+ return true;
+ });
+ std::cout << "\n";
+ }
+}
\ No newline at end of file
diff --git a/examples/demo_opt.cpp b/examples/demo_opt.cpp
index e13935dd..cc3efe01 100644
--- a/examples/demo_opt.cpp
+++ b/examples/demo_opt.cpp
@@ -37,26 +37,15 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
std::cout << "[Q] " << in_str << std::endl;
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[A] " << std::flush;
for (int step = 0; step < 50; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 2) {
- break;
- }
- size_t pos = 0;
- while ((pos = out_string.find("Ċ", pos)) != std::string::npos) {
- out_string.replace(pos, 2, " ");
- }
- pos = 0;
- while ((pos = out_string.find("Ġ", pos)) != std::string::npos) {
- out_string.replace(pos, 2, " ");
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_phi3.cpp b/examples/demo_phi3.cpp
index b11554a3..c0821bac 100644
--- a/examples/demo_phi3.cpp
+++ b/examples/demo_phi3.cpp
@@ -25,9 +25,6 @@ int main(int argc, char **argv) {
auto model = Phi3Model(config);
model.load(model_path);
- string system_prompt_start = "<|user|>\n";
- string system_prompt_end = " <|end|>\n<|assistant|>";
-
vector in_strs = {
"who are you?",
"What can you do?",
@@ -35,19 +32,16 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str_origin = in_strs[i];
- auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
- auto input_tensor = tokenizer.tokenize(in_str, i);
- std::cout << "[Q] " << in_str << std::endl;
+ auto in_str = tokenizer.apply_chat_template(in_str_origin);
+ auto input_tensor = tokenizer.tokenize(in_str);
+ std::cout << "[Q] " << in_str_origin << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == tokenizer.end_id && step != 0) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_qnn.cpp b/examples/demo_qnn.cpp
index 92ad1e24..c42a202f 100644
--- a/examples/demo_qnn.cpp
+++ b/examples/demo_qnn.cpp
@@ -31,24 +31,9 @@ int main(int argc, char **argv) {
" Give me a short introduction to large language model.",
};
- auto processOutput = [&](std::string &text) -> std::pair {
- if (text == "<|im_start|>" || text == "<|im_end|>" || text == "") return {true, ""};
- if (text == "<|endoftext|>") return {false, ""};
- return {true, text};
- };
-
- auto addSystemPrompt = [](const std::string &text) -> std::string {
- std::string ret;
- std::string pre = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n";
- ret = pre + text;
- std::string end = "<|im_end|>\n<|im_start|>assistant\n";
- ret = ret + end;
- return ret;
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
- auto input_str = addSystemPrompt(in_strs[i]);
- auto input_tensor = tokenizer.tokenize(input_str, i);
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
@@ -61,12 +46,9 @@ int main(int argc, char **argv) {
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
- auto [isOk, print_string] = processOutput(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- return false;
- }
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
return true;
});
std::cout << "FINISH\n";
diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp
index 909b8b85..49acdcf0 100644
--- a/examples/demo_qwen.cpp
+++ b/examples/demo_qwen.cpp
@@ -21,7 +21,7 @@ int main(int argc, char **argv) {
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
- cmdParser.add("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
+ cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B |]", false, "1.8B");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
@@ -39,30 +39,13 @@ int main(int argc, char **argv) {
model.load(model_path);
vector in_strs = {
- " Hello, who are you?",
- " What can you do?",
+ "Hello, who are you?",
+ "What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};
-
- auto processOutput = [&](std::string &text) -> std::pair {
- if (text == "<|im_start|>" || text == "<|im_end|>" || text == "") return {true, ""};
- if (text == "<|endoftext|>") return {false, ""};
- return {true, text};
- };
-
- auto addSystemPrompt = [](const std::string &text) -> std::string {
- std::string ret;
- std::string pre =
- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n";
- ret = pre + text;
- std::string end = "<|im_end|>\n<|im_start|>assistant\n";
- ret = ret + end;
- return ret;
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
- auto input_str = addSystemPrompt(in_strs[i]);
- auto input_tensor = tokenizer.tokenize(input_str, i);
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
@@ -75,12 +58,9 @@ int main(int argc, char **argv) {
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
- auto [isOk, print_string] = processOutput(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- return false;
- }
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
return true;
});
std::cout << "\n";
diff --git a/examples/demo_smollm.cpp b/examples/demo_smollm.cpp
index a2ca94f3..5233f14d 100644
--- a/examples/demo_smollm.cpp
+++ b/examples/demo_smollm.cpp
@@ -38,30 +38,14 @@ int main(int argc, char **argv) {
model.load(model_path);
vector in_strs = {
- " Hello, who are you?",
- " What can you do?",
+ "Hello, who are you?",
+ "What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};
- auto processOutput = [&](std::string &text) -> std::pair {
- if (text == "<|im_start|>" || text == "<|im_end|>" || text == "") return {true, ""};
- if (text == "<|endoftext|>") return {false, ""};
- return {true, text};
- };
-
- auto addSystemPrompt = [](const std::string &text) -> std::string {
- std::string ret;
- std::string pre =
- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n";
- ret = pre + text;
- std::string end = "<|im_end|>\n<|im_start|>assistant\n";
- ret = ret + end;
- return ret;
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
- auto input_str = addSystemPrompt(in_strs[i]);
- auto input_tensor = tokenizer.tokenize(input_str, i);
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
@@ -74,12 +58,9 @@ int main(int argc, char **argv) {
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
- auto [isOk, print_string] = processOutput(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- return false;
- }
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
return true;
});
std::cout << "\n";
diff --git a/examples/demo_sparse_llama.cpp b/examples/demo_sparse_llama.cpp
index 939a23f0..74b6a404 100644
--- a/examples/demo_sparse_llama.cpp
+++ b/examples/demo_sparse_llama.cpp
@@ -12,7 +12,7 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/relu_llama_vocab.mllm");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/ReLULlama_sparse_q4_k.mllm");
// cmdParser.add("predictor", 'p', "specify mllm model predictor path", false, "../models/ReLULlama_predictor.mllm");
cmdParser.add("limits", 'l', "max KV cache size", false, 600);
@@ -41,18 +41,16 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ // in_str = tokenizer.apply_chat_template(in_str);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 2) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_stablelm.cpp b/examples/demo_stablelm.cpp
index b1b281d3..11efcb9e 100644
--- a/examples/demo_stablelm.cpp
+++ b/examples/demo_stablelm.cpp
@@ -23,42 +23,27 @@ int main(int argc, char **argv) {
auto tokenizer = StableLMTokenizer(vocab_path, merge_path);
- string system_prompt_start = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n";
- string system_prompt_end = "<|im_end|>\n<|im_start|>assistant\n";
-
StableLMConfig config(tokens_limit, "1.6B", HFHUBROPE);
auto model = StableLMModel(config);
model.load(model_path);
vector in_strs = {
- " Hello, who are you?",
- " What can you do?",
+ "Hello, who are you?",
+ "What can you do?",
"Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) {
const auto &in_str_origin = in_strs[i];
- auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
+ auto in_str = tokenizer.apply_chat_template(in_str_origin);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str_origin << std::endl;
- auto input_tensor = tokenizer.tokenize(in_str, i);
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 100278) {
- break;
- }
- size_t pos = 0;
- while ((pos = out_string.find("Ċ", pos)) != std::string::npos) {
- out_string.replace(pos, 2, " ");
- }
- pos = 0;
- while ((pos = out_string.find("Ġ", pos)) != std::string::npos) {
- out_string.replace(pos, 2, " ");
- }
-
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_tinyllama.cpp b/examples/demo_tinyllama.cpp
index 66d25880..6c5b2e05 100644
--- a/examples/demo_tinyllama.cpp
+++ b/examples/demo_tinyllama.cpp
@@ -24,33 +24,28 @@ int main(int argc, char **argv) {
CPUBackend::cpu_threads = cmdParser.get("thread");
auto tokenizer = LLaMATokenizer(vocab_path);
+ string system_prompt_start = " You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.<|USER|>";
+ string system_prompt_end = "<|ASSISTANT|>";
+ tokenizer.set_chat_template(system_prompt_start, system_prompt_end);
TinyLLaMAConfig config(tokens_limit, "1.5B", HFHUBROPE);
auto model = TinyLLaMAModel(config);
model.load(model_path);
- string system_prompt_start = " You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.<|USER|>";
- string system_prompt_end = "<|ASSISTANT|>";
-
vector in_strs = {
"Hello, who are you?",
"Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) {
- auto in_str_origin = in_strs[i];
- auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ auto in_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- if (out_token == 2) {
- break;
- }
- std::cout << out_string << std::flush;
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/demo_yi.cpp b/examples/demo_yi.cpp
index 4aa3e945..ab6457a7 100644
--- a/examples/demo_yi.cpp
+++ b/examples/demo_yi.cpp
@@ -38,28 +38,17 @@ int main(int argc, char **argv) {
"请介绍北京邮电大学,推荐同学们报考。",
};
- auto processOutput = [&](std::string &text) -> std::pair {
- text = std::regex_replace(text, std::regex("▁"), " ");
- if (text == "<|endoftext|>" || text == "<|im_end|>") return {false, ""};
- return {true, text};
- };
-
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
std::cout << "[Q] " << in_str << std::endl;
- auto input_tensor = tokenizer.tokenize(in_str, i);
+ auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[A] " << std::flush;
for (int step = 0; step < 1000; step++) {
auto result = model({input_tensor});
- auto outputs = tokenizer.detokenize(result[0]);
- auto out_string = outputs.first;
- auto out_token = outputs.second;
- auto [isOk, print_string] = processOutput(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- break;
- }
+ auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
diff --git a/examples/main_llama.cpp b/examples/main_llama.cpp
index 2d847fb8..ede99ecb 100644
--- a/examples/main_llama.cpp
+++ b/examples/main_llama.cpp
@@ -82,7 +82,7 @@ void llama(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hi
}
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
diff --git a/include/Types.hpp b/include/Types.hpp
index 8099bc6d..c0a4cebb 100644
--- a/include/Types.hpp
+++ b/include/Types.hpp
@@ -106,6 +106,7 @@ enum Chl {
HD = 113, // only use for split attn.in_proj
D_HD = 313, // only use for split attn.in_proj
+ D_DH = 331, // only use for split attn.in_proj
CHANNLE = 1,
TIME = 2,
@@ -116,6 +117,12 @@ enum Chl {
};
+enum AttnQKVSplitType {
+ SPLIT_NONE = 0,
+ SPLIT_HD = Chl::HD,
+ SPLIT_D_HD = Chl::D_HD,
+};
+
#define ANYDIM -198098
enum PaddingType {
diff --git a/scripts/run_llama.sh b/scripts/run_llama.sh
index 299b4028..8db4a54a 100755
--- a/scripts/run_llama.sh
+++ b/scripts/run_llama.sh
@@ -4,7 +4,7 @@ adb shell mkdir /data/local/tmp/mllm
adb shell mkdir /data/local/tmp/mllm/bin
adb shell mkdir /data/local/tmp/mllm/models
adb shell mkdir /data/local/tmp/mllm/vocab
-adb push ../vocab/llama_vocab.mllm /data/local/tmp/mllm/vocab/
+adb push ../vocab/llama2_vocab.mllm /data/local/tmp/mllm/vocab/
#adb push ../bin-arm/main_llama /data/local/tmp/mllm/bin/
adb push ../bin-arm/demo_llama /data/local/tmp/mllm/bin/
adb push ../models/llama-2-7b-chat-q4_k.mllm /data/local/tmp/mllm/models/
diff --git a/src/Layer.cpp b/src/Layer.cpp
index 1b1a5d80..a057ab98 100644
--- a/src/Layer.cpp
+++ b/src/Layer.cpp
@@ -5,4 +5,5 @@
#include "Layer.hpp"
namespace mllm {
map Layer::layername_2_tensorname;
+bool Layer::use_layername_2_tensorname = true;
}; // namespace mllm
\ No newline at end of file
diff --git a/src/Layer.hpp b/src/Layer.hpp
index 6d2a3d5b..ab9fe403 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -41,6 +41,7 @@ class Layer {
}
bool inited_loaded = false;
static map layername_2_tensorname;
+ static bool use_layername_2_tensorname;
Tensor &operator()(Tensor &input) {
auto ts = run({input}, 1);
@@ -83,6 +84,23 @@ class Layer {
}
*/
vector new_names;
+ bool can_break = true;
+ auto in_x_name = renameX_names[0];
+ while (can_break) {
+ can_break = false;
+ for (const auto &suffix : suffixs) {
+ if (in_x_name.rfind(suffix) == (in_x_name.size() - suffix.size())) {
+ const auto r_name = in_x_name.substr(0, in_x_name.size() - suffix.size());
+ if (std::find(renameX_names.begin(), renameX_names.end(), r_name) == renameX_names.end() && std::find(new_names.begin(), new_names.end(), r_name) == new_names.end()) {
+ new_names.push_back(r_name);
+ in_x_name = r_name;
+ can_break = true;
+ }
+ break;
+ }
+ }
+ }
+ /*
for (const auto &in_x_name : renameX_names) {
for (const auto &suffix : suffixs) {
if (in_x_name.rfind(suffix) == (in_x_name.size() - suffix.size())) {
@@ -94,6 +112,7 @@ class Layer {
}
}
}
+ */
renameX_names.insert(renameX_names.end(), new_names.begin(), new_names.end());
for (const auto x_name : renameX_names) {
auto name = name_X_to_num(x_name, saved_list_idx);
@@ -125,7 +144,6 @@ class Layer {
Module::runlistIdx = saved_list_idx;
bool do_init = false;
// set backend to current module device and try to create op
- // TODO: backend fallback
backend_ = Backend::global_backends[Module::tmp_device];
if (Module::doLoad || !inited_loaded) {
do_init = !inited_loaded;
@@ -147,15 +165,20 @@ class Layer {
}
}
for (const auto &layer_next_name : layer_next_names) {
- if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
- if (param_["type"] == KVCACHE) {
- layername_2_tensorname[layer_next_name] = layer_next_name;
- init_reset_KVCache(inputs[0].name());
- } else {
- layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+ string next_name;
+ if (use_layername_2_tensorname) {
+ if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+ if (param_["type"] == KVCACHE) {
+ layername_2_tensorname[layer_next_name] = layer_next_name;
+ init_reset_KVCache(inputs[0].name());
+ } else {
+ layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+ }
}
+ next_name = layername_2_tensorname[layer_next_name];
+ } else {
+ next_name = layer_next_name;
}
- auto next_name = layername_2_tensorname[layer_next_name];
if (Tensor::graphs.find(next_name) == Tensor::graphs.end()) {
Tensor::graphs[next_name] = std::make_shared(backend_);
Tensor::graphs[next_name]->setName(next_name);
@@ -164,7 +187,12 @@ class Layer {
if (Module::doLoad) {
vector> output_result = {};
for (const auto &layer_next_name : layer_next_names) {
- auto next_name = layername_2_tensorname[layer_next_name];
+ string next_name;
+ if (use_layername_2_tensorname) {
+ next_name = layername_2_tensorname[layer_next_name];
+ } else {
+ next_name = layer_next_name;
+ }
output_result.push_back(*Tensor::graphs[next_name]);
}
return output_result;
@@ -195,7 +223,12 @@ class Layer {
vector> output_tensors = {};
vector next_names = {};
for (const auto &layer_next_name : layer_next_names) {
- auto next_name = layername_2_tensorname[layer_next_name];
+ string next_name;
+ if (use_layername_2_tensorname) {
+ next_name = layername_2_tensorname[layer_next_name];
+ } else {
+ next_name = layer_next_name;
+ }
next_names.push_back(next_name);
output_tensors.push_back(Tensor::graphs[next_name]);
}
@@ -224,7 +257,12 @@ class Layer {
#endif
vector> output_result = {};
for (const auto &layer_next_name : layer_next_names) {
- auto next_name = layername_2_tensorname[layer_next_name];
+ string next_name;
+ if (use_layername_2_tensorname) {
+ next_name = layername_2_tensorname[layer_next_name];
+ } else {
+ next_name = layer_next_name;
+ }
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData(layer_next_name);
#endif
diff --git a/src/Module.hpp b/src/Module.hpp
index f768c656..bfaf86a8 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -242,23 +242,30 @@ class Module {
Op::noLoadWeightsDtype() = dtype;
}
- void profiling(string name = "") {
+ vector profiling(string name = "") {
+ vector output;
// printf("\n");
std::cout << "===========================================" << std::endl;
if (!name.empty()) {
std::cout << " " << name << std::endl;
std::cout << "-------------------------------------------" << std::endl;
}
+ double load_time_s = load_time_ / 1000.0F;
std::cout << " Load time: " << load_time_ / 1000.0F << " s" << std::endl;
if (inference_times_.size() > 1 && decoding_token_size_ != prefilling_token_size_) {
- std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
+ double prefile_speed = 1000 * prefilling_token_size_ / inference_times_[0];
+ std::cout << " Prefilling speed: " << prefile_speed << " tokens/s" << std::endl;
double sum_decoding_time = std::accumulate(std::begin(inference_times_) + 1, std::end(inference_times_), 0.0);
double mean_decoding_time = sum_decoding_time / (inference_times_.size() - 1);
- std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
+ double decoding_speed = 1000 / mean_decoding_time;
+ std::cout << " Decoding speed: " << decoding_speed << " tokens/s" << std::endl;
+ output = {load_time_s, prefile_speed, decoding_speed};
} else {
double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
double mean_time = sum_time / (inference_times_.size());
+ double inference_time_s = mean_time / 1000.0F;
std::cout << " Inference latency: " << mean_time / 1000.0F << " s" << std::endl;
+ output = {load_time_s, inference_time_s};
}
// double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
// std::cout< generate(Tensor &input_ids, const LlmTextGeneratorOpts &opt, int end_token = -1) {
+ auto chatPostProcessing = [](unsigned token_idx, Tensor &tokens_tensor, const vector &clean_tensors) {
+ tokens_tensor.reshape(1, 1, 1, 1);
+ tokens_tensor.alloc();
+ tokens_tensor.setDataAt(0, 0, 0, 0, token_idx);
+
+ for (auto tensor : clean_tensors) {
+ tensor->reshape(0, 0, 0, 0);
+ tensor->alloc();
+ }
+ };
+
+ if (!opt.do_sample) {
+ // fail to greedy search
+ if (!text_generator_ || text_generator_->type() != LLmTextGeneratorType::kGreedySearch)
+ text_generator_ = std::make_shared(LLmTextGeneratorType::kGreedySearch, opt);
+ } else if (opt.do_sample && !opt.top_k && opt.top_p != 0.F) {
+ // fail to top p sampling
+ if (!text_generator_ || text_generator_->type() != LLmTextGeneratorType::kToppSampling)
+ text_generator_ = std::make_shared(LLmTextGeneratorType::kToppSampling, opt);
+ } else if (opt.do_sample && opt.top_k) {
+ // fail to top k sampling
+ if (!text_generator_ || text_generator_->type() != LLmTextGeneratorType::kTopkSampling)
+ text_generator_ = std::make_shared(LLmTextGeneratorType::kTopkSampling, opt);
+ }
+ vector result;
+ for (int step = 0; step < opt.max_new_tokens; ++step) {
+ auto _out = (*this)({input_ids});
+ auto out_token = text_generator_->generate(_out[0]);
+ result.push_back(out_token);
+ if (end_token != -1 && out_token == end_token) break;
+ chatPostProcessing(out_token, input_ids, {});
+ }
+ return result;
+ }
};
} // namespace mllm
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index f29ceeea..7e35c083 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -317,7 +317,7 @@ Tensor &Tensor::range(int start, int end) {
}
vector> Tensor::split(Tensor &input, std::vector each_dims,
- Chl split_dim, int head_size) {
+ Chl split_dim, int same_dim_size) {
vector next_names;
std::vector args;
for (int i = 0; i < each_dims.size(); ++i) {
@@ -327,7 +327,7 @@ vector> Tensor::split(Tensor &input, std::vector<
next_names.push_back(input.name() + ".split-" + std::to_string(i));
}
args.push_back(split_dim);
- args.push_back(head_size);
+ args.push_back(same_dim_size);
return getStaticFunc(next_names, FUNC_SPLIT, args, {Tensor::graphs[input.name()].get()});
}
diff --git a/src/Tensor.hpp b/src/Tensor.hpp
index 6f541d02..6ed703b7 100644
--- a/src/Tensor.hpp
+++ b/src/Tensor.hpp
@@ -825,7 +825,10 @@ class Tensor {
Tensor &norm(int L_n);
Tensor &where(float value, Chl axis);
static Tensor &range(int start, int end);
- static vector> split(Tensor &input, std::vector each_dims, Chl split_dim, int head_size = -1);
+ static vector> split(Tensor &input, std::vector each_dims, Chl split_dim, int same_dim_size = -1);
+ vector> split(std::vector each_dims, Chl split_dim, int same_dim_size = -1) {
+ return split(*this, each_dims, split_dim, same_dim_size);
+ }
/* Functions used for ChildTensor:
* - deepCopyFrom
@@ -1048,6 +1051,14 @@ class Tensor {
}
break;
}
+ case D_DH: {
+ auto sum = 0;
+ for (auto &t : ts) {
+ sum += t->head();
+ aggregated_dims_.push_back(sum);
+ }
+ break;
+ }
default:
break;
}
@@ -1080,7 +1091,7 @@ class Tensor {
std::cout << "tensor should be transfered across backend" << std::endl;
return inputs;
};
- static vector toCPU(vector inputs){
+ static vector toCPU(vector inputs) {
return toDevice(inputs, MLLM_CPU);
}
static vector toQNN(vector inputs) {
@@ -1659,6 +1670,23 @@ class Tensor {
}
break;
}
+ case D_DH: {
+ auto orin_d = d;
+ int dim_size = aggregated_tensors_[0]->dimension();
+ int total_head_idx = d / dim_size;
+ d = d % dim_size;
+ int old_head_idx = 0;
+ for (int a = 0; a < aggregated_dims_.size(); ++a) {
+ old_head_idx += aggregated_dims_[a];
+ if (total_head_idx < old_head_idx) {
+ tensor_id = a;
+ old_head_idx -= aggregated_dims_[a];
+ break;
+ }
+ }
+ h = total_head_idx - old_head_idx;
+ break;
+ }
default:
break;
}
diff --git a/src/backends/cpu/CPUReLU.cpp b/src/backends/cpu/CPUReLU.cpp
index ee70cba3..d798210f 100644
--- a/src/backends/cpu/CPUReLU.cpp
+++ b/src/backends/cpu/CPUReLU.cpp
@@ -7,7 +7,8 @@
#include
namespace mllm {
-CPUReLU::CPUReLU(Backend *bn, string opName, int threadCount):thread_count(threadCount), Op(bn, std::move(opName)) {
+CPUReLU::CPUReLU(Backend *bn, string opName, int threadCount) :
+ thread_count(threadCount), Op(bn, std::move(opName)) {
}
ErrorCode CPUReLU::reshape(vector> inputs, vector> outputs) {
assert(inputs.size() == 1);
@@ -23,8 +24,8 @@ ErrorCode CPUReLU::execute(vector> inputs, vectorhead();
int seq = input->sequence();
int dim = input->dimension();
-#pragma omp parallel for collapse(4)
- for (int b = 0; b
-#include
-#include "processor/ClipPreProcess.hpp"
+#include "processor/PreProcess.hpp"
#include "tokenizers/BPE/Bpe.hpp"
+#include
+#ifndef STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#define STB_IMAGE_IMPLEMENTATION
+#endif
+#include "stb/stb_image.h"
#include
using namespace mllm;
-class ClipProcessor final {
+class ClipProcessor : public PreProcessor {
+public:
+ BPETokenizer *tokenizer;
+ vector> input_ids_;
+ vector>>> pixel_values_;
+
Tensor img2Tensor(vector>> img, string name = "input", BackendType type = MLLM_CPU) {
int channel = img.size();
int height = img[0].size();
@@ -32,24 +42,23 @@ class ClipProcessor final {
}
return tensor1;
}
- vector softmax(const vector& scores) {
+ vector softmax(const vector &scores) {
vector exps;
float max_val = *max_element(scores.begin(), scores.end());
for (float score : scores) {
exps.push_back(exp(score - max_val));
}
float sum_exps = accumulate(exps.begin(), exps.end(), 0.0f);
- for (float& exp : exps) {
+ for (float &exp : exps) {
exp /= sum_exps;
}
return exps;
}
- BPETokenizer *tokenizer;
- ClipPreProcessor *clip_processor;
-
public:
- explicit ClipProcessor(const string &vocab_path, const string &merges_path) {
+ explicit ClipProcessor(const string &vocab_path, const string &merges_path, int height = 224, int width = 224, bool add_special_tokens = true) :
+ PreProcessor(height, width, false, true, true,
+ true, {0.48145466, 0.4578275, 0.40821073}, {0.26862954, 0.26130258, 0.27577711}) {
Module::initBackend(MLLM_CPU);
tokenizer = new BPETokenizer(vocab_path);
std::unordered_map merge_rank;
@@ -67,22 +76,104 @@ class ClipProcessor final {
rank++;
}
tokenizer->setMergeRank(merge_rank);
- tokenizer->setSpecialToken("<|startoftext|>", "<|endoftext|>");
- clip_processor = new ClipPreProcessor(tokenizer);
+ if (add_special_tokens) {
+ tokenizer->setSpecialToken("<|startoftext|>", "<|endoftext|>");
+ }
+ }
+
+ void Process(const std::string &text) override {
+ auto token_id = vector();
+ tokenizer_->tokenize(text, token_id, false);
+ input_ids_.push_back(token_id);
+ }
+
+ void PreProcessImages(const std::vector &images, const std::vector &image_length) override {
+ auto imageinfos = vector();
+ for (int i = 0; i < images.size(); i++) {
+ int width, height, channels;
+ auto data = stbi_load_from_memory(images[i], image_length[i], &width, &height, &channels, 3);
+ if (data == nullptr) {
+ std::cerr << "Error: Failed to load image from memory." << std::endl;
+ exit(-1);
+ }
+ float *f32_data = nullptr;
+ if (do_rescale_) {
+ f32_data = PreProcessor::RescaleImage(data, scale_, width * height * channels);
+ stbi_image_free(data);
+ } else {
+ f32_data = PreProcessor::RescaleImage(data, 1.0F, width * height * channels);
+ stbi_image_free(data);
+ }
+ imageinfos.emplace_back(ImageInfo(f32_data, width, height, channels));
+ }
+ if (do_resize_) {
+ imageinfos = PreProcessor::ResizeImages(imageinfos, height_, width_, false, true, shortest);
+ }
+ // std::cout<>>();
+ for (int k = 0; k < imageinfo.channels; k++) {
+ auto channel = vector>();
+ for (int i = 0; i < imageinfo.height; i++) {
+ auto row = vector();
+ for (int j = 0; j < imageinfo.width; j++) {
+ row.push_back(imageinfo.get_whc_pixel(i * imageinfo.width + j + k * imageinfo.width * imageinfo.height));
+ }
+ channel.push_back(row);
+ }
+ pixel_values.push_back(channel);
+ }
+
+ pixel_values_.push_back(pixel_values);
+ }
}
- std::array process(vector in_strs , string img_path, int hw = 224,
- string img_name = "input_vision", string text_name = "input_text", BackendType type = MLLM_CPU) {
+ void PreProcessImages(const std::vector &images_path) override {
+ assert(height_ > 0 && width_ > 0);
+ auto image_data = std::vector();
+ auto image_length = std::vector();
+ for (const auto &i : images_path) {
+ // read all file contents
+ std::ifstream file(i, std::ios::binary | std::ios::ate);
+ if (!file.is_open()) {
+ std::cerr << "Cannot open file: " << i << std::endl;
+ exit(-1);
+ }
+ auto size = file.tellg();
+ auto data = new uint8_t[size];
+ file.seekg(0, std::ios::beg);
+ file.read(reinterpret_cast(data), size);
+ file.close();
+ image_data.emplace_back(data);
+ image_length.emplace_back(size);
+ }
+ PreProcessImages(image_data, image_length);
+ }
+ void PreProcessImages(const std::vector &images_path, int height, int width) {
+ height_ = height;
+ width_ = width;
+ PreProcessImages(images_path);
+ }
- // vector in_strs = {"a photo of a cat", "a photo of a dog"};
+ vector process(vector in_strs, string img_path, int hw = 224,
+ string img_name = "input_vision", string text_name = "input_text", BackendType type = MLLM_CPU) {
+ input_ids_.clear();
+ pixel_values_.clear();
auto tokens_ids = vector>();
for (auto in_str : in_strs) {
vector tokens_id = {};
tokenizer->tokenize(in_str, tokens_id, true, true, "");
tokens_ids.push_back(tokens_id);
}
- clip_processor->PreProcessImages({std::move(img_path)}, hw, hw);
- auto images = clip_processor->pixel_values_[0];
+ PreProcessImages({std::move(img_path)}, hw, hw);
+ auto images = pixel_values_[0];
return {Tokenizer::tokens2Input(tokens_ids), img2Tensor(images, std::move(img_name))};
}
@@ -92,10 +183,7 @@ class ClipProcessor final {
auto value = result.dataAt(i, 0, 0, 0);
scores.push_back(value);
}
- auto token_idx = softmax(scores);
- // for (auto prob : token_idx) {
- // std::cout << prob << " ";
- // }
+ auto token_idx = softmax(scores);
return token_idx;
}
};
diff --git a/src/models/dclm/configuration_dclm.hpp b/src/models/dclm/configuration_dclm.hpp
new file mode 100644
index 00000000..8839d46f
--- /dev/null
+++ b/src/models/dclm/configuration_dclm.hpp
@@ -0,0 +1,56 @@
+/**
+ * @file configuration_dclm.hpp
+ * @author chenghua Wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-09-26
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#pragma once
+
+#include "Types.hpp"
+#include "models/transformer/configuration_transformer.hpp"
+
+using namespace mllm;
+
+class DCLMNameConfig : public TransformerNameConfig {
+public:
+ /**
+ * @brief DCLM following the hugging face naming method
+ *
+ * @param type RoPEType
+ */
+ void init(RoPEType type = RoPEType::HFHUBROPE) {
+ // the dclm's params name is quite different than others.
+ // pls set name of layers manually.
+ }
+};
+
+struct DCLMConfig {
+ explicit DCLMConfig(int token_limit, const string billions = "1B", RoPEType type = RoPEType::HFHUBROPE) :
+ cache_limit(token_limit) {
+ names_config.init(type);
+ if (!(billions == "1B" || billions == "1b")) {
+ throw std::runtime_error("Unsupported model size");
+ }
+ RoPE_type = type;
+ };
+
+ int dim = 2048;
+ float moe_capacity_factor = 1.25;
+ bool moe_expert_model_parallelism = false;
+ float moe_freq = 0.f;
+ float moe_loss_weight = 0.1f;
+ int moe_top_k = 2;
+ bool moe_weight_parallelism = false;
+ int n_heads = 16;
+ int n_layers = 24;
+ float norm_eps = 1e-06;
+ int seq_len = 2048;
+ bool weight_tying = false;
+ int vocab_size = 50432;
+ int cache_limit;
+ RoPEType RoPE_type = RoPEType::HFHUBROPE;
+ DCLMNameConfig names_config;
+};
\ No newline at end of file
diff --git a/src/models/dclm/modeling_dclm.hpp b/src/models/dclm/modeling_dclm.hpp
new file mode 100644
index 00000000..82508199
--- /dev/null
+++ b/src/models/dclm/modeling_dclm.hpp
@@ -0,0 +1,186 @@
+/**
+ * @file modeling_dclm.hpp
+ * @author chenghua Wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-09-26
+ * @ref https://github.com/mlfoundations/open_lm/blob/main/open_lm/model.py
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#pragma once
+
+#include "Layer.hpp"
+#include "Module.hpp"
+#include "Types.hpp"
+#include "configuration_dclm.hpp"
+
+using namespace mllm;
+
+class DCLMFFN final : public Module {
+ Layer w12;
+ Layer w3;
+ Layer silu;
+ int hidden_dim_;
+
+public:
+ DCLMFFN() = default;
+ DCLMFFN(int in_dim, int hidden_dim, int out_dim, const std::string &base_name) {
+ hidden_dim_ = hidden_dim;
+ w12 = Linear(in_dim, 2 * hidden_dim, false, base_name + "w12");
+ w3 = Linear(hidden_dim, out_dim, false, base_name + "w3");
+ silu = SiLU(base_name + "silu");
+ }
+
+ std::vector Forward(std::vector inputs, std::vector args) override {
+ auto x = inputs[0];
+ x = w12(x);
+
+ auto x_sp = Tensor::split(x, {hidden_dim_, hidden_dim_}, DIMENSION);
+ Tensor gate;
+
+ gate = x_sp[0];
+ x = x_sp[1];
+
+ x = silu(gate) * x;
+
+ return {w3(x)};
+ }
+};
+
+class DCLMAttention final : public Module {
+ Layer in_proj;
+ Layer out_proj;
+ Layer q_norm;
+ Layer k_norm;
+ Layer q_rope;
+ Layer k_rope;
+ KVCache k_cache;
+ KVCache v_cache;
+ Layer softmax;
+
+ int attn_hidden_dim_;
+ int head_dim_;
+ int n_heads_;
+
+public:
+ DCLMAttention() = default;
+ DCLMAttention(const DCLMConfig &cfg, const std::string &base_name) {
+ int head_dim = cfg.dim / cfg.n_heads;
+ attn_hidden_dim_ = cfg.n_heads * head_dim;
+ head_dim_ = head_dim;
+ n_heads_ = cfg.n_heads;
+ in_proj = Linear(cfg.dim, 3 * cfg.n_heads * head_dim, false, base_name + "in_proj");
+ out_proj = Linear(cfg.n_heads * head_dim, cfg.dim, false, base_name + "out_proj");
+ q_norm = LayerNorm(cfg.n_heads * head_dim, false, cfg.norm_eps, base_name + "q_norm");
+ k_norm = LayerNorm(cfg.n_heads * head_dim, false, cfg.norm_eps, base_name + "k_norm");
+ q_rope = RoPE(cfg.RoPE_type, 10000, cfg.seq_len, base_name + "q_rope");
+ k_rope = RoPE(cfg.RoPE_type, 10000, cfg.seq_len, base_name + "k_rope");
+ k_cache = KVCache(1, cfg.cache_limit, base_name + "k_cache");
+ v_cache = KVCache(1, cfg.cache_limit, base_name + "v_cache");
+ softmax = Softmax(DIMENSION, true, base_name + "softmax");
+ }
+
+ std::vector Forward(std::vector inputs, std::vector args) override {
+ auto qkv = in_proj(inputs[0]);
+ auto qkv_sp = qkv.split({attn_hidden_dim_, attn_hidden_dim_, attn_hidden_dim_}, DIMENSION);
+
+ Tensor q, k, v;
+ q = qkv_sp[0];
+ k = qkv_sp[1];
+ v = qkv_sp[2];
+
+ q = q_norm(q);
+ k = k_norm(k);
+ q = q.view(-1, n_heads_, -1, head_dim_);
+ k = k.view(-1, n_heads_, -1, head_dim_);
+ v = v.view(-1, n_heads_, -1, head_dim_);
+
+ q = q_rope(q);
+ k = k_rope(k);
+
+ k = k_cache(k);
+ v = v_cache(v);
+
+ k = k.transpose(SEQUENCE, DIMENSION);
+ auto qk = Tensor::mm(q, k);
+ qk = qk / std::sqrt(head_dim_);
+
+ qk = softmax(qk);
+
+ auto o = Tensor::mm(qk, v);
+ o = o.view(-1, 1, -1, n_heads_ * head_dim_);
+ o = out_proj(o);
+ return {o};
+ }
+};
+
+class DCLMDecoder final : public Module {
+ int n_heads_;
+ int dim_;
+ int head_dim_;
+ int hidden_dim_;
+ DCLMFFN feed_forward;
+ Layer attention_norm;
+ Layer ffn_norm;
+
+ DCLMAttention attention;
+
+public:
+ DCLMDecoder() = default;
+ DCLMDecoder(const DCLMConfig &cfg, const std::string &base_name) {
+ n_heads_ = cfg.n_heads;
+ dim_ = cfg.dim;
+ head_dim_ = cfg.dim / cfg.n_heads;
+
+ attention = DCLMAttention(cfg, base_name + "attention.");
+ // swiglu_torch
+ hidden_dim_ = 256 * ((int(2 * 4 * cfg.dim / 3) + 256 - 1) / 256);
+ feed_forward = DCLMFFN(cfg.dim, hidden_dim_, cfg.dim, base_name + "feed_forward.");
+ // lp_layer_norm
+ // we do not use low precision here, it's basiclly a normal layernorm without bias
+ attention_norm = LayerNorm(cfg.dim, false, cfg.norm_eps, base_name + "attention_norm");
+ ffn_norm = LayerNorm(cfg.dim, false, cfg.norm_eps, base_name + "ffn_norm");
+ }
+
+ std::vector Forward(std::vector inputs, std::vector args) override {
+ auto x = inputs[0];
+ auto h = attention({attention_norm(x)})[0];
+ h = h + x;
+ auto ffn_out = feed_forward({ffn_norm(h)})[0];
+ auto out = h + ffn_out;
+ return {out};
+ }
+};
+
+class DCLM final : public Module {
+ std::string base_name_ = "model.";
+
+ Layer tok_embeddings;
+ std::vector layers;
+ Layer norm;
+ Parameter lm_head;
+
+public:
+ DCLM() = default;
+ DCLM(const DCLMConfig &cfg) {
+ tok_embeddings = Embedding(cfg.vocab_size, cfg.dim, base_name_ + "tok_embeddings");
+ layers = List(cfg.n_layers, cfg, base_name_ + "layers.");
+ norm = LayerNorm(cfg.dim, false, cfg.norm_eps, base_name_ + "norm");
+ lm_head = Parameter(1, cfg.vocab_size, 1, cfg.dim,
+ base_name_ + "output.weight");
+ }
+
+ std::vector Forward(std::vector inputs, std::vector args) override {
+ auto x = inputs[0];
+ x = tok_embeddings(x);
+
+ for (auto &layer : layers) {
+ x = layer({x})[0];
+ }
+
+ x = norm(x);
+ auto out = Tensor::mm(x, lm_head().transpose(Chl::SEQUENCE, Chl::DIMENSION));
+ return {out};
+ }
+};
diff --git a/src/models/dclm/tokenization_dclm.hpp b/src/models/dclm/tokenization_dclm.hpp
new file mode 100644
index 00000000..134d5633
--- /dev/null
+++ b/src/models/dclm/tokenization_dclm.hpp
@@ -0,0 +1,77 @@
+/**
+ * @file tokenization_dclm.hpp
+ * @author chenghua Wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-09-26
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#ifndef TOKENIZATION_DCLM_HPP
+#define TOKENIZATION_DCLM_HPP
+
+#include "tokenizers/BPE/Bpe.hpp"
+#include "tokenizers/Tokenizer.hpp"
+#include
+
+using namespace mllm;
+
+using namespace mllm;
+
+class DCLMTokenizer final : public BPETokenizer {
+ std::unordered_map merge_rank;
+
+public:
+ explicit DCLMTokenizer(const std::string &vocab_file, const std::string &merge_file) :
+ BPETokenizer(vocab_file) {
+ Module::initBackend(MLLM_CPU);
+ std::ifstream merge(merge_file);
+ std::string line;
+ unsigned rank = 0;
+ while (std::getline(merge, line)) {
+ if (line.empty() || line[0] == '#') {
+ continue;
+ }
+ merge_rank[line] = rank++;
+ }
+ BPETokenizer::setMergeRank(merge_rank);
+ BPETokenizer::setSpecialToken("<|endoftext|>", "<|endoftext|>", "<|endoftext|>");
+ }
+ Tensor tokenize(std::string &text) override {
+ text = Tokenizer::replaceString(text, ' ', "Ġ");
+ std::vector tokens_id;
+ BPETokenizer::tokenize(text, tokens_id, false);
+ return BPETokenizer::tokens2Input(tokens_id);
+ }
+
+ std::string detokenize(const std::vector &tokens) override {
+ return BPETokenizer::detokenize(tokens);
+ }
+
+ std::pair detokenize(Tensor &result) override {
+ assert(result.batch() == 1);
+ assert(result.head() == 1);
+ vector scores;
+ for (int i = 0; i < result.dimension(); ++i) {
+ auto value = result.dataAt(0, 0, result.sequence() - 1, i);
+ scores.push_back(value);
+ }
+ auto token_idx = this->argmax(scores);
+ return {BPETokenizer::detokenize({token_idx}), token_idx};
+ }
+
+ std::pair postprocess(std::string &text) override {
+ size_t pos = 0;
+ while ((pos = text.find("Ċ", pos)) != std::string::npos) {
+ text.replace(pos, 2, " ");
+ break;
+ }
+ pos = 0;
+ while ((pos = text.find("Ġ", pos)) != std::string::npos) {
+ text.replace(pos, 2, " ");
+ }
+ return {true, text};
+ }
+};
+
+#endif //! TOKENIZATION_DCLM_HPP
\ No newline at end of file
diff --git a/src/models/fuyu/processing_fuyu.hpp b/src/models/fuyu/processing_fuyu.hpp
index 0b5185dc..113d01f3 100644
--- a/src/models/fuyu/processing_fuyu.hpp
+++ b/src/models/fuyu/processing_fuyu.hpp
@@ -5,32 +5,180 @@
#ifndef TOKENIZATION_FUYU_HPP
#define TOKENIZATION_FUYU_HPP
-#include
-#include "processor/FuyuPreProcess.hpp"
+#include
+#include "Tensor.hpp"
+#include
+// #include "processor/FuyuPreProcess.hpp"
+#include "processor/PreProcess.hpp"
+#include "tokenizers/Tokenizer.hpp"
#include "tokenizers/Unigram/Unigram.hpp"
+#include
+#include
+#include
+#ifndef STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#define STB_IMAGE_IMPLEMENTATION
+#endif
+#include "stb/stb_image.h"
+#include "stb/stb_image_resize2.h"
+
using namespace mllm;
-class FuyuProcessor final {
+class FuyuProcessor final : public PreProcessor {
UnigramTokenizer *tokenizer;
- FuyuPreProcess *preprocessor;
-
- unsigned int argmax(const std::vector &scores) {
- if (scores.empty()) {
- throw std::invalid_argument("Input vector is empty");
- }
- unsigned int maxIndex = 0;
- float maxValue = scores[0];
- for (size_t i = 1; i < scores.size(); ++i) {
- if (scores[i] > maxValue) {
- maxIndex = i;
- maxValue = scores[i];
+ vector> image_patch_indices_per_batch;
+ vector> image_patch_indices_per_subseq;
+ vector> image_patch_input_indices_;
+ vector text_lengths_;
+ vector> image_input_ids_;
+ vector> attention_mask_;
+ vector> image_patches_indices_;
+ vector>> image_patches_;
+ vector> text_ids_;
+ std::vector images_;
+
+ size_t max_tokens_to_generate; // init
+ token_id_t image_placeholder_id_; // init
+ token_id_t image_newline_id_; // init
+ std::pair patch_size_; // init
+ size_t max_position_embeddings = 16384;
+ token_id_t pad_token_id = 0;
+
+ void get_sample_encoding(const std::string &text) {
+ image_input_ids_.resize(images_.size());
+ image_patches_.resize(images_.size());
+ image_patch_indices_per_batch.resize(images_.size());
+ image_patch_indices_per_subseq.resize(images_.size());
+ auto num_index = 0;
+ for (int i = 0; i < images_.size(); i++) {
+ auto image = images_[i];
+ auto height = image.height;
+ auto width = image.width;
+ auto num_patches_per_dim_h = height / patch_size_.first;
+ auto num_patches_per_dim_w = width / patch_size_.second;
+ auto num_patches = num_patches_per_dim_h * num_patches_per_dim_w;
+ auto tensor_of_image_ids = vector>(num_patches_per_dim_h, vector(num_patches_per_dim_w, image_placeholder_id_));
+ auto &image_input_id = image_input_ids_[i];
+ auto &image_patch_indices_per_batch_ = image_patch_indices_per_batch[i];
+ auto &image_patch_indices_per_subseq_ = image_patch_indices_per_subseq[i];
+ for (int h = 0; h < num_patches_per_dim_h; h++) {
+ for (int w = 0; w < num_patches_per_dim_w; w++) {
+ auto patch_index = h * num_patches_per_dim_w + w;
+ image_patch_indices_per_batch_.push_back(patch_index + num_index);
+ image_patch_indices_per_subseq_.push_back(patch_index);
+ }
+ image_patch_indices_per_batch_.emplace_back(-1);
+ image_patch_indices_per_subseq_.emplace_back(-1);
+ }
+ num_index += num_patches;
+
+ for (auto &row : tensor_of_image_ids) {
+ row.push_back(image_newline_id_);
+ image_input_id.insert(image_input_id.end(), row.begin(), row.end());
+ }
+ image_patches_[i] = PatchImages(image, patch_size_.first, patch_size_.second);
+ }
+ tokenizer_->setSpecialToken("");
+
+ auto text_ = Tokenizer::replaceString(text, ' ', "▁");
+ text_ = "▁" + text_;
+ auto text_ids = vector();
+ tokenizer_->tokenize(text_, text_ids, true);
+ token_id_t answer_start_token = 0;
+ if (tokenizer_->getTokenId("<0x04>", answer_start_token)) {
+ text_ids.push_back(answer_start_token);
+ } else {
+ std::cerr << "ANSWER_START token not found in vocab file." << std::endl;
+ }
+ text_ids_.push_back(text_ids);
+ text_lengths_.push_back(text_ids.size());
+ // construct_full_unpacked_stream
+ auto image_padded_unpacked_tokens = vector>(images_.size(), vector());
+ auto unpacked_image_patch_indices_per_batch = vector>(images_.size(), vector());
+ size_t max_prompt_length = 0;
+ for (int i = 0; i < images_.size(); i++) {
+ auto &image_padded_unpacked_token = image_padded_unpacked_tokens[i];
+ auto &unpacked_image_patch_indice_per_batch = unpacked_image_patch_indices_per_batch[i];
+ // TODO:
+ auto text_length = text_lengths_[0];
+ auto image_token_length = image_input_ids_[i].size();
+ if (text_lengths_.size() > 1) {
+ text_length = text_lengths_[i];
+ }
+ auto size_ = image_token_length + text_length;
+ image_padded_unpacked_token.insert(image_padded_unpacked_token.begin(), image_input_ids_[i].begin(), image_input_ids_[i].end());
+ image_padded_unpacked_token.insert(image_padded_unpacked_token.end(), text_ids_[0].begin(), text_ids_[0].end());
+ unpacked_image_patch_indice_per_batch.insert(unpacked_image_patch_indice_per_batch.begin(), image_patch_indices_per_batch[i].begin(), image_patch_indices_per_batch[i].end());
+ unpacked_image_patch_indice_per_batch.insert(unpacked_image_patch_indice_per_batch.end(), text_ids_[0].size(), -1);
+ if (size_ > max_prompt_length) {
+ max_prompt_length = size_;
}
+ //
}
- return maxIndex;
+ size_t max_seq_len_batch = std::min(max_prompt_length + max_tokens_to_generate, max_position_embeddings);
+ auto tokens_to_place = std::min(max_seq_len_batch, max_prompt_length);
+ // full_unpacked_stream_to_tensor
+ image_patch_input_indices_.resize(images_.size());
+ for (int i = 0; i < image_patch_input_indices_.size(); i++) {
+ auto &image_patch_input_indice = image_patch_input_indices_[i];
+ image_patch_input_indice.insert(image_patch_input_indice.begin(), unpacked_image_patch_indices_per_batch[i].begin(), unpacked_image_patch_indices_per_batch[i].begin() + tokens_to_place);
+ image_patch_input_indice.insert(image_patch_input_indice.end(), max_seq_len_batch - tokens_to_place, -1);
+ }
+ image_input_ids_.clear();
+ image_input_ids_.resize(images_.size());
+ //_left_pad_inputs_with_attention_mask
+ attention_mask_.resize(images_.size());
+ image_patches_indices_.resize(images_.size());
+ for (int i = 0; i < images_.size(); i++) {
+ auto &attention_mask = attention_mask_[i];
+ auto &input_id = image_input_ids_[i];
+ auto num_padding_tokens = max_prompt_length - image_padded_unpacked_tokens[i].size();
+ input_id.insert(input_id.end(), num_padding_tokens, pad_token_id);
+ input_id.insert(input_id.end(), image_padded_unpacked_tokens[i].begin(), image_padded_unpacked_tokens[i].end());
+ attention_mask.insert(attention_mask.end(), num_padding_tokens, 0);
+ attention_mask.insert(attention_mask.end(), image_padded_unpacked_tokens[i].size(), 1);
+
+ // For the image patches indices, we need to add the padding tokens as well.
+ auto &image_patch_input_indice = image_patches_indices_[i];
+ auto &image_patch_input_indice_per_batch = image_patch_input_indices_[i];
+ auto num_padding_indices = max_seq_len_batch - image_patch_input_indice_per_batch.size();
+ image_patch_input_indice.insert(image_patch_input_indice.end(), num_padding_indices, -1);
+ image_patch_input_indice.insert(image_patch_input_indice.end(), image_patch_input_indice_per_batch.begin(), image_patch_input_indice_per_batch.end());
+ }
+ }
+
+ std::vector> PatchImages(ImageInfo &images, size_t patch_height, size_t patch_width) {
+ auto image_0 = images;
+ auto height = image_0.height;
+ auto width = image_0.width;
+ auto channels = image_0.channels;
+ auto square = width * height;
+ auto dim2 = square / patch_height / patch_width;
+ auto dim_2_1 = width / patch_width;
+ auto dim2_2 = height / patch_height;
+ auto stride2 = patch_height * width;
+ auto stride1 = patch_width;
+ auto patches = vector>(dim2, vector());
+ for (int i = 0; i < dim2_2; i++) {
+ for (int j = 0; j < dim_2_1; j++) {
+ auto &patch = patches[i * dim_2_1 + j];
+ auto const index_first_element_of_line = i * stride2 + j * stride1;
+ while (patch.size() < patch_height * patch_width * channels) {
+ for (int h = 0; h < patch_height; h++) {
+ for (int w = 0; w < patch_width; w++) {
+ for (int c = 0; c < channels; c++) {
+ patch.push_back(images.get_whc_pixel(index_first_element_of_line + h * width + w + c * square));
+ }
+ }
+ }
+ }
+ }
+ }
+ return patches;
}
- static Tensor patches2Tensor(vector>> image_patches, string name = "input", BackendType type = MLLM_CPU) {
+ Tensor vector3d2Tensor(vector>> image_patches, string name = "input", BackendType type = MLLM_CPU) {
int batch = 0;
int seq = 0;
int dims = 0;
@@ -53,7 +201,7 @@ class FuyuProcessor final {
return tensor1;
}
- static Tensor patchIdx2Tensor(vector> image_patches_indices, string name = "input", BackendType type = MLLM_CPU) {
+ Tensor vector2d2Tensor(vector> image_patches_indices, string name = "input", BackendType type = MLLM_CPU) {
int batch = 0;
int seq = 0;
if (!image_patches_indices.empty()) {
@@ -73,53 +221,128 @@ class FuyuProcessor final {
}
public:
- explicit FuyuProcessor(const std::string &vocab_file) {
+ explicit FuyuProcessor(const std::string &vocab_file) :
+ PreProcessor(1080, 1920, true, true, true, true, {0.5}, {0.5}) {
Module::initBackend(MLLM_CPU);
- tokenizer = new UnigramTokenizer(vocab_file);
+ tokenizer_ = new UnigramTokenizer(vocab_file);
+ auto tmp_token = vector();
+ tokenizer_->tokenize("|SPEAKER|", tmp_token, false);
+ image_placeholder_id_ = tmp_token[0];
+ tmp_token.clear();
+ tokenizer_->tokenize("|NEWLINE|", tmp_token, false);
+ image_newline_id_ = tmp_token[0];
+ patch_size_ = std::make_pair(30, 30);
+ max_tokens_to_generate = 20;
}
- std::array process(std::string &text, vector image) {
- preprocessor = new FuyuPreProcess(tokenizer);
- preprocessor->images_.clear();
- preprocessor->image_input_ids_.clear();
- preprocessor->image_patches_indices_.clear();
- preprocessor->image_patches_.clear();
- preprocessor->PreProcessImages(image);
- preprocessor->Process(text);
- auto input_ids = preprocessor->image_input_ids_;
- auto image_patches_indices = preprocessor->image_patches_indices_;
- auto image_patches = preprocessor->image_patches_;
- if (input_ids.empty()) {
- input_ids = preprocessor->text_ids_;
+ void PreProcessImages(const std::vector &images, const std::vector &image_length) override {
+ assert(height_ > 0 && width_ > 0);
+
+ for (int i = 0; i < images.size(); i++) {
+ auto image = images[i];
+ int width_, height_, channels_;
+ // Data is [height * width * channels],RGB
+ const unsigned char *data = stbi_load_from_memory(image, image_length[i], &width_, &height_, &channels_, 3);
+ if (data == nullptr) {
+ std::cerr << "load image failed" << std::endl;
+ exit(-1);
+ }
+ auto float_data = RescaleImage(data, 255.0, width_ * height_ * channels_);
+ images_.emplace_back(float_data, width_, height_, channels_);
+ }
+ auto image_patches = std::vector>>>>();
+ if (do_resize_) {
+ images_ = ResizeImages(images_, height_, width_);
+ }
+
+ if (do_pad_) {
+ images_ = PadImages(images_, height_, width_, patch_size_.second, patch_size_.first);
+ }
+ if (do_normalize_) {
+ if (mean_.size() != std_.size() || mean_.size() != 1 && mean_.size() != 3) {
+ std::cerr << "MEAN should be of same size of std and length should be (1 or 3) !" << std::endl;
+ exit(-1);
+ }
+ if (mean_.size() == 1) {
+ mean_.resize(3, mean_[0]);
+ }
+ if (std_.size() == 1) {
+ std_.resize(3, std_[0]);
+ }
+ images_ = NormalizeImages(images_, mean_, std_);
}
- std::array result = {UnigramTokenizer::tokens2Input(input_ids[0], "input_ids"),
- patches2Tensor(image_patches, "image_patches"),
- patchIdx2Tensor(image_patches_indices, "image_patches_indices")};
- return result;
}
- Tensor tokenize(std::string &text) const {
- if (text[0] != ' ') {
- text = ' ' + text;
+ void PreProcessImages(const std::vector &images_path) override {
+ assert(height_ > 0 && width_ > 0);
+ auto image_data = std::vector();
+ auto image_length = std::vector();
+ for (const auto &i : images_path) {
+ // read all file contents
+ std::ifstream file(i, std::ios::binary | std::ios::ate);
+ if (!file.is_open()) {
+ std::cerr << "Cannot open file: " << i << std::endl;
+ exit(-1);
+ }
+ auto size = file.tellg();
+ auto data = new uint8_t[size];
+ file.seekg(0, std::ios::beg);
+ file.read(reinterpret_cast(data), size);
+ file.close();
+ image_data.emplace_back(data);
+ image_length.emplace_back(size);
}
- auto tokens_id = vector();
- tokenizer->tokenize(text, tokens_id, true);
- return UnigramTokenizer::tokens2Input(tokens_id);
+ PreProcessImages(image_data, image_length);
}
- std::string detokenize(const std::vector &tokens) {
- return tokenizer->detokenize(tokens);
+ void Process(const std::string &text) override {
+ if (text.empty()) {
+ return;
+ }
+ if (images_.empty()) {
+ std::cout << "images is empty" << std::endl;
+ }
+ // auto batch_size = images_.size();
+ get_sample_encoding(text);
+ }
+
+ vector process(std::string &text, vector image) {
+ image_patch_indices_per_batch.clear();
+ image_patch_indices_per_subseq.clear();
+ image_patch_input_indices_.clear();
+ text_lengths_.clear();
+ image_input_ids_.clear();
+ attention_mask_.clear();
+ image_patches_indices_.clear();
+ image_patches_.clear();
+ text_ids_.clear();
+ images_.clear();
+
+ PreProcessImages(image);
+ Process(text);
+ auto input_ids = image_input_ids_;
+ if (input_ids.empty()) {
+ input_ids = text_ids_;
+ }
+ vector result = {mllm::Tokenizer::tokens2Input(input_ids[0], "input_ids"),
+ vector3d2Tensor(image_patches_, "image_patches"),
+ vector2d2Tensor(image_patches_indices_, "image_patches_indices")};
+ return result;
}
std::pair detokenize(Tensor &result) {
- assert(result.batch() == 1);
- assert(result.head() == 1);
- vector scores;
- for (int i = 0; i < result.dimension(); ++i) {
- auto value = result.dataAt(0, 0, result.sequence() - 1, i);
- scores.push_back(value);
- }
- auto token_idx = this->argmax(scores);
- return {tokenizer->detokenize({token_idx}), token_idx};
+ return tokenizer_->detokenize(result);
+ }
+
+ std::pair postprocess(std::string &text) {
+ size_t pos = 0;
+ std::string from = "▁";
+ std::string to = " ";
+ while ((pos = text.find(from, pos)) != std::string::npos) {
+ text.replace(pos, from.length(), to);
+ pos += to.length();
+ }
+ if (text == "|ENDOFTEXT|") return {false, ""};
+ return {true, text};
}
};
diff --git a/src/models/gemma/tokenization_gemma.hpp b/src/models/gemma/tokenization_gemma.hpp
index 23b612b1..4f456978 100644
--- a/src/models/gemma/tokenization_gemma.hpp
+++ b/src/models/gemma/tokenization_gemma.hpp
@@ -15,40 +15,35 @@
#define TOKENIZATION_GEMMA_HPP
#include "tokenizers/BPE/Bpe.hpp"
-#include
#include
using namespace mllm;
-class GemmaTokenizer final {
+class GemmaTokenizer final : public BPETokenizer {
public:
- explicit GemmaTokenizer(const std::string &vocab_file) {
+ explicit GemmaTokenizer(const std::string &vocab_file) :
+ BPETokenizer(vocab_file) {
Module::initBackend(MLLM_CPU);
- tokenizer = new BPETokenizer(vocab_file);
}
- ~GemmaTokenizer() {
- delete tokenizer;
- }
-
- Tensor tokenize(std::string &text, int str_i = 0) const {
+ Tensor tokenize(std::string &text) override {
// replace all blanck to '_'
std::string new_text = BPETokenizer::replaceString(text, ' ', "▁");
// Returns a tokenized string. The Gemma tokenizer never adds a prefix space
auto tokens_id = vector();
- tokenizer->tokenize(new_text, tokens_id, false);
+ BPETokenizer::tokenize(new_text, tokens_id, false);
// insert
tokens_id.insert(tokens_id.begin(), bos_id);
return BPETokenizer::tokens2Input(tokens_id);
}
- std::string detokenize(const std::vector &tokens) {
- return tokenizer->detokenize(tokens);
+ std::string detokenize(const std::vector &tokens) override {
+ return BPETokenizer::detokenize(tokens);
}
- std::pair detokenize(Tensor &result) {
+ std::pair detokenize(Tensor &result) override {
assert(result.batch() == 1 && "Batch size of result is not 1. Which is not supported for now.");
assert(result.head() == 1 && "The 3rd dim of result should be one. e.g.:[1, 1, seq, hidden]");
std::vector scores;
@@ -59,21 +54,15 @@ class GemmaTokenizer final {
scores.push_back(value);
}
auto token_idx = this->argmax(scores);
- auto text = tokenizer->detokenize({token_idx});
+ auto text = BPETokenizer::detokenize({token_idx});
text = std::regex_replace(text, std::regex("▁"), " ");
return make_pair(text, token_idx);
}
-
-private:
- unsigned int argmax(const std::vector &scores) {
- if (scores.empty()) {
- throw std::invalid_argument("Input vector is empty");
- }
- return std::max_element(scores.begin(), scores.end()) - scores.begin();
+ std::pair postprocess(std::string &text) override {
+ if (text.empty()) return {false, ""};
+ return {true, text};
}
- BPETokenizer *tokenizer;
-
public:
token_id_t pad_id = 0, eos_id = 1, bos_id = 2, unk_id = 3;
};
diff --git a/src/models/imagebind/processing_imagebind.hpp b/src/models/imagebind/processing_imagebind.hpp
index 4a127c71..d2ba8bcc 100644
--- a/src/models/imagebind/processing_imagebind.hpp
+++ b/src/models/imagebind/processing_imagebind.hpp
@@ -6,14 +6,12 @@
#define PROCESSING_IMAGEBIND_HPP
#include
-#include "processor/ClipPreProcess.hpp"
#include "tokenizers/BPE/Bpe.hpp"
+#include "models/clip/processing_clip.hpp"
using namespace mllm;
-class ImagebindProcessor final {
- BPETokenizer *tokenizer;
- ClipPreProcessor *clip_processor;
+class ImagebindProcessor final : public ClipProcessor {
static Tensor tokens2Input(vector> tokens, int max_pos, string name = "input", BackendType type = MLLM_CPU) {
const auto bsize = static_cast(tokens.size());
Tensor tensor1(bsize, 1, max_pos, 1, Backend::global_backends[type], true);
@@ -22,9 +20,9 @@ class ImagebindProcessor final {
tensor1.setTtype(INPUT_TENSOR);
for (int b = 0; b < bsize; ++b) {
for (int idx = 0; idx < max_pos; ++idx) {
- if(idx < tokens[b].size()) {
+ if (idx < tokens[b].size()) {
tensor1.setDataAt(b, 0, idx, 0, tokens[b][idx]);
- }else{
+ } else {
tensor1.setDataAt(b, 0, idx, 0, 0);
}
}
@@ -82,45 +80,28 @@ class ImagebindProcessor final {
return tensor1;
}
- std::string toLowercase(const std::string& input) {
+ std::string toLowercase(const std::string &input) {
std::string output = input;
std::transform(output.begin(), output.end(), output.begin(),
- [](unsigned char c){ return std::tolower(c); });
+ [](unsigned char c) { return std::tolower(c); });
return output;
}
public:
- explicit ImagebindProcessor(const string &vocab_path, const string &merges_path) {
+ explicit ImagebindProcessor(const string &vocab_path, const string &merges_path) :
+ ClipProcessor(vocab_path, merges_path) {
Module::initBackend(MLLM_CPU);
- tokenizer = new BPETokenizer(vocab_path);
- std::unordered_map merge_rank;
- auto merge_file = std::ifstream(merges_path);
- std::string line;
- unsigned rank = 0;
- while (std::getline(merge_file, line)) {
- if (line.empty()) {
- continue;
- }
- if (line[0] == '#') {
- continue;
- }
- merge_rank[line] = rank;
- rank++;
- }
- tokenizer->setMergeRank(merge_rank);
- tokenizer->setSpecialToken("<|startoftext|>", "<|endoftext|>");
- clip_processor = new ClipPreProcessor(tokenizer);
}
- struct imagebind_out{
+ struct imagebind_out {
Tensor text_tensors;
Tensor img_tensors;
Tensor audio_tensors;
vector in_len;
};
imagebind_out process(vector in_strs, int max_pos, vector img_path, int hw, vector wav_path,
- string text_name = "input_text", string img_name = "input_vision", string wav_name = "input_audio",
- BackendType type = MLLM_CPU) {
+ string text_name = "input_text", string img_name = "input_vision", string wav_name = "input_audio",
+ BackendType type = MLLM_CPU) {
auto tokens_ids = vector>();
for (auto in_str : in_strs) {
in_str = toLowercase(in_str);
@@ -128,29 +109,29 @@ class ImagebindProcessor final {
tokenizer->tokenize(in_str, tokens_id, true, true, "");
tokens_ids.push_back(tokens_id);
}
- vector input_text_lens ={};
+ vector input_text_lens = {};
for (auto tokens_id : tokens_ids) {
input_text_lens.push_back(tokens_id.size() - 1);
}
- clip_processor->PreProcessImages(img_path, hw, hw);
- auto images = clip_processor->pixel_values_;
+ PreProcessImages(img_path, hw, hw);
+ auto images = pixel_values_;
auto audios = PreProcessor::ProcessAudio(std::move(wav_path));
return {tokens2Input(tokens_ids, max_pos, std::move(text_name)),
- img2Tensor(images, std::move(img_name)),
- audio2Tensor(audios, std::move(wav_name)), input_text_lens};
+ img2Tensor(images, std::move(img_name)),
+ audio2Tensor(audios, std::move(wav_name)), input_text_lens};
}
- void showResult(Tensor& tensor){
+ void showResult(Tensor &tensor) {
// std::cout<<"vision X text :"<(0, 0, s, d)<<" ";
+ std::cout << tensor.dataAt(0, 0, s, d) << " ";
}
- std::cout< tensors) {
vector shows = {"vision X text :", "vision X audio :"};
for (int i = 0; i < tensors.size(); ++i) {
- std::cout<
using namespace mllm;
+class LLaMATokenizer final : public BPETokenizer {
+ bool bos_ = true;
-class LLaMATokenizer final {
- BPETokenizer* tokenizer;
-
-
- unsigned int argmax(const std::vector &scores) {
- if (scores.empty()) {
- throw std::invalid_argument("Input vector is empty");
- }
- return std::max_element(scores.begin(), scores.end()) - scores.begin();
- }
- bool bos_=true;
public:
- explicit LLaMATokenizer(const std::string &vocab_file, bool bos=true) {
+ explicit LLaMATokenizer(const std::string &vocab_file, bool bos = true) :
+ BPETokenizer(vocab_file) {
Module::initBackend(MLLM_CPU);
- tokenizer = new BPETokenizer(vocab_file);
bos_ = bos;
+ chat_template_pre = "[INST] ";
+ chat_template_end = " [/INST]";
}
- Tensor tokenize(std::string &text, int str_i = 0) const {
- if (text[0] != ' ') {
- text = ' ' + text;
- }
+ Tensor tokenize(std::string &text) override {
auto tokens_id = vector();
- tokenizer->tokenize(text, tokens_id, bos_);
- if (str_i > 0){
- tokens_id[0] = 13;
- }
- return BPETokenizer::tokens2Input(tokens_id);
+ BPETokenizer::tokenize(text, tokens_id, bos_);
+ return tokens2Input(tokens_id);
}
- std::string detokenize(const std::vector &tokens) {
- return tokenizer->detokenize(tokens);
+ std::string detokenize(const std::vector &tokens) override {
+ return BPETokenizer::detokenize(tokens);
}
-
- std::pair detokenize(Tensor& result) {
+ std::pair detokenize(Tensor &result) override {
assert(result.batch() == 1);
assert(result.head() == 1);
vector scores;
@@ -51,12 +38,15 @@ class LLaMATokenizer final {
auto value = result.dataAt(0, 0, result.sequence() - 1, i);
scores.push_back(value);
}
- auto token_idx = this->argmax(scores);
- return {tokenizer->detokenize({token_idx}), token_idx};
+ auto token_idx = argmax(scores);
+ return {BPETokenizer::detokenize({token_idx}), token_idx};
+ }
+ std::pair