UbiquitousLearning · yirongjie · Oct 18, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 
 | Model                                                                       | CPU <br> FP32 | CPU <br> INT4  | Hexagon NPU <br> INT8 |
 |-----------------------------------------------------------------------------|------|-----|----------------------------|
-| [LLaMA-1/2 7B](https://github.com/facebookresearch/llama)                   | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main)   |  |
+| [LLaMA 2 7B](https://github.com/facebookresearch/llama)                   | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main)   |  |
 | [Alpaca 7B](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)                | [✔️](https://huggingface.co/mllmTeam/chinese-alpaca-7b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/chinese-alpaca-7b-mllm/tree/main)   |  |
 | [TinyLLaMA 1.1B](https://github.com/jzhang38/TinyLlama)                     | [✔️](https://huggingface.co/mllmTeam/tinyllama-1.1b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/tinyllama-1.1b-mllm/tree/main)   |  |
 | [Fuyu 8B](https://www.adept.ai/blog/fuyu-8b)                                | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main)   |  |
@@ -92,16 +92,18 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 | [ImageBind](https://github.com/facebookresearch/ImageBind) (3 modalities)   | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main)   |  |
 | [LLaVA 7B](https://github.com/haotian-liu/LLaVA)                            | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main)   |  |
 | [Gemma 2B](https://github.com/google/gemma_pytorch)                         | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)   |  |
-| [Qwen 0.5B](https://github.com/QwenLM/Qwen)                                 | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)   |  |
-| [Qwen 1.8B Chat](https://github.com/QwenLM/Qwen)                            | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm)   | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) |
+| [Qwen 1.5 0.5B](https://github.com/QwenLM/Qwen)                                 | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)   |  |
+| [Qwen 1.5 1.8B](https://github.com/QwenLM/Qwen)                            | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm)   | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) |
+| [Qwen 2.5 1.5B](https://github.com/QwenLM/Qwen2.5) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | |
 | [Mistral 7B](https://github.com/mistralai/mistral-src)                      | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)   |  |
 | [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B)                             | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)   |  |
-| [StableLM 1.6B](https://github.com/Stability-AI/StableLM)                     | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)   |  |
+| [StableLM 2 1.6B](https://github.com/Stability-AI/StableLM)                     | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)   |  |
 | [OPT 1.3B](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT)                     | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main)   |  |
-| [Phi-3-mini 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)                     |  [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main)   | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main)   |  |
+| [Phi 3 mini 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)                     |  [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main)   | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main)   |  |
 | [MiniCPM 2B](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)                     |  [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main)   | [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main)   |  |
 | [SmolLM 1.7B](https://huggingface.co/HuggingFaceTB/SmolLM-1.7B-Instruct)                     |  [✔️](https://huggingface.co/mllmTeam/smollm-1.7b-instruct-mllm/tree/main)   | [✔️](https://huggingface.co/mllmTeam/smollm-1.7b-instruct-mllm/tree/main)   |  |
-
+| [DCLM 1B](https://huggingface.co/TRI-ML/DCLM-1B) | [✔️](https://huggingface.co/mllmTeam/dclm-1b-mllm/tree/main)| [✔️](https://huggingface.co/mllmTeam/dclm-1b-mllm/tree/main)| |
+| [OpenELM 1.1B](https://github.com/apple/corenet/tree/main/projects/openelm) | [✔️](https://huggingface.co/mllmTeam/openelm-1.1b-mllm/tree/main)| [✔️](https://huggingface.co/mllmTeam/openelm-1.1b-mllm/tree/main)| |
 ## Quick Start
 
 ### Get the Code
@@ -295,7 +297,7 @@ cd ./bin
 
 ```bash
 cd ./bin
-./demo_llama -m ../models/llama-2-7b-chat-q4_k.mllm -v ../vocab/llama_vocab.mllm
+./demo_llama -m ../models/llama-2-7b-chat-q4_k.mllm -v ../vocab/llama2_vocab.mllm
 ```
 
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -528,8 +528,8 @@ else ()
     target_link_libraries(demo_smollm PUBLIC MLLM_CPU -fopenmp)
 endif ()
 
-# add_executable(demo_openelm
-#     ${PROJECT_SOURCE_DIR}/examples/demo_openelm.cpp 
+# add_executable(demo_phonellm
+#     ${PROJECT_SOURCE_DIR}/examples/demo_phonellm.cpp 
 #     ${DIR_SRC_CPU} 
 #     ${DIR_SRC_MEM_MANAGER} 
 #     ${DIR_SRC_EXP} 
@@ -541,32 +541,52 @@ endif ()
 #     ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
 # )
 # if (MLLM_OPENMP_STATIC)
-#     target_compile_options(demo_openelm PRIVATE -fopenmp)
-#     target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+#     target_compile_options(demo_phonellm PRIVATE -fopenmp)
+#     target_link_libraries(demo_phonellm PUBLIC MLLM_CPU -fopenmp -static-openmp)
 # else ()
-#     target_compile_options(demo_openelm PRIVATE -fopenmp)
-#     target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp)
+#     target_compile_options(demo_phonellm PRIVATE -fopenmp)
+#     target_link_libraries(demo_phonellm PUBLIC MLLM_CPU -fopenmp)
 # endif ()
 
-# add_executable(demo_dclm
-#     ${PROJECT_SOURCE_DIR}/examples/demo_dclm.cpp 
-#     ${DIR_SRC_CPU} 
-#     ${DIR_SRC_MEM_MANAGER} 
-#     ${DIR_SRC_EXP} 
-#     ${DIR_SRC}
-#     ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
-#     ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
-#     ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
-#     ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
-#     ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
-# )
-# if (MLLM_OPENMP_STATIC)
-#     target_compile_options(demo_dclm PRIVATE -fopenmp)
-#     target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp -static-openmp)
-# else ()
-#     target_compile_options(demo_dclm PRIVATE -fopenmp)
-#     target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp)
-# endif ()
+add_executable(demo_openelm
+    ${PROJECT_SOURCE_DIR}/examples/demo_openelm.cpp 
+    ${DIR_SRC_CPU} 
+    ${DIR_SRC_MEM_MANAGER} 
+    ${DIR_SRC_EXP} 
+    ${DIR_SRC}
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
+    ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
+)
+if (MLLM_OPENMP_STATIC)
+    target_compile_options(demo_openelm PRIVATE -fopenmp)
+    target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_compile_options(demo_openelm PRIVATE -fopenmp)
+    target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp)
+endif ()
+
+add_executable(demo_dclm
+    ${PROJECT_SOURCE_DIR}/examples/demo_dclm.cpp 
+    ${DIR_SRC_CPU} 
+    ${DIR_SRC_MEM_MANAGER} 
+    ${DIR_SRC_EXP} 
+    ${DIR_SRC}
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
+    ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
+    ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
+)
+if (MLLM_OPENMP_STATIC)
+    target_compile_options(demo_dclm PRIVATE -fopenmp)
+    target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_compile_options(demo_dclm PRIVATE -fopenmp)
+    target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp)
+endif ()
 
 add_executable(benchmark_llm
     ${PROJECT_SOURCE_DIR}/examples/benchmark.cpp 

diff --git a/examples/demo_clip.cpp b/examples/demo_clip.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <utility>
 #include "cmdline.h"
 #include "models/clip/modeling_clip.hpp"
 #include "models/clip/processing_clip.hpp"

diff --git a/examples/demo_dclm.cpp b/examples/demo_dclm.cpp
@@ -0,0 +1,59 @@
+/**
+ * @file demo_dclm.cpp
+ * @author chenghua Wang ([email protected])
+ * @version 0.1
+ * @date 2024-09-26
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "cmdline.h"
+#include "models/dclm/configuration_dclm.hpp"
+#include "models/dclm/modeling_dclm.hpp"
+#include "models/dclm/tokenization_dclm.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    std::iostream::sync_with_stdio(false);
+
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/dclm_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/dclm_merges.txt");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/dclm-1b-fp32.mllm");
+    cmdParser.add<string>("billion", 'b', "[1B]", false, "1B");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_billion = cmdParser.get<string>("billion");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = DCLMTokenizer(vocab_path, merge_path);
+    DCLMConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+    auto model = DCLM(config);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        "Machine learning is",
+    };
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto in_str = in_strs[i];
+        std::cout << in_str << std::flush;
+        auto input_tensor = tokenizer.tokenize(in_str);
+        for (int step = 0; step < 100; step++) {
+            auto result = model({input_tensor});
+            auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            std::cout << output_string << std::flush;
+            chatPostProcessing(out_token, input_tensor, {});
+        }
+        printf("\n");
+    }
+}
diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp
@@ -12,7 +12,7 @@ using namespace mllm;
 
 int main(int argc, char **argv) {
     cmdline::parser cmdParser;
-    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto in_str = in_strs[i];
-        auto input_tensor = tokenizer.tokenize(in_str, i);
+        auto input_tensor = tokenizer.tokenize(in_str);
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
         for (int step = 0; step < 100; step++) {
@@ -76,13 +76,10 @@ int main(int argc, char **argv) {
                 {(int)(32 * ratio), (int)(11008 * ratio)}  // 31
             };
             auto result = model({input_tensor}, activate_dims);
-            auto outputs = tokenizer.detokenize(result[0]);
-            auto out_string = outputs.first;
-            auto out_token = outputs.second;
-            if (out_token == 2) {
-                break;
-            }
-            std::cout << out_string << std::flush;
+            auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { break; }
+            std::cout << output_string << std::flush;
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");

diff --git a/examples/demo_fuyu.cpp b/examples/demo_fuyu.cpp
@@ -51,10 +51,9 @@ int main(int argc, char **argv) {
             auto outputs = processor.detokenize(result[0]);
             auto out_string = outputs.first;
             auto out_token = outputs.second;
-            if (out_token == 71013) {
-                break;
-            }
-            std::cout << out_string << std::flush;
+            auto [end, string] = processor.postprocess(out_string);
+            if (!end) { break; }
+            std::cout << string << std::flush;
             chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1], &input_tensors[2]});
         }
         printf("\n");

diff --git a/examples/demo_gemma.cpp b/examples/demo_gemma.cpp
@@ -42,16 +42,15 @@ int main(int argc, char **argv) {
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto in_str = in_strs[i];
-        auto input_tensor = tokenizer.tokenize(in_str, i);
+        auto input_tensor = tokenizer.tokenize(in_str);
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
         for (int step = 0; step < 100; step++) {
             auto result = model({input_tensor});
-            auto outputs = tokenizer.detokenize(result[0]);
-            auto out_string = outputs.first;
-            auto out_token = outputs.second;
-            if (out_token == tokenizer.eos_id && step != 0) break;
-            std::cout << out_string << std::flush;
+            auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { break; }
+            std::cout << output_string << std::flush;
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");

diff --git a/examples/demo_llama.cpp b/examples/demo_llama.cpp
@@ -12,7 +12,7 @@ using namespace mllm;
 
 int main(int argc, char **argv) {
     cmdline::parser cmdParser;
-    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama2_vocab.mllm");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
@@ -30,24 +30,21 @@ int main(int argc, char **argv) {
     model.load(model_path);
 
     vector<string> in_strs = {
-        " Hello, who are you?",
-        " What can you do?",
+        "Hello, who are you?",
+        "What can you do?",
         "Please introduce Beijing University of Posts and Telecommunications."};
 
     for (int i = 0; i < in_strs.size(); ++i) {
-        auto in_str = in_strs[i];
-        auto input_tensor = tokenizer.tokenize(in_str, i);
-        std::cout << "[Q] " << in_str << std::endl;
+        auto in_str = tokenizer.apply_chat_template(in_strs[i]);
+        auto input_tensor = tokenizer.tokenize(in_str);
+        std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
         for (int step = 0; step < 100; step++) {
             auto result = model({input_tensor});
-            auto outputs = tokenizer.detokenize(result[0]);
-            auto out_string = outputs.first;
-            auto out_token = outputs.second;
-            if (out_token == 2) {
-                break;
-            }
-            std::cout << out_string << std::flush;
+            auto [out_string, out_token] = tokenizer.detokenize(result[0]);
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { break; }
+            std::cout << output_string << std::flush;
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");

diff --git a/examples/demo_llava.cpp b/examples/demo_llava.cpp
@@ -48,10 +48,12 @@ int main(int argc, char **argv) {
             auto outputs = processor.detokenize(result[0]);
             auto out_string = outputs.first;
             auto out_token = outputs.second;
-            if (out_token == 2) {
+            auto [isOk, print_string] = processor.postprocess(out_string);
+            if (isOk) {
+                std::cout << print_string << std::flush;
+            } else {
                 break;
             }
-            std::cout << out_string << std::flush;
             chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1]});
         }
         printf("\n");