Merge pull request #88 from chenghuaWang/main

feat: Add Yi-1.5-6B support
UbiquitousLearning · Jul 15, 2024 · ca6a898 · ca6a898
2 parents 40544f9 + ce6c112
commit ca6a898
Show file tree

Hide file tree

Showing 9 changed files with 454 additions and 12 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -459,6 +459,17 @@ else ()
     target_link_libraries(demo_mistral PUBLIC MLLM_CPU)
 endif ()
 
+add_executable(demo_yi ${PROJECT_SOURCE_DIR}/examples/demo_yi.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
+        src/tokenizers/Tokenizer.cpp
+        src/tokenizers/BPE/Bpe.cpp
+        src/processor/PreProcess.cpp
+)
+if (ARM AND NOT APK)
+    target_compile_options(demo_yi PRIVATE -fopenmp)
+    target_link_libraries(demo_yi PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_link_libraries(demo_yi PUBLIC MLLM_CPU)
+endif ()
 
 # add_executable(demo_deepseek ${PROJECT_SOURCE_DIR}/examples/demo_deepseek.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
 #         src/tokenizers/Tokenizer.cpp

diff --git a/README.md b/README.md
@@ -12,7 +12,8 @@
 Wait.. why on-device multimodal LLM? - It's a key building block for [intelligent personal agent](https://arxiv.org/pdf/2401.05459.pdf), text-based image searching/retrieval, screen VQA, and many more exciting mobile apps, without giving away your private data (chat history, screenshots, taken photos, etc).
 
 ## Recent update
-- [:fire::fire:Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
+- [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
+- [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88
 - [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83
 - [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79
 - [2024 April 9] Support new model: Gemma 2B https://github.com/UbiquitousLearning/mllm/pull/75
@@ -75,6 +76,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 | [Gemma 2B](https://github.com/google/gemma_pytorch)                         | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)   |
 | [Qwen 0.5B](https://github.com/QwenLM/Qwen)                         | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)   |
 | [Mistral 7B](https://github.com/mistralai/mistral-src)                         | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)   |
+| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B)                         | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)   |
 
 ## Quick Start
 
@@ -246,16 +248,16 @@ cd tools/convertor
 pip install -r ./requirements.txt
 
 # for one file pytorch model
-python convert.py --input_model=model.pth --output_model=model.mllm --type=torch
+python converter.py --input_model=model.pth --output_model=model.mllm --type=torch
 
 # for multi-file pytorch model
-python convert.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch
+python converter.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch
 
 # for one file safetensor model
-python convert.py --input_model=model.bin --output_model=model.mllm --type=safetensor
+python converter.py --input_model=model.bin --output_model=model.mllm --type=safetensor
 
 # for multi-file safetensor model
-python convert.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
+python converter.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
 ``` 
 
 ### Convert vocabulary
@@ -274,7 +276,7 @@ mllm only support two quantize modes: Q4_0 and Q4_K.
 
 ```bash
 cd bin
-./quantize model.mllm model_q4_0.mllm Q4_K
+./quantize model.mllm model_q4_k.mllm Q4_K
 ```
 
 ## Roadmap

diff --git a/examples/demo_yi.cpp b/examples/demo_yi.cpp
@@ -0,0 +1,67 @@
+/**
+ * @file demo_yi.cpp
+ * @author Chenghua Wang ([email protected])
+ * @brief
+ * @version 0.1
+ * @date 2024-07-02
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "cmdline.h"
+#include "models/yi/configuration_yi.hpp"
+#include "models/yi/modeling_yi.hpp"
+#include "models/yi/tokenization_yi.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/yi_vocab.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/yi-1.5-6b-chat-q4_k.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = YiTokenizer(vocab_path);
+    YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
+    auto model = YiForCausalLM(config);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        "请介绍北京邮电大学，推荐同学们报考。",
+    };
+
+    auto processOutput = [&](std::string &text) -> std::pair<bool, std::string> {
+        text = std::regex_replace(text, std::regex("▁"), " ");
+        if (text == "<|endoftext|>" || text == "<|im_end|>") return {false, ""};
+        return {true, text};
+    };
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto in_str = in_strs[i];
+        std::cout << "[Q] " << in_str << std::endl;
+        auto input_tensor = tokenizer.tokenize(in_str, i);
+        std::cout << "[A] " << std::flush;
+        for (int step = 0; step < 1000; step++) {
+            auto result = model({input_tensor});
+            auto outputs = tokenizer.detokenize(result[0]);
+            auto out_string = outputs.first;
+            auto out_token = outputs.second;
+            auto [isOk, print_string] = processOutput(out_string);
+            if (isOk) {
+                std::cout << print_string << std::flush;
+            } else {
+                break;
+            }
+            chatPostProcessing(out_token, input_tensor, {});
+        }
+        printf("\n");
+    }
+}
diff --git a/src/models/yi/configuration_yi.hpp b/src/models/yi/configuration_yi.hpp
@@ -0,0 +1,104 @@
+/**
+ * @file configuration_Yi.hpp
+ * @author Chenghua Wang ([email protected])
+ * @brief
+ * @version 0.1
+ * @date 2024-07-02
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#ifndef CONFIG_YI_HPP
+#define CONFIG_YI_HPP
+#include "models/transformer/configuration_transformer.hpp"
+
+using namespace mllm;
+
+class YiNameConfig : public TransformerNameConfig {
+public:
+    std::string blk_name;
+    std::string token_embd_name;
+    std::string post_norm_name;
+    std::string lm_head_name;
+    std::string _gate_proj_name;
+
+    void init(RoPEType type = LLAMAROPE) {
+        switch (type) {
+        case LLAMAROPE: {
+            blk_name = "layers.";
+            _attn_base_name = "attention.";
+            _ffn_base_name = "feed_forward.";
+            _q_proj_name = "wq";
+            _k_proj_name = "wk";
+            _v_proj_name = "wv";
+            _o_proj_name = "wo";
+            _gate_proj_name = "w1";
+            _up_proj_name = "w3";
+            _down_proj_name = "w2";
+            _attn_norm_name = "attention_norm";
+            _ffn_norm_name = "ffn_norm";
+            token_embd_name = "tok_embeddings";
+            post_norm_name = "norm";
+            lm_head_name = "output";
+            break;
+        }
+        case HFHUBROPE: {
+            blk_name = "model.layers.";
+            _attn_base_name = "self_attn.";
+            _ffn_base_name = "mlp.";
+            _q_proj_name = "q_proj";
+            _k_proj_name = "k_proj";
+            _v_proj_name = "v_proj";
+            _o_proj_name = "o_proj";
+            _gate_proj_name = "gate_proj";
+            _up_proj_name = "up_proj";
+            _down_proj_name = "down_proj";
+            _attn_norm_name = "input_layernorm";
+            _ffn_norm_name = "post_attention_layernorm";
+            token_embd_name = "model.embed_tokens";
+            post_norm_name = "model.norm";
+            lm_head_name = "lm_head";
+            break;
+        }
+        default: {
+            throw std::runtime_error("Unsupported llama type");
+        }
+        }
+    }
+};
+
+class YiConfig {
+public:
+    explicit YiConfig(int token_limit, string billions = "6B", RoPEType type = LLAMAROPE, int vocab = 64000) {
+        names_config.init(type);
+        vocab_size = vocab;
+        if (!(billions == "6B" || billions == "6b")) {
+            throw std::runtime_error("Unsupported model size");
+        }
+        RoPE_type = type;
+        cache_limit = token_limit;
+    }
+
+public:
+    bool attention_bias = false;
+    float attention_drop = 0.0;
+    int pad_token_id = 0;
+    int bos_token_id = 1;
+    int eos_token_id = 2;
+    int hidden_size = 4096;
+    float initializer_range = 0.02;
+    int intermediate_size = 11008;
+    int max_position_embeddings = 4096;
+    int num_attention_heads = 32;
+    int num_hidden_layers = 32;
+    int num_key_value_heads = 4;
+    int pretraining_tp = 1;
+    float rms_norm_eps = 1e-6;
+    float rope_theta = 5000000.0;
+    int vocab_size = 64000;
+    int cache_limit;
+    RoPEType RoPE_type;
+    YiNameConfig names_config;
+};
+
+#endif //! CONFIG_YI_HPP