Skip to content

Commit

Permalink
Merge pull request #88 from chenghuaWang/main
Browse files Browse the repository at this point in the history
feat: Add Yi-1.5-6B support
  • Loading branch information
yirongjie authored Jul 15, 2024
2 parents 40544f9 + ce6c112 commit ca6a898
Show file tree
Hide file tree
Showing 9 changed files with 454 additions and 12 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,17 @@ else ()
target_link_libraries(demo_mistral PUBLIC MLLM_CPU)
endif ()

add_executable(demo_yi ${PROJECT_SOURCE_DIR}/examples/demo_yi.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
src/tokenizers/Tokenizer.cpp
src/tokenizers/BPE/Bpe.cpp
src/processor/PreProcess.cpp
)
if (ARM AND NOT APK)
target_compile_options(demo_yi PRIVATE -fopenmp)
target_link_libraries(demo_yi PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_link_libraries(demo_yi PUBLIC MLLM_CPU)
endif ()

# add_executable(demo_deepseek ${PROJECT_SOURCE_DIR}/examples/demo_deepseek.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
# src/tokenizers/Tokenizer.cpp
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
Wait.. why on-device multimodal LLM? - It's a key building block for [intelligent personal agent](https://arxiv.org/pdf/2401.05459.pdf), text-based image searching/retrieval, screen VQA, and many more exciting mobile apps, without giving away your private data (chat history, screenshots, taken photos, etc).

## Recent update
- [:fire::fire:Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
- [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
- [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88
- [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83
- [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79
- [2024 April 9] Support new model: Gemma 2B https://github.com/UbiquitousLearning/mllm/pull/75
Expand Down Expand Up @@ -75,6 +76,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
| [Gemma 2B](https://github.com/google/gemma_pytorch) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) |
| [Qwen 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) |
| [Mistral 7B](https://github.com/mistralai/mistral-src) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) |
| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) |

## Quick Start

Expand Down Expand Up @@ -246,16 +248,16 @@ cd tools/convertor
pip install -r ./requirements.txt

# for one file pytorch model
python convert.py --input_model=model.pth --output_model=model.mllm --type=torch
python converter.py --input_model=model.pth --output_model=model.mllm --type=torch

# for multi-file pytorch model
python convert.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch
python converter.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch

# for one file safetensor model
python convert.py --input_model=model.bin --output_model=model.mllm --type=safetensor
python converter.py --input_model=model.bin --output_model=model.mllm --type=safetensor

# for multi-file safetensor model
python convert.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
python converter.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
```

### Convert vocabulary
Expand All @@ -274,7 +276,7 @@ mllm only support two quantize modes: Q4_0 and Q4_K.

```bash
cd bin
./quantize model.mllm model_q4_0.mllm Q4_K
./quantize model.mllm model_q4_k.mllm Q4_K
```

## Roadmap
Expand Down
67 changes: 67 additions & 0 deletions examples/demo_yi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* @file demo_yi.cpp
* @author Chenghua Wang ([email protected])
* @brief
* @version 0.1
* @date 2024-07-02
*
* @copyright Copyright (c) 2024
*
*/
#include "cmdline.h"
#include "models/yi/configuration_yi.hpp"
#include "models/yi/modeling_yi.hpp"
#include "models/yi/tokenization_yi.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/yi_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/yi-1.5-6b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = YiTokenizer(vocab_path);
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
auto model = YiForCausalLM(config);
model.load(model_path);

vector<string> in_strs = {
"请介绍北京邮电大学,推荐同学们报考。",
};

auto processOutput = [&](std::string &text) -> std::pair<bool, std::string> {
text = std::regex_replace(text, std::regex(""), " ");
if (text == "<|endoftext|>" || text == "<|im_end|>") return {false, ""};
return {true, text};
};

for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
std::cout << "[Q] " << in_str << std::endl;
auto input_tensor = tokenizer.tokenize(in_str, i);
std::cout << "[A] " << std::flush;
for (int step = 0; step < 1000; step++) {
auto result = model({input_tensor});
auto outputs = tokenizer.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
auto [isOk, print_string] = processOutput(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
break;
}
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
}
}
104 changes: 104 additions & 0 deletions src/models/yi/configuration_yi.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* @file configuration_Yi.hpp
* @author Chenghua Wang ([email protected])
* @brief
* @version 0.1
* @date 2024-07-02
*
* @copyright Copyright (c) 2024
*
*/
#ifndef CONFIG_YI_HPP
#define CONFIG_YI_HPP
#include "models/transformer/configuration_transformer.hpp"

using namespace mllm;

class YiNameConfig : public TransformerNameConfig {
public:
std::string blk_name;
std::string token_embd_name;
std::string post_norm_name;
std::string lm_head_name;
std::string _gate_proj_name;

void init(RoPEType type = LLAMAROPE) {
switch (type) {
case LLAMAROPE: {
blk_name = "layers.";
_attn_base_name = "attention.";
_ffn_base_name = "feed_forward.";
_q_proj_name = "wq";
_k_proj_name = "wk";
_v_proj_name = "wv";
_o_proj_name = "wo";
_gate_proj_name = "w1";
_up_proj_name = "w3";
_down_proj_name = "w2";
_attn_norm_name = "attention_norm";
_ffn_norm_name = "ffn_norm";
token_embd_name = "tok_embeddings";
post_norm_name = "norm";
lm_head_name = "output";
break;
}
case HFHUBROPE: {
blk_name = "model.layers.";
_attn_base_name = "self_attn.";
_ffn_base_name = "mlp.";
_q_proj_name = "q_proj";
_k_proj_name = "k_proj";
_v_proj_name = "v_proj";
_o_proj_name = "o_proj";
_gate_proj_name = "gate_proj";
_up_proj_name = "up_proj";
_down_proj_name = "down_proj";
_attn_norm_name = "input_layernorm";
_ffn_norm_name = "post_attention_layernorm";
token_embd_name = "model.embed_tokens";
post_norm_name = "model.norm";
lm_head_name = "lm_head";
break;
}
default: {
throw std::runtime_error("Unsupported llama type");
}
}
}
};

class YiConfig {
public:
explicit YiConfig(int token_limit, string billions = "6B", RoPEType type = LLAMAROPE, int vocab = 64000) {
names_config.init(type);
vocab_size = vocab;
if (!(billions == "6B" || billions == "6b")) {
throw std::runtime_error("Unsupported model size");
}
RoPE_type = type;
cache_limit = token_limit;
}

public:
bool attention_bias = false;
float attention_drop = 0.0;
int pad_token_id = 0;
int bos_token_id = 1;
int eos_token_id = 2;
int hidden_size = 4096;
float initializer_range = 0.02;
int intermediate_size = 11008;
int max_position_embeddings = 4096;
int num_attention_heads = 32;
int num_hidden_layers = 32;
int num_key_value_heads = 4;
int pretraining_tp = 1;
float rms_norm_eps = 1e-6;
float rope_theta = 5000000.0;
int vocab_size = 64000;
int cache_limit;
RoPEType RoPE_type;
YiNameConfig names_config;
};

#endif //! CONFIG_YI_HPP
Loading

0 comments on commit ca6a898

Please sign in to comment.