Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Yi-1.5-6B support #88

Merged
merged 7 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,17 @@ else ()
target_link_libraries(demo_mistral PUBLIC MLLM_CPU)
endif ()

add_executable(demo_yi ${PROJECT_SOURCE_DIR}/examples/demo_yi.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
src/tokenizers/Tokenizer.cpp
src/tokenizers/BPE/Bpe.cpp
src/processor/PreProcess.cpp
)
if (ARM AND NOT APK)
target_compile_options(demo_yi PRIVATE -fopenmp)
target_link_libraries(demo_yi PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_link_libraries(demo_yi PUBLIC MLLM_CPU)
endif ()

# add_executable(demo_deepseek ${PROJECT_SOURCE_DIR}/examples/demo_deepseek.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
# src/tokenizers/Tokenizer.cpp
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
Wait.. why on-device multimodal LLM? - It's a key building block for [intelligent personal agent](https://arxiv.org/pdf/2401.05459.pdf), text-based image searching/retrieval, screen VQA, and many more exciting mobile apps, without giving away your private data (chat history, screenshots, taken photos, etc).

## Recent update
- [:fire::fire:Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
- [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
- [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88
- [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83
- [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79
- [2024 April 9] Support new model: Gemma 2B https://github.com/UbiquitousLearning/mllm/pull/75
Expand Down Expand Up @@ -75,6 +76,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
| [Gemma 2B](https://github.com/google/gemma_pytorch) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) |
| [Qwen 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) |
| [Mistral 7B](https://github.com/mistralai/mistral-src) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) |
| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) |

## Quick Start

Expand Down Expand Up @@ -246,16 +248,16 @@ cd tools/convertor
pip install -r ./requirements.txt

# for one file pytorch model
python convert.py --input_model=model.pth --output_model=model.mllm --type=torch
python converter.py --input_model=model.pth --output_model=model.mllm --type=torch

# for multi-file pytorch model
python convert.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch
python converter.py --input_model=pytorch_model.bin.index.json --output_model=model.mllm --type=torch

# for one file safetensor model
python convert.py --input_model=model.bin --output_model=model.mllm --type=safetensor
python converter.py --input_model=model.bin --output_model=model.mllm --type=safetensor

# for multi-file safetensor model
python convert.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
python converter.py --input_model=model.safetensors.index.json --output_model=model.mllm --type=safetensor
```

### Convert vocabulary
Expand All @@ -274,7 +276,7 @@ mllm only support two quantize modes: Q4_0 and Q4_K.

```bash
cd bin
./quantize model.mllm model_q4_0.mllm Q4_K
./quantize model.mllm model_q4_k.mllm Q4_K
```

## Roadmap
Expand Down
67 changes: 67 additions & 0 deletions examples/demo_yi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* @file demo_yi.cpp
* @author Chenghua Wang ([email protected])
* @brief
* @version 0.1
* @date 2024-07-02
*
* @copyright Copyright (c) 2024
*
*/
#include "cmdline.h"
#include "models/yi/configuration_yi.hpp"
#include "models/yi/modeling_yi.hpp"
#include "models/yi/tokenization_yi.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/yi_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/yi-1.5-6b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = YiTokenizer(vocab_path);
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
auto model = YiForCausalLM(config);
model.load(model_path);

vector<string> in_strs = {
"请介绍北京邮电大学,推荐同学们报考。",
};

auto processOutput = [&](std::string &text) -> std::pair<bool, std::string> {
text = std::regex_replace(text, std::regex("▁"), " ");
if (text == "<|endoftext|>" || text == "<|im_end|>") return {false, ""};
return {true, text};
};

for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
std::cout << "[Q] " << in_str << std::endl;
auto input_tensor = tokenizer.tokenize(in_str, i);
std::cout << "[A] " << std::flush;
for (int step = 0; step < 1000; step++) {
auto result = model({input_tensor});
auto outputs = tokenizer.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
auto [isOk, print_string] = processOutput(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
break;
}
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
}
}
104 changes: 104 additions & 0 deletions src/models/yi/configuration_yi.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* @file configuration_Yi.hpp
* @author Chenghua Wang ([email protected])
* @brief
* @version 0.1
* @date 2024-07-02
*
* @copyright Copyright (c) 2024
*
*/
#ifndef CONFIG_YI_HPP
#define CONFIG_YI_HPP
#include "models/transformer/configuration_transformer.hpp"

using namespace mllm;

class YiNameConfig : public TransformerNameConfig {
public:
std::string blk_name;
std::string token_embd_name;
std::string post_norm_name;
std::string lm_head_name;
std::string _gate_proj_name;

void init(RoPEType type = LLAMAROPE) {
switch (type) {
case LLAMAROPE: {
blk_name = "layers.";
_attn_base_name = "attention.";
_ffn_base_name = "feed_forward.";
_q_proj_name = "wq";
_k_proj_name = "wk";
_v_proj_name = "wv";
_o_proj_name = "wo";
_gate_proj_name = "w1";
_up_proj_name = "w3";
_down_proj_name = "w2";
_attn_norm_name = "attention_norm";
_ffn_norm_name = "ffn_norm";
token_embd_name = "tok_embeddings";
post_norm_name = "norm";
lm_head_name = "output";
break;
}
case HFHUBROPE: {
blk_name = "model.layers.";
_attn_base_name = "self_attn.";
_ffn_base_name = "mlp.";
_q_proj_name = "q_proj";
_k_proj_name = "k_proj";
_v_proj_name = "v_proj";
_o_proj_name = "o_proj";
_gate_proj_name = "gate_proj";
_up_proj_name = "up_proj";
_down_proj_name = "down_proj";
_attn_norm_name = "input_layernorm";
_ffn_norm_name = "post_attention_layernorm";
token_embd_name = "model.embed_tokens";
post_norm_name = "model.norm";
lm_head_name = "lm_head";
break;
}
default: {
throw std::runtime_error("Unsupported llama type");
}
}
}
};

class YiConfig {
public:
explicit YiConfig(int token_limit, string billions = "6B", RoPEType type = LLAMAROPE, int vocab = 64000) {
names_config.init(type);
vocab_size = vocab;
if (!(billions == "6B" || billions == "6b")) {
throw std::runtime_error("Unsupported model size");
}
RoPE_type = type;
cache_limit = token_limit;
}

public:
bool attention_bias = false;
float attention_drop = 0.0;
int pad_token_id = 0;
int bos_token_id = 1;
int eos_token_id = 2;
int hidden_size = 4096;
float initializer_range = 0.02;
int intermediate_size = 11008;
int max_position_embeddings = 4096;
int num_attention_heads = 32;
int num_hidden_layers = 32;
int num_key_value_heads = 4;
int pretraining_tp = 1;
float rms_norm_eps = 1e-6;
float rope_theta = 5000000.0;
int vocab_size = 64000;
int cache_limit;
RoPEType RoPE_type;
YiNameConfig names_config;
};

#endif //! CONFIG_YI_HPP
Loading
Loading