Skip to content

Commit

Permalink
CI(github): add self hosted
Browse files Browse the repository at this point in the history
  • Loading branch information
tpoisonooo committed Jan 22, 2024
1 parent 68f8168 commit eb82035
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 39 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/config-ci.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
[feature_store]
reject_throttle = 689.0
model_path = "GanymedeNil/text2vec-large-chinese"
work_dir = "workdir"

[web_search]
x_api_key = "${YOUR-API-KEY}"
domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
save_dir = "logs/web_search_result"

[llm]
enable_local = 1
enable_remote = 0
client_url = "http://127.0.0.1:8888/inference"

[llm.server]
# local_llm_path = "internlm/internlm2-chat-7b"
local_llm_path = "/data2/khj/internlm2-chat-7b"
local_llm_max_text_length = 16000
local_llm_bind_port = 8888
remote_type = "kimi"
remote_api_key = "${YOUR-API-KEY}"
remote_llm_max_text_length = 128000
remote_llm_model = "moonshot-v1-128k"

[worker]
enable_sg_search = 0
save_path = "logs/work.txt"

[worker.time]
start = "00:00:00"
end = "23:59:59"
has_weekday = 0

[sg_search]
binary_src_path = "/usr/local/bin/src"
src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"

[sg_search.opencompass]
github_repo_id = "open-compass/opencompass"
introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"

[sg_search.lmdeploy]
github_repo_id = "internlm/lmdeploy"
introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"

[frontend]
type = "none"
webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: lint
name: Check markdown local file link available

on:
push:
Expand Down
16 changes: 16 additions & 0 deletions .github/workflows/smock.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Check `huixiangdou.service.main` works

on:
push:
branches:
- main
pull_request:

jobs:
lint:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- name: Run LLM test
run: |
python -m huixiangdou.service.llm_server_hybrid --config_path .github/workflows/config-ci.ini --unittest
121 changes: 83 additions & 38 deletions huixiangdou/service/llm_server_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,63 @@
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer

class InferenceWrapper:
"""A class to wrapper kinds of inference framework"""
def __init__(self, model_path: str, local_max_length: int = 8000):
"""Init model handler."""
self.inference = 'huggingface'

# try:
# import lmdeploy
# from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
# self.inference = 'lmdeploy'
# except ImportError:
# logger.warning(
# "Warning: auto enable lmdeploy for higher efficiency" # noqa E501
# "https://github.com/internlm/lmdeploy"
# )

# if self.inference == 'huggingface':
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map='auto',
torch_dtype='auto'
).eval()

# else:
# backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=local_max_length)
# self.pipe = pipeline(model_path, backend_config=backend_config)
# self.gen_config = GenerationConfig(top_p=0.8,
# top_k=1,
# temperature=0.8,
# max_new_tokens=1024)

def chat(self, prompt:str, history=[]):
"""Generate a response from local LLM.
Args:
prompt (str): The prompt for inference.
history (list): List of previous interactions.
Returns:
str: Generated response.
"""
output_text = ''
# if self.inference == 'huggingface':
output_text, _ = self.model.chat(self.tokenizer,
prompt,
history,
top_k=1,
do_sample=False)
# elif self.inference == 'lmdeploy':
# output_text = pipe(prompt, gen_config=self.gen_config)
# else:
# raise Exception(f'unknown inference framework {self.inference}')
return output_text


class HybridLLMServer:
"""A class to handle server-side interactions with a hybrid language
Expand Down Expand Up @@ -42,19 +99,8 @@ def __init__(self,

model_path = self.server_config['local_llm_path']

self.tokenizer = None
self.model = None

if self.enable_local:
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map='auto',
torch_dtype='auto',
# fp16=True,
).eval()
self.inference = InferenceWrapper(model_path)
else:
logger.warning('local LLM disabled.')

Expand Down Expand Up @@ -155,12 +201,10 @@ def generate_response(self, prompt, history=[], remote=False):
"""# Caution: For the results of this software to be reliable and verifiable, # noqa E501
it's essential to ensure reproducibility. Thus `GenerationMode.GREEDY_SEARCH` # noqa E501
must enabled."""
output_text, _ = self.model.chat(self.tokenizer,
prompt,
history,
top_k=1,
do_sample=False)
print((prompt, output_text))

output_text = self.inference.chat(prompt, history)

logger.info((prompt, output_text))
time_finish = time.time()

logger.debug('Q:{} A:{} \t\t remote {} timecost {} '.format(
Expand All @@ -178,6 +222,10 @@ def parse_args():
help= # noqa E251
'Hybrid LLM Server configuration path. Default value is config.ini' # noqa E501
)
parser.add_argument('--unittest',
action='store_true',
default=False,
help='Test with samples.')
args = parser.parse_args()
return args

Expand Down Expand Up @@ -223,31 +271,28 @@ async def inference(request):


def main():
"""Main function to start the server process and run a sample client
request."""
"""Function to start the server without running a separate process."""
args = parse_args()
server_ready = Value('i', 0)

server_process = Process(target=llm_serve,
args=(args.config_path, server_ready))
server_process.daemon = True
server_process.start()

from llm_client import ChatClient
client = ChatClient(config_path=args.config_path)
while server_ready.value == 0:
logger.info('waiting for server to be ready..')
time.sleep(3)
print(client.generate_response(prompt='今天天气如何?', history=[], remote=False))
if not args.unittest:
llm_serve(args.config_path, server_ready)
else:
server_process = Process(target=llm_serve,
args=(args.config_path, server_ready))
server_process.daemon = True
server_process.start()

from .llm_client import ChatClient
client = ChatClient(config_path=args.config_path)
while server_ready.value == 0:
logger.info('waiting for server to be ready..')
time.sleep(3)

def simple_bind():
"""Function to start the server without running a separate process."""
args = parse_args()
server_ready = Value('i', 0)

llm_serve(args.config_path, server_ready)
queries = ['今天天气如何?']
for query in queries:
print(client.generate_response(prompt=query, history=[], remote=False))


if __name__ == '__main__':
simple_bind()
main()

0 comments on commit eb82035

Please sign in to comment.