CI(github): add self hosted

InternLM · Jan 22, 2024 · eb82035 · eb82035
1 parent 68f8168
commit eb82035
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 39 deletions.
diff --git a/.github/workflows/config-ci.ini b/.github/workflows/config-ci.ini
@@ -0,0 +1,49 @@
+[feature_store]
+reject_throttle = 689.0
+model_path = "GanymedeNil/text2vec-large-chinese"
+work_dir = "workdir"
+
+[web_search]
+x_api_key = "${YOUR-API-KEY}"
+domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
+save_dir = "logs/web_search_result"
+
+[llm]
+enable_local = 1
+enable_remote = 0
+client_url = "http://127.0.0.1:8888/inference"
+
+[llm.server]
+# local_llm_path = "internlm/internlm2-chat-7b"
+local_llm_path = "/data2/khj/internlm2-chat-7b"
+local_llm_max_text_length = 16000
+local_llm_bind_port = 8888
+remote_type = "kimi"
+remote_api_key = "${YOUR-API-KEY}"
+remote_llm_max_text_length = 128000
+remote_llm_model = "moonshot-v1-128k"
+
+[worker]
+enable_sg_search = 0
+save_path = "logs/work.txt"
+
+[worker.time]
+start = "00:00:00"
+end = "23:59:59"
+has_weekday = 0 
+
+[sg_search]
+binary_src_path = "/usr/local/bin/src"
+src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
+
+[sg_search.opencompass]
+github_repo_id = "open-compass/opencompass"
+introduction = "用于评测大型语言模型（LLM）. 它提供了完整的开源可复现的评测框架，支持大语言模型、多模态模型的一站式评测，基于分布式技术，对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度，整合集纳了超过70个评测数据集，合计提供了超过40万个模型评测问题，并提供长文本、安全、代码3类大模型特色技术能力评测。"
+
+[sg_search.lmdeploy]
+github_repo_id = "internlm/lmdeploy"
+introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM（Large Language Model）的工具包。是一个服务端场景下，transformer 结构 LLM 部署工具，支持 GPU 服务端部署，速度有保障，支持 Tensor Parallel，多并发优化，功能全面，包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
+
+[frontend]
+type = "none"
+webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -1,4 +1,4 @@
-name: lint
+name: Check markdown local file link available
 
 on:
   push:

diff --git a/.github/workflows/smock.yml b/.github/workflows/smock.yml
@@ -0,0 +1,16 @@
+name: Check `huixiangdou.service.main` works
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  lint:
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run LLM test
+        run: |
+          python -m huixiangdou.service.llm_server_hybrid --config_path .github/workflows/config-ci.ini  --unittest
diff --git a/huixiangdou/service/llm_server_hybrid.py b/huixiangdou/service/llm_server_hybrid.py
@@ -12,6 +12,63 @@
 from openai import OpenAI
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+class InferenceWrapper:
+    """A class to wrapper kinds of inference framework"""
+    def __init__(self, model_path: str, local_max_length: int = 8000):
+        """Init model handler."""
+        self.inference = 'huggingface'
+
+        # try:
+        #     import lmdeploy
+        #     from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
+        #     self.inference = 'lmdeploy'
+        # except ImportError:
+        #     logger.warning(
+        #         "Warning: auto enable lmdeploy for higher efficiency"  # noqa E501
+        #         "https://github.com/internlm/lmdeploy"
+        #     )
+
+        # if self.inference == 'huggingface':
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device_map='auto',
+            torch_dtype='auto'
+        ).eval()
+
+        # else:
+            # backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=local_max_length)
+            # self.pipe = pipeline(model_path, backend_config=backend_config)
+            # self.gen_config = GenerationConfig(top_p=0.8,
+            #                             top_k=1,
+            #                             temperature=0.8,
+            #                             max_new_tokens=1024)
+
+    def chat(self, prompt:str, history=[]):
+        """Generate a response from local LLM.
+
+        Args:
+            prompt (str): The prompt for inference.
+            history (list): List of previous interactions.
+
+        Returns:
+            str: Generated response.
+        """
+        output_text = ''
+        # if self.inference == 'huggingface':
+        output_text, _ = self.model.chat(self.tokenizer,
+                                            prompt,
+                                            history,
+                                            top_k=1,
+                                            do_sample=False)
+        # elif self.inference == 'lmdeploy':
+        #     output_text = pipe(prompt, gen_config=self.gen_config)
+        # else:
+        #     raise Exception(f'unknown inference framework {self.inference}')
+        return output_text
+
 
 class HybridLLMServer:
     """A class to handle server-side interactions with a hybrid language
@@ -42,19 +99,8 @@ def __init__(self,
 
         model_path = self.server_config['local_llm_path']
 
-        self.tokenizer = None
-        self.model = None
-
         if self.enable_local:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_path, trust_remote_code=True)
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                trust_remote_code=True,
-                device_map='auto',
-                torch_dtype='auto',
-                #                fp16=True,
-            ).eval()
+            self.inference = InferenceWrapper(model_path)
         else:
             logger.warning('local LLM disabled.')
 
@@ -155,12 +201,10 @@ def generate_response(self, prompt, history=[], remote=False):
             """# Caution: For the results of this software to be reliable and verifiable,  # noqa E501
             it's essential to ensure reproducibility. Thus `GenerationMode.GREEDY_SEARCH`  # noqa E501
             must enabled."""
-            output_text, _ = self.model.chat(self.tokenizer,
-                                             prompt,
-                                             history,
-                                             top_k=1,
-                                             do_sample=False)
-            print((prompt, output_text))
+
+            output_text = self.inference.chat(prompt, history)
+
+            logger.info((prompt, output_text))
         time_finish = time.time()
 
         logger.debug('Q:{} A:{} \t\t remote {} timecost {} '.format(
@@ -178,6 +222,10 @@ def parse_args():
         help=  # noqa E251
         'Hybrid LLM Server configuration path. Default value is config.ini'  # noqa E501
     )
+    parser.add_argument('--unittest',
+                        action='store_true',
+                        default=False,
+                        help='Test with samples.')
     args = parser.parse_args()
     return args
 
@@ -223,31 +271,28 @@ async def inference(request):
 
 
 def main():
-    """Main function to start the server process and run a sample client
-    request."""
+    """Function to start the server without running a separate process."""
     args = parse_args()
     server_ready = Value('i', 0)
 
-    server_process = Process(target=llm_serve,
-                             args=(args.config_path, server_ready))
-    server_process.daemon = True
-    server_process.start()
-
-    from llm_client import ChatClient
-    client = ChatClient(config_path=args.config_path)
-    while server_ready.value == 0:
-        logger.info('waiting for server to be ready..')
-        time.sleep(3)
-    print(client.generate_response(prompt='今天天气如何？', history=[], remote=False))
+    if not args.unittest:
+        llm_serve(args.config_path, server_ready)
+    else:
+        server_process = Process(target=llm_serve,
+                                args=(args.config_path, server_ready))
+        server_process.daemon = True
+        server_process.start()
 
+        from .llm_client import ChatClient
+        client = ChatClient(config_path=args.config_path)
+        while server_ready.value == 0:
+            logger.info('waiting for server to be ready..')
+            time.sleep(3)
 
-def simple_bind():
-    """Function to start the server without running a separate process."""
-    args = parse_args()
-    server_ready = Value('i', 0)
-
-    llm_serve(args.config_path, server_ready)
+        queries = ['今天天气如何？']
+        for query in queries:
+            print(client.generate_response(prompt=query, history=[], remote=False))
 
 
 if __name__ == '__main__':
-    simple_bind()
+    main()