From 7309f7dc58cdc81e1fca1bda0efe16a7fe9fdada Mon Sep 17 00:00:00 2001
From: YuChunlei <147286318@qq.com>
Date: Sat, 4 May 2024 05:41:42 +0000
Subject: [PATCH] update server.cfg

---
 __main__.py |  5 +++--
 model.py    | 26 ++++++++++++--------------
 server.cfg  | 50 ++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/__main__.py b/__main__.py
index 2fbd575..6a6b407 100644
--- a/__main__.py
+++ b/__main__.py
@@ -42,13 +42,14 @@
 def main():
     description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
     parser = argparse.ArgumentParser(description=description)
-
+    current_file_path = __file__
+    current_directory = os.path.dirname(current_file_path)   
     add_args_from_model(parser, Settings)
     parser.add_argument(
         "--config_file",
         type=str,
         help="Path to a config file to load.",
-        default="/home/test/api_server.cfg",
+        default= current_directory + "/server.cfg",
     )
     server_settings: ServerSettings | None = None
     model_settings: list[ModelSettings] = []
diff --git a/model.py b/model.py
index 191b615..028ae96 100644
--- a/model.py
+++ b/model.py
@@ -129,24 +129,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
 
         kwargs = {}
 
-        if settings.hf_model_repo_id is not None:
-            create_fn = functools.partial(
-                llama_cpp.Llama.from_pretrained,
-                repo_id=settings.hf_model_repo_id,
-                filename=settings.model,
-            )
-        elif settings.chat_format == "chatglm":
-            create_fn = chatglm_cpp.Pipeline
-            kwargs["model_path"] = settings.model
-        else:
-            create_fn = llama_cpp.Llama
-            kwargs["model_path"] = settings.model
 
-        if settings.chat_format == "chatglm3":
+        if settings.chat_format == "chatglm3" or settings.chat_format == "chatglm":
             _model = chatglm_cpp.Pipeline(settings.model)
             _model.create_chat_completion = chatglm.create_chat_completion
             
-        if settings.chat_format == "bge-onnx":        
+        elif settings.chat_format == "bge-onnx":        
             _model =extends.BgeOnnxModel(settings.model,settings.model_alias)           
 
         elif settings.chat_format == "firefunction" :           
@@ -189,6 +177,16 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 n_threads_batch=settings.n_threads_batch,
             )
         else:
+            if settings.hf_model_repo_id is not None:
+                create_fn = functools.partial(
+                    llama_cpp.Llama.from_pretrained,
+                    repo_id=settings.hf_model_repo_id,
+                    filename=settings.model,
+                )           
+            else:
+                create_fn = llama_cpp.Llama
+                kwargs["model_path"] = settings.model
+                
             _model = create_fn(
                 **kwargs,
                 # Model Params
diff --git a/server.cfg b/server.cfg
index 5cebfa5..db399ef 100644
--- a/server.cfg
+++ b/server.cfg
@@ -2,12 +2,46 @@
     "host": "0.0.0.0",
     "port": 8000,
     "models": [
+        {
+            "model": "/home/test/llm-models/chatglm3-ggml.bin",
+            "model_alias": "chatglm3",
+            "chat_format": "chatglm3",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "embedding": false,
+            "n_threads": 12,
+            "n_batch": 512
+        },
+       {
+            "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf",
+            "model_alias": "bge-large-zh-v1.5",
+            "chat_format": "bert",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 8192,
+            "embedding": true,
+            "n_batch": 512,
+            "verbose": false
+        },
+        {
+            "model": "/home/test/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/babcf60cae0a1f438d7ade582983d4ba462303c2/onnx/",
+            "model_alias": "bge-m3",
+            "chat_format": "bge-onnx",
+            "embedding": true,
+            "n_gpu_layers": 0,
+            "n_ctx": 8192,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512
+        },
        {
             "model": "/home/test/llm-models/chatglm3-ggml-q8.bin",
-            "model_alias": "chatglm3",
+            "model_alias": "chatglm3-q8",
             "chat_format": "chatglm3",
             "n_gpu_layers": 0,
             "offload_kqv": true,
+            "embedding": false,
             "n_threads": 12,
             "n_batch": 512
         },
@@ -18,6 +52,7 @@
             "chat_format": "openfunctions",
             "n_gpu_layers": 0,
             "n_ctx":4096,
+            "embedding": false, 
             "offload_kqv": true,
             "n_threads": 12,
             "n_batch": 512
@@ -31,20 +66,13 @@
             "offload_kqv": true,
             "n_threads": 12,
             "n_batch": 512,
+            "embedding": false,
             "n_ctx": 8192,
             "use_mmap":true
         },
-        {
-            "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf",
-            "model_alias": "bge-large-zh-v1.5",
-            "chat_format": "bert",
-            "n_gpu_layers": 0,
-            "offload_kqv": true,
-            "n_threads": 12,
-            "n_batch": 512
-        },
         {
             "model": "/home/test/llm-models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+            "hf_pretrained_model_name_or_path":"meta-llama/Meta-Llama-3-8B-Instruct",            
             "model_alias": "llama-3-8b",
             "chat_format": "llama-3",
             "n_gpu_layers": 0,
@@ -60,6 +88,7 @@
             "chat_format": "gemma",
             "n_gpu_layers": 0,
             "offload_kqv": true,
+            "embedding": false,
             "n_threads": 12,
             "n_ctx": 8192,
             "n_batch": 512
@@ -71,6 +100,7 @@
             "clip_model_path": "/home/test/llm-models/mmproj-model-f16.gguf",
             "n_gpu_layers": 0,
             "offload_kqv": true,
+            "embedding": false,
             "n_threads": 12,
             "n_ctx": 4096,
             "n_batch": 512