feat(model): Support llama.cpp server deploy (#2263)

eosphoros-ai · Jan 2, 2025 · 0b2af2e · 0b2af2e
1 parent 576da34
commit 0b2af2e
Show file tree

Hide file tree

Showing 14 changed files with 823 additions and 44 deletions.
diff --git a/dbgpt/core/schema/api.py b/dbgpt/core/schema/api.py
@@ -65,6 +65,14 @@ class APIChatCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = Field(0.0, description="Presence penalty")
 
 
+class UsageInfo(BaseModel):
+    """Usage info entity."""
+
+    prompt_tokens: int = Field(0, description="Prompt tokens")
+    total_tokens: int = Field(0, description="Total tokens")
+    completion_tokens: Optional[int] = Field(0, description="Completion tokens")
+
+
 class DeltaMessage(BaseModel):
     """Delta message entity for chat completion response."""
 
@@ -95,6 +103,7 @@ class ChatCompletionStreamResponse(BaseModel):
     choices: List[ChatCompletionResponseStreamChoice] = Field(
         ..., description="Chat completion response choices"
     )
+    usage: UsageInfo = Field(..., description="Usage info")
 
 
 class ChatMessage(BaseModel):
@@ -104,14 +113,6 @@ class ChatMessage(BaseModel):
     content: str = Field(..., description="Content of the message")
 
 
-class UsageInfo(BaseModel):
-    """Usage info entity."""
-
-    prompt_tokens: int = Field(0, description="Prompt tokens")
-    total_tokens: int = Field(0, description="Total tokens")
-    completion_tokens: Optional[int] = Field(0, description="Completion tokens")
-
-
 class ChatCompletionResponseChoice(BaseModel):
     """Chat completion response choice entity."""
 
@@ -256,3 +257,157 @@ class ErrorCode(IntEnum):
     GRADIO_STREAM_UNKNOWN_ERROR = 50004
     CONTROLLER_NO_WORKER = 50005
     CONTROLLER_WORKER_TIMEOUT = 50006
+
+
+class CompletionRequest(BaseModel):
+    """Completion request entity."""
+
+    model: str = Field(..., description="Model name")
+    prompt: Union[str, List[Any]] = Field(
+        ...,
+        description="Provide the prompt for this completion as a string or as an "
+        "array of strings or numbers representing tokens",
+    )
+    suffix: Optional[str] = Field(
+        None,
+        description="Suffix to append to the completion. If provided, the model will "
+        "stop generating upon reaching this suffix",
+    )
+    temperature: Optional[float] = Field(
+        0.8,
+        description="Adjust the randomness of the generated text. Default: `0.8`",
+    )
+    n: Optional[int] = Field(
+        1,
+        description="Number of completions to generate. Default: `1`",
+    )
+    max_tokens: Optional[int] = Field(
+        16,
+        description="The maximum number of tokens that can be generated in the "
+        "completion. Default: `16`",
+    )
+    stop: Optional[Union[str, List[str]]] = Field(
+        None,
+        description="Up to 4 sequences where the API will stop generating further "
+        "tokens. The returned text will not contain the stop sequence.",
+    )
+    stream: Optional[bool] = Field(
+        False,
+        description="Whether to stream back partial completions. Default: `False`",
+    )
+    top_p: Optional[float] = Field(
+        1.0,
+        description="Limit the next token selection to a subset of tokens with a "
+        "cumulative probability above a threshold P. Default: `1.0`",
+    )
+    top_k: Optional[int] = Field(
+        -1,
+        description="Limit the next token selection to the K most probable tokens. "
+        "Default: `-1`",
+    )
+    logprobs: Optional[int] = Field(
+        None,
+        description="Modify the likelihood of specified tokens appearing in the "
+        "completion.",
+    )
+    echo: Optional[bool] = Field(
+        False, description="Echo back the prompt in addition to the completion"
+    )
+    presence_penalty: Optional[float] = Field(
+        0.0,
+        description="Number between -2.0 and 2.0. Positive values penalize new tokens "
+        "based on whether they appear in the text so far, increasing the model's "
+        "likelihood to talk about new topics.",
+    )
+    frequency_penalty: Optional[float] = Field(
+        0.0,
+        description="Number between -2.0 and 2.0. Positive values penalize new tokens "
+        "based on their existing frequency in the text so far, decreasing the model's "
+        "likelihood to repeat the same line verbatim.",
+    )
+    user: Optional[str] = Field(
+        None,
+        description="A unique identifier representing your end-user, which can help "
+        "OpenAI to monitor and detect abuse.",
+    )
+    use_beam_search: Optional[bool] = False
+    best_of: Optional[int] = Field(
+        1,
+        description='Generates best_of completions server-side and returns the "best" '
+        "(the one with the highest log probability per token). Results cannot be "
+        "streamed. When used with n, best_of controls the number of candidate "
+        "completions and n specifies how many to return – best_of must be greater than "
+        "n.",
+    )
+
+
+class LogProbs(BaseModel):
+    """Logprobs entity."""
+
+    text_offset: List[int] = Field(default_factory=list, description="Text offset")
+    token_logprobs: List[Optional[float]] = Field(
+        default_factory=list, description="Token logprobs"
+    )
+    tokens: List[str] = Field(default_factory=list, description="Tokens")
+    top_logprobs: List[Optional[Dict[str, float]]] = Field(
+        default_factory=list, description="Top logprobs"
+    )
+
+
+class CompletionResponseChoice(BaseModel):
+    """Completion response choice entity."""
+
+    index: int = Field(..., description="Choice index")
+    text: str = Field(..., description="Text")
+    logprobs: Optional[LogProbs] = Field(None, description="Logprobs")
+    finish_reason: Optional[Literal["stop", "length"]] = Field(
+        None, description="The reason the model stopped generating tokens."
+    )
+
+
+class CompletionResponse(BaseModel):
+    """Completion response entity."""
+
+    id: str = Field(default_factory=lambda: f"cmpl-{str(uuid.uuid1())}")
+    object: str = Field(
+        "text_completion",
+        description="The object type, which is always 'text_completion'",
+    )
+    created: int = Field(
+        default_factory=lambda: int(time.time()), description="Created time"
+    )
+    model: str = Field(..., description="Model name")
+    choices: List[CompletionResponseChoice] = Field(
+        ...,
+        description="The list of completion choices the model generated for the input "
+        "prompt.",
+    )
+    usage: UsageInfo = Field(..., description="Usage info")
+
+
+class CompletionResponseStreamChoice(BaseModel):
+    """Completion response choice entity."""
+
+    index: int = Field(..., description="Choice index")
+    text: str = Field(..., description="Text")
+    logprobs: Optional[LogProbs] = Field(None, description="Logprobs")
+    finish_reason: Optional[Literal["stop", "length"]] = Field(
+        None, description="The reason the model stopped generating tokens."
+    )
+
+
+class CompletionStreamResponse(BaseModel):
+    """Completion stream response entity."""
+
+    id: str = Field(
+        default_factory=lambda: f"cmpl-{str(uuid.uuid1())}", description="Stream ID"
+    )
+    object: str = Field("text_completion", description="Object type")
+    created: int = Field(
+        default_factory=lambda: int(time.time()), description="Created time"
+    )
+    model: str = Field(..., description="Model name")
+    choices: List[CompletionResponseStreamChoice] = Field(
+        ..., description="Completion response choices"
+    )
+    usage: UsageInfo = Field(..., description="Usage info")
diff --git a/dbgpt/model/adapter/base.py b/dbgpt/model/adapter/base.py
@@ -145,6 +145,14 @@ def support_async(self) -> bool:
         """Whether the loaded model supports asynchronous calls"""
         return False
 
+    def support_generate_function(self) -> bool:
+        """Whether support generate function, if it is False, we will use
+        generate_stream function.
+
+        Sometimes, we need to use generate function to get the result of the model.
+        """
+        return False
+
     def get_generate_stream_function(self, model, model_path: str):
         """Get the generate stream function of the model"""
         raise NotImplementedError
@@ -153,6 +161,14 @@ def get_async_generate_stream_function(self, model, model_path: str):
         """Get the asynchronous generate stream function of the model"""
         raise NotImplementedError
 
+    def get_generate_function(self, model, model_path: str):
+        """Get the generate function of the model"""
+        raise NotImplementedError
+
+    def get_async_generate_function(self, model, model_path: str):
+        """Get the asynchronous generate function of the model"""
+        raise NotImplementedError
+
     def get_default_conv_template(
         self, model_name: str, model_path: str
     ) -> Optional[ConversationAdapter]: