Move parameter ignore_eos to benchmark script (#184)

* move ignore_eos to benchmark script * fix
intel · Apr 12, 2024 · dc2a1b4 · dc2a1b4
1 parent 36e8e2a
commit dc2a1b4
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 3 deletions.
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -239,6 +239,7 @@ async def send_request(
     track_input_output: bool = False,
     progress_bar: tqdm = None,
     simple: bool = False,
+    vllm_engine: bool = False,
 ) -> None:
     """
     Sends a request to the specified API URL with the given prompt and configuration.
@@ -278,6 +279,8 @@ async def send_request(
             "temperature": temp_config["temperature"] if "temperature" in temp_config else None,
             "top_p": temp_config["top_p"] if "top_p" in temp_config else None,
         }
+        if vllm_engine:
+            pload.update({"ignore_eos": True})
 
     token_latencies_per_request: List[float] = []
 
@@ -322,6 +325,11 @@ async def send_request(
                         generate_len = json.loads(response_text)["usage"]["completion_tokens"]
                     except Exception:
                         generate_len = None
+            expected_output_len = temp_config["max_new_tokens"]
+            if vllm_engine and generate_len != expected_output_len:
+                print(
+                    f"Warning: the actual generated length is {generate_len}, which is different from the expected output length({expected_output_len})."
+                )
             if progress_bar:
                 progress_bar.update()
             break
@@ -356,6 +364,7 @@ async def benchmark(
     track_input_output: bool = False,
     progress: bool = False,
     simple: bool = False,
+    vllm_engine: bool = False,
 ) -> None:
     """
     Benchmark the API by sending multiple requests asynchronously.
@@ -386,6 +395,7 @@ async def benchmark(
                     track_input_output,
                     progress_bar,
                     simple,
+                    vllm_engine,
                 )
             )
         )
@@ -460,6 +470,9 @@ def main(args: argparse.Namespace):
         config["top_p"] = float(args.top_p)
     if args.top_k:
         config["top_k"] = float(args.top_k)
+    # In order to align with vllm test parameters
+    if args.vllm_engine:
+        config["ignore_eos"] = True
 
     benchmark_start_time = time.perf_counter()
     asyncio.run(
@@ -474,6 +487,7 @@ def main(args: argparse.Namespace):
             args.track_input_output,
             args.progress,
             args.simple,
+            args.vllm_engine,
         )
     )
 
@@ -713,6 +727,11 @@ def main(args: argparse.Namespace):
         help="The number of highest probability vocabulary tokens to keep \
             for top-k-filtering.",
     )
+    parser.add_argument(
+        "--vllm-engine",
+        action="store_true",
+        help="If set, parameter ignore_eos will be True to generate the completion.",
+    )
     parser.add_argument(
         "--progress", action="store_true", help="Whether to display a progress bar."
     )

diff --git a/docs/benchmark.md b/docs/benchmark.md
@@ -117,7 +117,7 @@ OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --confi
 ```
 4. Send requests
 ```cmd
-numactl -N 1 -m 1 python benchmarks/benchmark_serving.py --model-endpoint-base http://127.0.0.1:8000 --model-name llama-2-7b-chat-hf --dataset ./dataset/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --dataset-format ShareGPT --simple
+numactl -N 1 -m 1 python benchmarks/benchmark_serving.py --model-endpoint-base http://127.0.0.1:8000 --model-name llama-2-7b-chat-hf --dataset ./dataset/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --dataset-format ShareGPT --vllm-engine --simple
 ```
 5. Results
 ```cmd

diff --git a/llm_on_ray/inference/api_openai_backend/openai_protocol.py b/llm_on_ray/inference/api_openai_backend/openai_protocol.py
@@ -413,6 +413,7 @@ class ChatCompletionRequest(BaseModel):
     user: Optional[str] = None
     tools: Optional[List[Tool]] = None
     tool_choice: Union[Literal["auto", "none"], ToolChoice] = "auto"
+    ignore_eos: bool = False  # used in vllm engine benchmark
 
 
 class FinishReason(str, Enum):

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -56,6 +56,8 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
         if temperature != 1.0 or top_p != 1.0:
             gen_config.update({"do_sample": True})
+        if request_config.get("ignore_eos", False):
+            gen_config.update({"ignore_eos": True})
 
         async for x in handle_request(
             model=model,

diff --git a/llm_on_ray/inference/vllm_predictor.py b/llm_on_ray/inference/vllm_predictor.py
@@ -66,8 +66,6 @@ async def _get_generator_output(self, results_generator):
 
     async def generate_async(self, prompts: Union[str, List[str]], **config) -> GenerateResult:
         config = self.update_vllm_config(**config)
-        # In order to align with vllm test parameters
-        config["ignore_eos"] = True
         sampling_params = SamplingParams(**config)
         if isinstance(prompts, str):
             request_id = random_uuid()