Skip to content

Commit

Permalink
Move parameter ignore_eos to benchmark script (#184)
Browse files Browse the repository at this point in the history
* move ignore_eos to benchmark script

* fix
  • Loading branch information
KepingYan authored Apr 12, 2024
1 parent 36e8e2a commit dc2a1b4
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 3 deletions.
19 changes: 19 additions & 0 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ async def send_request(
track_input_output: bool = False,
progress_bar: tqdm = None,
simple: bool = False,
vllm_engine: bool = False,
) -> None:
"""
Sends a request to the specified API URL with the given prompt and configuration.
Expand Down Expand Up @@ -278,6 +279,8 @@ async def send_request(
"temperature": temp_config["temperature"] if "temperature" in temp_config else None,
"top_p": temp_config["top_p"] if "top_p" in temp_config else None,
}
if vllm_engine:
pload.update({"ignore_eos": True})

token_latencies_per_request: List[float] = []

Expand Down Expand Up @@ -322,6 +325,11 @@ async def send_request(
generate_len = json.loads(response_text)["usage"]["completion_tokens"]
except Exception:
generate_len = None
expected_output_len = temp_config["max_new_tokens"]
if vllm_engine and generate_len != expected_output_len:
print(
f"Warning: the actual generated length is {generate_len}, which is different from the expected output length({expected_output_len})."
)
if progress_bar:
progress_bar.update()
break
Expand Down Expand Up @@ -356,6 +364,7 @@ async def benchmark(
track_input_output: bool = False,
progress: bool = False,
simple: bool = False,
vllm_engine: bool = False,
) -> None:
"""
Benchmark the API by sending multiple requests asynchronously.
Expand Down Expand Up @@ -386,6 +395,7 @@ async def benchmark(
track_input_output,
progress_bar,
simple,
vllm_engine,
)
)
)
Expand Down Expand Up @@ -460,6 +470,9 @@ def main(args: argparse.Namespace):
config["top_p"] = float(args.top_p)
if args.top_k:
config["top_k"] = float(args.top_k)
# In order to align with vllm test parameters
if args.vllm_engine:
config["ignore_eos"] = True

benchmark_start_time = time.perf_counter()
asyncio.run(
Expand All @@ -474,6 +487,7 @@ def main(args: argparse.Namespace):
args.track_input_output,
args.progress,
args.simple,
args.vllm_engine,
)
)

Expand Down Expand Up @@ -713,6 +727,11 @@ def main(args: argparse.Namespace):
help="The number of highest probability vocabulary tokens to keep \
for top-k-filtering.",
)
parser.add_argument(
"--vllm-engine",
action="store_true",
help="If set, parameter ignore_eos will be True to generate the completion.",
)
parser.add_argument(
"--progress", action="store_true", help="Whether to display a progress bar."
)
Expand Down
2 changes: 1 addition & 1 deletion docs/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --confi
```
4. Send requests
```cmd
numactl -N 1 -m 1 python benchmarks/benchmark_serving.py --model-endpoint-base http://127.0.0.1:8000 --model-name llama-2-7b-chat-hf --dataset ./dataset/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --dataset-format ShareGPT --simple
numactl -N 1 -m 1 python benchmarks/benchmark_serving.py --model-endpoint-base http://127.0.0.1:8000 --model-name llama-2-7b-chat-hf --dataset ./dataset/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --dataset-format ShareGPT --vllm-engine --simple
```
5. Results
```cmd
Expand Down
1 change: 1 addition & 0 deletions llm_on_ray/inference/api_openai_backend/openai_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ class ChatCompletionRequest(BaseModel):
user: Optional[str] = None
tools: Optional[List[Tool]] = None
tool_choice: Union[Literal["auto", "none"], ToolChoice] = "auto"
ignore_eos: bool = False # used in vllm engine benchmark


class FinishReason(str, Enum):
Expand Down
2 changes: 2 additions & 0 deletions llm_on_ray/inference/api_openai_backend/query_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
if temperature != 1.0 or top_p != 1.0:
gen_config.update({"do_sample": True})
if request_config.get("ignore_eos", False):
gen_config.update({"ignore_eos": True})

async for x in handle_request(
model=model,
Expand Down
2 changes: 0 additions & 2 deletions llm_on_ray/inference/vllm_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ async def _get_generator_output(self, results_generator):

async def generate_async(self, prompts: Union[str, List[str]], **config) -> GenerateResult:
config = self.update_vllm_config(**config)
# In order to align with vllm test parameters
config["ignore_eos"] = True
sampling_params = SamplingParams(**config)
if isinstance(prompts, str):
request_id = random_uuid()
Expand Down

0 comments on commit dc2a1b4

Please sign in to comment.