Skip to content

Commit

Permalink
TensorRT-LLM in-process benchmarking support (#35)
Browse files Browse the repository at this point in the history
* Add tensorrtllm_engine option to service-kind and update testing (#700) (#762)

* Add tensorrtllm_engine option to service-kind and update testing

* Add output format check for tensorrtllm_engine

Co-authored-by: Elias Bermudez <[email protected]>

* Support input payload generation for tensorrtllm engine (#767)

* Add functionality for async requests and output retrieval with Triton C API (#25)

* Support 1-d array data in profile exporter (#28)

* support array of data in profile exporter

* add some tests

* run formatting

* fix pre-commit

* remove duplicate argparser arguments

* Fix Triton C API mode missing infer requested output datatype bug

---------

Co-authored-by: Matthew Kotila <[email protected]>

* Support profile data parsing for tensorrtllm engine service kind (#33)

* support parsing tensorrtllm engine profile response

* add test

* refactor the test

* update types and names

* fix pre-commit

* run PA with triton c api

* more clean up on the tests

* fix codeql

* address feedback

* Add functionality to continue benchmarking in Triton C API mode if server logging support is disabled (#34)

---------

Co-authored-by: Hyunjae Woo <[email protected]>
Co-authored-by: Elias Bermudez <[email protected]>
  • Loading branch information
3 people authored Aug 9, 2024
1 parent eda567e commit e1455e0
Show file tree
Hide file tree
Showing 29 changed files with 1,335 additions and 163 deletions.
97 changes: 97 additions & 0 deletions genai-perf/genai_perf/llm_inputs/llm_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class OutputFormat(Enum):
RANKINGS = auto()
TENSORRTLLM = auto()
VLLM = auto()
TENSORRTLLM_ENGINE = auto()

def to_lowercase(self):
return self.name.lower()
Expand Down Expand Up @@ -216,6 +217,7 @@ def create_llm_inputs(

json_in_pa_format = cls._convert_generic_json_to_output_format(
output_format,
tokenizer,
generic_dataset_json,
add_model_name,
add_stream,
Expand Down Expand Up @@ -688,6 +690,7 @@ def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict:
def _convert_generic_json_to_output_format(
cls,
output_format: OutputFormat,
tokenizer: Tokenizer,
generic_dataset: Dict,
add_model_name: bool,
add_stream: bool,
Expand Down Expand Up @@ -763,6 +766,16 @@ def _convert_generic_json_to_output_format(
model_name,
model_selection_strategy,
)
elif output_format == OutputFormat.TENSORRTLLM_ENGINE:
output_json = cls._convert_generic_json_to_trtllm_engine_format(
generic_dataset,
tokenizer,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
else:
raise GenAIPerfException(
f"Output format {output_format} is not currently supported"
Expand Down Expand Up @@ -1010,6 +1023,28 @@ def _convert_generic_json_to_trtllm_format(

return pa_json

@classmethod
def _convert_generic_json_to_trtllm_engine_format(
cls,
dataset_json: Dict,
tokenizer: Tokenizer,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
pa_json = cls._populate_trtllm_engine_output_json(
dataset_json,
tokenizer,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
return pa_json

@classmethod
def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None:
filename = output_dir / DEFAULT_INPUT_DATA_JSON
Expand Down Expand Up @@ -1261,6 +1296,43 @@ def _populate_trtllm_output_json(

return pa_json

@classmethod
def _populate_trtllm_engine_output_json(
cls,
dataset_json: Dict,
tokenizer: Tokenizer,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
pa_json = cls._create_empty_trtllm_pa_json()

for index, entry in enumerate(dataset_json["rows"]):
token_ids = tokenizer.encode(entry["text_input"])
pa_json["data"].append(
{
"input_ids": {
"content": token_ids,
"shape": [len(token_ids)],
},
"input_lengths": [len(token_ids)],
"request_output_len": [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS],
}
)

pa_json = cls._add_optional_tags_to_trtllm_engine_json(
pa_json,
index,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
)
return pa_json

@classmethod
def _create_empty_openai_pa_json(cls) -> Dict:
empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT)
Expand Down Expand Up @@ -1477,6 +1549,31 @@ def _add_optional_tags_to_trtllm_json(

return pa_json

@classmethod
def _add_optional_tags_to_trtllm_engine_json(
cls,
pa_json: Dict,
index: int,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
) -> Dict:
row = pa_json["data"][index]
if add_stream:
row["streaming"] = [True]
if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN:
num_tokens = int(random.gauss(output_tokens_mean, output_tokens_stddev))
row["request_output_len"] = [num_tokens]
if output_tokens_deterministic:
row["min_length"] = [num_tokens]

for key, value in extra_inputs.items():
row[key] = [value]

return pa_json

@classmethod
def _add_required_tags_to_trtllm_json(
cls,
Expand Down
2 changes: 1 addition & 1 deletion genai-perf/genai_perf/metrics/llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(
time_to_first_tokens: List[int] = [],
inter_token_latencies: List[int] = [],
output_token_throughputs: List[float] = [],
output_token_throughputs_per_request: List[int] = [],
output_token_throughputs_per_request: List[float] = [],
output_sequence_lengths: List[int] = [],
input_sequence_lengths: List[int] = [],
chunked_inter_token_latencies: List[List[int]] = [[]],
Expand Down
17 changes: 11 additions & 6 deletions genai-perf/genai_perf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ def _check_conditional_args(
args = _convert_str_to_enum_entry(args, "backend", OutputFormat)
args.output_format = args.backend

if args.service_kind == "tensorrtllm_engine":
args.output_format = OutputFormat.TENSORRTLLM_ENGINE

# Output token distribution checks
if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN:
if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV:
Expand All @@ -187,10 +190,11 @@ def _check_conditional_args(
"The --output-tokens-mean option is required when using --output-tokens-mean-deterministic."
)

if args.service_kind != "triton":
if args.service_kind not in ["triton", "tensorrtllm_engine"]:
if args.output_tokens_mean_deterministic:
parser.error(
"The --output-tokens-mean-deterministic option is only supported with the Triton service-kind."
"The --output-tokens-mean-deterministic option is only supported "
"with the Triton and TensorRT-LLM Engine service-kind."
)

_check_conditional_args_embeddings_rankings(parser, args)
Expand Down Expand Up @@ -267,6 +271,8 @@ def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace:
name += [f"{args.service_kind}-{args.endpoint_type}"]
elif args.service_kind == "triton":
name += [f"{args.service_kind}-{args.backend.to_lowercase()}"]
elif args.service_kind == "tensorrtllm_engine":
name += [f"{args.service_kind}"]
else:
raise ValueError(f"Unknown service kind '{args.service_kind}'.")

Expand Down Expand Up @@ -578,7 +584,7 @@ def _add_endpoint_args(parser):
endpoint_group.add_argument(
"--service-kind",
type=str,
choices=["triton", "openai"],
choices=["triton", "openai", "tensorrtllm_engine"],
default="triton",
required=False,
help="The kind of service perf_analyzer will "
Expand Down Expand Up @@ -625,9 +631,8 @@ def _add_output_args(parser):
default=Path("profile_export.json"),
help="The path where the perf_analyzer profile export will be "
"generated. By default, the profile export will be to profile_export.json. "
"The genai-perf files will be exported to <profile_export_file>_genai_perf.json and "
"<profile_export_file>_genai_perf.csv. "
"For example, if the profile export file is profile_export.json, the genai-perf CSV file will be "
"The genai-perf file will be exported to <profile_export_file>_genai_perf.csv. "
"For example, if the profile export file is profile_export.json, the genai-perf file will be "
"exported to profile_export_genai_perf.csv.",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ def _get_input_token_count(self, req_inputs: dict) -> int:
"""Deserialize the request input and return tokenized inputs."""
if self._service_kind == "triton":
input_text = req_inputs["text_input"]
elif self._service_kind == "triton_c_api":
return len(req_inputs["input_ids"]) # no tokenizer required
elif self._service_kind == "openai":
input_text = self._get_openai_input_text(req_inputs)
else:
Expand Down Expand Up @@ -252,6 +254,9 @@ def _get_output_token_counts(
"""Return response-level token counts and total token count."""
if self._service_kind == "triton":
output_texts = self._get_triton_output_tokens(res_outputs)
elif self._service_kind == "triton_c_api":
# No tokenizer is need to get the token counts.
return self._get_tensorrtllm_engine_token_counts(res_outputs)
elif self._service_kind == "openai":
output_texts = self._get_openai_output_tokens(res_outputs)
else:
Expand All @@ -263,6 +268,17 @@ def _get_output_token_counts(
output_token_counts = list(map(len, output_tokens))
return output_token_counts, full_text_token_count

def _get_tensorrtllm_engine_token_counts(
self, res_outputs: List[Dict]
) -> Tuple[List[int], int]:
token_ids = []
for r in res_outputs:
if isinstance(r["output_ids"], list):
token_ids += r["output_ids"]
else:
token_ids.append(r["output_ids"])
return token_ids, len(token_ids)

def _get_triton_output_tokens(self, res_outputs: List[Dict]) -> List[str]:
"""Return a list of Triton response texts."""
return [r["text_output"] for r in res_outputs]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def _get_profile_metadata(self, data: dict) -> None:

elif self._service_kind == "triton":
self._response_format = ResponseFormat.TRITON
elif self._service_kind == "triton_c_api":
pass # ignore
else:
raise ValueError(f"Unknown service kind: {self._service_kind}")

Expand Down
10 changes: 7 additions & 3 deletions genai-perf/genai_perf/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
f"--input-data",
f"{args.artifact_dir / DEFAULT_INPUT_DATA_JSON}",
]
cmd += Profiler.add_protocol_args(args)
cmd += Profiler.add_inference_load_args(args)

for arg, value in vars(args).items():
if arg in skip_args:
pass
Expand All @@ -122,16 +125,17 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
cmd += [f"-{arg}"]
else:
cmd += [f"--{arg}"]
# GAP needs to call PA using triton_c_api service kind when running
# against tensorrtllm engine.
elif arg == "service_kind" and value == "tensorrtllm_engine":
cmd += ["--service-kind", "triton_c_api", "--streaming"]
else:
if len(arg) == 1:
cmd += [f"-{arg}", f"{value}"]
else:
arg = utils.convert_option_name(arg)
cmd += [f"--{arg}", f"{value}"]

cmd += Profiler.add_protocol_args(args)
cmd += Profiler.add_inference_load_args(args)

if extra_args is not None:
for arg in extra_args:
cmd += [f"{arg}"]
Expand Down
7 changes: 6 additions & 1 deletion genai-perf/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ def test_help_version_arguments_output_and_exit(
(["--request-rate", "9.0"], {"request_rate": 9.0}),
(["-s", "99.5"], {"stability_percentage": 99.5}),
(["--service-kind", "triton"], {"service_kind": "triton"}),
(
["--service-kind", "tensorrtllm_engine"],
{"service_kind": "tensorrtllm_engine"},
),
(
["--service-kind", "openai", "--endpoint-type", "chat"],
{"service_kind": "openai", "endpoint": "v1/chat/completions"},
Expand Down Expand Up @@ -530,7 +534,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys):
"100",
"--output-tokens-mean-deterministic",
],
"The --output-tokens-mean-deterministic option is only supported with the Triton service-kind",
"The --output-tokens-mean-deterministic option is only supported with the Triton and TensorRT-LLM Engine service-kind",
),
(
[
Expand Down Expand Up @@ -642,6 +646,7 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys):
OutputFormat.TENSORRTLLM,
),
(["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM),
(["--service-kind", "tensorrtllm_engine"], OutputFormat.TENSORRTLLM_ENGINE),
],
)
def test_inferred_output_format(self, monkeypatch, args, expected_format):
Expand Down
Loading

0 comments on commit e1455e0

Please sign in to comment.