diff --git a/genai-perf/genai_perf/inputs/input_constants.py b/genai-perf/genai_perf/inputs/input_constants.py index 02b1b99a..6fc771cb 100644 --- a/genai-perf/genai_perf/inputs/input_constants.py +++ b/genai-perf/genai_perf/inputs/input_constants.py @@ -49,6 +49,7 @@ class OutputFormat(Enum): TENSORRTLLM = auto() VLLM = auto() TENSORRTLLM_ENGINE = auto() + TENSORRTLLM_BACKEND = auto() def to_lowercase(self): return self.name.lower() diff --git a/genai-perf/genai_perf/inputs/output_format_converter_factory.py b/genai-perf/genai_perf/inputs/output_format_converter_factory.py index 694fd49b..b5745332 100644 --- a/genai-perf/genai_perf/inputs/output_format_converter_factory.py +++ b/genai-perf/genai_perf/inputs/output_format_converter_factory.py @@ -46,6 +46,7 @@ def create(output_format: OutputFormat): OutputFormat.RANKINGS: RankingsConverter, OutputFormat.VLLM: VLLMConverter, OutputFormat.TENSORRTLLM: TensorRTLLMConverter, + OutputFormat.TENSORRTLLM_BACKEND: TensorRTLLMEngineConverter, OutputFormat.TENSORRTLLM_ENGINE: TensorRTLLMEngineConverter, } if output_format not in converters: diff --git a/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py index 39d05372..2703a219 100755 --- a/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -229,10 +229,13 @@ def _preprocess_response( def _get_input_token_count(self, req_inputs: dict) -> int: """Deserialize the request input and return tokenized inputs.""" - if self._service_kind == "triton": - input_text = req_inputs["text_input"] - elif self._service_kind == "triton_c_api": + if ( + self._service_kind == "triton_c_api" + or self._response_format == ResponseFormat.TENSORRTLLM_BACKEND + ): return len(req_inputs["input_ids"]) # no tokenizer required + elif self._service_kind == "triton": + input_text = req_inputs["text_input"] elif self._service_kind == "openai": input_text = self._get_openai_input_text(req_inputs) else: @@ -259,11 +262,14 @@ def _get_output_token_counts( self, res_outputs: List[Dict] ) -> Tuple[List[int], int]: """Return response-level token counts and total token count.""" - if self._service_kind == "triton": - output_texts = self._get_triton_output_tokens(res_outputs) - elif self._service_kind == "triton_c_api": + if ( + self._service_kind == "triton_c_api" + or self._response_format == ResponseFormat.TENSORRTLLM_BACKEND + ): # No tokenizer is need to get the token counts. return self._get_tensorrtllm_engine_token_counts(res_outputs) + elif self._service_kind == "triton": + output_texts = self._get_triton_output_tokens(res_outputs) elif self._service_kind == "openai": output_texts = self._get_openai_output_tokens(res_outputs) else: diff --git a/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py index e798452b..32c8d66f 100755 --- a/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py +++ b/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py @@ -44,6 +44,7 @@ class ResponseFormat(Enum): RANKINGS = auto() IMAGE_RETRIEVAL = auto() TRITON = auto() + TENSORRTLLM_BACKEND = auto() class ProfileDataParser: @@ -109,6 +110,8 @@ def _get_profile_metadata(self, data: dict) -> None: elif self._service_kind == "triton": self._response_format = ResponseFormat.TRITON + if "input_ids" in data["experiments"][0]["requests"][0]["request_inputs"]: + self._response_format = ResponseFormat.TENSORRTLLM_BACKEND elif self._service_kind == "triton_c_api": pass # ignore else: