triton-inference-server · IzzyPutterman · Sep 4, 2024 · matthewkotila · Sep 23, 2024 · IzzyPutterman
diff --git a/genai-perf/genai_perf/inputs/input_constants.py b/genai-perf/genai_perf/inputs/input_constants.py
@@ -49,6 +49,7 @@ class OutputFormat(Enum):
     TENSORRTLLM = auto()
     VLLM = auto()
     TENSORRTLLM_ENGINE = auto()
+    TENSORRTLLM_BACKEND = auto()
 
     def to_lowercase(self):
         return self.name.lower()

diff --git a/genai-perf/genai_perf/inputs/output_format_converter_factory.py b/genai-perf/genai_perf/inputs/output_format_converter_factory.py
@@ -46,6 +46,7 @@ def create(output_format: OutputFormat):
             OutputFormat.RANKINGS: RankingsConverter,
             OutputFormat.VLLM: VLLMConverter,
             OutputFormat.TENSORRTLLM: TensorRTLLMConverter,
+            OutputFormat.TENSORRTLLM_BACKEND: TensorRTLLMEngineConverter,
             OutputFormat.TENSORRTLLM_ENGINE: TensorRTLLMEngineConverter,
         }
         if output_format not in converters:

diff --git a/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
@@ -229,10 +229,13 @@ def _preprocess_response(
 
     def _get_input_token_count(self, req_inputs: dict) -> int:
         """Deserialize the request input and return tokenized inputs."""
-        if self._service_kind == "triton":
-            input_text = req_inputs["text_input"]
-        elif self._service_kind == "triton_c_api":
+        if (
+            self._service_kind == "triton_c_api"
+            or self._response_format == ResponseFormat.TENSORRTLLM_BACKEND
+        ):
             return len(req_inputs["input_ids"])  # no tokenizer required
+        elif self._service_kind == "triton":
+            input_text = req_inputs["text_input"]
         elif self._service_kind == "openai":
             input_text = self._get_openai_input_text(req_inputs)
         else:
@@ -259,11 +262,14 @@ def _get_output_token_counts(
         self, res_outputs: List[Dict]
     ) -> Tuple[List[int], int]:
         """Return response-level token counts and total token count."""
-        if self._service_kind == "triton":
-            output_texts = self._get_triton_output_tokens(res_outputs)
-        elif self._service_kind == "triton_c_api":
+        if (
+            self._service_kind == "triton_c_api"
+            or self._response_format == ResponseFormat.TENSORRTLLM_BACKEND
+        ):
             # No tokenizer is need to get the token counts.
             return self._get_tensorrtllm_engine_token_counts(res_outputs)
+        elif self._service_kind == "triton":
+            output_texts = self._get_triton_output_tokens(res_outputs)
         elif self._service_kind == "openai":
             output_texts = self._get_openai_output_tokens(res_outputs)
         else:

diff --git a/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py
@@ -44,6 +44,7 @@ class ResponseFormat(Enum):
     RANKINGS = auto()
     IMAGE_RETRIEVAL = auto()
     TRITON = auto()
+    TENSORRTLLM_BACKEND = auto()
 
 
 class ProfileDataParser:
@@ -109,6 +110,8 @@ def _get_profile_metadata(self, data: dict) -> None:
 
         elif self._service_kind == "triton":
             self._response_format = ResponseFormat.TRITON
+            if "input_ids" in data["experiments"][0]["requests"][0]["request_inputs"]:
+                self._response_format = ResponseFormat.TENSORRTLLM_BACKEND
         elif self._service_kind == "triton_c_api":
             pass  # ignore
         else: