diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c6470449..2eba3aee 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,7 +50,7 @@
 import copy
 
 # (prompt str, output str, prompt len, output len, request latency, latencies list)
-latency_tracking: List[Tuple[Optional[str], Optional[str], int, int, float, List[float]]] = []
+latency_tracking: List[Tuple[Optional[List[str]], Optional[str], int, int, float, List[float]]] = []
 
 
 def sample_requests_ShareGPT(
@@ -97,8 +97,8 @@ def sample_requests_ShareGPT(
         tokenized_dataset.append(([prompts[i]], prompt_token_ids[i], output_len))
 
     # Filter out too long sequences.
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    filtered_dataset: List[Tuple[List[str], int, int]] = []
+    for prompts, prompt_token_ids, output_len in tokenized_dataset:
         prompt_len = len(prompt_token_ids)
         # Prune too short sequences.
         if (min_input_tokens_len is not None and prompt_len < min_input_tokens_len) or (
@@ -112,7 +112,7 @@ def sample_requests_ShareGPT(
             continue
         if max_length is not None and prompt_len + output_len > max_length:
             continue
-        filtered_dataset.append(([prompt], prompt_len, output_len))
+        filtered_dataset.append((prompts, prompt_len, output_len))
 
     # Sample the requests.
     sampled_requests = random.sample(filtered_dataset, num_requests)
@@ -163,7 +163,7 @@ def sample_requests_IDC(
     max_new_tokens: int,
     num_requests: int,
     tokenizer: PreTrainedTokenizer,
-    config: Dict[str, Union[int, float]]
+    config: Dict[str, Union[int, float]],
 ) -> List[Tuple[List[str], int, int]]:
     """
     Sample requests from a dataset of IPEX format.
@@ -247,9 +247,9 @@ def gen_prompt_ids(prompt_len):
 
 
 async def get_request(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: List[Tuple[List[str], int, int]],
     request_rate: float,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
+) -> AsyncGenerator[Tuple[List[str], int, int], None]:
     """
     Asynchronously generates requests based on the input_requests and request_rate.
 
@@ -343,7 +343,7 @@ async def send_request(
 
     token_latencies_per_request: List[float] = []
 
-    timeout = aiohttp.ClientTimeout(total=5 * 3600) 
+    timeout = aiohttp.ClientTimeout(total=5 * 3600)
     async with aiohttp.ClientSession(timeout=timeout) as session:
         while True:
             async with session.post(api_url, headers=headers, json=pload) as response:
@@ -388,13 +388,16 @@ async def send_request(
                     response_content = chunks[-2].decode("utf-8")
                     response_content = json.loads(response_content.split("data: ")[1])
                     generate_len = response_content["usage"]["completion_tokens"]
-                    response_text = []
+                    response_texts = []
                     for decoded_chunk in decoded_chunks:
                         text = decoded_chunk.split("data: ")[1]
                         if text.startswith("{"):
                             json_text = json.loads(text)
-                            if "choices" in json_text and "content" in json_text["choices"][0]["delta"]:
-                                response_text.append(json_text["choices"][0]["delta"]["content"])
+                            if (
+                                "choices" in json_text
+                                and "content" in json_text["choices"][0]["delta"]
+                            ):
+                                response_texts.append(json_text["choices"][0]["delta"]["content"])
                 else:
                     response_text = b"".join(chunks).decode("utf-8")
                     try:
@@ -411,11 +414,11 @@ async def send_request(
             break
 
     if args.track_token_latency:
-        print("response: ", "".join(response_text))
+        print("response: ", "".join(response_texts))
     request_end_time = time.perf_counter()
     request_latency = request_end_time - request_start_time
 
-    prompt_str = prompt if track_input_output else None
+    prompt_str = prompts if track_input_output else None
     output_str = response_text if track_input_output else None
 
     if generate_len is not None:
@@ -449,7 +452,7 @@ async def benchmark(
 
     Args:
         api_url (str): The URL of the API.
-        input_requests (List[Tuple[str, int, int]]): A list of input requests, where each request is a tuple
+        input_requests (List[Tuple[List[str], int, int]]): A list of input requests, where each request is a tuple
             containing the prompt, prompt length, and output length.
         request_rate (float): The rate at which requests should be sent, in requests per second.
         config (dict): Configuration parameters for sending requests.
@@ -615,7 +618,11 @@ def main(args: argparse.Namespace):
     next_token_index = 1 if args.simple else 2
     if args.track_token_latency and latency_tracking:
         avg_first_token_latency = np.mean(
-            [latencies[first_token_index] for _, _, _, _, _, latencies in latency_tracking if latencies != []]
+            [
+                latencies[first_token_index]
+                for _, _, _, _, _, latencies in latency_tracking
+                if latencies != []
+            ]
         )
         avg_next_token_latency = np.mean(
             [
@@ -848,4 +855,3 @@ def main(args: argparse.Namespace):
     )
     args = parser.parse_args()
     main(args)
-
diff --git a/benchmarks/calc_stats.py b/benchmarks/calc_stats.py
index faa0811b..fe3b684c 100644
--- a/benchmarks/calc_stats.py
+++ b/benchmarks/calc_stats.py
@@ -3,7 +3,9 @@
 from typing import Dict, List
 
 if len(sys.argv) < 4:
-    raise ValueError("need arguments, file path, number of expected iterations and expected generated token length")
+    raise ValueError(
+        "need arguments, file path, number of expected iterations and expected generated token length"
+    )
 
 file_path = sys.argv[1]
 nbr_iter = int(sys.argv[2])
@@ -12,22 +14,24 @@
 with open(file_path) as f:
     lines = f.readlines()
 
-PAT_NBR_USERS   = re.compile(r"Run num_prompts (\d+) (.+)")
-PAT_ITER        = re.compile(r"Run iter (\d+)")
-PAT_ACTUAL_LEN  = re.compile(r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\.")
-PAT_TOTAL_TIME  = re.compile(r"Total time: ([^ ]+) s")
-PAT_PROMPT_LEN  = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+")
-PAT_REQ_TPT     = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s")
-PAT_INPUT_TPT   = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s")
-PAT_OUTPUT_TPT  = re.compile(r"output Token Throughput: ([^ ]+) tokens/s")
-PAT_REQ_LAT     = re.compile(r"Average latency per Request: ([^ ]+) s")
-PAT_TOK_LAT     = re.compile(r"Average latency per Token: ([^ ]+) s")
-PAT_FTOK_LAT    = re.compile(r"Average latency for First Tokens: ([^ ]+) s")
-PAT_NTOK_LAT    = re.compile(r"Average latency for Next Tokens: ([^ ]+) s")
+PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)")
+PAT_ITER = re.compile(r"Run iter (\d+)")
+PAT_ACTUAL_LEN = re.compile(
+    r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\."
+)
+PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s")
+PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+")
+PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s")
+PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s")
+PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s")
+PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s")
+PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s")
+PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s")
+PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s")
 
 nbr_users_perf: Dict[int, List[Dict[str, float]]] = {}
 
-token_lengths: List[int] = [] 
+token_lengths: List[int] = []
 
 state = 0
 current_nbr_user = -1
@@ -54,13 +58,15 @@
             print(">>>", line, m.group(1))
             token_lengths.append(int(m.group(1)))
             if expected_gen_token_len != int(m.group(2)):
-                raise ValueError("expected token lengths are not equal", expected_gen_token_len, m.group(2))
+                raise ValueError(
+                    "expected token lengths are not equal", expected_gen_token_len, m.group(2)
+                )
         else:
             m = PAT_TOTAL_TIME.match(line)
             if m:
                 metrics = nbr_users_perf[current_nbr_user][current_iter]
                 full_gen_lens = token_lengths + [512] * (current_nbr_user - len(token_lengths))
-                metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens))/current_nbr_user
+                metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens)) / current_nbr_user
                 metrics["TOTAL_TIME"] = float(m.group(1))
                 token_lengths = []
                 state = 4
@@ -138,30 +144,32 @@
         print("number of users: ", k)
         size = len(values)
         if size != nbr_iter:
-            raise ValueError("size should be equal to number of interations, " + str(size) + " != " + str(nbr_iter))
+            raise ValueError(
+                "size should be equal to number of interations, "
+                + str(size)
+                + " != "
+                + str(nbr_iter)
+            )
         metrics = {
-                "ACT_GEN_TOKENS": 0.0,
-                "PROMPT_LEN": 0.0,
-                "TOTAL_TIME": 0.0,
-                "REQ_TPT": 0.0,
-                "INPUT_TPT": 0.0,
-                "OUTPUT_TPT": 0.0,
-                "REQ_LAT": 0.0,
-                "TOK_LAT": 0.0,
-                "FTOK_LAT": 0.0,
-                "NTOK_LAT": 0.0,
-                }
+            "ACT_GEN_TOKENS": 0.0,
+            "PROMPT_LEN": 0.0,
+            "TOTAL_TIME": 0.0,
+            "REQ_TPT": 0.0,
+            "INPUT_TPT": 0.0,
+            "OUTPUT_TPT": 0.0,
+            "REQ_LAT": 0.0,
+            "TOK_LAT": 0.0,
+            "FTOK_LAT": 0.0,
+            "NTOK_LAT": 0.0,
+        }
         for v in values:
-            for k in metrics:
-                metrics[k] += v[k]
-        for k, v in metrics.items():
-            metrics[k] = v/size
+            for kk in metrics:
+                metrics[kk] += v[kk]
+        for kk, vv in metrics.items():
+            metrics[kk] = vv / size
         print(metrics)
         print("=========================================")
 
 
-
 else:
     raise ValueError("Failed to collect metrics")
-
-