diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c6470449..2eba3aee 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -50,7 +50,7 @@ import copy # (prompt str, output str, prompt len, output len, request latency, latencies list) -latency_tracking: List[Tuple[Optional[str], Optional[str], int, int, float, List[float]]] = [] +latency_tracking: List[Tuple[Optional[List[str]], Optional[str], int, int, float, List[float]]] = [] def sample_requests_ShareGPT( @@ -97,8 +97,8 @@ def sample_requests_ShareGPT( tokenized_dataset.append(([prompts[i]], prompt_token_ids[i], output_len)) # Filter out too long sequences. - filtered_dataset: List[Tuple[str, int, int]] = [] - for prompt, prompt_token_ids, output_len in tokenized_dataset: + filtered_dataset: List[Tuple[List[str], int, int]] = [] + for prompts, prompt_token_ids, output_len in tokenized_dataset: prompt_len = len(prompt_token_ids) # Prune too short sequences. if (min_input_tokens_len is not None and prompt_len < min_input_tokens_len) or ( @@ -112,7 +112,7 @@ def sample_requests_ShareGPT( continue if max_length is not None and prompt_len + output_len > max_length: continue - filtered_dataset.append(([prompt], prompt_len, output_len)) + filtered_dataset.append((prompts, prompt_len, output_len)) # Sample the requests. sampled_requests = random.sample(filtered_dataset, num_requests) @@ -163,7 +163,7 @@ def sample_requests_IDC( max_new_tokens: int, num_requests: int, tokenizer: PreTrainedTokenizer, - config: Dict[str, Union[int, float]] + config: Dict[str, Union[int, float]], ) -> List[Tuple[List[str], int, int]]: """ Sample requests from a dataset of IPEX format. @@ -247,9 +247,9 @@ def gen_prompt_ids(prompt_len): async def get_request( - input_requests: List[Tuple[str, int, int]], + input_requests: List[Tuple[List[str], int, int]], request_rate: float, -) -> AsyncGenerator[Tuple[str, int, int], None]: +) -> AsyncGenerator[Tuple[List[str], int, int], None]: """ Asynchronously generates requests based on the input_requests and request_rate. @@ -343,7 +343,7 @@ async def send_request( token_latencies_per_request: List[float] = [] - timeout = aiohttp.ClientTimeout(total=5 * 3600) + timeout = aiohttp.ClientTimeout(total=5 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: while True: async with session.post(api_url, headers=headers, json=pload) as response: @@ -388,13 +388,16 @@ async def send_request( response_content = chunks[-2].decode("utf-8") response_content = json.loads(response_content.split("data: ")[1]) generate_len = response_content["usage"]["completion_tokens"] - response_text = [] + response_texts = [] for decoded_chunk in decoded_chunks: text = decoded_chunk.split("data: ")[1] if text.startswith("{"): json_text = json.loads(text) - if "choices" in json_text and "content" in json_text["choices"][0]["delta"]: - response_text.append(json_text["choices"][0]["delta"]["content"]) + if ( + "choices" in json_text + and "content" in json_text["choices"][0]["delta"] + ): + response_texts.append(json_text["choices"][0]["delta"]["content"]) else: response_text = b"".join(chunks).decode("utf-8") try: @@ -411,11 +414,11 @@ async def send_request( break if args.track_token_latency: - print("response: ", "".join(response_text)) + print("response: ", "".join(response_texts)) request_end_time = time.perf_counter() request_latency = request_end_time - request_start_time - prompt_str = prompt if track_input_output else None + prompt_str = prompts if track_input_output else None output_str = response_text if track_input_output else None if generate_len is not None: @@ -449,7 +452,7 @@ async def benchmark( Args: api_url (str): The URL of the API. - input_requests (List[Tuple[str, int, int]]): A list of input requests, where each request is a tuple + input_requests (List[Tuple[List[str], int, int]]): A list of input requests, where each request is a tuple containing the prompt, prompt length, and output length. request_rate (float): The rate at which requests should be sent, in requests per second. config (dict): Configuration parameters for sending requests. @@ -615,7 +618,11 @@ def main(args: argparse.Namespace): next_token_index = 1 if args.simple else 2 if args.track_token_latency and latency_tracking: avg_first_token_latency = np.mean( - [latencies[first_token_index] for _, _, _, _, _, latencies in latency_tracking if latencies != []] + [ + latencies[first_token_index] + for _, _, _, _, _, latencies in latency_tracking + if latencies != [] + ] ) avg_next_token_latency = np.mean( [ @@ -848,4 +855,3 @@ def main(args: argparse.Namespace): ) args = parser.parse_args() main(args) - diff --git a/benchmarks/calc_stats.py b/benchmarks/calc_stats.py index faa0811b..fe3b684c 100644 --- a/benchmarks/calc_stats.py +++ b/benchmarks/calc_stats.py @@ -3,7 +3,9 @@ from typing import Dict, List if len(sys.argv) < 4: - raise ValueError("need arguments, file path, number of expected iterations and expected generated token length") + raise ValueError( + "need arguments, file path, number of expected iterations and expected generated token length" + ) file_path = sys.argv[1] nbr_iter = int(sys.argv[2]) @@ -12,22 +14,24 @@ with open(file_path) as f: lines = f.readlines() -PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)") -PAT_ITER = re.compile(r"Run iter (\d+)") -PAT_ACTUAL_LEN = re.compile(r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\.") -PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s") -PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+") -PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s") -PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s") -PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s") -PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s") -PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s") -PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s") -PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s") +PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)") +PAT_ITER = re.compile(r"Run iter (\d+)") +PAT_ACTUAL_LEN = re.compile( + r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\." +) +PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s") +PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+") +PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s") +PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s") +PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s") +PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s") +PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s") +PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s") +PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s") nbr_users_perf: Dict[int, List[Dict[str, float]]] = {} -token_lengths: List[int] = [] +token_lengths: List[int] = [] state = 0 current_nbr_user = -1 @@ -54,13 +58,15 @@ print(">>>", line, m.group(1)) token_lengths.append(int(m.group(1))) if expected_gen_token_len != int(m.group(2)): - raise ValueError("expected token lengths are not equal", expected_gen_token_len, m.group(2)) + raise ValueError( + "expected token lengths are not equal", expected_gen_token_len, m.group(2) + ) else: m = PAT_TOTAL_TIME.match(line) if m: metrics = nbr_users_perf[current_nbr_user][current_iter] full_gen_lens = token_lengths + [512] * (current_nbr_user - len(token_lengths)) - metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens))/current_nbr_user + metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens)) / current_nbr_user metrics["TOTAL_TIME"] = float(m.group(1)) token_lengths = [] state = 4 @@ -138,30 +144,32 @@ print("number of users: ", k) size = len(values) if size != nbr_iter: - raise ValueError("size should be equal to number of interations, " + str(size) + " != " + str(nbr_iter)) + raise ValueError( + "size should be equal to number of interations, " + + str(size) + + " != " + + str(nbr_iter) + ) metrics = { - "ACT_GEN_TOKENS": 0.0, - "PROMPT_LEN": 0.0, - "TOTAL_TIME": 0.0, - "REQ_TPT": 0.0, - "INPUT_TPT": 0.0, - "OUTPUT_TPT": 0.0, - "REQ_LAT": 0.0, - "TOK_LAT": 0.0, - "FTOK_LAT": 0.0, - "NTOK_LAT": 0.0, - } + "ACT_GEN_TOKENS": 0.0, + "PROMPT_LEN": 0.0, + "TOTAL_TIME": 0.0, + "REQ_TPT": 0.0, + "INPUT_TPT": 0.0, + "OUTPUT_TPT": 0.0, + "REQ_LAT": 0.0, + "TOK_LAT": 0.0, + "FTOK_LAT": 0.0, + "NTOK_LAT": 0.0, + } for v in values: - for k in metrics: - metrics[k] += v[k] - for k, v in metrics.items(): - metrics[k] = v/size + for kk in metrics: + metrics[kk] += v[kk] + for kk, vv in metrics.items(): + metrics[kk] = vv / size print(metrics) print("=========================================") - else: raise ValueError("Failed to collect metrics") - -