Skip to content

Commit

Permalink
fix lint issue
Browse files Browse the repository at this point in the history
Signed-off-by: Jiafu Zhang <[email protected]>
  • Loading branch information
jiafuzha committed Jul 16, 2024
1 parent d5694c2 commit 10f0f7c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 51 deletions.
38 changes: 22 additions & 16 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
import copy

# (prompt str, output str, prompt len, output len, request latency, latencies list)
latency_tracking: List[Tuple[Optional[str], Optional[str], int, int, float, List[float]]] = []
latency_tracking: List[Tuple[Optional[List[str]], Optional[str], int, int, float, List[float]]] = []


def sample_requests_ShareGPT(
Expand Down Expand Up @@ -97,8 +97,8 @@ def sample_requests_ShareGPT(
tokenized_dataset.append(([prompts[i]], prompt_token_ids[i], output_len))

# Filter out too long sequences.
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
filtered_dataset: List[Tuple[List[str], int, int]] = []
for prompts, prompt_token_ids, output_len in tokenized_dataset:
prompt_len = len(prompt_token_ids)
# Prune too short sequences.
if (min_input_tokens_len is not None and prompt_len < min_input_tokens_len) or (
Expand All @@ -112,7 +112,7 @@ def sample_requests_ShareGPT(
continue
if max_length is not None and prompt_len + output_len > max_length:
continue
filtered_dataset.append(([prompt], prompt_len, output_len))
filtered_dataset.append((prompts, prompt_len, output_len))

# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
Expand Down Expand Up @@ -163,7 +163,7 @@ def sample_requests_IDC(
max_new_tokens: int,
num_requests: int,
tokenizer: PreTrainedTokenizer,
config: Dict[str, Union[int, float]]
config: Dict[str, Union[int, float]],
) -> List[Tuple[List[str], int, int]]:
"""
Sample requests from a dataset of IPEX format.
Expand Down Expand Up @@ -247,9 +247,9 @@ def gen_prompt_ids(prompt_len):


async def get_request(
input_requests: List[Tuple[str, int, int]],
input_requests: List[Tuple[List[str], int, int]],
request_rate: float,
) -> AsyncGenerator[Tuple[str, int, int], None]:
) -> AsyncGenerator[Tuple[List[str], int, int], None]:
"""
Asynchronously generates requests based on the input_requests and request_rate.
Expand Down Expand Up @@ -343,7 +343,7 @@ async def send_request(

token_latencies_per_request: List[float] = []

timeout = aiohttp.ClientTimeout(total=5 * 3600)
timeout = aiohttp.ClientTimeout(total=5 * 3600)
async with aiohttp.ClientSession(timeout=timeout) as session:
while True:
async with session.post(api_url, headers=headers, json=pload) as response:
Expand Down Expand Up @@ -388,13 +388,16 @@ async def send_request(
response_content = chunks[-2].decode("utf-8")
response_content = json.loads(response_content.split("data: ")[1])
generate_len = response_content["usage"]["completion_tokens"]
response_text = []
response_texts = []
for decoded_chunk in decoded_chunks:
text = decoded_chunk.split("data: ")[1]
if text.startswith("{"):
json_text = json.loads(text)
if "choices" in json_text and "content" in json_text["choices"][0]["delta"]:
response_text.append(json_text["choices"][0]["delta"]["content"])
if (
"choices" in json_text
and "content" in json_text["choices"][0]["delta"]
):
response_texts.append(json_text["choices"][0]["delta"]["content"])
else:
response_text = b"".join(chunks).decode("utf-8")
try:
Expand All @@ -411,11 +414,11 @@ async def send_request(
break

if args.track_token_latency:
print("response: ", "".join(response_text))
print("response: ", "".join(response_texts))
request_end_time = time.perf_counter()
request_latency = request_end_time - request_start_time

prompt_str = prompt if track_input_output else None
prompt_str = prompts if track_input_output else None
output_str = response_text if track_input_output else None

if generate_len is not None:
Expand Down Expand Up @@ -449,7 +452,7 @@ async def benchmark(
Args:
api_url (str): The URL of the API.
input_requests (List[Tuple[str, int, int]]): A list of input requests, where each request is a tuple
input_requests (List[Tuple[List[str], int, int]]): A list of input requests, where each request is a tuple
containing the prompt, prompt length, and output length.
request_rate (float): The rate at which requests should be sent, in requests per second.
config (dict): Configuration parameters for sending requests.
Expand Down Expand Up @@ -615,7 +618,11 @@ def main(args: argparse.Namespace):
next_token_index = 1 if args.simple else 2
if args.track_token_latency and latency_tracking:
avg_first_token_latency = np.mean(
[latencies[first_token_index] for _, _, _, _, _, latencies in latency_tracking if latencies != []]
[
latencies[first_token_index]
for _, _, _, _, _, latencies in latency_tracking
if latencies != []
]
)
avg_next_token_latency = np.mean(
[
Expand Down Expand Up @@ -848,4 +855,3 @@ def main(args: argparse.Namespace):
)
args = parser.parse_args()
main(args)

78 changes: 43 additions & 35 deletions benchmarks/calc_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from typing import Dict, List

if len(sys.argv) < 4:
raise ValueError("need arguments, file path, number of expected iterations and expected generated token length")
raise ValueError(
"need arguments, file path, number of expected iterations and expected generated token length"
)

file_path = sys.argv[1]
nbr_iter = int(sys.argv[2])
Expand All @@ -12,22 +14,24 @@
with open(file_path) as f:
lines = f.readlines()

PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)")
PAT_ITER = re.compile(r"Run iter (\d+)")
PAT_ACTUAL_LEN = re.compile(r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\.")
PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s")
PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+")
PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s")
PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s")
PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s")
PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s")
PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s")
PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s")
PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s")
PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)")
PAT_ITER = re.compile(r"Run iter (\d+)")
PAT_ACTUAL_LEN = re.compile(
r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\."
)
PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s")
PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+")
PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s")
PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s")
PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s")
PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s")
PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s")
PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s")
PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s")

nbr_users_perf: Dict[int, List[Dict[str, float]]] = {}

token_lengths: List[int] = []
token_lengths: List[int] = []

state = 0
current_nbr_user = -1
Expand All @@ -54,13 +58,15 @@
print(">>>", line, m.group(1))
token_lengths.append(int(m.group(1)))
if expected_gen_token_len != int(m.group(2)):
raise ValueError("expected token lengths are not equal", expected_gen_token_len, m.group(2))
raise ValueError(
"expected token lengths are not equal", expected_gen_token_len, m.group(2)
)
else:
m = PAT_TOTAL_TIME.match(line)
if m:
metrics = nbr_users_perf[current_nbr_user][current_iter]
full_gen_lens = token_lengths + [512] * (current_nbr_user - len(token_lengths))
metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens))/current_nbr_user
metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens)) / current_nbr_user
metrics["TOTAL_TIME"] = float(m.group(1))
token_lengths = []
state = 4
Expand Down Expand Up @@ -138,30 +144,32 @@
print("number of users: ", k)
size = len(values)
if size != nbr_iter:
raise ValueError("size should be equal to number of interations, " + str(size) + " != " + str(nbr_iter))
raise ValueError(
"size should be equal to number of interations, "
+ str(size)
+ " != "
+ str(nbr_iter)
)
metrics = {
"ACT_GEN_TOKENS": 0.0,
"PROMPT_LEN": 0.0,
"TOTAL_TIME": 0.0,
"REQ_TPT": 0.0,
"INPUT_TPT": 0.0,
"OUTPUT_TPT": 0.0,
"REQ_LAT": 0.0,
"TOK_LAT": 0.0,
"FTOK_LAT": 0.0,
"NTOK_LAT": 0.0,
}
"ACT_GEN_TOKENS": 0.0,
"PROMPT_LEN": 0.0,
"TOTAL_TIME": 0.0,
"REQ_TPT": 0.0,
"INPUT_TPT": 0.0,
"OUTPUT_TPT": 0.0,
"REQ_LAT": 0.0,
"TOK_LAT": 0.0,
"FTOK_LAT": 0.0,
"NTOK_LAT": 0.0,
}
for v in values:
for k in metrics:
metrics[k] += v[k]
for k, v in metrics.items():
metrics[k] = v/size
for kk in metrics:
metrics[kk] += v[kk]
for kk, vv in metrics.items():
metrics[kk] = vv / size
print(metrics)
print("=========================================")



else:
raise ValueError("Failed to collect metrics")


0 comments on commit 10f0f7c

Please sign in to comment.