Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/trust remote code #73

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
2aa9902
feat: add sample time to result_metrics
jonzarecki Jul 16, 2024
cea6baa
feat: request_metrics instead of request_config
jonzarecki Jul 16, 2024
8520c3c
fix: add check to prevent bug in llmperf
jonzarecki Jul 31, 2024
e49e32b
chore: add gh issue of vllm
jonzarecki Jul 31, 2024
fa28bd2
fix: add handling for when certain errors occur in vllm
jonzarecki Aug 1, 2024
26db07c
add dolly dataset
jonzarecki Sep 1, 2024
5ff3299
Use appropriate tokenizer instead of defaulting to Llama tokenizer
markVaykhansky Sep 10, 2024
fdb8e21
CR Fixes
markVaykhansky Sep 10, 2024
fbf6782
Merge pull request #1 from Jounce-IO/use_appropriate_tokenizer
markVaykhansky Sep 10, 2024
437f351
Trim line optimally
markVaykhansky Sep 11, 2024
f0a0bc6
Add break + fix token count return value
markVaykhansky Sep 11, 2024
aa30928
Log prompts
markVaykhansky Sep 11, 2024
f1e0001
Bugfix - returned non-trimmed line
markVaykhansky Sep 11, 2024
5760f15
Fix typo
markVaykhansky Sep 11, 2024
e6ff81b
Merge pull request #2 from Jounce-IO/fix_generating_more_tokens_then_…
markVaykhansky Sep 11, 2024
5a275ad
CR Fixes
markVaykhansky Sep 11, 2024
d7622ba
Merge pull request #3 from Jounce-IO/log_generated_propmpts
markVaykhansky Sep 11, 2024
94cc3f3
Better CLI argument
markVaykhansky Sep 11, 2024
004d3af
Merge pull request #4 from Jounce-IO/log_generated_propmpts
markVaykhansky Sep 12, 2024
f5ec0b4
Fail llm-perf if error rate is bigger than 50
markVaykhansky Sep 19, 2024
080c6f4
Fail llm-perf if error rate is bigger than 50
markVaykhansky Sep 19, 2024
afafa0b
Merge pull request #5 from Jounce-IO/raise-exception-when-errors
markVaykhansky Sep 19, 2024
7cd1727
Add max num errors allowed parameter
markVaykhansky Sep 23, 2024
10dfbbd
Ignore dotenv
markVaykhansky Sep 23, 2024
2465465
Fix param typo
markVaykhansky Sep 23, 2024
f816533
Merge pull request #6 from Jounce-IO/raise-exception-when-errors
markVaykhansky Sep 23, 2024
b825817
Change fixed error count to error ratio
markVaykhansky Sep 24, 2024
a3f3c21
Remote error rate check after finished llmperf run
markVaykhansky Sep 24, 2024
1edae85
Fix constant value
markVaykhansky Sep 24, 2024
f32d776
Fail on timeout
markVaykhansky Sep 25, 2024
5e7e05c
add
jouDance Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.env
# The build output should clearly not be checked in
*test-output.xml
/bazel-*
Expand Down
1 change: 1 addition & 0 deletions src/llmperf/common_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
ERROR_RATE = "error_rate"
NUM_REQ_STARTED = "num_requests_started"
REQ_START_TIME = "request_time"
16,006 changes: 16,006 additions & 0 deletions src/llmperf/dolly.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/llmperf/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class RequestConfig(BaseModel):

model: str
prompt: Tuple[str, int]
sample_time: float
sampling_params: Optional[Dict[str, Any]] = None
llm_api: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
37 changes: 23 additions & 14 deletions src/llmperf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,10 @@ def upload_to_s3(results_path: str, s3_path: str) -> None:


def randomly_sample_sonnet_lines_prompt(
tokenizer,
prompt_tokens_mean: int = 550,
prompt_tokens_stddev: int = 250,
expect_output_tokens: int = 150,
tokenizer = LlamaTokenizerFast.from_pretrained(
"hf-internal-testing/llama-tokenizer")
) -> Tuple[str, int]:
expect_output_tokens: int = 150) -> Tuple[str, int]:
"""Generate a prompt that randomly samples lines from a the shakespeare sonnet at sonnet.txt.

Args:
Expand Down Expand Up @@ -93,29 +91,40 @@ def randomly_sample_sonnet_lines_prompt(
num_prompt_tokens = sample_random_positive_int(
prompt_tokens_mean, prompt_tokens_stddev
)

while num_prompt_tokens < get_token_length(prompt):
num_prompt_tokens = sample_random_positive_int(
prompt_tokens_mean, prompt_tokens_stddev
)

remaining_prompt_tokens = num_prompt_tokens - get_token_length(prompt)
sonnet_path = pathlib.Path(__file__).parent.resolve() / "sonnet.txt"

sonnet_path = pathlib.Path(__file__).parent.resolve() / "dolly.txt"
with open(sonnet_path, "r") as f:
sonnet_lines = f.readlines()
random.shuffle(sonnet_lines)

sampling_lines = True
while sampling_lines:
for line in sonnet_lines:
line_to_add = line
if remaining_prompt_tokens - get_token_length(line_to_add) < 0:
# This will cut off a line in the middle of a word, but that's ok since an
# llm should be able to handle that.
line_to_add = line_to_add[: int(math.ceil(remaining_prompt_tokens))]
trimmed_line = trim_line_optimally_if_exceeds_remaining_tokens(line,
remaining_prompt_tokens,
get_token_length)
prompt += trimmed_line
remaining_prompt_tokens -= get_token_length(trimmed_line)

if len(line) != len(trimmed_line) or remaining_prompt_tokens == 0:
sampling_lines = False
prompt += line_to_add
break
prompt += line_to_add
remaining_prompt_tokens -= get_token_length(line_to_add)
return (prompt, num_prompt_tokens)

return prompt, (num_prompt_tokens - remaining_prompt_tokens)


def trim_line_optimally_if_exceeds_remaining_tokens(line: str, max_tokens: int, get_token_length) -> str:
for line_index in reversed(range(len(line) + 1)):
trimmed_line = line[:line_index]
if get_token_length(trimmed_line) <= max_tokens:
return trimmed_line


def sample_random_positive_int(mean: int, stddev: int) -> int:
Expand Down
94 changes: 76 additions & 18 deletions token_benchmark_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
)
from tqdm import tqdm

from transformers import LlamaTokenizerFast
from transformers import LlamaTokenizerFast, AutoTokenizer


def get_tokenizer(model: str) -> LlamaTokenizerFast | AutoTokenizer:
model = model.replace("huggingface/", "")
return AutoTokenizer.from_pretrained(model, trust_remote_code=True)


def get_token_throughput_latencies(
model: str,
Expand All @@ -34,8 +40,10 @@ def get_token_throughput_latencies(
additional_sampling_params: Optional[Dict[str, Any]] = None,
num_concurrent_requests: int = 1,
max_num_completed_requests: int = 500,
test_timeout_s=90,
llm_api="openai",
test_timeout_s: int =90,
llm_api: str = "openai",
log_prompts: bool = False,
max_errors_ratio_allowed: float=0.1,
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
"""Get the token throughput and latencies for the given model.

Expand All @@ -59,11 +67,9 @@ def get_token_throughput_latencies(
"""
random.seed(11111)

tokenizer = LlamaTokenizerFast.from_pretrained(
"hf-internal-testing/llama-tokenizer"
)
tokenizer = get_tokenizer(model=model)
get_token_length = lambda text: len(tokenizer.encode(text))

if not additional_sampling_params:
additional_sampling_params = {}

Expand All @@ -81,26 +87,47 @@ def get_token_throughput_latencies(
num_output_tokens_list.append(num_output_tokens)

prompts.append(randomly_sample_sonnet_lines_prompt(
tokenizer=tokenizer,
prompt_tokens_mean=mean_input_tokens,
prompt_tokens_stddev=stddev_input_tokens,
expect_output_tokens=num_output_tokens,
tokenizer=tokenizer
))

if log_prompts:
print("Sending the following prompts:")
print(prompts)
else:
# 'prompts' is an array of tuples where each item is (prompt, token_length)
print("Sending the following prompt sizes:")
print(list(map(lambda prompt_with_token_count: prompt_with_token_count[1], prompts)))

start_time = time.monotonic()
iter = 0
pbar = tqdm(total=max_num_completed_requests)
while (
time.monotonic() - start_time < test_timeout_s
and len(completed_requests) < max_num_completed_requests
# https://github.com/vllm-project/vllm/issues/2484
and len(num_output_tokens_list) > 0 # happens when requests are aborted
):
iter += 1

total_requests_with_errors: int = len([metric for metric in completed_requests
if metric[common_metrics.ERROR_CODE] is not None])
completed_requests_error_ratio: float = total_requests_with_errors / max_num_completed_requests

if completed_requests_error_ratio > max_errors_ratio_allowed:
raise Exception(f"Max errors ratio allowed is {max_errors_ratio_allowed} but "
f"{total_requests_with_errors} / {max_num_completed_requests} "
f"requests contained an error")

default_sampling_params = {"max_tokens": num_output_tokens_list.pop()}
default_sampling_params.update(additional_sampling_params)
request_config = RequestConfig(
model=model,
prompt=prompts.pop(),
sampling_params=default_sampling_params,
sample_time=time.monotonic() - start_time,
llm_api=llm_api,
)
req_launcher.launch_requests(request_config)
Expand All @@ -111,39 +138,46 @@ def get_token_throughput_latencies(
outs = req_launcher.get_next_ready()
all_metrics = []
for out in outs:
request_metrics, gen_text, _ = out
request_metrics, gen_text, req_config = out
num_output_tokens = get_token_length(gen_text)
if num_output_tokens:
if num_output_tokens:
request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
else:
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
if request_metrics[common_metrics.E2E_LAT] > 0:
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
else:
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = 0
request_metrics[common_metrics.REQ_START_TIME] = req_config.sample_time
all_metrics.append(request_metrics)

completed_requests.extend(all_metrics)
pbar.update(len(completed_requests) - num_completed_requests)
num_completed_requests = len(completed_requests)

pbar.close()
end_time = time.monotonic()
if end_time - start_time >= test_timeout_s:
print("Test timed out before all requests could be completed.")
raise Exception(f"Test timed out after {test_timeout_s} seconds before all requests could be completed.")

# check one last time that there are no remaining results to collect.
outs = req_launcher.get_next_ready()
all_metrics = []
for out in outs:
request_metrics, gen_text, _ = out
num_output_tokens = get_token_length(gen_text)
if num_output_tokens:
if num_output_tokens:
request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens
else:
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]

if request_metrics[common_metrics.E2E_LAT] > 0:
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
else:
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = 0
all_metrics.append(request_metrics)
completed_requests.extend(all_metrics)

Expand All @@ -161,7 +195,7 @@ def get_token_throughput_latencies(
}

metadata["results"] = ret

return metadata, completed_requests


Expand Down Expand Up @@ -200,7 +234,7 @@ def flatten(item):

df = pd.DataFrame(metrics)
df_without_errored_req = df[df[common_metrics.ERROR_CODE].isna()]

for key in [
common_metrics.INTER_TOKEN_LAT,
common_metrics.TTFT,
Expand Down Expand Up @@ -259,7 +293,7 @@ def flatten(item):

ret[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests
ret[common_metrics.COMPLETED_REQUESTS_PER_MIN] = num_completed_requests_per_min

return ret


Expand All @@ -276,6 +310,8 @@ def run_token_benchmark(
additional_sampling_params: str,
results_dir: str,
user_metadata: Dict[str, Any],
log_prompts: bool,
max_errors_ratio_allowed: float,
):
"""
Args:
Expand Down Expand Up @@ -311,6 +347,8 @@ def run_token_benchmark(
stddev_output_tokens=stddev_output_tokens,
num_concurrent_requests=num_concurrent_requests,
additional_sampling_params=json.loads(additional_sampling_params),
log_prompts=log_prompts,
max_errors_ratio_allowed=max_errors_ratio_allowed,
)

if results_dir:
Expand Down Expand Up @@ -446,6 +484,24 @@ def run_token_benchmark(
"name=foo,bar=1. These will be added to the metadata field of the results. "
),
)
args.add_argument(
"--log-prompts",
type=bool,
action=argparse.BooleanOptionalAction,
default=False,
help=(
"If True will log all prompts sent to the model"
),
)
args.add_argument(
"--max-errors-ratio-allowed",
type=float,
default=0.1,
help=(
"Max errors ratio allowed (i.e completed_requests_with_error / max_num_completed_requests) "
"tolerated in am LLMPerf run"
),
)

if __name__ == "__main__":
env_vars = dict(os.environ)
Expand All @@ -472,4 +528,6 @@ def run_token_benchmark(
additional_sampling_params=args.additional_sampling_params,
results_dir=args.results_dir,
user_metadata=user_metadata,
log_prompts=args.log_prompts,
max_errors_ratio_allowed=args.max_errors_ratio_allowed,
)