diff --git a/README.md b/README.md index 5abc91d..1b6f421 100644 --- a/README.md +++ b/README.md @@ -11,23 +11,9 @@ pip install -e . # Basic Usage -We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness. +We implement a load test for evaluating LLMs to check for performance. The test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. -## Load test - -The load test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. The prompt that is sent with each request is of the format: - -``` -Randomly stream lines from the following text. Don't generate eos tokens: -LINE 1, -LINE 2, -LINE 3, -... -``` - -Where the lines are randomly sampled from a collection of lines from Shakespeare sonnets. Tokens are counted using the `LlamaTokenizer` regardless of which LLM API is being tested. This is to ensure that the prompts are consistent across different LLM APIs. - -To run the most basic load test you can the token_benchmark_ray script. +We support various llm clients, datasets, and scenarios. To run the most basic load test you can the `benchmark.py` script. ### Caveats and Disclaimers @@ -37,32 +23,13 @@ To run the most basic load test you can the token_benchmark_ray script. - The results may vary with the load. - The results may not correlate with users’ workloads. -### OpenAI Compatible APIs -```bash -export OPENAI_API_KEY=secret_abcdefg -export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" - -python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api openai \ ---additional-sampling-params '{}' - -``` - -### Anthropic +### FuriosaAI APIs ```bash -export ANTHROPIC_API_KEY=secret_abcdefg +export FURIOSA_API_BASE=FURIOSA_API_ENDOPINT -python token_benchmark_ray.py \ ---model "claude-2" \ +python benchmark.py \ +--model "meta-llama/Llama-3.2-1B-Instruct" \ +--dataset translation \ --mean-input-tokens 550 \ --stddev-input-tokens 150 \ --mean-output-tokens 150 \ @@ -70,95 +37,21 @@ python token_benchmark_ray.py \ --max-num-completed-requests 2 \ --timeout 600 \ --num-concurrent-requests 1 \ +--wait-for any \ --results-dir "result_outputs" \ ---llm-api anthropic \ +--llm-api furiosa \ --additional-sampling-params '{}' ``` -### TogetherAI - -```bash -export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" - -python token_benchmark_ray.py \ ---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api "litellm" \ ---additional-sampling-params '{}' - -``` - -### Hugging Face - -```bash -export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" -export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" - -python token_benchmark_ray.py \ ---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api "litellm" \ ---additional-sampling-params '{}' - -``` - -### LiteLLM - -LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. - -see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). - -```bash -python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api "litellm" \ ---additional-sampling-params '{}' - -``` - -### Vertex AI - -Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. - -The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. - -Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. - +### OpenAI Compatible APIs ```bash +export OPENAI_API_KEY=secret_abcdefg +export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" -gcloud auth application-default login -gcloud config set project YOUR_PROJECT_ID - -export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) -export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID -export GCLOUD_REGION=YOUR_REGION -export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID - -python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ +python benchmark.py \ +--model "meta-llama/Llama-3.2-1B-Instruct" \ +--dataset sonnet \ --mean-input-tokens 550 \ --stddev-input-tokens 150 \ --mean-output-tokens 150 \ @@ -166,186 +59,18 @@ python token_benchmark_ray.py \ --max-num-completed-requests 2 \ --timeout 600 \ --num-concurrent-requests 1 \ +--wait-for any \ --results-dir "result_outputs" \ ---llm-api "vertexai" \ +--llm-api openai \ --additional-sampling-params '{}' ``` -### SageMaker - -SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. - -```bash - -export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" -export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s -export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" -export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" - -python llm_correctness.py \ ---model "llama-2-7b" \ ---llm-api "sagemaker" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` +see `python benchmark.py --help` for more details on the arguments. -see `python token_benchmark_ray.py --help` for more details on the arguments. - -## Correctness Test - -The correctness test spawns a number of concurrent requests to the LLM API with the following format: - -``` -Convert the following sequence of words into a number: {random_number_in_word_format}. Output just your final answer. -``` - -where random_number_in_word_format could be for example "one hundred and twenty three". The test then checks that the response contains that number in digit format which in this case would be 123. - -The test does this for a number of randomly generated numbers and reports the number of responses that contain a mismatch. - -To run the most basic correctness test you can run the the llm_correctness.py script. - -### OpenAI Compatible APIs - -```bash -export OPENAI_API_KEY=secret_abcdefg -export OPENAI_API_BASE=https://console.endpoints.anyscale.com/m/v1 - -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---max-num-completed-requests 150 \ ---timeout 600 \ ---num-concurrent-requests 10 \ ---results-dir "result_outputs" -``` - -### Anthropic - -```bash -export ANTHROPIC_API_KEY=secret_abcdefg - -python llm_correctness.py \ ---model "claude-2" \ ---llm-api "anthropic" \ ---max-num-completed-requests 5 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" -``` - -### TogetherAI - -```bash -export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" - -python llm_correctness.py \ ---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ ---llm-api "litellm" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -### Hugging Face - -```bash -export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" -export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" - -python llm_correctness.py \ ---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "litellm" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -### LiteLLM - -LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. - -see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). - -```bash -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "litellm" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -see `python llm_correctness.py --help` for more details on the arguments. - - -### Vertex AI - -Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. - -The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. - -Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. - - -```bash - -gcloud auth application-default login -gcloud config set project YOUR_PROJECT_ID - -export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) -export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID -export GCLOUD_REGION=YOUR_REGION -export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID - -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "vertexai" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -### SageMaker - -SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. - -```bash - -export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" -export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s -export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" -export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" - -python llm_correctness.py \ ---model "llama-2-7b" \ ---llm-api "sagemaker" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -## Saving Results - -The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned. # Advanced Usage -The correctness tests were implemented with the following workflow in mind: - ```python import ray from transformers import LlamaTokenizerFast @@ -353,8 +78,7 @@ from transformers import LlamaTokenizerFast from llmperf.ray_clients.openai_chat_completions_client import ( OpenAIChatCompletionsClient, ) -from llmperf.models import RequestConfig -from llmperf.requests_launcher import RequestsLauncher +from llmperf.launcher.wait_for_all import WaitForAllLauncher # Copying the environment variables and passing them to ray.init() is necessary @@ -362,36 +86,34 @@ from llmperf.requests_launcher import RequestsLauncher ray.init(runtime_env={"env_vars": {"OPENAI_API_BASE" : "https://api.endpoints.anyscale.com/v1", "OPENAI_API_KEY" : "YOUR_API_KEY"}}) +MODEL="meta-llama/Llama-2-7b-chat-hf" + base_prompt = "hello_world" tokenizer = LlamaTokenizerFast.from_pretrained( "hf-internal-testing/llama-tokenizer" ) -base_prompt_len = len(tokenizer.encode(base_prompt)) +get_token_len = lambda text: len(tokenizer.encode(text)) + +base_prompt_len = get_token_len(base_prompt) prompt = (base_prompt, base_prompt_len) # Create a client for spawning requests -clients = [OpenAIChatCompletionsClient.remote()] - -req_launcher = RequestsLauncher(clients) +clients = [OpenAIChatCompletionsClient.remote(get_token_len)] -req_config = RequestConfig( - model="meta-llama/Llama-2-7b-chat-hf", - prompt=prompt - ) +req_launcher = WaitForAllLauncher(MODEL, clients, {}) -req_launcher.launch_requests(req_config) -result = req_launcher.get_next_ready(block=True) +result = req_launcher.launch(0, [prompt], [128]) print(result) ``` # Implementing New LLM Clients -To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor. +To implement a new LLM client, you need to implement the base class `llmperf.ray_clients.LLMClient` and decorate it as a ray actor. ```python -from llmperf.ray_llm_client import LLMClient +from llmperf.ray_clients import LLMClient import ray diff --git a/analyze-token-benchmark-results.ipynb b/analyze-token-benchmark-results.ipynb index d6c5a45..272c56b 100644 --- a/analyze-token-benchmark-results.ipynb +++ b/analyze-token-benchmark-results.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "source": [ "# Token Benchmark Example Analysis\n", - "The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses." + "The following is an example of the analysis that can be done on individual responses that are saved when running `benchmark.py` with the flag `--results-dir` which enables the saving of all responses." ] }, { diff --git a/token_benchmark_ray.py b/benchmark.py similarity index 96% rename from token_benchmark_ray.py rename to benchmark.py index aa95b18..6ec80cd 100644 --- a/token_benchmark_ray.py +++ b/benchmark.py @@ -11,12 +11,10 @@ import ray from llmperf import common_metrics -from llmperf.common import SUPPORTED_APIS, construct_clients +from llmperf.common import SUPPORTED_APIS, construct_clients, construct_launcher from llmperf.datasets import randomly_sample_prompt -from llmperf.launcher.wait_for_all import WaitForAllLauncher -from llmperf.launcher.wait_for_any import WaitForAnyLauncher from llmperf.utils import ( LLMPerfResults, sample_random_positive_int, @@ -25,15 +23,6 @@ from transformers import AutoTokenizer -def construct_launcher(wait_for, model, clients, additional_sampling_params): - if wait_for == "all": - return WaitForAllLauncher(model, clients, additional_sampling_params) - elif wait_for == "any": - return WaitForAnyLauncher(model, clients, additional_sampling_params) - else: - raise ValueError(f"Wrong type for 'wait_for' option: {wait_for}") - - def get_token_throughput_latencies( model: str, dataset: str, @@ -43,7 +32,7 @@ def get_token_throughput_latencies( stddev_output_tokens: int, additional_sampling_params: Optional[Dict[str, Any]] = None, num_concurrent_requests: int = 1, - wait_for: str = all, + wait_for: str = "all", max_num_completed_requests: int = 500, test_timeout_s=90, llm_api="openai", @@ -236,7 +225,7 @@ def flatten(item): return ret -def run_token_benchmark( +def run_benchmark( llm_api: str, model: str, dataset: str, @@ -415,7 +404,7 @@ def run_token_benchmark( args.add_argument( "--llm-api", type=str, - default="openai", + default="furiosa", help=( f"The name of the llm api to use. Can select from {SUPPORTED_APIS}" " (default: %(default)s)" @@ -452,7 +441,7 @@ def run_token_benchmark( key, value = item.split("=") user_metadata[key] = value - run_token_benchmark( + run_benchmark( llm_api=args.llm_api, model=args.model, dataset=args.dataset, diff --git a/llm_correctness.py b/llm_correctness.py deleted file mode 100644 index c9d102d..0000000 --- a/llm_correctness.py +++ /dev/null @@ -1,309 +0,0 @@ -import argparse -import json -import os -from pathlib import Path -import random -import re -import time -from typing import Any, Dict, List, Optional, Tuple - -import num2words -import ray -from tqdm import tqdm - -from llmperf import common_metrics -from llmperf.common import SUPPORTED_APIS, construct_clients -from llmperf.models import RequestConfig -from llmperf.requests_launcher import RequestsLauncher -from llmperf.utils import ( - LLMPerfResults, -) - -MAX_RANDOM_NUMBER = 10000 - - -def llm_correctness( - model: str, - additional_sampling_params: Optional[Dict[str, Any]] = None, - num_concurrent_requests: int = 1, - max_num_completed_requests: int = 500, - test_timeout_s=90, - llm_api="chat", -) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: - """Get the token throughput and latencies for the given model. - - Args: - model: The name of the model to query. - additional_sampling_params: Additional sampling parameters to send with the request. - For more information see the LLM APIs documentation for the completions - num_concurrent_requests: The number of concurrent requests to make. Increase - this to increase the amount of load and vice versa. - test_timeout_s: The amount of time to run the test for before reporting results. - llm_api: The type of request to make. Either "chat" or "litellm". - - Returns: - A tuple containing summary metrics and raw results from the test. - - """ - - if not additional_sampling_params: - additional_sampling_params = {} - - clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests) - req_launcher = RequestsLauncher(clients) - start_time = time.monotonic() - - num_errored_requests = 0 - num_mismatched_requests = 0 - num_completed_requests = 0 - - sampling_params = {"temperature": 0.0} - sampling_params.update(additional_sampling_params) - completed_requests = [] - iter = 0 - pbar = tqdm(total=max_num_completed_requests) - while ( - time.monotonic() - start_time < test_timeout_s - and num_completed_requests < max_num_completed_requests - ): - iter += 1 - rnd_number = random.randint(0, MAX_RANDOM_NUMBER) - rnd_num_words = num2words.num2words(rnd_number) - - prompt = f"Convert the following sequence of words into a number: {rnd_num_words}.\nPrint the number first." - - request_config = RequestConfig( - model=model, - prompt=(prompt, 0), - sampling_params=sampling_params, - metadata={"rnd_number": rnd_number}, - llm_api=llm_api, - ) - req_launcher.launch_requests(request_config) - - if not (iter % num_concurrent_requests): - completed_requests.extend(req_launcher.get_next_ready()) - pbar.update(len(completed_requests) - num_completed_requests) - num_completed_requests = len(completed_requests) - - pbar.close() - end_time = time.monotonic() - if end_time - start_time >= test_timeout_s: - print("Test timed out before all requests could be completed.") - - raw_results = [] - - print("Mismatched and errored requests.") - for out in completed_requests: - metrics, generated_text, completed_request_config = out - - raw_results.append( - { - "metrics": metrics, - "generated_text": generated_text, - "request_config": dict(completed_request_config), - } - ) - - # if there were no errors when making request. - if not metrics[common_metrics.ERROR_CODE]: - try: - commas_between_numbers_re = r"(\d+),(?=\d)" - gen_text_commas_removed = re.sub( - commas_between_numbers_re, r"\1", generated_text - ) - nums = re.findall(r"\d+", gen_text_commas_removed) - generated_text = gen_text_commas_removed.replace("\n", " ") - - assert str(completed_request_config.metadata["rnd_number"]) in nums - except: - num_mismatched_requests += 1 - print( - f" mismatched request: {generated_text}, expected: {completed_request_config.metadata['rnd_number']}" - ) - else: - num_errored_requests += 1 - print( - f" The request errored: {metrics[common_metrics.ERROR_CODE]}, " - f"{metrics[common_metrics.ERROR_MSG]} " - ) - print() - - error_rate = num_errored_requests / num_completed_requests - mismatch_rate = num_mismatched_requests / num_completed_requests - num_non_errored_requests = num_completed_requests - num_errored_requests - summary_metrics = {} - summary_metrics[common_metrics.NUM_ERRORS] = num_errored_requests - summary_metrics["num_mismatched_requests"] = num_mismatched_requests - summary_metrics["error_rate"] = error_rate - summary_metrics["mismatch_rate"] = mismatch_rate - summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests - summary_metrics["num_non_errored_requests"] = num_non_errored_requests - - # Metadata - summary_metrics["model"] = model - summary_metrics["num_concurrent_requests"] = num_concurrent_requests - summary_metrics["additional_sampling_params"] = additional_sampling_params - summary_metrics["llm_api"] = llm_api - - return summary_metrics, raw_results - - -def run( - llm_api: str, - model: str, - test_timeout_s: int, - max_num_completed_requests: int, - num_concurrent_requests: int, - additional_sampling_params: str, - results_dir: str, - user_metadata: Dict[str, str], -): - """ - Args: - llm_api: The type of request to make. Either "chat" or "litellm". - model: The name of the model to query. - max_num_completed_requests: The number of requests to complete before finishing the test. - test_timeout_s: The amount of time to run the test for before reporting results. - num_concurrent_requests: The number of concurrent requests to make. Increase - this to increase the amount of load and vice versa. - mean_input_tokens: The mean number of tokens to send in the prompt for the request. - stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request. - mean_output_tokens: The mean number of tokens to generate per request. - stddev_output_tokens: The standard deviation of the number of tokens to generate per request. - additional_sampling_params: Additional sampling parameters to send with the request. - For more information see the LLM APIs documentation for the completions. - results_dir: The directory to save the results to. - - """ - - summary_metrics, raw_results = llm_correctness( - model=model, - llm_api=llm_api, - test_timeout_s=test_timeout_s, - max_num_completed_requests=max_num_completed_requests, - num_concurrent_requests=num_concurrent_requests, - additional_sampling_params=json.loads(additional_sampling_params), - ) - - time.sleep(2) - - print( - f"Results for llm correctness test for {model} queried with the {llm_api} api." - ) - print( - f"Errors: {summary_metrics[common_metrics.NUM_ERRORS]}, " - f"Error rate: {summary_metrics['error_rate']}" - ) - - print( - f"Mismatched: {summary_metrics['num_mismatched_requests']}, " - f"Mismatch rate: {summary_metrics['mismatch_rate']}" - ) - print(f"Completed: {summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS]}") - print(f"Completed without errors: {summary_metrics['num_non_errored_requests']}") - - if results_dir: - file_name = f"{model}_correctness" - file_name = re.sub(r"[^\w\d-]+", "-", file_name) - file_name = re.sub(r"-{2,}", "-", file_name) - summary_file_name = f"{file_name}_summary" - individual_responses_filename = f"{file_name}_individual_responses" - summary_metrics.update(user_metadata) - results = LLMPerfResults(name=summary_file_name, metadata=summary_metrics) - results_dir = Path(results_dir) - if not results_dir.exists(): - results_dir.mkdir(parents=True) - elif not results_dir.is_dir(): - raise ValueError(f"{results_dir} is not a directory") - with open(results_dir / f"{summary_file_name}.json", "w") as f: - json.dump(results.to_dict(), f, indent=4) - with open(results_dir / f"{individual_responses_filename}.json", "w") as f: - json.dump(raw_results, f, indent=4) - - -args = argparse.ArgumentParser(description="Run a correctness test for a given model.") - -args.add_argument( - "--model", type=str, required=True, help="The model to use for this load test." -) -args.add_argument( - "--num-concurrent-requests", - type=int, - default=10, - help=("The number of concurrent requests to send. (default: %(default)s)"), -) -args.add_argument( - "--timeout", - type=int, - default=90, - help="The amount of time to run the load test for. (default: %(default)s)", -) -args.add_argument( - "--max-num-completed-requests", - type=int, - default=50, - help=( - "The number of requests to complete before finishing the test. Note " - "that its possible for the test to timeout first. (default: %(default)s)" - ), -) -args.add_argument( - "--additional-sampling-params", - type=str, - default="{}", - help=( - "Additional sampling params to send with the each request to the LLM API. " - "(default: %(default)s) No additional sampling params are sent." - ), -) -args.add_argument( - "--results-dir", - type=str, - default="", - help=( - "The directory to save the results to. " - "(`default: %(default)s`) No results are saved)" - ), -) -args.add_argument( - "--llm-api", - type=str, - default="openai", - help=( - f"The type of request to make. The supported llm apis are {SUPPORTED_APIS} " - " (`default: %(default)s`)" - ), -) -args.add_argument( - "--metadata", - type=str, - default="", - help=( - "A comma separated list of metadata to include in the results, e.g. " - "name=foo,bar=1. These will be added to the metadata field of the results. " - ), -) - -if __name__ == "__main__": - args = args.parse_args() - - env_vars = dict(os.environ) - ray.init(runtime_env={"env_vars": env_vars}) - # Parse user metadata. - user_metadata = {} - if args.metadata: - for item in args.metadata.split(","): - key, value = item.split("=") - user_metadata[key] = value - - run( - llm_api=args.llm_api, - model=args.model, - test_timeout_s=args.timeout, - max_num_completed_requests=args.max_num_completed_requests, - num_concurrent_requests=args.num_concurrent_requests, - additional_sampling_params=args.additional_sampling_params, - results_dir=args.results_dir, - user_metadata=user_metadata, - ) diff --git a/src/llmperf/common.py b/src/llmperf/common.py index cbc864a..27ab3ca 100644 --- a/src/llmperf/common.py +++ b/src/llmperf/common.py @@ -1,15 +1,15 @@ -from typing import List, Callable +from typing import Any, Dict, List, Callable +from llmperf.launcher import RequestsLauncher +from llmperf.launcher.wait_for_all import WaitForAllLauncher +from llmperf.launcher.wait_for_any import WaitForAnyLauncher from llmperf.ray_clients.furiosa_client import FuriosaLLMClient -from llmperf.ray_clients.litellm_client import LiteLLMClient from llmperf.ray_clients.openai_chat_completions_client import ( OpenAIChatCompletionsClient, ) -from llmperf.ray_clients.sagemaker_client import SageMakerClient -from llmperf.ray_clients.vertexai_client import VertexAIClient -from llmperf.ray_llm_client import LLMClient +from llmperf.ray_clients import LLMClient -SUPPORTED_APIS = ["openai", "anthropic", "litellm"] +SUPPORTED_APIS = ["openai", "furiosa"] def construct_clients( @@ -30,17 +30,38 @@ def construct_clients( OpenAIChatCompletionsClient.remote(get_token_len) for _ in range(num_clients) ] - elif llm_api == "sagemaker": - clients = [SageMakerClient.remote() for _ in range(num_clients)] - elif llm_api == "vertexai": - clients = [VertexAIClient.remote() for _ in range(num_clients)] elif llm_api == "furiosa": clients = [FuriosaLLMClient.remote(get_token_len) for _ in range(num_clients)] - elif llm_api in SUPPORTED_APIS: - clients = [LiteLLMClient.remote(get_token_len) for _ in range(num_clients)] else: raise ValueError( f"llm_api must be one of the supported LLM APIs: {SUPPORTED_APIS}" ) return clients + + +def construct_launcher( + wait_for: str, + model: str, + clients: List[LLMClient], + additional_sampling_params: Dict[str, Any], +) -> RequestsLauncher: + """Construct RequestsLauncher that will send requests with a specific pattern. + + Args: + wait_for: The name of pattern. WaitForAll launcher + model: The name of the model to query. + clients: The list of LLMClients. + additional_sampling_params: Additional sampling parameters to send with the request. + For more information see the LLM APIs documentation for the completions + + Returns: + The constructed RequesstLauncher + + """ + if wait_for == "all": + return WaitForAllLauncher(model, clients, additional_sampling_params) + elif wait_for == "any": + return WaitForAnyLauncher(model, clients, additional_sampling_params) + else: + raise ValueError(f"Wrong type for 'wait_for' option: {wait_for}") diff --git a/src/llmperf/datasets/__init__.py b/src/llmperf/datasets/__init__.py index 575fc6e..1091de1 100644 --- a/src/llmperf/datasets/__init__.py +++ b/src/llmperf/datasets/__init__.py @@ -1,8 +1,5 @@ # TODO: Generalize dataset loading method using abstraction class like LLMClient from typing import Tuple -from llmperf.datasets.gpqa import randomly_sample_gpqa_prompt -from llmperf.datasets.sonnet import randomly_sample_sonnet_lines_prompt -from llmperf.datasets.translation import randomly_sample_translation_prompt def randomly_sample_prompt( @@ -13,14 +10,20 @@ def randomly_sample_prompt( get_token_len, ) -> Tuple[str, int]: if dataset == "sonnet": + from llmperf.datasets.sonnet import randomly_sample_sonnet_lines_prompt + f = randomly_sample_sonnet_lines_prompt elif dataset == "human-eval": from llmperf.datasets.human_eval import randomly_sample_human_eval_prompt f = randomly_sample_human_eval_prompt elif dataset == "gpqa": + from llmperf.datasets.gpqa import randomly_sample_gpqa_prompt + f = randomly_sample_gpqa_prompt elif dataset == "translation": + from llmperf.datasets.translation import randomly_sample_translation_prompt + f = randomly_sample_translation_prompt else: raise ValueError(f"Not supported dataset {dataset}") diff --git a/src/llmperf/datasets/gpqa.py b/src/llmperf/datasets/gpqa.py index 2008861..7f2d8d3 100644 --- a/src/llmperf/datasets/gpqa.py +++ b/src/llmperf/datasets/gpqa.py @@ -71,12 +71,4 @@ def randomly_sample_gpqa_prompt( while num_prompt_tokens < get_token_len(prompt): prompt = random.choice(prompts) - # padding - # pad_token_num = 0 - # remaining_prompt_tokens = num_prompt_tokens - get_token_len(prompt) - # while remaining_prompt_tokens > 0: - # pad_token_num += 1 - # remaining_prompt_tokens -= get_token_len(tokenizer.pad_token * pad_token_num) - # prompt += tokenizer.pad_token * (pad_token_num - 1) - return [prompt, get_token_len(prompt)] diff --git a/src/llmperf/datasets/human_eval.py b/src/llmperf/datasets/human_eval.py index e9b935e..80b290b 100644 --- a/src/llmperf/datasets/human_eval.py +++ b/src/llmperf/datasets/human_eval.py @@ -30,12 +30,4 @@ def randomly_sample_human_eval_prompt( task_id = random.choice(task_ids) prompt += problems[task_id]["prompt"] - # padding - # remaining_prompt_tokens -= get_token_length(prompt) - # pad_token_num = 0 - # while remaining_prompt_tokens > 0: - # pad_token_num += 1 - # remaining_prompt_tokens -= get_token_length(tokenizer.pad_token * pad_token_num) - # prompt += tokenizer.pad_token * (pad_token_num - 1) - return [prompt, get_token_len(prompt)] diff --git a/src/llmperf/launcher/__init__.py b/src/llmperf/launcher/__init__.py index 1e49ec6..dceb0ce 100644 --- a/src/llmperf/launcher/__init__.py +++ b/src/llmperf/launcher/__init__.py @@ -2,7 +2,7 @@ from typing import Any, List, Dict, Tuple from llmperf.models import RequestConfig -from llmperf.ray_llm_client import LLMClient +from llmperf.ray_clients import LLMClient from ray.util import ActorPool diff --git a/src/llmperf/launcher/wait_for_all.py b/src/llmperf/launcher/wait_for_all.py index d628c9d..5950af3 100644 --- a/src/llmperf/launcher/wait_for_all.py +++ b/src/llmperf/launcher/wait_for_all.py @@ -6,6 +6,10 @@ class WaitForAllLauncher(RequestsLauncher): + """RequestsLauncher that waits for all sended request before send next requests. + + The WaitForAll launcher sends the next n requests only when all n requests sent simultaneously are completed. + """ def launch( self, diff --git a/src/llmperf/launcher/wait_for_any.py b/src/llmperf/launcher/wait_for_any.py index de363d5..964d8ae 100644 --- a/src/llmperf/launcher/wait_for_any.py +++ b/src/llmperf/launcher/wait_for_any.py @@ -6,6 +6,11 @@ class WaitForAnyLauncher(RequestsLauncher): + """RequestsLauncher that waits for any sended request before send a next request. + + The WaitForAny launcher sends a next request when any of requests sent are completed. + It keeps the number of concurrently processing requests at n. + """ def launch( self, @@ -17,16 +22,13 @@ def launch( pbar = tqdm(total=len(prompts)) start_time = time.monotonic() - for i, (prompt, num_output_tokens) in enumerate( - zip(prompts, num_output_tokens_list) - ): + for (prompt, num_output_tokens) in zip(prompts, num_output_tokens_list): default_sampling_params = {"max_tokens": num_output_tokens} default_sampling_params.update(self._additional_sampling_params) request_config = RequestConfig( model=self._model, prompt=prompt, sampling_params=default_sampling_params, - sleep=i, ) # Do not care about is there any idle actor. # If there is no idle actor, it will be added at pending request diff --git a/src/llmperf/models.py b/src/llmperf/models.py index be0d7ea..505e690 100644 --- a/src/llmperf/models.py +++ b/src/llmperf/models.py @@ -16,6 +16,7 @@ class RequestConfig(BaseModel): model: str prompt: Tuple[str, int] + system_prompt: Optional[str] = None sampling_params: Optional[Dict[str, Any]] = None llm_api: Optional[str] = None metadata: Optional[Dict[str, Any]] = None diff --git a/src/llmperf/ray_clients/__init__.py b/src/llmperf/ray_clients/__init__.py index e69de29..fecec66 100644 --- a/src/llmperf/ray_clients/__init__.py +++ b/src/llmperf/ray_clients/__init__.py @@ -0,0 +1,24 @@ +import abc +from typing import Any, Dict, Tuple + +from llmperf.models import RequestConfig + + +class LLMClient: + """A client for making requests to a LLM API e.g Anyscale Endpoints.""" + + get_token_len = None + + @abc.abstractmethod + def llm_request( + self, request_config: RequestConfig + ) -> Tuple[Dict[str, Any], str, RequestConfig]: + """Make a single completion request to a LLM API + + Returns: + Metrics about the performance charateristics of the request. + The text generated by the request to the LLM API. + The request_config used to make the request. This is mainly for logging purposes. + + """ + ... diff --git a/src/llmperf/ray_clients/furiosa_client.py b/src/llmperf/ray_clients/furiosa_client.py index 936e057..2724211 100644 --- a/src/llmperf/ray_clients/furiosa_client.py +++ b/src/llmperf/ray_clients/furiosa_client.py @@ -5,11 +5,13 @@ from llmperf.ray_clients.openai_chat_completions_client import ( OpenAIChatCompletionsClient, ) -from llmperf.ray_llm_client import LLMClient +from llmperf.ray_clients import LLMClient import ray from ray.runtime_env import RuntimeEnv +FURIOSA_AI_DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant." + @ray.remote class FuriosaLLMClient(LLMClient): """Client for FuriosaAI LLM Completion API.""" @@ -33,9 +35,10 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: os.environ["OPENAI_API_BASE"] = address os.environ["OPENAI_API_KEY"] = key - # Use greedy search + # Use greedy search as default if "temperature" not in request_config.sampling_params: request_config.sampling_params["temperature"] = 0.0 + request_config.system_prompt = FURIOSA_AI_DEFAULT_SYSTEM_PROMPT actor = OpenAIChatCompletionsClient.options( runtime_env=RuntimeEnv(env_vars=dict(os.environ)) diff --git a/src/llmperf/ray_clients/litellm_client.py b/src/llmperf/ray_clients/litellm_client.py deleted file mode 100644 index a933965..0000000 --- a/src/llmperf/ray_clients/litellm_client.py +++ /dev/null @@ -1,103 +0,0 @@ -import time -from typing import Any, Dict -import ray - -from llmperf.ray_llm_client import LLMClient -from llmperf.models import RequestConfig -from llmperf import common_metrics - - -@ray.remote -class LiteLLMClient(LLMClient): - """Client for LiteLLM Completions API.""" - - def __init__(self, get_token_len): - self.get_token_len = get_token_len - - def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: - # litellm package isn't serializable, so we import it within the function - # to maintain compatibility with ray. - from litellm import completion, validate_environment - - prompt = request_config.prompt - prompt, prompt_len = prompt - - message = [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ] - assert ( - request_config.llm_api is not None - ), "the request config's llm_api must be set." - if request_config.llm_api == "litellm": - model = request_config.model - else: - model = request_config.llm_api + "/" + request_config.model - validation_result = validate_environment(model) - if validation_result["missing_keys"]: - raise ValueError( - f"The following environment vars weren't found but were necessary for " - f"the model {request_config.model}: {validation_result['missing_keys']}" - ) - body = { - "model": model, - "messages": message, - "stream": True, - } - sampling_params = request_config.sampling_params - body.update(sampling_params or {}) - - time_to_next_token = [] - tokens_received = 0 - ttft = 0 - error_response_code = -1 - generated_text = "" - error_msg = "" - output_throughput = 0 - total_request_time = 0 - - metrics = {} - - metrics[common_metrics.ERROR_CODE] = None - metrics[common_metrics.ERROR_MSG] = "" - - try: - start_time = time.monotonic() - most_recent_received_token_time = time.monotonic() - - response = completion(**body) - ttft = 0 - for tok in response: - if tok.choices[0].delta: - delta = tok.choices[0].delta - if delta.get("content", None): - if ttft == 0: - ttft = time.monotonic() - start_time - time_to_next_token.append(ttft) - else: - time_to_next_token.append( - time.monotonic() - most_recent_received_token_time - ) - generated_text += delta["content"] - most_recent_received_token_time = time.monotonic() - tokens_received += 1 - - total_request_time = time.monotonic() - start_time - - output_throughput = tokens_received / total_request_time - - except Exception as e: - metrics[common_metrics.ERROR_MSG] = error_msg - metrics[common_metrics.ERROR_CODE] = error_response_code - - print(f"Warning Or Error: {e}") - print(error_response_code) - - metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) - metrics[common_metrics.TTFT] = ttft - metrics[common_metrics.E2E_LAT] = total_request_time - metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput - metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len - metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received - metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len - return metrics, generated_text, request_config diff --git a/src/llmperf/ray_clients/openai_chat_completions_client.py b/src/llmperf/ray_clients/openai_chat_completions_client.py index 8e513e8..ab74113 100644 --- a/src/llmperf/ray_clients/openai_chat_completions_client.py +++ b/src/llmperf/ray_clients/openai_chat_completions_client.py @@ -6,15 +6,11 @@ import ray import requests -from llmperf.ray_llm_client import LLMClient +from llmperf.ray_clients import LLMClient from llmperf.models import RequestConfig from llmperf import common_metrics -# Copy from AA's sample code -OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant." - - @ray.remote class OpenAIChatCompletionsClient(LLMClient): """Client for OpenAI Chat Completions API.""" @@ -25,9 +21,10 @@ def __init__(self, get_token_len): def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: prompt = request_config.prompt prompt, prompt_len = prompt + system_prompt = request_config.system_prompt message = [ - {"role": "system", "content": OPENAI_SYSTEM_MESSAGE_API}, + {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ] model = request_config.model diff --git a/src/llmperf/ray_clients/sagemaker_client.py b/src/llmperf/ray_clients/sagemaker_client.py deleted file mode 100644 index 95a18bc..0000000 --- a/src/llmperf/ray_clients/sagemaker_client.py +++ /dev/null @@ -1,159 +0,0 @@ -import io -import json -import os -import time -from typing import Any, Dict - -import boto3 -import ray -from transformers import LlamaTokenizerFast - -from llmperf.ray_llm_client import LLMClient -from llmperf.models import RequestConfig -from llmperf import common_metrics - - -@ray.remote -class SageMakerClient(LLMClient): - """Client for OpenAI Chat Completions API.""" - - def __init__(self): - # Sagemaker doesn't return the number of tokens that are generated so we approximate it by - # using the llama tokenizer. - self.tokenizer = LlamaTokenizerFast.from_pretrained( - "hf-internal-testing/llama-tokenizer" - ) - self.get_token_len = lambda text: len(self.tokenizer.encode(text)) - - def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: - if not os.environ.get("AWS_ACCESS_KEY_ID"): - raise ValueError("AWS_ACCESS_KEY_ID must be set.") - if not os.environ.get("AWS_SECRET_ACCESS_KEY"): - raise ValueError("AWS_SECRET_ACCESS_KEY must be set.") - if not os.environ.get("AWS_REGION_NAME"): - raise ValueError("AWS_REGION_NAME must be set.") - - prompt = request_config.prompt - prompt, prompt_len = prompt - - message = [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ] - model = request_config.model - sm_runtime = boto3.client( - "sagemaker-runtime", region_name=os.environ.get("AWS_REGION_NAME") - ) - - sampling_params = request_config.sampling_params - - if "max_tokens" in sampling_params: - sampling_params["max_new_tokens"] = sampling_params["max_tokens"] - del sampling_params["max_tokens"] - - message = { - "inputs": [ - [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ] - ], - "parameters": { - **request_config.sampling_params, - }, - } - - time_to_next_token = [] - tokens_received = 0 - ttft = 0 - error_response_code = None - generated_text = "" - error_msg = "" - output_throughput = 0 - total_request_time = 0 - metrics = {} - - start_time = time.monotonic() - most_recent_received_token_time = time.monotonic() - - try: - response = sm_runtime.invoke_endpoint_with_response_stream( - EndpointName=model, - ContentType="application/json", - Body=json.dumps(message), - CustomAttributes="accept_eula=true", - ) - - event_stream = response["Body"] - json_byte = b"" - for line, ttft, _ in LineIterator(event_stream): - json_byte += line - time_to_next_token.append( - time.monotonic() - most_recent_received_token_time - ) - most_recent_received_token_time = time.monotonic() - ttft = ttft - start_time - resp = json.loads(json_byte) - total_request_time = time.monotonic() - start_time - generated_text = resp[0]["generation"]["content"] - tokens_received = self.get_token_len(generated_text) - output_throughput = tokens_received / total_request_time - - except Exception as e: - print(f"Warning Or Error: {e}") - print(error_response_code) - error_msg = str(e) - error_response_code = 500 - - metrics[common_metrics.ERROR_MSG] = error_msg - metrics[common_metrics.ERROR_CODE] = error_response_code - metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token - metrics[common_metrics.TTFT] = ttft - metrics[common_metrics.E2E_LAT] = total_request_time - metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput - metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len - metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received - metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len - - return metrics, generated_text, request_config - - -class LineIterator: - """ - A helper class for parsing the byte stream input. - Reference: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/ - """ - - def __init__(self, stream): - self.byte_iterator = iter(stream) - self.buffer = io.BytesIO() - self.read_pos = 0 - self.ttft = 0 - - def __iter__(self): - return self - - def __next__(self): - while True: - self.buffer.seek(self.read_pos) - line = self.buffer.readline() - if line and line[-1] == ord("\n"): - if self.ttft == 0: - self.ttft = time.monotonic() - self.read_pos += len(line) - return line[:-1], self.ttft, time.monotonic() - # kyle: dealing with last ']' for chat output - if line and self.read_pos == self.buffer.getbuffer().nbytes - 1: - self.read_pos += 1 - return line, self.ttft, time.monotonic() - try: - chunk = next(self.byte_iterator) - except StopIteration: - if self.read_pos < self.buffer.getbuffer().nbytes: - continue - raise - if "PayloadPart" not in chunk: - print("Unknown event type:" + chunk) - continue - self.buffer.seek(0, io.SEEK_END) - self.buffer.write(chunk["PayloadPart"]["Bytes"]) diff --git a/src/llmperf/ray_clients/vertexai_client.py b/src/llmperf/ray_clients/vertexai_client.py deleted file mode 100644 index 146a51d..0000000 --- a/src/llmperf/ray_clients/vertexai_client.py +++ /dev/null @@ -1,136 +0,0 @@ -import json -import os -import time -from typing import Any, Dict - -import ray -import requests -from transformers import LlamaTokenizerFast - -from llmperf.ray_llm_client import LLMClient -from llmperf.models import RequestConfig -from llmperf import common_metrics - - -@ray.remote -class VertexAIClient(LLMClient): - """Client for VertexAI API.""" - - def __init__(self): - # VertexAI doesn't return the number of tokens that are generated so we approximate it by - # using the llama tokenizer. - self.tokenizer = LlamaTokenizerFast.from_pretrained( - "hf-internal-testing/llama-tokenizer" - ) - self.get_token_len = lambda text: len(self.tokenizer.encode(text)) - - def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: - project_id = os.environ.get("GCLOUD_PROJECT_ID") - region = os.environ.get("GCLOUD_REGION") - endpoint_id = os.environ.get("VERTEXAI_ENDPOINT_ID") - access_token = os.environ.get("GCLOUD_ACCESS_TOKEN").strip() - if not project_id: - raise ValueError("the environment variable GCLOUD_PROJECT_ID must be set.") - if not region: - raise ValueError("the environment variable GCLOUD_REGION must be set.") - if not endpoint_id: - raise ValueError( - "the environment variable VERTEXAI_ENDPOINT_ID must be set." - ) - if not access_token: - raise ValueError( - "the environment variable GCLOUD_ACCESS_TOKEN must be set." - ) - prompt = request_config.prompt - prompt, prompt_len = prompt - - time_to_next_token = [] - tokens_received = 0 - ttft = 0 - generated_text = "" - output_throughput = 0 - total_request_time = 0 - - metrics = {} - - metrics[common_metrics.ERROR_CODE] = None - metrics[common_metrics.ERROR_MSG] = "" - - try: - # Define the URL for the request - url = ( - f"https://{region}-aiplatform.googleapis.com/v1/projects/" - f"{project_id}/locations/{region}/endpoints/{endpoint_id}:predict" - ) - - # Define the headers - headers = { - "Authorization": f"Bearer {access_token}", - "Content-Type": "application/json", - } - - sampling_params = request_config.sampling_params - if "max_new_tokens" in sampling_params: - sampling_params["maxOutputTokens"] = sampling_params.pop( - "max_new_tokens" - ) - - # Define the data payload - data = {"instances": [{"prompt": prompt}], "parameters": sampling_params} - - # Make the POST request - start_time = time.monotonic() - response = requests.post(url, headers=headers, data=json.dumps(data)) - total_request_time = time.monotonic() - start_time - response_code = response.status_code - response.raise_for_status() - # output from the endpoint is in the form: - # {"predictions": ["Input: ... \nOutput:\n ..."]} - generated_text = response.json()["predictions"][0].split("\nOutput:\n")[1] - tokens_received = self.get_token_len(generated_text) - ttft = -1 - output_throughput = tokens_received / total_request_time - time_to_next_token = [ - total_request_time / tokens_received for _ in range(tokens_received) - ] - - except Exception as e: - metrics[common_metrics.ERROR_MSG] = str(e) - metrics[common_metrics.ERROR_CODE] = response_code - print(f"Warning Or Error: {e}") - print(response_code) - print(response_code) - - metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token - metrics[common_metrics.TTFT] = ttft - metrics[common_metrics.E2E_LAT] = total_request_time - metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput - metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len - metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received - metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len - - return metrics, generated_text, request_config - - -if __name__ == "__main__": - # Run these before hand: - - # gcloud auth application-default login - # gcloud config set project YOUR_PROJECT_ID - # export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) - # export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID - # export GCLOUD_REGION=YOUR_REGION - # export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID - - client = VertexAIClient.remote() - request_config = RequestConfig( - prompt=("Give me ten interview questions for the role of program manager.", 10), - model="gpt3", - sampling_params={ - "temperature": 0.2, - "max_new_tokens": 256, - "top_k": 40, - "top_p": 0.95, - }, - ) - ray.get(client.llm_request.remote(request_config)) diff --git a/src/llmperf/ray_llm_client.py b/src/llmperf/ray_llm_client.py deleted file mode 100644 index fecec66..0000000 --- a/src/llmperf/ray_llm_client.py +++ /dev/null @@ -1,24 +0,0 @@ -import abc -from typing import Any, Dict, Tuple - -from llmperf.models import RequestConfig - - -class LLMClient: - """A client for making requests to a LLM API e.g Anyscale Endpoints.""" - - get_token_len = None - - @abc.abstractmethod - def llm_request( - self, request_config: RequestConfig - ) -> Tuple[Dict[str, Any], str, RequestConfig]: - """Make a single completion request to a LLM API - - Returns: - Metrics about the performance charateristics of the request. - The text generated by the request to the LLM API. - The request_config used to make the request. This is mainly for logging purposes. - - """ - ...