Skip to content

Commit

Permalink
Merge branch 'master' into olruwase/deepnvme
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Aug 21, 2024
2 parents 0f4db6f + 9563904 commit cfd800c
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
from .dummy_client import DummyClientConfig, DummyClient
from .fastgen_client import FastGenClientConfig, FastGenClient
from .vllm_client import vLLMClientConfig, vLLMClient
from .openai_client import openaiClientConfig, openaiClient

client_config_classes = {
"dummy": DummyClientConfig,
"azure_ml": AzureMLClientConfig,
"fastgen": FastGenClientConfig,
"vllm": vLLMClientConfig,
"openai": openaiClientConfig
}
client_classes = {
"dummy": DummyClient,
"azure_ml": AzureMLClient,
"fastgen": FastGenClient,
"vllm": vLLMClient,
"openai": openaiClient,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import json
import requests
import subprocess
import time
from typing import Any, Dict

from loguru import logger
from pydantic import Field

from .base import BaseClient
from ..config import BaseConfigModel
from ..prompt import Prompt


# client to test any openai API
class openaiClientConfig(BaseConfigModel):
model: str = Field(..., description="HuggingFace.co model name")
url: str = "http://127.0.0.1:26500/v1/completions"


class openaiClient(BaseClient):
def __init__(self, config: openaiClientConfig):
super().__init__(config)

def start_service(self) -> None:
pass

def stop_service(self) -> None:
pass

def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
api_url = self.config.url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
pload = {
"prompt": prompt.text,
"model": self.config.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": prompt.max_new_tokens,
"ignore_eos": False,
}
return {"url": api_url, "headers": headers, "json": pload, "timeout": 180}

def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
response = requests.post(**request_kwargs)
output = json.loads(response.content)
return output

def process_response(self, raw_response: Any) -> str:
return raw_response["choices"][0]["text"]
76 changes: 75 additions & 1 deletion benchmarks/inference/mii/src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,80 @@ def get_response(response: requests.Response) -> List[str]:
)


# client talks with openai api
def call_openai(
input_tokens: str, max_new_tokens: int, args: argparse.Namespace
) -> ResponseDetails:

api_url = args.openai_api_url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {args.openai_api_key}"
}

pload = {
"prompt": input_tokens,
"model": args.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": max_new_tokens,
"ignore_eos": False,
"stream": args.stream,
}

def clear_line(n: int = 1) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)

def get_streaming_response(
response: requests.Response, time_last_token
) -> Iterable[List[str]]:
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"data:"
):
if chunk:
plain=chunk.decode("utf-8")
if plain.strip() == "[DONE]":
continue
data = json.loads(plain)
output = data["choices"][0]["text"]
time_now = time.time()
yield output, time_now - time_last_token
time_last_token = time_now

# For non-streaming, but currently non-streaming is not fully implemented
def get_response(response: requests.Response) -> List[str]:
data = json.loads(response.content)
output = data["choices"][0]["text"]
return output

token_gen_time = []
start_time = time.time()
#response = requests.post(api_url, headers=headers, json=pload, stream=False)
response = requests.post(api_url, headers=headers, json=pload, stream=args.stream)
if args.stream:
output = ""
for h, t in get_streaming_response(response, start_time):
output += h
token_gen_time.append(t)
else:
output = get_response(response)

return ResponseDetails(
generated_tokens=output,
prompt=input_tokens,
start_time=start_time,
end_time=time.time(),
model_time=0,
token_gen_time=token_gen_time,
)


def call_aml(
input_tokens: str,
max_new_tokens: int,
Expand Down Expand Up @@ -205,7 +279,7 @@ def _run_parallel(
event_loop = asyncio.new_event_loop()
asyncio.set_event_loop(event_loop)

backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml}
backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml, "openai": call_openai}
call_fn = backend_call_fns[args.backend]

barrier.wait()
Expand Down
9 changes: 6 additions & 3 deletions benchmarks/inference/mii/src/plot_effective_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm", "openai"], default=["fastgen", "vllm"], \
nargs="+", help="Specify the backends to generate plots for")
parser.add_argument("--log_dir", type=Path, default="./results")
parser.add_argument("--model", type=str)
parser.add_argument("--out_dir", type=Path, default="./plots/goodtput")
parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second")
parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets")
Expand Down Expand Up @@ -76,7 +77,7 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span):


def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ):
tokenizer = get_tokenizer()
tokenizer = get_tokenizer(args.model)
prompt_length = len(tokenizer.tokenize(response_detail.prompt))
prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec
if prompt_latency_SLA < response_detail.token_gen_time[0]:
Expand Down Expand Up @@ -137,7 +138,9 @@ def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen
]

plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}}
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}, \
'openai': {'label': 'openai-API', 'marker': '+', 'color': 'red'}
}

for f in validate_funcs:
plt.figure()
Expand Down
11 changes: 7 additions & 4 deletions benchmarks/inference/mii/src/postprocess_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,13 @@ def parse_args():
return args


def get_tokenizer():
def get_tokenizer(model=None):
global tokenizer
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
if model==None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
else:
tokenizer = AutoTokenizer.from_pretrained(model)
return tokenizer


Expand All @@ -78,8 +81,8 @@ def get_summary(args, response_details):

tokens_per_sec = mean(
[
(len(get_tokenizer().tokenize(r.prompt)) +
len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
(len(get_tokenizer(args["model"]).tokenize(r.prompt)) +
len(get_tokenizer(args["model"]).tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
else len(r.generated_tokens))
/ (r.end_time - r.start_time)
for r in response_details
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/inference/mii/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def start_server(args: argparse.Namespace) -> None:
"fastgen": start_fastgen_server,
"vllm": start_vllm_server,
"aml": start_aml_server,
"openai": start_openai_server,
}
start_fn = start_server_fns[args.backend]
start_fn(args)
Expand Down Expand Up @@ -90,12 +91,16 @@ def start_aml_server(args: argparse.Namespace) -> None:
"AML server start not implemented. Please use Azure Portal to start the server."
)

def start_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

def stop_server(args: argparse.Namespace) -> None:
stop_server_fns = {
"fastgen": stop_fastgen_server,
"vllm": stop_vllm_server,
"aml": stop_aml_server,
"openai": stop_openai_server,
}
stop_fn = stop_server_fns[args.backend]
stop_fn(args)
Expand All @@ -118,6 +123,9 @@ def stop_aml_server(args: argparse.Namespace) -> None:
"AML server stop not implemented. Please use Azure Portal to stop the server."
)

def stop_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

if __name__ == "__main__":
args = parse_args(server_args=True)
Expand Down
14 changes: 13 additions & 1 deletion benchmarks/inference/mii/src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ def parse_args(
default="./results/",
help="Directory to save result JSON files",
)
client_parser.add_argument(
"--openai_api_url",
type=str,
default=None,
help="When using the openai API backend, this is the API URL that points to an openai api server",
)
client_parser.add_argument(
"--openai_api_key",
type=str,
default=None,
help="When using the openai API backend, this is the API key for a given openai_api_url",
)
client_parser.add_argument(
"--aml_api_url",
type=str,
Expand Down Expand Up @@ -156,7 +168,7 @@ def parse_args(
parser.add_argument(
"--backend",
type=str,
choices=["aml", "fastgen", "vllm"],
choices=["aml", "fastgen", "vllm", "openai"],
default="fastgen",
help="Which backend to benchmark",
)
Expand Down

0 comments on commit cfd800c

Please sign in to comment.