From d60ddb09191d8672b41f7593423667d807c1b5e2 Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Sat, 16 Mar 2024 11:31:49 -0700 Subject: [PATCH] TorchServe quickstart chatbot example (#3003) * TorchServe quickstart chatbot example * Added more details in Readme * lint failure * code cleanup * review comments --------- Co-authored-by: Mark Saroufim --- examples/LLM/llama2/chat_app/Readme.md | 27 +++ .../LLM/llama2/chat_app/docker/Dockerfile | 26 +++ .../LLM/llama2/chat_app/docker/build_image.sh | 28 +++ .../LLM/llama2/chat_app/docker/client_app.py | 118 ++++++++++ .../llama2/chat_app/docker/config.properties | 9 + .../chat_app/docker/dockerd-entrypoint.sh | 81 +++++++ .../chat_app/docker/llama_cpp_handler.py | 67 ++++++ .../llama2/chat_app/docker/requirements.txt | 4 + .../chat_app/docker/torchserve_server_app.py | 205 ++++++++++++++++++ 9 files changed, 565 insertions(+) create mode 100644 examples/LLM/llama2/chat_app/docker/Dockerfile create mode 100755 examples/LLM/llama2/chat_app/docker/build_image.sh create mode 100644 examples/LLM/llama2/chat_app/docker/client_app.py create mode 100644 examples/LLM/llama2/chat_app/docker/config.properties create mode 100755 examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh create mode 100644 examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py create mode 100644 examples/LLM/llama2/chat_app/docker/requirements.txt create mode 100644 examples/LLM/llama2/chat_app/docker/torchserve_server_app.py diff --git a/examples/LLM/llama2/chat_app/Readme.md b/examples/LLM/llama2/chat_app/Readme.md index 4684bd3132..b70adc21b6 100644 --- a/examples/LLM/llama2/chat_app/Readme.md +++ b/examples/LLM/llama2/chat_app/Readme.md @@ -9,6 +9,33 @@ We are using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) in You can run this example on your laptop to understand how to use TorchServe +## Quick Start Guide + +To get started with TorchServe, you need to run the following + +``` +# 1: Set HF Token as Env variable +export HUGGINGFACE_TOKEN= # get this from your HuggingFace account + +# 2: Build TorchServe Image for Serving llama2-7b model with 4-bit quantization +./examples/llm/llama2/chat_app/docker/build_image.sh meta-llama/Llama-2-7b-chat-hf + +# 3: Launch the streamlit app for server & client +docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v :/home/model-server/model-store pytorch/torchserve:meta-llama---Llama-2-7b-chat-hf +``` +In step 3, `` is a location where you want the model to be downloaded + +### What to expect +This launches two streamlit apps +1. TorchServe Server app to start/stop TorchServe, load model, scale up/down workers, configure dynamic batch_size ( Currently llama-cpp-python doesn't support batch_size > 1) + - Since this app is targeted for Apple M1/M2 laptops, we load a 4-bit quantized version of llama2 using llama-cpp-python. +2. Client chat app where you can chat with the model . There is a slider to send concurrent requests to the model. The current app doesn't have a good mechanism to show multiple responses in parallel. You can notice streaming response for the first request followed by a complete response for the next request. + +Currently, this launches llama2-7b model with 4-bit quantization running on CPU. + +To make use of M1/M2 GPU, you can follow the below guide to do a standalone TorchServe installation. + + ## Architecture ![Chatbot Architecture](./screenshots/architecture.png) diff --git a/examples/LLM/llama2/chat_app/docker/Dockerfile b/examples/LLM/llama2/chat_app/docker/Dockerfile new file mode 100644 index 0000000000..fd3435b903 --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/Dockerfile @@ -0,0 +1,26 @@ +ARG BASE_IMAGE=pytorch/torchserve:latest-gpu + +FROM $BASE_IMAGE as server +ARG BASE_IMAGE +ARG EXAMPLE_DIR +ARG MODEL_NAME +ARG HUGGINGFACE_TOKEN + +USER root + +ENV MODEL_NAME=$MODEL_NAME + +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + apt-get update && \ + apt-get install libopenmpi-dev git -y + +COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt +RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN + +COPY $EXAMPLE_DIR /home/model-server/chat_bot +COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties + +WORKDIR /home/model-server/chat_bot +RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \ + && chown -R model-server /home/model-server diff --git a/examples/LLM/llama2/chat_app/docker/build_image.sh b/examples/LLM/llama2/chat_app/docker/build_image.sh new file mode 100755 index 0000000000..7fefc63aa7 --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/build_image.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Check if there are enough arguments +if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +MODEL_NAME=$(echo "$1" | sed 's/\//---/g') +echo "Model: " $MODEL_NAME + +BASE_IMAGE="pytorch/torchserve:latest-cpu" + +DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}" + +# Get relative path of example dir +EXAMPLE_DIR=$(dirname "$(readlink -f "$0")") +ROOT_DIR=${EXAMPLE_DIR}/../../../../.. +ROOT_DIR=$(realpath "$ROOT_DIR") +EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|") + +# Build docker image for the application +DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" . + +echo "Run the following command to start the chat bot" +echo "" +echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG +echo "" diff --git a/examples/LLM/llama2/chat_app/docker/client_app.py b/examples/LLM/llama2/chat_app/docker/client_app.py new file mode 100644 index 0000000000..fdd8a6d444 --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/client_app.py @@ -0,0 +1,118 @@ +import json +import os +from concurrent.futures import ThreadPoolExecutor + +import requests +import streamlit as st + +MODEL_NAME = os.environ["MODEL_NAME"] + +# App title +st.set_page_config(page_title="TorchServe Chatbot") + +with st.sidebar: + st.title("TorchServe Chatbot") + + st.session_state.model_loaded = False + try: + res = requests.get(url="http://localhost:8080/ping") + res = requests.get(url=f"http://localhost:8081/models/{MODEL_NAME}") + status = "NOT READY" + if res.status_code == 200: + status = json.loads(res.text)[0]["workers"][0]["status"] + + if status == "READY": + st.session_state.model_loaded = True + st.success("Proceed to entering your prompt message!", icon="👉") + else: + st.warning("Model not loaded in TorchServe", icon="⚠️") + + except requests.ConnectionError: + st.warning("TorchServe is not up. Try again", icon="⚠️") + + if st.session_state.model_loaded: + st.success(f"Model loaded: {MODEL_NAME}!", icon="👉") + + st.subheader("Model parameters") + temperature = st.sidebar.slider( + "temperature", min_value=0.1, max_value=1.0, value=0.5, step=0.1 + ) + top_p = st.sidebar.slider( + "top_p", min_value=0.1, max_value=1.0, value=0.5, step=0.1 + ) + max_new_tokens = st.sidebar.slider( + "max_new_tokens", min_value=48, max_value=512, value=50, step=4 + ) + concurrent_requests = st.sidebar.select_slider( + "concurrent_requests", options=[2**j for j in range(0, 8)] + ) + +# Store LLM generated responses +if "messages" not in st.session_state.keys(): + st.session_state.messages = [ + {"role": "assistant", "content": "How may I assist you today?"} + ] + +# Display or clear chat messages +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + + +def clear_chat_history(): + st.session_state.messages = [ + {"role": "assistant", "content": "How may I assist you today?"} + ] + + +st.sidebar.button("Clear Chat History", on_click=clear_chat_history) + + +def generate_model_response(prompt_input, executor): + string_dialogue = ( + "Question: What are the names of the planets in the solar system? Answer: " + ) + headers = {"Content-type": "application/json", "Accept": "text/plain"} + url = f"http://127.0.0.1:8080/predictions/{MODEL_NAME}" + data = json.dumps( + { + "prompt": prompt_input, + "params": { + "max_new_tokens": max_new_tokens, + "top_p": top_p, + "temperature": temperature, + }, + } + ) + res = [ + executor.submit(requests.post, url=url, data=data, headers=headers, stream=True) + for i in range(concurrent_requests) + ] + + return res, max_new_tokens + + +# User-provided prompt +if prompt := st.chat_input(): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.write(prompt) + +# Generate a new response if last message is not from assistant +if st.session_state.messages[-1]["role"] != "assistant": + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + with ThreadPoolExecutor() as executor: + futures, max_tokens = generate_model_response(prompt, executor) + placeholder = st.empty() + full_response = "" + count = 0 + for future in futures: + response = future.result() + for chunk in response.iter_content(chunk_size=None): + if chunk: + data = chunk.decode("utf-8") + full_response += data + placeholder.markdown(full_response) + message = {"role": "assistant", "content": full_response} + st.session_state.messages.append(message) diff --git a/examples/LLM/llama2/chat_app/docker/config.properties b/examples/LLM/llama2/chat_app/docker/config.properties new file mode 100644 index 0000000000..17dd570bae --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/config.properties @@ -0,0 +1,9 @@ +metrics_mode=prometheus +model_metrics_auto_detect=true +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +number_of_netty_threads=32 +job_queue_size=1000 +model_store=/home/model-server/model-store +workflow_store=/home/model-server/wf-store diff --git a/examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh b/examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh new file mode 100755 index 0000000000..11396d3be7 --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -e + +export LLAMA2_Q4_MODEL=/home/model-server/model-store/$MODEL_NAME/model/ggml-model-q4_0.gguf + + +create_model_cfg_yaml() { + # Define the YAML content with a placeholder for the model name + yaml_content="# TorchServe frontend parameters\nminWorkers: 1\nmaxWorkers: 1\nresponseTimeout: 1200\n#deviceType: \"gpu\"\n#deviceIds: [0,1]\n#torchrun:\n# nproc-per-node: 1\n\nhandler:\n model_name: \"${2}\"\n manual_seed: 40" + + # Create the YAML file with the specified model name + echo -e "$yaml_content" > "model-config-${1}.yaml" +} + +create_model_archive() { + MODEL_NAME=$1 + MODEL_CFG=$2 + echo "Create model archive for ${MODEL_NAME} if it doesn't already exist" + if [ -d "/home/model-server/model-store/$MODEL_NAME" ]; then + echo "Model archive for $MODEL_NAME exists." + fi + if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then + echo "Model already download" + mv /home/model-server/model-store/$MODEL_NAME/model /home/model-server/model-store/ + else + echo "Model needs to be downloaded" + fi + torch-model-archiver --model-name "$MODEL_NAME" --version 1.0 --handler llama_cpp_handler.py --config-file $MODEL_CFG -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f + if [ -d "/home/model-server/model-store/model" ]; then + mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_NAME/ + fi +} + +download_model() { + MODEL_NAME=$1 + HF_MODEL_NAME=$2 + if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then + echo "Model $HF_MODEL_NAME already downloaded" + else + echo "Downloading model $HF_MODEL_NAME" + python Download_model.py --model_path /home/model-server/model-store/$MODEL_NAME/model --model_name $HF_MODEL_NAME + fi +} + +quantize_model() { + if [ ! -f "$LLAMA2_Q4_MODEL" ]; then + tmp_model_name=$(echo "$MODEL_NAME" | sed 's/---/--/g') + directory_path=/home/model-server/model-store/$MODEL_NAME/model/models--$tmp_model_name/snapshots/ + HF_MODEL_SNAPSHOT=$(find $directory_path -type d -mindepth 1) + echo "Cleaning up previous build of llama-cpp" + git clone https://github.com/ggerganov/llama.cpp.git build + cd build + make + python -m pip install -r requirements.txt + + echo "Convert the 7B model to ggml FP16 format" + python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf + + echo "Quantize the model to 4-bits (using q4_0 method)" + ./quantize ggml-model-f16.gguf $LLAMA2_Q4_MODEL q4_0 + + cd .. + echo "Saved quantized model weights to $LLAMA2_Q4_MODEL" + fi +} + +HF_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/---/\//g') +if [[ "$1" = "serve" ]]; then + shift 1 + create_model_cfg_yaml $MODEL_NAME $HF_MODEL_NAME + create_model_archive $MODEL_NAME "model-config-$MODEL_NAME.yaml" + download_model $MODEL_NAME $HF_MODEL_NAME + quantize_model + streamlit run torchserve_server_app.py --server.port 8084 & + streamlit run client_app.py --server.port 8085 +else + eval "$@" +fi + +# prevent docker exit +tail -f /dev/null diff --git a/examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py b/examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py new file mode 100644 index 0000000000..c607d8e81a --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py @@ -0,0 +1,67 @@ +import logging +import os +from abc import ABC + +import torch +from llama_cpp import Llama + +from ts.protocol.otf_message_handler import send_intermediate_predict_response +from ts.torch_handler.base_handler import BaseHandler + +logger = logging.getLogger(__name__) + + +class LlamaCppHandler(BaseHandler, ABC): + def __init__(self): + super(LlamaCppHandler, self).__init__() + self.initialized = False + + def initialize(self, ctx): + """In this initialize function, the HF large model is loaded and + partitioned using DeepSpeed. + Args: + ctx (context): It is a JSON Object containing information + pertaining to the model artifacts parameters. + """ + model_path = os.environ["LLAMA2_Q4_MODEL"] + model_name = ctx.model_yaml_config["handler"]["model_name"] + seed = int(ctx.model_yaml_config["handler"]["manual_seed"]) + torch.manual_seed(seed) + + self.model = Llama(model_path=model_path) + logger.info(f"Loaded {model_name} model successfully") + + def preprocess(self, data): + assert ( + len(data) == 1 + ), "llama-cpp-python is currently only supported with batch_size=1" + for row in data: + item = row.get("body") + return item + + def inference(self, data): + params = data["params"] + tokens = self.model.tokenize(bytes(data["prompt"], "utf-8")) + generation_kwargs = dict( + tokens=tokens, + temp=params["temperature"], + top_p=params["top_p"], + ) + count = 0 + for token in self.model.generate(**generation_kwargs): + if count >= params["max_new_tokens"]: + break + + count += 1 + new_text = self.model.detokenize([token]) + send_intermediate_predict_response( + [new_text], + self.context.request_ids, + "Intermediate Prediction success", + 200, + self.context, + ) + return [""] + + def postprocess(self, output): + return output diff --git a/examples/LLM/llama2/chat_app/docker/requirements.txt b/examples/LLM/llama2/chat_app/docker/requirements.txt new file mode 100644 index 0000000000..d79fb8e3f9 --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/requirements.txt @@ -0,0 +1,4 @@ +transformers +llama-cpp-python +streamlit>=1.26.0 +requests_futures diff --git a/examples/LLM/llama2/chat_app/docker/torchserve_server_app.py b/examples/LLM/llama2/chat_app/docker/torchserve_server_app.py new file mode 100644 index 0000000000..c485bf871c --- /dev/null +++ b/examples/LLM/llama2/chat_app/docker/torchserve_server_app.py @@ -0,0 +1,205 @@ +import json +import os +import subprocess +import time + +import requests +import streamlit as st + +MODEL_NAME = os.environ["MODEL_NAME"] +MODEL = MODEL_NAME.split("---")[1] + +# App title +st.set_page_config(page_title="TorchServe Server") + + +def start_server(): + subprocess.run( + ["torchserve --start --ts-config /home/model-server/config.properties"], + shell=True, + check=True, + ) + while True: + try: + res = requests.get(url="http://localhost:8080/ping") + if res.status_code == 200: + break + else: + server_state_container.error("Not able to start TorchServe", icon="🚫") + except: + time.sleep(0.1) + + st.session_state.started = True + st.session_state.stopped = False + st.session_state.registered = { + MODEL_NAME: False, + } + + +def stop_server(): + os.system("torchserve --stop") + st.session_state.stopped = True + st.session_state.started = False + st.session_state.registered = { + MODEL_NAME: False, + } + + +def _register_model(url, MODEL_NAME): + res = requests.post(url) + if res.status_code != 200: + server_state_container.error("Error registering model", icon="🚫") + st.session_state.started = True + return + print(f"registering {MODEL_NAME}") + st.session_state.registered[MODEL_NAME] = True + st.session_state.stopped = False + server_state_container.caption(res.text) + + +def register_model(MODEL_NAME): + if not st.session_state.started: + server_state_container.caption("TorchServe is not running. Start it") + return + url = f"http://localhost:8081/models?model_name={MODEL_NAME}&url={MODEL_NAME}&batch_size=1&max_batch_delay=3000&initial_workers=1&synchronous=true" + _register_model(url, MODEL_NAME) + + +def get_status(): + print( + f"registered state for {MODEL_NAME} is {st.session_state.registered[MODEL_NAME]}" + ) + if st.session_state.registered[MODEL_NAME]: + url = f"http://localhost:8081/models/{MODEL_NAME}" + res = requests.get(url) + if res.status_code != 200: + model_state_container.error( + f"Error getting model status for {MODEL_NAME}", icon="🚫" + ) + return + print(res.text) + status = json.loads(res.text)[0] + model_state_container.write(status) + + +def scale_workers(workers): + if st.session_state.registered[MODEL_NAME]: + num_workers = st.session_state[workers] + # num_workers = workers + url = ( + f"http://localhost:8081/models/{MODEL_NAME}?min_worker=" + f"{str(num_workers)}&synchronous=true" + ) + res = requests.put(url) + server_state_container.caption(res.text) + + +def set_batch_size(batch_size): + if st.session_state.registered[MODEL_NAME]: + url = f"http://localhost:8081/models/{MODEL_NAME}/1.0" + res = requests.delete(url) + server_state_container.caption(res.text) + print(f"Unregistering {MODEL_NAME}") + st.session_state.registered[MODEL_NAME] = False + print(f"batch size is {batch_size}") + + batch_size = st.session_state[batch_size] + url = ( + f"http://localhost:8081/models?model_name={MODEL_NAME}&url={MODEL_NAME}" + f"&batch_size={str(batch_size)}&initial_workers={str(workers)}" + f"&synchronous=true&max_batch_delay={str(max_batch_delay)}" + ) + _register_model(url, MODEL_NAME) + + +def set_max_batch_delay(max_batch_delay): + if st.session_state.registered[MODEL_NAME]: + url = f"http://localhost:8081/models/{MODEL_NAME}/1.0" + res = requests.delete(url) + server_state_container.caption(res.text) + print(f"Unregistering {MODEL_NAME}") + st.session_state.registered[MODEL_NAME] = False + + max_batch_delay = st.session_state[max_batch_delay] + url = ( + f"http://localhost:8081/models?model_name={MODEL_NAME}&url=" + f"{MODEL_NAME}&batch_size={str(batch_size)}&initial_workers=" + f"{str(workers)}&synchronous=true&max_batch_delay={str(max_batch_delay)}" + ) + _register_model(url, MODEL_NAME) + + +if "started" not in st.session_state: + st.session_state.started = False +if "stopped" not in st.session_state: + st.session_state.stopped = False +if "registered" not in st.session_state: + st.session_state.registered = { + MODEL_NAME: False, + } + +with st.sidebar: + st.title("TorchServe Server ") + + st.button("Start Server", on_click=start_server) + st.button("Stop Server", on_click=stop_server) + st.button(f"Register {MODEL}", on_click=register_model, args=(MODEL_NAME,)) + + workers = st.sidebar.slider( + "Num Workers", + key="Num Workers", + min_value=1, + max_value=4, + value=1, + step=1, + on_change=scale_workers, + args=("Num Workers",), + ) + batch_size = st.sidebar.select_slider( + "Batch Size", + key="Batch Size", + options=[2**j for j in range(0, 8)], + on_change=set_batch_size, + args=("Batch Size",), + ) + max_batch_delay = st.sidebar.slider( + "Max Batch Delay", + key="Max Batch Delay", + min_value=3000, + max_value=10000, + value=3000, + step=100, + on_change=set_max_batch_delay, + args=("Max Batch Delay",), + ) + + if st.session_state.started: + st.success("Started TorchServe", icon="✅") + + if st.session_state.stopped: + st.success("Stopped TorchServe", icon="✅") + + if st.session_state.registered[MODEL_NAME]: + st.success(f"Registered model {MODEL_NAME}", icon="✅") + + +st.title("TorchServe Status") +server_state_container = st.container() +server_state_container.subheader("Server status:") + +if st.session_state.started: + server_state_container.success("Started TorchServe", icon="✅") + +if st.session_state.stopped: + server_state_container.success("Stopped TorchServe", icon="✅") + +if st.session_state.registered[MODEL_NAME]: + server_state_container.success(f"Registered model {MODEL_NAME}", icon="✅") + + +model_state_container = st.container() +with model_state_container: + st.subheader("Model Status") + +with model_state_container: + st.button("Model Status", on_click=get_status)