collabora · makaveli10 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 11, 2024
diff --git a/README.md b/README.md
@@ -96,7 +96,17 @@ client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/b
   docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
   ```
 
-  - TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) in order to setup docker and use TensorRT backend. We provide a pre-built docker image which has TensorRT-LLM built and ready to use.
+  - TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md).
+  ```
+  mkdir docker/scratch-space
+  cp docker/scripts/build-whisper-tensorrt.sh docker/scratch-space
+  cp docker/scripts/run-whisperlive.sh docker/scratch-space
+
+  # For e.g. 3090 RTX cuda architecture is 86-real
+  CUDA_ARCH=86-real docker compose build
+
+  MODEL_SIZE=small.en BACKEND=tensorrt docker compose up
+  ```
 
 - CPU
 ```bash

diff --git a/TensorRT_whisper.md b/TensorRT_whisper.md
@@ -12,56 +12,19 @@ git clone https://github.com/collabora/WhisperLive.git
 cd WhisperLive
 ```
 
-- Pull the TensorRT-LLM docker image which we prebuilt for WhisperLive TensorRT backend.
-```bash
-docker pull ghcr.io/collabora/whisperbot-base:latest
+- Build docker image for the gpu architecture. By default the image is built for 4090 i.e. `CUDA_ARCH=89-real;90-real`
 ```
+mkdir docker/scratch-space
+cp docker/scripts/build-whisper-tensorrt.sh docker/scratch-space
+cp docker/scripts/run-whisperlive.sh docker/scratch-space
 
-- Next, we run the docker image and mount WhisperLive repo to the containers `/home` directory.
-```bash
-docker run -it --gpus all --shm-size=8g \
-       --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
-       -p 9090:9090 -v /path/to/WhisperLive:/home/WhisperLive \
-       ghcr.io/collabora/whisperbot-base:latest
-```
-
-- Make sure to test the installation. 
-```bash
-# export ENV=${ENV:-/etc/shinit_v2} 
-# source $ENV
-python -c "import torch; import tensorrt; import tensorrt_llm"
-```
-**NOTE**: Uncomment and update library paths if imports fail.
-
-## Whisper TensorRT Engine
-- We build `small.en` and `small` multilingual TensorRT engine. The script logs the path of the directory with Whisper TensorRT engine. We need the model_path to run the server.
-```bash
-# convert small.en
-bash scripts/build_whisper_tensorrt.sh /root/TensorRT-LLM-examples small.en
-
-# convert small multilingual model
-bash scripts/build_whisper_tensorrt.sh /root/TensorRT-LLM-examples small
+# For e.g. 3090 RTX cuda architecture is 86-real
+CUDA_ARCH=86-real docker compose build
 ```
 
 ## Run WhisperLive Server with TensorRT Backend
+We run the container with docker compose which builds the tensorrt engine for specified model
+if it doesnt exist already in the mounted volume `docker/scratch-space`. Optionally, if you want to run `faster_whisper` backend use `BACKEND=faster_whisper`
 ```bash
-cd /home/WhisperLive
-
-# Install requirements
-apt update && bash scripts/setup.sh
-pip install -r requirements/server.txt
-
-# Required to create mel spectogram
-wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
-
-# Run English only model
-python3 run_server.py --port 9090 \
-                      --backend tensorrt \
-                      --trt_model_path "path/to/whisper_trt/from/build/step"
-
-# Run Multilingual model
-python3 run_server.py --port 9090 \
-                      --backend tensorrt \
-                      --trt_model_path "path/to/whisper_trt/from/build/step" \
-                      --trt_multilingual
+MODEL_SIZE=small.en BACKEND=tensorrt docker compose up
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,26 @@
+version: '3.8'
+
+services:
+  whisperlive-tensorrt:
+    build:
+      context: docker
+      dockerfile: Dockerfile.tensorrt
+      args:
+        CUDA_ARCH: ${CUDA_ARCH:-89-real;90-real}
+    image: whisperlive-tensorrt:latest
+    volumes:
+      - type: bind
+        source: ./docker/scratch-space
+        target: /root/scratch-space
+    environment:
+      VERBOSE: ${VERBOSE:-false}
+      MODEL_SIZE: ${MODEL_SIZE:-small.en}
+      BACKEND: ${BACKEND:-tensorrt}
+    ports:
+      - "8000:9090"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: ["gpu"]
+    entrypoint: ["/bin/bash", "-c", "/root/scratch-space/run-whisperlive.sh $$MODEL_SIZE $$BACKEND"]
diff --git a/docker/Dockerfile.tensorrt b/docker/Dockerfile.tensorrt
@@ -0,0 +1,20 @@
+ARG BASE_IMAGE=nvcr.io/nvidia/cuda
+ARG BASE_TAG=12.2.2-devel-ubuntu22.04
+
+FROM ${BASE_IMAGE}:${BASE_TAG} as base
+ARG CUDA_ARCH
+ENV CUDA_ARCH=${CUDA_ARCH}
+
+FROM base as devel
+WORKDIR /root
+COPY scripts/install-deps.sh /root
+RUN bash install-deps.sh && rm install-deps.sh
+COPY scripts/build-trt-llm.sh /root
+RUN bash build-trt-llm.sh && rm build-trt-llm.sh
+
+FROM devel as release
+WORKDIR /root/
+COPY scripts/install-trt-llm.sh /root
+RUN bash install-trt-llm.sh && rm install-trt-llm.sh
+COPY scripts/setup-whisperlive.sh /root/
+RUN ./setup-whisperlive.sh
diff --git a/docker/scripts/build-trt-llm.sh b/docker/scripts/build-trt-llm.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+
+export ENV=${ENV:-/etc/shinit_v2}
+source $ENV
+
+CUDA_ARCH="${CUDA_ARCH:-89-real;90-real}"
+
+cd /root/TensorRT-LLM
+python3 scripts/build_wheel.py --clean --cuda_architectures "$CUDA_ARCH" --trt_root /usr/local/tensorrt
diff --git a/scripts/build_whisper_tensorrt.sh → docker/scripts/build-whisper-tensorrt.sh b/scripts/build_whisper_tensorrt.sh → docker/scripts/build-whisper-tensorrt.sh
@@ -48,19 +48,21 @@ download_and_build_model() {
     # wget --directory-prefix=assets "$model_url"
     # echo "Download completed: ${model_name}.pt"
     if [ ! -f "assets/${model_name}.pt" ]; then
-        wget --directory-prefix=assets "$model_url"
+        wget --directory-prefix=assets "$model_url" > /dev/null 2>&1
         echo "Download completed: ${model_name}.pt"
     else
         echo "${model_name}.pt already exists in assets directory."
     fi
 
     local output_dir="whisper_${model_name//./_}"
-    echo "$output_dir"
-    echo "Running build script for $model_name with output directory $output_dir"
-    python3 build.py --output_dir "$output_dir" --use_gpt_attention_plugin --use_gemm_plugin --use_bert_attention_plugin --model_name "$model_name"
+    echo "Running TensorRT-LLM build script for $model_name with output directory $output_dir"
+    python3 build.py --output_dir "$output_dir" --use_gpt_attention_plugin --use_gemm_plugin --use_bert_attention_plugin --model_name "$model_name" > /dev/null 2>&1
     echo "Whisper $model_name TensorRT engine built."
     echo "========================================="
     echo "Model is located at: $(pwd)/$output_dir"
+    mkdir -p /root/scratch-space/models
+    cp -r $output_dir /root/scratch-space/models
+
 }
 
 if [ "$#" -lt 1 ]; then
@@ -70,8 +72,15 @@ fi
 
 tensorrt_examples_dir="$1"
 model_name="${2:-small.en}"
+output_dir="whisper_${model_name//./_}"
 
-cd $1/whisper
-pip install --no-deps -r requirements.txt
+if [ ! -d "/root/scratch-space/models/$output_dir" ] || [ -z "$(ls -A /root/scratch-space/models/$output_dir)" ]; then
+    echo "$output_dir directory does not exist or is empty. Building whisper"
+    cd $1/whisper
+    echo "Installing requirements for Whisper TensorRT-LLM ..."
+    pip install --no-deps -r requirements.txt > /dev/null 2>&1
+    download_and_build_model "$output_dir"
+else
+    echo "$output_dir directory exists and is not empty. Skipping build-whisper..."
+fi
 
-download_and_build_model "$model_name"
diff --git a/docker/scripts/install-deps.sh b/docker/scripts/install-deps.sh
@@ -0,0 +1,54 @@
+#!/bin/bash -e
+
+apt-get update && apt-get -y install git git-lfs
+git clone --depth=1 -b cuda12.2 https://github.com/makaveli10/TensorRT-LLM.git
+cd TensorRT-LLM
+git checkout main
+git submodule update --init --recursive
+git lfs install
+git lfs pull
+
+# do not reinstall CUDA (our base image provides the same exact versions)
+patch -p1 <<EOF
+diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
+index 2dcb0a6..3a27e03 100644
+--- a/docker/common/install_tensorrt.sh
++++ b/docker/common/install_tensorrt.sh
+@@ -35,19 +35,7 @@ install_ubuntu_requirements() {
+     dpkg -i cuda-keyring_1.0-1_all.deb
+
+     apt-get update
+-    if [[ $(apt list --installed | grep libcudnn8) ]]; then
+-      apt-get remove --purge -y libcudnn8*
+-    fi
+-    if [[ $(apt list --installed | grep libnccl) ]]; then
+-      apt-get remove --purge -y --allow-change-held-packages libnccl*
+-    fi
+-    if [[ $(apt list --installed | grep libcublas) ]]; then
+-      apt-get remove --purge -y --allow-change-held-packages libcublas*
+-    fi
+-    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+     apt-get install -y --no-install-recommends libcudnn8=${CUDNN_VER} libcudnn8-dev=${CUDNN_VER}
+-    apt-get install -y --no-install-recommends libnccl2=${NCCL_VER} libnccl-dev=${NCCL_VER}
+-    apt-get install -y --no-install-recommends libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER}
+     apt-get clean
+     rm -rf /var/lib/apt/lists/*
+ }
+EOF
+
+cd docker/common/
+export BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
+export ENV=${ENV:-/etc/shinit_v2}
+bash install_base.sh
+bash install_cmake.sh
+source $ENV
+bash install_ccache.sh
+# later on TensorRT-LLM will force reinstall this version anyways
+pip3 install --extra-index-url https://download.pytorch.org/whl/cu121 torch==2.1.0
+bash install_tensorrt.sh
+bash install_polygraphy.sh
+source $ENV
+
+cd /root/TensorRT-LLM/docker/common/
+bash install_mpi4py.sh
+source $ENV
diff --git a/docker/scripts/install-trt-llm.sh b/docker/scripts/install-trt-llm.sh
@@ -0,0 +1,10 @@
+#!/bin/bash -e
+
+cd /root/TensorRT-LLM
+pip install build/tensorrt_llm-0.7.1-cp310-cp310-linux_x86_64.whl
+mv examples ../TensorRT-LLM-examples
+cd ..
+
+rm -rf TensorRT-LLM
+# we don't need static libraries and they take a lot of space
+(cd /usr && find . -name '*static.a' | grep -v cudart_static | xargs rm -f)
diff --git a/docker/scripts/run-whisperlive.sh b/docker/scripts/run-whisperlive.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -e
+echo "MODEL_SIZE is set to: $MODEL_SIZE"
+echo "BACKEND is set to: $BACKEND"
+
+test -f /etc/shinit_v2 && source /etc/shinit_v2
+
+echo "Running build-models.sh..."
+cd /root/scratch-space/
+./build-whisper-tensorrt.sh /root/TensorRT-LLM-examples/ $MODEL_SIZE
+
+whisper_model_trt="whisper_${MODEL_SIZE//./_}"
+
+echo "$whisper_model_trt"
+
+cd /root/WhisperLive
+
+if [ "$BACKEND" == "tensorrt" ]; then
+    if [[ $MODEL_SIZE == *".en" ]]; then
+        exec python3 run_server.py -p 9090 -b $BACKEND \
+            -trt /root/scratch-space/models/"$whisper_model_trt"
+    else
+        exec python3 run_server.py -p 9090 -b $BACKEND \
+            -trt /root/scratch-space/models/"$whisper_model_trt" \
+            -m
+    fi
+else
+    exec python3 run_server.py
+fi
diff --git a/docker/scripts/setup-whisperlive.sh b/docker/scripts/setup-whisperlive.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -e
+
+## Clone this repo and install requirements
+[ -d "WhisperLive" ] || git clone https://github.com/collabora/WhisperLive.git
+
+cd WhisperLive
+apt update
+apt-get install portaudio19-dev ffmpeg wget -y
+
+## Install all the other dependencies normally
+pip install -r requirements/server.txt
+
+mkdir -p /root/.cache/whisper-live/
+curl -L -o /root/.cache/whisper-live/silero_vad.onnx https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
+
+# the sound filter definitions
+wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz