Add UDS bundles and README Updates (#322)

* add uds bundles for local package deployment * update README and Makefile targets * add uds latest targets and move dir * add local dev instructions to README.md * update e2e test workflow
defenseunicorns · Apr 8, 2024 · 2bf1974 · 2bf1974
1 parent a5d98b2
commit 2bf1974
Show file tree

Hide file tree

Showing 15 changed files with 478 additions and 26 deletions.
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -62,8 +62,8 @@ jobs:
           run: |
             make build-api LOCAL_VERSION=e2e-test
             docker image prune -af
-            uds zarf package deploy zarf-package-leapfrogai-api-amd64-e2e-test.tar.zst --confirm
-            rm zarf-package-leapfrogai-api-amd64-e2e-test.tar.zst
+            uds zarf package deploy packages/api/zarf-package-leapfrogai-api-amd64-e2e-test.tar.zst --confirm
+            rm packages/api/zarf-package-leapfrogai-api-amd64-e2e-test.tar.zst
 
         ##########
         # llama
@@ -72,8 +72,8 @@ jobs:
           run: |
             make build-llama-cpp-python LOCAL_VERSION=e2e-test
             docker image prune -af
-            uds zarf package deploy zarf-package-llama-cpp-python-amd64-e2e-test.tar.zst -l=trace --confirm
-            rm zarf-package-llama-cpp-python-amd64-e2e-test.tar.zst
+            uds zarf package deploy packages/llama-cpp-python/zarf-package-llama-cpp-python-amd64-e2e-test.tar.zst -l=trace --confirm
+            rm packages/llama-cpp-python/zarf-package-llama-cpp-python-amd64-e2e-test.tar.zst
 
         - name: Test llama-cpp-python
           run: |
@@ -90,8 +90,8 @@ jobs:
           run: |
             make build-text-embeddings LOCAL_VERSION=e2e-test
             docker image prune -af
-            uds zarf package deploy zarf-package-text-embeddings-amd64-e2e-test.tar.zst -l=trace --confirm
-            rm zarf-package-text-embeddings-amd64-e2e-test.tar.zst
+            uds zarf package deploy packages/text-embeddings/zarf-package-text-embeddings-amd64-e2e-test.tar.zst -l=trace --confirm
+            rm packages/text-embeddings/zarf-package-text-embeddings-amd64-e2e-test.tar.zst
 
         - name: Test text-embeddings
           run: |
@@ -108,8 +108,8 @@ jobs:
           run: |
             make build-whisper LOCAL_VERSION=e2e-test
             docker image prune -af
-            uds zarf package deploy zarf-package-whisper-amd64-e2e-test.tar.zst -l=trace --confirm
-            rm zarf-package-whisper-amd64-e2e-test.tar.zst
+            uds zarf package deploy packages/whisper/zarf-package-whisper-amd64-e2e-test.tar.zst -l=trace --confirm
+            rm packages/whisper/zarf-package-whisper-amd64-e2e-test.tar.zst
 
         - name: Test whisper
           run: |

diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ tokenizer.json
 config.json
 generation_config.json
 vocabulary.json
+.model/
 
 # go binaries
 api/main
diff --git a/Makefile b/Makefile
@@ -3,7 +3,6 @@ KEY ?= ""
 
 VERSION ?= $(shell git describe --abbrev=0 --tags)
 LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
-
 ######################################################################################
 
 .PHONY: help
@@ -49,14 +48,14 @@ build-api: local-registry setup-api-deps ## Build the leapfrogai_api container a
 	docker push localhost:5000/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/api --registry-override=ghcr.io=localhost:5000 --insecure --set LEAPFROGAI_IMAGE_VERSION=${LOCAL_VERSION} --confirm
+	uds zarf package create packages/api -o packages/api --registry-override=ghcr.io=localhost:5000 --insecure --set LEAPFROGAI_IMAGE_VERSION=${LOCAL_VERSION} --confirm
 
 
-setup-llama-deps:  ## Download the wheels for the optional 'llama' dependencies
-	-rm packages/llama/build/*.whl
+setup-llama-cpp-python-deps:  ## Download the wheels for the optional 'llama-cpp-python' dependencies
+	-rm packages/llama-cpp-python/build/*.whl
 	python -m pip wheel ".[llama-cpp-python]" -w packages/llama-cpp-python/build
 
-build-llama-cpp-python: local-registry setup-llama-deps ## Build the llama-cpp-python (cpu) container and Zarf package
+build-llama-cpp-python: local-registry setup-llama-cpp-python-deps ## Build the llama-cpp-python (cpu) container and Zarf package
 	## Build the image (and tag it for the local registry)
 	docker build -t ghcr.io/defenseunicorns/leapfrogai/llama-cpp-python:${LOCAL_VERSION} packages/llama-cpp-python
 	docker tag ghcr.io/defenseunicorns/leapfrogai/llama-cpp-python:${LOCAL_VERSION} localhost:5000/defenseunicorns/leapfrogai/llama-cpp-python:${LOCAL_VERSION}
@@ -65,14 +64,14 @@ build-llama-cpp-python: local-registry setup-llama-deps ## Build the llama-cpp-p
 	docker push localhost:5000/defenseunicorns/leapfrogai/llama-cpp-python:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/llama-cpp-python --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
+	uds zarf package create packages/llama-cpp-python -o packages/llama-cpp-python --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
 
 
 setup-vllm-deps: ## Download the wheels for the optional 'vllm' dependencies
 	-rm packages/vllm/build/*.whl
 	python -m pip wheel ".[vllm]" -w packages/vllm/build
 
-build-vllm: local-registry setup-vllm-deps
+build-vllm: local-registry setup-vllm-deps ## Build the vllm container and Zarf package
 	## Build the image (and tag it for the local registry)
 	docker build -t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} packages/vllm
 	docker tag ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} localhost:5000/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
@@ -81,7 +80,7 @@ build-vllm: local-registry setup-vllm-deps
 	docker push localhost:5000/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/vllm --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
+	uds zarf package create packages/vllm -o packages/vllm --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
 
 
 setup-text-embeddings-deps: ## Download the wheels for the optional 'text-embeddings' dependencies
@@ -97,7 +96,7 @@ build-text-embeddings: local-registry setup-text-embeddings-deps ## Build the te
 	docker push localhost:5000/defenseunicorns/leapfrogai/text-embeddings:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/text-embeddings --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
+	uds zarf package create packages/text-embeddings -o packages/text-embeddings --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
 
 
 setup-whisper-deps: ## Download the wheels for the optional 'whisper' dependencies
@@ -113,4 +112,10 @@ build-whisper: local-registry setup-whisper-deps ## Build the whisper container
 	docker push localhost:5000/defenseunicorns/leapfrogai/whisper:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/whisper --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
+	uds zarf package create packages/whisper -o packages/whisper --registry-override=ghcr.io=localhost:5000 --insecure --set IMAGE_VERSION=${LOCAL_VERSION} --confirm
+
+build-cpu: build-api build-llama-cpp-python build-text-embeddings build-whisper ## Build all zarf packages for a cpu-enabled deployment of LFAI
+
+build-gpu: build-api build-vllm build-text-embeddings build-whisper ## Build all zarf packages for a gpu-enabled deployment of LFAI
+
+build-all: build-api build-llama-cpp-python build-vllm build-text-embeddings build-whisper ## Build all of the LFAI packages
diff --git a/README.md b/README.md
@@ -6,13 +6,18 @@
 - [Table of Contents](#table-of-contents)
 - [Overview](#overview)
 - [Why Host Your Own LLM?](#why-host-your-own-llm)
+- [Structure](#structure)
 - [Getting Started](#getting-started)
 - [Components](#components)
   - [API](#api)
   - [Backends](#backends)
   - [Image Hardening](#image-hardening)
   - [SDK](#sdk)
   - [User Interface](#user-interface)
+- [Usage](#usage)
+  - [UDS (Latest)](#uds-latest)
+  - [UDS (Dev)](#uds-dev)
+  - [Local Dev](#local-dev)
 - [Community](#community)
 
 ## Overview
@@ -31,9 +36,34 @@ Large Language Models (LLMs) are a powerful resource for AI-driven decision maki
 
 - **Mission Integration**: By hosting your own LLM, you have the ability to customize the model's parameters, training data, and more, tailoring the AI to your specific needs.
 
+## Structure
+
+The LeapfrogAI repository follows a monorepo structure based around an [API](#api) with each of the [components](#components) included in a dedicated `packages` directory. Each of these package directories contains the source code for each component as well as the deployment infrastructure. The UDS bundles that handle the development and latest deployments of LeapfrogAI are in the `uds-bundles` directory. The structure looks as follows:
+
+```
+leapfrogai/
+├── src/
+│   └── leapfrogai_api/
+│       ├── main.py
+│       └── ...
+├── packages/
+│   ├── api/
+│   ├── llama-cpp-python/
+│   ├── text-embeddings/
+│   ├── vllm/
+│   └── whisper/
+├── uds-bundles/
+│   ├── dev/
+│   └── latest/
+├── Makefile
+├── pyproject.toml
+├── README.md
+└── ...
+```
+
 ## Getting Started
 
-The preferred method for running LeapfrogAI is a local [Kubernetes](https://kubernetes.io/) deployment using [UDS](https://github.com/defenseunicorns/uds-core). Simple instructions for this type of deployment can be found on the [LeapfrogAI Documentation Site](https://docs.leapfrog.ai/docs/).
+The preferred method for running LeapfrogAI is a local [Kubernetes](https://kubernetes.io/) deployment using [UDS](https://github.com/defenseunicorns/uds-core). Refer to the [Quick Start](https://docs.leapfrog.ai/docs/local-deploy-guide/quick_start/) section of the LeapfrogAI documentation site for instructions on this type of deployment.
 
 ## Components
 
@@ -74,6 +104,124 @@ The LeapfrogAI SDK provides a standard set of protobuff and python utilities for
 
 LeapfrogAI provides some options of UI to get started with common use-cases such as chat, summarization, and transcription.
 
+## Usage
+
+### UDS (Latest)
+
+LeapfrogAI can be deployed and run locally via UDS and Kubernetes, built out using [Zarf](https://zarf.dev) packages. This pulls the most recent package images and is the most stable way of running a local LeapfrogAI deployment. These instructions can be found on the [LeapfrogAI Docs](https://docs.leapfrog.ai/docs/) site.
+
+### UDS (Dev)
+
+If you want to make some changes to LeapfrogAI before deploying via UDS (for example in a dev environment), you can follow these instructions:
+
+Make sure your system has the [required dependencies](https://docs.leapfrog.ai/docs/local-deploy-guide/quick_start/#prerequisites).
+
+For ease, it's best to create a virtual environment:
+```
+python -m venv .venv
+source .venv/bin/activate
+```
+
+Each component is built into its own Zarf package. This can be done easily using the provided `Make` targets:
+```
+make build-api
+make build-vllm                 # if you have GPUs
+make build-llama-cpp-python     # if you have CPU only
+make build-text-embeddings
+make build-whisper
+```
+**OR**
+
+You can build all of the packages you need at once with the following make targets:
+
+```
+make build-cpu    # api, llama-cpp-python, text-embeddings, whisper
+make build-gpu    # api, vllm, text-embeddings, whisper
+make build-all    # all of the backends
+```
+
+Once the packages are created, you can deploy either a CPU or GPU-enabled deployment via one of the UDS bundles:
+
+#### CPU
+```
+cd uds-bundles/dev/cpu
+uds create .
+uds deploy k3d-core-slim-dev:0.18.0
+uds deploy uds-bundle-leapfrogai*.tar.zst
+```
+
+#### GPU
+```
+cd uds-bundles/dev/gpu
+uds create .
+uds deploy k3d-core-slim-dev:0.18.0 --set K3D_EXTRA_ARGS="--gpus=all --image=ghcr.io/justinthelaw/k3d-gpu-support:v1.27.4-k3s1-cuda"     # be sure to check if a newer version exists
+uds deploy uds-bundle-leapfrogai-*.tar.zst --confirm
+```
+
+### Local Dev
+
+The following instructions are for running each of the LFAI components for local development. This is useful when testing changes to a specific component, but will not assist in a full deployment of LeapfrogAI. Please refer to the above sections for deployment instructions.
+
+It is highly recommended to make a virtual environment to keep the development environment clean:
+
+```
+python -m venv .venv
+source .venv/bin/activate
+```
+
+#### API
+
+To run the LeapfrogAI API locally (starting from the root directory of the repository):
+
+```
+python -m pip install ".[api,dev]"
+cd src
+uvicorn leapfrogai_api.main:app --port 3000 --reload
+```
+
+#### Backend: llama-cpp-python
+
+To run the llama-cpp-python backend locally (starting from the root directory of the repository):
+
+```
+python -m pip install ".[llama-cpp-python,dev]"
+cd packages/llama-cpp-python
+python scripts/model_download.py
+mv .model/*.gguf .model/model.gguf
+python -m leapfrogai_api.types.cli --app-dir=. main:Model
+```
+
+#### Backend: text-embeddings
+To run the text-embeddings backend locally (starting from the root directory of the repository):
+
+```
+python -m pip install ".[text-embeddings,dev]"
+cd packages/text-embeddings
+python scripts/model_download.py
+python -u main.py
+```
+
+#### Backend: vllm
+To run the vllm backend locally (starting from the root directory of the repository):
+
+```
+python -m pip install ".[vllm,dev]"
+cd packages/vllm
+python scripts/model_download.py
+export QUANTIZATION=awq
+python -m leapfrogai_api.types.cli --app-dir=. main:Model
+```
+
+#### Backend: whisper
+To run the vllm backend locally (starting from the root directory of the repository):
+
+```
+python -m pip install ".[whisper,dev]"
+cd packages/whisper
+ct2-transformers-converter --model openai/whisper-base --output_dir .model --copy_files tokenizer.json --quantization float32
+python -u main.py
+```
+
 ## Community
 
 LeapfrogAI is supported by a community of users and contributors, including:

diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile
@@ -38,7 +38,6 @@ COPY build/*.whl build/
 COPY build/leapfrogai_api*.whl leapfrogai_api-100.100.100-py3-none-any.whl
 RUN pip install "leapfrogai_api-100.100.100-py3-none-any.whl[vllm]" --no-index --find-links=build/
 
-
 # download model
 ARG REPO_ID=TheBloke/Synthia-7B-v2.0-AWQ
 ARG REVISION=main

diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,7 @@ text-embeddings = [
 
 whisper = [
     "faster-whisper == 0.10.0",
+    "ctranslate2 >= 4.1.0"
 ]
 
 e2e-test = [

diff --git a/uds-bundles/dev/cpu/uds-bundle.yaml b/uds-bundles/dev/cpu/uds-bundle.yaml
@@ -0,0 +1,39 @@
+kind: UDSBundle
+metadata:
+  name: leapfrogai
+  description: A UDS bundle for deploying LeapfrogAI with CPU-only support
+  version: dev
+
+packages:
+  # API
+  - name: leapfrogai-api
+    path: ../../../packages/api/
+    ref: dev
+
+  # Legacy UI - not currently in this repo so it builds from ghcr
+  - name: leapfrogai-ui
+    repository: ghcr.io/defenseunicorns/packages/leapfrogai/leapfrogai-ui
+    ref: 0.3.5
+
+  # Chat Model
+  - name: llama-cpp-python
+    path: ../../../packages/llama-cpp-python/
+    ref: dev
+
+  # Text Embeddings Model
+  - name: text-embeddings
+    path: ../../../packages/text-embeddings/
+    ref: dev
+
+  # RAG Backend - not currently in this repo so it builds from ghcr
+  - name: rag
+    repository: ghcr.io/defenseunicorns/packages/leapfrogai/rag
+    ref: 0.3.1
+
+  # Transcription Model
+  - name: whisper
+    path: ../../../packages/whisper/
+    ref: dev
+
+
+
diff --git a/uds-bundles/dev/cpu/uds-config.yaml b/uds-bundles/dev/cpu/uds-config.yaml
@@ -0,0 +1,26 @@
+variables:
+  leapfrogai-ui:
+    domain: https://ai.uds.dev
+    model: llama-cpp-python
+    concurrent_requests: "false"
+    ai4ns_branding: "false"
+    leapfrogai_rag_url: http://rag.leapfrogai.svc.cluster.local:8000
+    max_tokens: 8192
+
+  text-embeddings:
+    gpu_limit: 0
+
+  whisper:
+    gpu_limit: 0
+
+  rag:
+    model: llama-cpp-python
+    ssl_verification: "false" # if certs exist in-cluster, make true
+    response_mode: "raw" # default mode for query endpoint
+    temperature: 0 # refine method temperature for vllm
+    max_output: 2048
+    context_window: 4096
+    chunk_size: 512
+    overlap_size: 64
+    embedding_model_name: text-embeddings
+    top_k: 20