-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathconfig.yaml
30 lines (30 loc) · 1.36 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
base_image:
# select an image: L4
# CPU baseten/text-embeddings-inference-mirror:cpu-1.6
# Turing (T4, ...) baseten/text-embeddings-inference-mirror:turing-1.6
# Ampere 80 (A100, A30) baseten/text-embeddings-inference-mirror:1.6
# Ampere 86 (A10, A10G, A40, ...) baseten/text-embeddings-inference-mirror:86-1.6
# Ada Lovelace (L4, ...) baseten/text-embeddings-inference-mirror:89-1.6
# Hopper (H100/H100 40GB) baseten/text-embeddings-inference-mirror:hopper-1.6
image: baseten/text-embeddings-inference-mirror:89-1.6
model_metadata:
repo_id: BAAI/bge-base-en-v1.5
docker_server:
start_command: sh -c "text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size 32 --max-concurrent-requests 40 --max-batch-tokens 32768"
readiness_endpoint: /health
liveness_endpoint: /health
# change to /rerank or /predict if you want to use the rerank or predict endpoint
# https://huggingface.github.io/text-embeddings-inference/
predict_endpoint: /v1/embeddings
server_port: 7997
resources:
accelerator: L4
use_gpu: true
model_name: text-embeddings-inference trussless
build_commands: # optional step to download the weights of the model into the image
- git clone https://huggingface.co/BAAI/bge-base-en-v1.5 /data/local-model
runtime:
predict_concurrency : 40
environment_variables:
VLLM_LOGGING_LEVEL: WARNING
hf_access_token: null