Skip to content

Commit

Permalink
Avoid downloading engine more than once (#137)
Browse files Browse the repository at this point in the history
In cases when we run TRT-LLM trusses with more than one worker we
download engine files each time worker runs. I've added safeguard on
this so we download files only once.
  • Loading branch information
Timur Abishev authored Dec 22, 2023
1 parent 04c8301 commit d9e8104
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 48 deletions.
15 changes: 9 additions & 6 deletions llama/llama-2-7b-trt-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
15 changes: 9 additions & 6 deletions mistral/mistral-7b-instruct-chat-trt-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
15 changes: 9 additions & 6 deletions mistral/mistral-7b-trt-llm-build-engine/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from build_engine_utils import BuildConfig, build_engine
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -46,11 +46,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
if "engine_build" in self._config["model_metadata"]:
if not is_external_engine_repo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
15 changes: 9 additions & 6 deletions mistral/mixtral-8x7b-instruct-trt-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down
15 changes: 9 additions & 6 deletions templates/trt-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from client import TritonClient, UserData
from transformers import AutoTokenizer
from utils import download_engine, prepare_grpc_tensor
from utils import download_engine, prepare_grpc_tensor, server_loaded

TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/")

Expand Down Expand Up @@ -45,11 +45,14 @@ def load(self):

# Download model from Hugging Face Hub if specified
if is_external_engine_repo:
download_engine(
engine_repository=self._config["model_metadata"]["engine_repository"],
fp=self._data_dir,
auth_token=hf_access_token,
)
if not server_loaded():
download_engine(
engine_repository=self._config["model_metadata"][
"engine_repository"
],
fp=self._data_dir,
auth_token=hf_access_token,
)

# Load Triton Server and model
tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"]
Expand Down

0 comments on commit d9e8104

Please sign in to comment.