diff --git a/examples/torch_tensorrt/torchcompile/README.md b/examples/torch_tensorrt/torchcompile/README.md new file mode 100644 index 0000000000..754dc43a63 --- /dev/null +++ b/examples/torch_tensorrt/torchcompile/README.md @@ -0,0 +1,85 @@ +# TorchServe inference with torch.compile with tensorrt backend + +This example shows how to run TorchServe inference with [Torch-TensorRT](https://github.com/pytorch/TensorRT) model + +### Pre-requisites + +- Verified to be working with `torch-tensorrt==2.3.0` + +Change directory to examples directory `cd examples/torch_tensorrt/torchcompile` + +### torch.compile config + +To use `tensorrt` backend with `torch.compile`, we specify the following config in `model-config.yaml` + +``` +pt2: + compile: + enable: True + backend: tensorrt +``` + +### Download the weights + +``` +wget https://download.pytorch.org/models/resnet50-11ad3fa6.pth +``` +### Create model archive + +``` +mkdir model_store + +torch-model-archiver --model-name res50-trt --handler image_classifier --version 1.0 --model-file model.py --serialized-file resnet50-11ad3fa6.pth --config-file model-config.yaml --extra-files ../../image_classifier/index_to_name.json --export-path model_store -f + +``` + +#### Start TorchServe +``` +torchserve --start --model-store model_store --models res50-trt=res50-trt.mar --disable-token --ncs +``` + +#### Run Inference + +``` +curl http://127.0.0.1:8080/predictions/res50-trt -T ../../image_classifier/kitten.jpg +``` + +produces the output + +``` +{ + "tabby": 0.27221813797950745, + "tiger_cat": 0.13754481077194214, + "Egyptian_cat": 0.04620043560862541, + "lynx": 0.003195191267877817, + "lens_cap": 0.00225762533955276 +} +``` + +## Measuring speedup + +``` +2024-06-22T18:40:52,651 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_preprocess.Milliseconds:6.462495803833008|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081652,edac5623-7904-47a9-b6f6-bdcc5f8590ed, pattern=[METRICS] +2024-06-22T18:40:52,653 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_inference.Milliseconds:1.600767970085144|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081652,edac5623-7904-47a9-b6f6-bdcc5f8590ed, pattern=[METRICS] +2024-06-22T18:40:52,653 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_postprocess.Milliseconds:0.21452799439430237|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081652,edac5623-7904-47a9-b6f6-bdcc5f8590ed, pattern=[METRICS] +``` + +To switch to PyTorch eager, we remove the `pt2` config in `model-config.yaml` or use the following + +``` +pt2: + compile: + enable: false +``` + +If we disable `torch.compile` and use PyTorch eager, we see the following + +``` +2024-06-22T18:42:32,540 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_preprocess.Milliseconds:6.869855880737305|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081752,1eb885cf-c857-4d9e-b2f8-27ec70311e32, pattern=[METRICS] +2024-06-22T18:42:32,545 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_inference.Milliseconds:5.565248012542725|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081752,1eb885cf-c857-4d9e-b2f8-27ec70311e32, pattern=[METRICS] +2024-06-22T18:42:32,546 [INFO ] W-9000-res50-trt_1.0-stdout org.pytorch.serve.wlm.WorkerLifeCycle - result=[METRICS]ts_handler_postprocess.Milliseconds:0.16128000617027283|#ModelName:res50-trt,Level:Model|#type:GAUGE|#hostname:ip-172-31-4-205,1719081752,1eb885cf-c857-4d9e-b2f8-27ec70311e32, pattern=[METRICS] +``` + +We see that `torch.compile` with `tensorrt` backend reduces model inference from `5.56 ms` to `1.6 ms`. +Please note that `torch.compile` is a JIT compiler and it takes a few iterations (1-3) to warmup before you see the speedup + diff --git a/examples/torch_tensorrt/torchcompile/model-config.yaml b/examples/torch_tensorrt/torchcompile/model-config.yaml new file mode 100644 index 0000000000..7696590f83 --- /dev/null +++ b/examples/torch_tensorrt/torchcompile/model-config.yaml @@ -0,0 +1,8 @@ +minWorkers: 1 +maxWorkers: 1 +pt2: + compile: + enable: true + backend: tensorrt +handler: + profile: true diff --git a/examples/torch_tensorrt/torchcompile/model.py b/examples/torch_tensorrt/torchcompile/model.py new file mode 100644 index 0000000000..ac61782d3a --- /dev/null +++ b/examples/torch_tensorrt/torchcompile/model.py @@ -0,0 +1,6 @@ +from torchvision.models.resnet import Bottleneck, ResNet + + +class ImageClassifier(ResNet): + def __init__(self): + super(ImageClassifier, self).__init__(Bottleneck, [3, 4, 6, 3]) diff --git a/examples/torch_tensorrt/torchcompile/requirements.txt b/examples/torch_tensorrt/torchcompile/requirements.txt new file mode 100644 index 0000000000..17bfdf15bc --- /dev/null +++ b/examples/torch_tensorrt/torchcompile/requirements.txt @@ -0,0 +1 @@ +torch_tensorrt>=2.3.0 diff --git a/examples/torch_tensorrt/README.md b/examples/torch_tensorrt/torchscript/README.md similarity index 80% rename from examples/torch_tensorrt/README.md rename to examples/torch_tensorrt/torchscript/README.md index d621f33526..0117103c54 100644 --- a/examples/torch_tensorrt/README.md +++ b/examples/torch_tensorrt/torchscript/README.md @@ -1,6 +1,6 @@ -# TorchServe inference with torch tensorrt model +# TorchServe inference with torch tensorrt (using TorchScript) model -This example shows how to run TorchServe inference with [Torch-TensorRT](https://github.com/pytorch/TensorRT) model +This example shows how to run TorchServe inference with [Torch-TensorRT](https://github.com/pytorch/TensorRT) model using TorchScript. This is the legacy way of using TensorRT with PyTorch. We recommend using torch.compile for new deployments (see [../README.md](../README.md)). TorchScript is in maintenance mode. ### Pre-requisites diff --git a/examples/torch_tensorrt/requirements.txt b/examples/torch_tensorrt/torchscript/requirements.txt similarity index 100% rename from examples/torch_tensorrt/requirements.txt rename to examples/torch_tensorrt/torchscript/requirements.txt diff --git a/examples/torch_tensorrt/resnet_tensorrt.py b/examples/torch_tensorrt/torchscript/resnet_tensorrt.py similarity index 100% rename from examples/torch_tensorrt/resnet_tensorrt.py rename to examples/torch_tensorrt/torchscript/resnet_tensorrt.py diff --git a/ts/utils/util.py b/ts/utils/util.py index 39cdfd66a0..62d6a2c6bd 100644 --- a/ts/utils/util.py +++ b/ts/utils/util.py @@ -27,6 +27,7 @@ class PT2Backend(str, enum.Enum): IPEX = "ipex" TORCHXLA_TRACE_ONCE = "torchxla_trace_once" OPENVINO = "openvino" + TENSORRT = "tensorrt" HPU_BACKEND = "hpu_backend" diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 1f077cf66a..43446f168f 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1254,3 +1254,4 @@ parallelLevel parallelType parallelization pptp +torchcompile