NVIDIA · buddhapuneeth · Oct 15, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/.clang-format b/.clang-format
@@ -59,6 +59,7 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Left
+QualifierAlignment: Right
 ReflowComments:  true
 SeparateDefinitionBlocks: Always
 SortIncludes:    CaseSensitive

diff --git a/.dockerignore b/.dockerignore
@@ -1,5 +1,7 @@
 build
 cpp/*build*
+cpp/cmake-*
+cpp/.ccache
 cpp/tests/resources/models
 tensorrt_llm/libs
 **/__pycache__

diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,4 @@
 *.a filter=lfs diff=lfs merge=lfs -text
+*.lib filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.dll filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,116 @@
+name: "Bug Report"
+description: Submit a bug report to help us improve TensorRT-LLM
+labels: [ "bug" ]
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us.
+      placeholder: |
+        - CPU architecture (e.g., x86_64, aarch64)
+        - CPU/Host memory size (if known)
+        - GPU properties
+          - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
+          - GPU memory size (if known)
+          - Clock frequencies used (if applicable)
+        - Libraries
+          - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
+          - TensorRT-LLM commit (if known)
+          - Versions of TensorRT, Modelopt, CUDA, cuBLAS, etc. used
+          - Container used (if running TensorRT-LLM in a container)
+        - NVIDIA driver version
+        - OS (Ubuntu 22.04, CentOS 7, Windows 10)
+        - Any other information that may be useful in reproducing the bug
+    validations:
+      required: true
+
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        To expedite the response to your issue, it would be helpful if you could identify the appropriate person
+        to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
+
+        Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
+        you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
+
+        Please tag fewer than 3 people.
+
+        Quantization: @Tracin
+
+        Documentation: @juney-nvidia
+
+        Feature request: @ncomly-nvidia
+
+        Performance: @kaiyux
+
+        Others: @byshiue
+
+      placeholder: "@Username ..."
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The tasks I am working on are:"
+      options:
+        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
+        - label: "My own task or dataset (give details below)"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly.
+        Additionally, if you have any error messages, or stack traces related to the problem, please include them here.
+
+        Remember to use code tags to properly format your code. You can refer to the
+        link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
+
+        Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
+        It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+
+          1.
+          2.
+          3.
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
+
+  - type: textarea
+    id: actual-behavior
+    validations:
+      required: true
+    attributes:
+      label: actual behavior
+      description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
+
+  - type: textarea
+    id: additioanl-notes
+    validations:
+      required: true
+    attributes:
+      label: additional notes
+      description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
diff --git a/.github/workflows/auto_close_inactive_issues.yml b/.github/workflows/auto_close_inactive_issues.yml
@@ -0,0 +1,25 @@
+# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 15
+          stale-issue-label: "stale"
+          exempt-issue-labels: ""
+          stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."
+          close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          debug-only: false
diff --git a/.gitignore b/.gitignore
@@ -5,20 +5,55 @@ __pycache__/
 *.cache
 *.nsys-rep
 .VSCodeCounter
-build*/
-*.so
+cpp/build*
+build
+!tensorrt_llm/bench/build
+!builders/
 *.egg-info/
 .coverage
-*.csv
 *.onnx
 tmp/
 venv/
 .venv/
 .local/
 .hypothesis/
 .idea/
+dump*/
+.trt-internal
+*.dot
+*.prof
+*.log
+*.pkl
+*.hdf5
+*.lock
+config.json
+/*.svg
 cpp/cmake-build-*
+cpp/.ccache
+tensorrt_llm/bin
+tensorrt_llm/libs
+tensorrt_llm/bindings.*.so
+tensorrt_llm/bindings.pyi
+tensorrt_llm/bindings/**/*.pyi
+*docs/cpp_docs*
+*docs/source/_cpp_gen*
+docs/source/llm-api/*.rst
+docs/source/llm-api-examples/llm_*.rst
+*.swp
 
 # Testing
 .coverage.*
 results_trt/
+
+# build/debug
+*.safetensors
+*/tllm_debug/**
+*.patch
+
+# Generated files
+cpp/include/tensorrt_llm/executor/version.h
+
+# User config files
+CMakeUserPresets.json
+compile_commands.json
+*.bin
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
-	branch = v2.10.0
 [submodule "3rdparty/json"]
 	path = 3rdparty/json
 	url = https://github.com/nlohmann/json.git
@@ -12,3 +11,9 @@
 [submodule "3rdparty/NVTX"]
 	path = 3rdparty/NVTX
 	url = https://github.com/NVIDIA/NVTX.git
+[submodule "3rdparty/ucxx"]
+	path = 3rdparty/ucxx
+	url = https://github.com/rapidsai/ucxx.git
+[submodule "3rdparty/pybind11"]
+	path = 3rdparty/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,8 @@ repos:
     rev: v4.1.0
     hooks:
     -   id: check-added-large-files
-        exclude: 'cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin'
+        exclude: |
+            (?x)^(.*cubin.cpp)$
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
@@ -33,10 +34,17 @@ repos:
     -   id: clang-format
         types_or: [c++, c, cuda]
         exclude: |
-            (?x)^(
-                cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/.*
-            )$
+            (?x)^(.*cubin.cpp$ | .*fmha_cubin.h)$
 -   repo: https://github.com/cheshirekow/cmake-format-precommit
     rev: v0.6.10
     hooks:
     -   id: cmake-format
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+    -   id: codespell
+        args:
+        - --skip=".git,3rdparty"
+        - --exclude-file=examples/whisper/tokenizer.py
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
+        exclude: 'tests/llm-test-defs/turtle/test_input_files'
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/3rdparty/json b/3rdparty/json
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
diff --git a/3rdparty/ucxx b/3rdparty/ucxx