From fe2c83a5232eb711db4732fb6b537020f252bc31 Mon Sep 17 00:00:00 2001
From: HolyWu <holywu@gmail.com>
Date: Wed, 4 Dec 2024 23:50:32 +0800
Subject: [PATCH 01/11] Get decompositions only for CIA ops (#3297)

---
 py/torch_tensorrt/dynamo/lowering/_decompositions.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
index 0c4e19d902..945d1d0d2a 100644
--- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py
+++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -4,8 +4,11 @@
 
 import torch
 from torch._decomp import register_decomposition
+from torch._export.utils import (
+    _collect_all_valid_cia_ops_for_aten_namespace,
+    _get_decomp_for_cia,
+)
 from torch._ops import OpOverload
-from torch.export import default_decompositions
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim
 from torch_tensorrt.dynamo.utils import to_torch_device
@@ -432,7 +435,10 @@ def get_decompositions(
         return {**CORE_ATEN_DECOMPOSITIONS_FILTERED, **TORCH_TRT_DECOMPOSITIONS}
     else:
         # changes made here due to torch2.6 changes https://github.com/pytorch/pytorch/pull/135080
-        decomp_table = default_decompositions()
+        decomp_table = {}
+        for op in _collect_all_valid_cia_ops_for_aten_namespace():
+            decomp_table[op] = _get_decomp_for_cia(op)
+
         DECOMP_TABLE_FILTERED: Dict[OpOverload, Callable[[Any], Any]] = {
             decomp: decomp_table[decomp]
             for decomp in decomp_table

From f37151cadbe0539da1faf661db6c0075fc1998b5 Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:05:06 -0700
Subject: [PATCH 02/11] Update docgen.yml base container

Updated to pytorch/manylinux2_28-builder:cuda12.6
---
 .github/workflows/docgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index 593c4e90f0..29f8e83679 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -14,7 +14,7 @@ jobs:
     if: ${{ ! contains(github.actor, 'pytorchbot') }}
     environment: pytorchbot-env
     container:
-      image: docker.io/pytorch/manylinux-builder:cuda12.4
+      image: docker.io/pytorch/manylinux2_28-builder:cuda12.6
       options: --gpus all
     env:
       CUDA_HOME: /usr/local/cuda-12.4

From b1f01b20612c74ec0b5dfb82ee57608d4564921d Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:51:22 -0700
Subject: [PATCH 03/11] Update docgen.yml

---
 .github/workflows/docgen.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index 29f8e83679..df8422e260 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -17,9 +17,9 @@ jobs:
       image: docker.io/pytorch/manylinux2_28-builder:cuda12.6
       options: --gpus all
     env:
-      CUDA_HOME: /usr/local/cuda-12.4
-      VERSION_SUFFIX: cu124
-      CU_VERSION: cu124
+      CUDA_HOME: /usr/local/cuda-12.6
+      VERSION_SUFFIX: cu126
+      CU_VERSION: cu126
       CHANNEL: nightly
       CI_BUILD: 1
     steps:

From 13a2f64720b6ed4028d472d5d466318f9eb80ea3 Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 10:45:35 -0700
Subject: [PATCH 04/11] Update docgen.yml

---
 .github/workflows/docgen.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index df8422e260..ce7ffef5d6 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -46,7 +46,9 @@ jobs:
       - name: Generate New Docs
         run: |
           cd docsrc
-          yum install -y doxygen pandoc
+          dnf clean all
+          dnf update 
+          dnf install -y doxygen pandoc
           python3 -m pip install -r requirements.txt
           python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
           make html

From 7f22d7bf413ae3aaa38002e3cbac98a13121de42 Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:53:40 -0700
Subject: [PATCH 05/11] Update docgen.yml

---
 .github/workflows/docgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index ce7ffef5d6..72da08949b 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -48,7 +48,7 @@ jobs:
           cd docsrc
           dnf clean all
           dnf update 
-          dnf install -y doxygen pandoc
+          dnf install -y doxygen pandoc --allowerasing
           python3 -m pip install -r requirements.txt
           python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
           make html

From 5cca6522cedd9ff56101f4f9c8f7f6db108f23f3 Mon Sep 17 00:00:00 2001
From: Hoonkyung Cho <chohk88@gmail.com>
Date: Thu, 5 Dec 2024 21:16:40 +0900
Subject: [PATCH 06/11] fix: cumsum add_constant bug fix (add dtype for np
 zeros) (#3258)

Co-authored-by: Hoonkyung Cho <hoonkyungc@nvidia.com>
---
 .../dynamo/conversion/impl/slice/ops.py       |  2 +-
 py/torch_tensorrt/dynamo/utils.py             | 26 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
index b58435b489..3274d78c2b 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
@@ -370,7 +370,7 @@ def cumsum(
         )
     else:
         new_dims = tuple(data.shape)
-        zeros = np.zeros(new_dims)
+        zeros = np.zeros(new_dims, dtype=np.float32)
         zero_trttensor = get_trt_tensor(ctx, zeros, f"{name}_initial_value")
 
     running_sum = loop.add_recurrence(zero_trttensor)
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 85aa663809..95e5f30e4d 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from dataclasses import fields, replace
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -10,6 +11,8 @@
 import tensorrt as trt
 import torch
 from torch._subclasses.fake_tensor import FakeTensor
+
+from packaging import version
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
@@ -19,8 +22,6 @@
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings
 
-from packaging import version
-
 from .types import TRTDataType
 
 logger = logging.getLogger(__name__)
@@ -494,6 +495,27 @@ def parse_dynamo_kwargs(
         if "options" in kwargs and len(kwargs) == 1:
             kwargs = kwargs["options"]
 
+        if "truncate_long_and_double" in kwargs:
+            if (
+                "truncate_double" in kwargs
+                and kwargs["truncate_double"] is not _defaults.TRUNCATE_DOUBLE
+            ):
+                raise ValueError(
+                    'Provided configuration for "truncate_double" and deprecated API "truncate_long_and_double". '
+                    'Please only use "truncate_double".'
+                )
+            else:
+                kwargs["truncate_double"] = kwargs["truncate_long_and_double"]
+                warnings.warn(
+                    'Compiler option "truncate_long_and_double" is deprecated in favor of "truncate_double" as int64 is now natively supported. '
+                    "This option will be removed in the next version.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                del kwargs[
+                    "truncate_long_and_double"
+                ]  # Remove deprecated key after handling
+
         valid_attrs = {attr.name for attr in fields(settings)}
         valid_kwargs = {k: v for k, v in kwargs.items() if k in valid_attrs}
         settings = replace(settings, **valid_kwargs)

From 6f0b5be8a55ce0bbedb40e6e86a5741c67f31eac Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:04:12 -0700
Subject: [PATCH 07/11] Update docgen.yml

---
 .github/workflows/docgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index 72da08949b..fe2220fbbb 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -48,7 +48,7 @@ jobs:
           cd docsrc
           dnf clean all
           dnf update 
-          dnf install -y doxygen pandoc --allowerasing
+          dnf install -y doxygen pandoc --allowerasing --nobest
           python3 -m pip install -r requirements.txt
           python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
           make html

From 1af20d33cf03deb716fba4013844678d09c05653 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 5 Dec 2024 13:34:17 -0800
Subject: [PATCH 08/11] chore: update install doxygen in the docs container

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 .github/workflows/docgen.yml | 115 ++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index fe2220fbbb..8002e0bcac 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -5,64 +5,67 @@ name: Generate Docs
 # Controls when the action will run. Triggers the workflow on push or pull request
 # events but only for the master branch
 on:
-  push:
-    branches: [ main ]
+    push:
+        branches: [main]
 
 jobs:
-  build-docs:
-    runs-on: linux.g5.4xlarge.nvidia.gpu
-    if: ${{ ! contains(github.actor, 'pytorchbot') }}
-    environment: pytorchbot-env
-    container:
-      image: docker.io/pytorch/manylinux2_28-builder:cuda12.6
-      options: --gpus all
-    env:
-      CUDA_HOME: /usr/local/cuda-12.6
-      VERSION_SUFFIX: cu126
-      CU_VERSION: cu126
-      CHANNEL: nightly
-      CI_BUILD: 1
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          ref: ${{github.head_ref}}
-          token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
-      - name: Select Python / CUDA
-        run: |
-          git config --global --add safe.directory /__w/TensorRT/TensorRT
-          echo "/opt/python/cp311-cp311/bin/" >> $GITHUB_PATH
+    build-docs:
+        runs-on: linux.g5.4xlarge.nvidia.gpu
+        if: ${{ ! contains(github.actor, 'pytorchbot') }}
+        environment: pytorchbot-env
+        container:
+            image: docker.io/pytorch/manylinux2_28-builder:cuda12.6
+            options: --gpus all
+        env:
+            CUDA_HOME: /usr/local/cuda-12.6
+            VERSION_SUFFIX: cu126
+            CU_VERSION: cu126
+            CHANNEL: nightly
+            CI_BUILD: 1
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  ref: ${{github.head_ref}}
+                  token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
+            - name: Select Python / CUDA
+              run: |
+                  git config --global --add safe.directory /__w/TensorRT/TensorRT
+                  echo "/opt/python/cp311-cp311/bin/" >> $GITHUB_PATH
 
-      - name: Install base deps
-        run: |
-          python3 -m pip install pip --upgrade
-          python3 -m pip install pyyaml numpy torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu124
-          ./packaging/pre_build_script.sh
-      - name: Get HEAD SHA
-        id: vars
-        run: echo "sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
-      - name: Build Python Package
-        run: |
-          python3 -m pip install --pre . --extra-index-url https://download.pytorch.org/whl/nightly/cu124
-      - name: Generate New Docs
-        run: |
-          cd docsrc
-          dnf clean all
-          dnf update 
-          dnf install -y doxygen pandoc --allowerasing --nobest
-          python3 -m pip install -r requirements.txt
-          python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
-          make html
-          cd ..
-      - uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-        # Required
-          commit_message: "docs: [Automated] Regenerating documenation for ${{ steps.vars.outputs.sha }}"
-          commit_options: "--no-verify --signoff"
-          file_pattern: docs/
-          commit_user_name: Torch-TensorRT Github Bot
-          commit_user_email: torch-tensorrt.github.bot@nvidia.com
-          commit_author: Torch-TensorRT Github Bot <torch-tensorrt.github.bot@nvidia.com>
+            - name: Install base deps
+              run: |
+                  python3 -m pip install pip --upgrade
+                  python3 -m pip install pyyaml numpy torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu124
+                  ./packaging/pre_build_script.sh
+            - name: Get HEAD SHA
+              id: vars
+              run: echo "sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
+            - name: Build Python Package
+              run: |
+                  python3 -m pip install --pre . --extra-index-url https://download.pytorch.org/whl/nightly/cu124
+            - name: Generate New Docs
+              run: |
+                  cd docsrc
+                  dnf clean all
+                  dnf makecache --refresh
+                  dnf install yum-utils -y
+                  dnf config-manager --set-enabled powertools
+                  dnf update --skip-broken --nobest
+                  dnf install -y doxygen pandoc
+                  python3 -m pip install -r requirements.txt
+                  python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
+                  make html
+                  cd ..
+            - uses: stefanzweifel/git-auto-commit-action@v4
+              with:
+                  # Required
+                  commit_message: "docs: [Automated] Regenerating documenation for ${{ steps.vars.outputs.sha }}"
+                  commit_options: "--no-verify --signoff"
+                  file_pattern: docs/
+                  commit_user_name: Torch-TensorRT Github Bot
+                  commit_user_email: torch-tensorrt.github.bot@nvidia.com
+                  commit_author: Torch-TensorRT Github Bot <torch-tensorrt.github.bot@nvidia.com>
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}
-  cancel-in-progress: true
+    group: ${{ github.workflow }}-${{ github.ref_name }}
+    cancel-in-progress: true

From 62d4fcb041040e5d12cccd2abf2474a1398804c1 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 5 Dec 2024 13:54:01 -0800
Subject: [PATCH 09/11] chore: dropped -y in update

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 .github/workflows/docgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index 8002e0bcac..bb6adc1034 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -50,7 +50,7 @@ jobs:
                   dnf makecache --refresh
                   dnf install yum-utils -y
                   dnf config-manager --set-enabled powertools
-                  dnf update --skip-broken --nobest
+                  dnf update --skip-broken --nobest -y
                   dnf install -y doxygen pandoc
                   python3 -m pip install -r requirements.txt
                   python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"

From 5dbb84835512129ee55ab5deebfff7df7254a0f3 Mon Sep 17 00:00:00 2001
From: "Zewen (Evan) Li" <zewenl@nvidia.com>
Date: Fri, 6 Dec 2024 09:47:58 -0800
Subject: [PATCH 10/11] fix: change docker img from manylinux to manylinux2_28
 for all CUDA versions (#3312)

---
 .../scripts/generate_binary_build_matrix.py   | 701 ++++++++++++++++++
 .github/workflows/build-test-linux.yml        |   2 +-
 .../workflows/build-test-tensorrt-linux.yml   |   2 +-
 .../workflows/build-test-tensorrt-windows.yml |   2 +-
 .github/workflows/build-test-windows.yml      |   2 +-
 .../generate_binary_build_matrix.yml          | 112 +++
 .github/workflows/release-linux.yml           |   2 +-
 .github/workflows/release-windows.yml         |   2 +-
 8 files changed, 819 insertions(+), 6 deletions(-)
 create mode 100644 .github/scripts/generate_binary_build_matrix.py
 create mode 100644 .github/workflows/generate_binary_build_matrix.yml

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
new file mode 100644
index 0000000000..4ba7e0faeb
--- /dev/null
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -0,0 +1,701 @@
+#!/usr/bin/env python3
+
+"""Generates a matrix to be utilized through github actions
+
+Will output a condensed version of the matrix if on a pull request that only
+includes the latest version of python we support built on four different
+architectures:
+    * CPU
+    * Latest CUDA
+    * Latest ROCM
+    * Latest XPU
+"""
+
+
+import argparse
+import json
+import os
+import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+PYTHON_ARCHES_DICT = {
+    "nightly": ["3.9", "3.10", "3.11", "3.12"],
+    "test": ["3.9", "3.10", "3.11", "3.12"],
+    "release": ["3.9", "3.10", "3.11", "3.12"],
+}
+CUDA_ARCHES_DICT = {
+    "nightly": ["11.8", "12.4", "12.6"],
+    "test": ["11.8", "12.1", "12.4"],
+    "release": ["11.8", "12.1", "12.4"],
+}
+ROCM_ARCHES_DICT = {
+    "nightly": ["6.1", "6.2"],
+    "test": ["6.1", "6.2"],
+    "release": ["6.1", "6.2"],
+}
+
+CUDA_CUDDN_VERSIONS = {
+    "11.8": {"cuda": "11.8.0", "cudnn": "9"},
+    "12.1": {"cuda": "12.1.1", "cudnn": "9"},
+    "12.4": {"cuda": "12.4.1", "cudnn": "9"},
+    "12.6": {"cuda": "12.6.2", "cudnn": "9"},
+}
+
+PACKAGE_TYPES = ["wheel", "conda", "libtorch"]
+PRE_CXX11_ABI = "pre-cxx11"
+CXX11_ABI = "cxx11-abi"
+RELEASE = "release"
+DEBUG = "debug"
+NIGHTLY = "nightly"
+TEST = "test"
+
+# OS constants
+LINUX = "linux"
+LINUX_AARCH64 = "linux-aarch64"
+MACOS_ARM64 = "macos-arm64"
+WINDOWS = "windows"
+
+# Accelerator architectures
+CPU = "cpu"
+CPU_AARCH64 = "cpu-aarch64"
+CUDA_AARCH64 = "cuda-aarch64"
+CUDA = "cuda"
+ROCM = "rocm"
+XPU = "xpu"
+
+
+CURRENT_NIGHTLY_VERSION = "2.6.0"
+CURRENT_CANDIDATE_VERSION = "2.5.1"
+CURRENT_STABLE_VERSION = "2.5.1"
+CURRENT_VERSION = CURRENT_STABLE_VERSION
+
+# By default use Nightly for CUDA arches
+CUDA_ARCHES = CUDA_ARCHES_DICT[NIGHTLY]
+ROCM_ARCHES = ROCM_ARCHES_DICT[NIGHTLY]
+PYTHON_ARCHES = PYTHON_ARCHES_DICT[NIGHTLY]
+
+# Container images
+LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str]
+WHEEL_CONTAINER_IMAGES: Dict[str, str]
+
+LINUX_GPU_RUNNER = "linux.g5.4xlarge.nvidia.gpu"
+LINUX_CPU_RUNNER = "linux.2xlarge"
+LINUX_AARCH64_RUNNER = "linux.arm64.2xlarge"
+LINUX_AARCH64_GPU_RUNNER = "linux.arm64.m7g.4xlarge"
+WIN_GPU_RUNNER = "windows.g4dn.xlarge"
+WIN_CPU_RUNNER = "windows.4xlarge"
+MACOS_M1_RUNNER = "macos-m1-stable"
+
+PACKAGES_TO_INSTALL_WHL = "torch torchvision torchaudio"
+WHL_INSTALL_BASE = "pip3 install"
+DOWNLOAD_URL_BASE = "https://download.pytorch.org"
+
+ENABLE = "enable"
+DISABLE = "disable"
+
+
+def arch_type(arch_version: str) -> str:
+    if arch_version in CUDA_ARCHES:
+        return CUDA
+    elif arch_version in ROCM_ARCHES:
+        return ROCM
+    elif arch_version == CPU_AARCH64:
+        return CPU_AARCH64
+    elif arch_version == CUDA_AARCH64:
+        return CUDA_AARCH64
+    elif arch_version == XPU:
+        return XPU
+    else:  # arch_version should always be CPU in this case
+        return CPU
+
+
+def validation_runner(arch_type: str, os: str) -> str:
+    if os == LINUX:
+        if arch_type == CUDA:
+            return LINUX_GPU_RUNNER
+        else:
+            return LINUX_CPU_RUNNER
+    elif os == LINUX_AARCH64:
+        if arch_type == CUDA_AARCH64:
+            return LINUX_AARCH64_GPU_RUNNER
+        else:
+            return LINUX_AARCH64_RUNNER
+    elif os == WINDOWS:
+        if arch_type == CUDA:
+            return WIN_GPU_RUNNER
+        else:
+            return WIN_CPU_RUNNER
+    elif os == MACOS_ARM64:
+        return MACOS_M1_RUNNER
+    else:  # default to linux cpu runner
+        return LINUX_CPU_RUNNER
+
+
+def initialize_globals(channel: str, build_python_only: bool) -> None:
+    global CURRENT_VERSION, CUDA_ARCHES, ROCM_ARCHES, PYTHON_ARCHES
+    global WHEEL_CONTAINER_IMAGES, LIBTORCH_CONTAINER_IMAGES
+    if channel == TEST:
+        CURRENT_VERSION = CURRENT_CANDIDATE_VERSION
+    else:
+        CURRENT_VERSION = CURRENT_STABLE_VERSION
+
+    CUDA_ARCHES = CUDA_ARCHES_DICT[channel]
+    ROCM_ARCHES = ROCM_ARCHES_DICT[channel]
+    if build_python_only:
+        # Only select the oldest version of python if building a python only package
+        PYTHON_ARCHES = [PYTHON_ARCHES_DICT[channel][0]]
+    else:
+        PYTHON_ARCHES = PYTHON_ARCHES_DICT[channel]
+    WHEEL_CONTAINER_IMAGES = {
+        "11.8": "pytorch/manylinux2_28-builder:cuda11.8",
+        "12.1": "pytorch/manylinux2_28-builder:cuda12.1",
+        "12.4": "pytorch/manylinux2_28-builder:cuda12.4",
+        "12.6": "pytorch/manylinux2_28-builder:cuda12.6",
+        **{
+            gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}"
+            for gpu_arch in ROCM_ARCHES
+        },
+        CPU: "pytorch/manylinux-builder:cpu",
+        XPU: "pytorch/manylinux2_28-builder:xpu",
+        # TODO: Migrate CUDA_AARCH64 image to manylinux2_28_aarch64-builder:cuda12.4
+        CPU_AARCH64: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64",
+        CUDA_AARCH64: "pytorch/manylinuxaarch64-builder:cuda12.4",
+    }
+    LIBTORCH_CONTAINER_IMAGES = {
+        **{
+            (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:cuda{gpu_arch}"
+            for gpu_arch in CUDA_ARCHES
+        },
+        **{
+            (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}"
+            for gpu_arch in CUDA_ARCHES
+        },
+        **{
+            (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:rocm{gpu_arch}"
+            for gpu_arch in ROCM_ARCHES
+        },
+        **{
+            (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}"
+            for gpu_arch in ROCM_ARCHES
+        },
+        (CPU, PRE_CXX11_ABI): "pytorch/manylinux-builder:cpu",
+        (CPU, CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu",
+    }
+
+
+def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
+    return {
+        CPU: "cpu",
+        CPU_AARCH64: CPU,
+        CUDA_AARCH64: "cu124",
+        CUDA: f"cu{gpu_arch_version.replace('.', '')}",
+        ROCM: f"rocm{gpu_arch_version}",
+        XPU: "xpu",
+    }.get(gpu_arch_type, gpu_arch_version)
+
+
+def list_without(in_list: List[str], without: List[str]) -> List[str]:
+    return [item for item in in_list if item not in without]
+
+
+def get_base_download_url_for_repo(
+    repo: str, channel: str, gpu_arch_type: str, desired_cuda: str
+) -> str:
+    base_url_for_type = f"{DOWNLOAD_URL_BASE}/{repo}"
+    base_url_for_type = (
+        base_url_for_type if channel == RELEASE else f"{base_url_for_type}/{channel}"
+    )
+
+    if gpu_arch_type != CPU:
+        base_url_for_type = f"{base_url_for_type}/{desired_cuda}"
+    else:
+        base_url_for_type = f"{base_url_for_type}/{gpu_arch_type}"
+
+    return base_url_for_type
+
+
+def get_libtorch_install_command(
+    os: str,
+    channel: str,
+    gpu_arch_type: str,
+    libtorch_variant: str,
+    devtoolset: str,
+    desired_cuda: str,
+    libtorch_config: str,
+) -> str:
+    prefix = "libtorch" if os != WINDOWS else "libtorch-win"
+    _libtorch_variant = (
+        f"{libtorch_variant}-{libtorch_config}"
+        if libtorch_config == "debug"
+        else libtorch_variant
+    )
+    build_name = (
+        f"{prefix}-{devtoolset}-{_libtorch_variant}-latest.zip"
+        if devtoolset == "cxx11-abi"
+        else f"{prefix}-{_libtorch_variant}-latest.zip"
+    )
+
+    if os == MACOS_ARM64:
+        arch = "arm64"
+        build_name = f"libtorch-macos-{arch}-latest.zip"
+        if channel in [RELEASE, TEST]:
+            build_name = f"libtorch-macos-{arch}-{CURRENT_VERSION}.zip"
+
+    elif os == LINUX and (channel in (RELEASE, TEST)):
+        build_name = (
+            f"{prefix}-{devtoolset}-{_libtorch_variant}-{CURRENT_VERSION}%2B{desired_cuda}.zip"
+            if devtoolset == "cxx11-abi"
+            else f"{prefix}-{_libtorch_variant}-{CURRENT_VERSION}%2B{desired_cuda}.zip"
+        )
+    elif os == WINDOWS and (channel in (RELEASE, TEST)):
+        build_name = (
+            f"{prefix}-shared-with-deps-debug-{CURRENT_VERSION}%2B{desired_cuda}.zip"
+            if libtorch_config == "debug"
+            else f"{prefix}-shared-with-deps-{CURRENT_VERSION}%2B{desired_cuda}.zip"
+        )
+    elif os == WINDOWS and channel == NIGHTLY:
+        build_name = (
+            f"{prefix}-shared-with-deps-debug-latest.zip"
+            if libtorch_config == "debug"
+            else f"{prefix}-shared-with-deps-latest.zip"
+        )
+
+    return f"{get_base_download_url_for_repo('libtorch', channel, gpu_arch_type, desired_cuda)}/{build_name}"
+
+
+def get_wheel_install_command(
+    os: str,
+    channel: str,
+    gpu_arch_type: str,
+    gpu_arch_version: str,
+    desired_cuda: str,
+    python_version: str,
+    use_only_dl_pytorch_org: bool,
+    use_split_build: bool = False,
+) -> str:
+    if use_split_build:
+        if (gpu_arch_version in CUDA_ARCHES) and (os == LINUX) and (channel == NIGHTLY):
+            return f"{WHL_INSTALL_BASE} {PACKAGES_TO_INSTALL_WHL} --index-url {get_base_download_url_for_repo('whl', channel, gpu_arch_type, desired_cuda)}_pypi_pkg"  # noqa: E501
+        else:
+            raise ValueError(
+                "Split build is not supported for this configuration. It is only supported for CUDA 11.8, 12.4, 12.6 on Linux nightly builds."  # noqa: E501
+            )
+    if (
+        channel == RELEASE
+        and (not use_only_dl_pytorch_org)
+        and (
+            (gpu_arch_version == "12.4" and os == LINUX)
+            or (gpu_arch_type == CPU and os in [WINDOWS, MACOS_ARM64])
+            or (os == LINUX_AARCH64)
+        )
+    ):
+        return f"{WHL_INSTALL_BASE} {PACKAGES_TO_INSTALL_WHL}"
+    else:
+        whl_install_command = (
+            f"{WHL_INSTALL_BASE} --pre {PACKAGES_TO_INSTALL_WHL}"
+            if channel == "nightly"
+            else f"{WHL_INSTALL_BASE} {PACKAGES_TO_INSTALL_WHL}"
+        )
+        return f"{whl_install_command} --index-url {get_base_download_url_for_repo('whl', channel, gpu_arch_type, desired_cuda)}"  # noqa: E501
+
+
+def generate_conda_matrix(
+    os: str,
+    channel: str,
+    with_cuda: str,
+    with_rocm: str,
+    with_cpu: str,
+    with_xpu: str,
+    limit_pr_builds: bool,
+    use_only_dl_pytorch_org: bool,
+    use_split_build: bool = False,
+    python_versions: Optional[List[str]] = None,
+) -> List[Dict[str, str]]:
+    ret: List[Dict[str, str]] = []
+    # return empty list. Conda builds are deprecated, see https://github.com/pytorch/pytorch/issues/138506
+    return ret
+
+
+def generate_libtorch_matrix(
+    os: str,
+    channel: str,
+    with_cuda: str,
+    with_rocm: str,
+    with_cpu: str,
+    with_xpu: str,
+    limit_pr_builds: bool,
+    use_only_dl_pytorch_org: bool,
+    use_split_build: bool = False,
+    python_versions: Optional[List[str]] = None,
+    abi_versions: Optional[List[str]] = None,
+    arches: Optional[List[str]] = None,
+    libtorch_variants: Optional[List[str]] = None,
+) -> List[Dict[str, str]]:
+    ret: List[Dict[str, str]] = []
+
+    if arches is None:
+        arches = []
+
+        if with_cpu == ENABLE:
+            arches += [CPU]
+
+        if with_cuda == ENABLE and os in (LINUX, WINDOWS):
+            arches += CUDA_ARCHES
+
+        if with_rocm == ENABLE and os == LINUX:
+            arches += ROCM_ARCHES
+
+    if abi_versions is None:
+        if os == WINDOWS:
+            abi_versions = [RELEASE, DEBUG]
+        elif os == LINUX:
+            abi_versions = [PRE_CXX11_ABI, CXX11_ABI]
+        elif os in [MACOS_ARM64]:
+            abi_versions = [CXX11_ABI]
+        else:
+            abi_versions = []
+
+    if libtorch_variants is None:
+        libtorch_variants = [
+            "shared-with-deps",
+        ]
+
+    global LIBTORCH_CONTAINER_IMAGES
+
+    for abi_version in abi_versions:
+        for arch_version in arches:
+            for libtorch_variant in libtorch_variants:
+                # one of the values in the following list must be exactly
+                # CXX11_ABI, but the precise value of the other one doesn't
+                # matter
+                gpu_arch_type = arch_type(arch_version)
+                gpu_arch_version = "" if arch_version == CPU else arch_version
+
+                desired_cuda = translate_desired_cuda(gpu_arch_type, gpu_arch_version)
+                devtoolset = abi_version if os != WINDOWS else ""
+                libtorch_config = abi_version if os == WINDOWS else ""
+                ret.append(
+                    {
+                        "gpu_arch_type": gpu_arch_type,
+                        "gpu_arch_version": gpu_arch_version,
+                        "desired_cuda": desired_cuda,
+                        "libtorch_variant": libtorch_variant,
+                        "libtorch_config": libtorch_config,
+                        "devtoolset": devtoolset,
+                        "container_image": (
+                            LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
+                            if os != WINDOWS
+                            else ""
+                        ),
+                        "package_type": "libtorch",
+                        "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(  # noqa: E501
+                            ".", "_"
+                        ),
+                        # Please noe since libtorch validations are minimal, we use CPU runners
+                        "validation_runner": validation_runner(CPU, os),
+                        "installation": get_libtorch_install_command(
+                            os,
+                            channel,
+                            gpu_arch_type,
+                            libtorch_variant,
+                            devtoolset,
+                            desired_cuda,
+                            libtorch_config,
+                        ),
+                        "channel": channel,
+                        "stable_version": CURRENT_VERSION,
+                    }
+                )
+    return ret
+
+
+def generate_wheels_matrix(
+    os: str,
+    channel: str,
+    with_cuda: str,
+    with_rocm: str,
+    with_cpu: str,
+    with_xpu: str,
+    limit_pr_builds: bool,
+    use_only_dl_pytorch_org: bool,
+    use_split_build: bool = False,
+    python_versions: Optional[List[str]] = None,
+    arches: Optional[List[str]] = None,
+) -> List[Dict[str, str]]:
+    package_type = "wheel"
+
+    if not python_versions:
+        # Define default python version
+        python_versions = list(PYTHON_ARCHES)
+
+        # If the list of python versions is set explicitly by the caller, stick with it instead
+        # of trying to add more versions behind the scene
+        if channel == NIGHTLY and (os in (LINUX, MACOS_ARM64, LINUX_AARCH64)):
+            python_versions += ["3.13"]
+
+    if os == LINUX:
+        # NOTE: We only build manywheel packages for linux
+        package_type = "manywheel"
+
+    upload_to_base_bucket = "yes"
+    if arches is None:
+        # Define default compute architectures
+        arches = []
+
+        if with_cpu == ENABLE:
+            arches += [CPU]
+
+        if os == LINUX_AARCH64:
+            # Only want the one arch as the CPU type is different and
+            # uses different build/test scripts
+            arches = [CPU_AARCH64, CUDA_AARCH64]
+
+        if with_cuda == ENABLE:
+            upload_to_base_bucket = "no"
+            if os in (LINUX, WINDOWS):
+                arches += CUDA_ARCHES
+
+        if with_rocm == ENABLE and os == LINUX:
+            arches += ROCM_ARCHES
+
+        if with_xpu == ENABLE and os in (LINUX, WINDOWS):
+            arches += [XPU]
+
+    if limit_pr_builds:
+        python_versions = [python_versions[0]]
+
+    global WHEEL_CONTAINER_IMAGES
+
+    ret: List[Dict[str, Any]] = []
+    for python_version in python_versions:
+        for arch_version in arches:
+
+            # TODO: Enable Python 3.13 support for ROCM
+            if arch_version in ROCM_ARCHES and python_version == "3.13":
+                continue
+
+            gpu_arch_type = arch_type(arch_version)
+            gpu_arch_version = (
+                "" if arch_version in [CPU, CPU_AARCH64, XPU] else arch_version
+            )
+
+            desired_cuda = translate_desired_cuda(gpu_arch_type, gpu_arch_version)
+            entry = {
+                "python_version": python_version,
+                "gpu_arch_type": gpu_arch_type,
+                "gpu_arch_version": gpu_arch_version,
+                "desired_cuda": desired_cuda,
+                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+                "package_type": package_type,
+                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
+                    ".", "_"
+                ),
+                "validation_runner": validation_runner(gpu_arch_type, os),
+                "installation": get_wheel_install_command(
+                    os,
+                    channel,
+                    gpu_arch_type,
+                    gpu_arch_version,
+                    desired_cuda,
+                    python_version,
+                    use_only_dl_pytorch_org,
+                ),
+                "channel": channel,
+                "upload_to_base_bucket": upload_to_base_bucket,
+                "stable_version": CURRENT_VERSION,
+                "use_split_build": False,
+            }
+            ret.append(entry)
+            if (
+                use_split_build
+                and (gpu_arch_version in CUDA_ARCHES)
+                and (os == LINUX)
+                and (channel == NIGHTLY)
+            ):
+                entry = entry.copy()
+                entry["build_name"] = (
+                    f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(
+                        ".", "_"
+                    )
+                )
+                entry["use_split_build"] = True
+                ret.append(entry)
+
+    return ret
+
+
+GENERATING_FUNCTIONS_BY_PACKAGE_TYPE: Dict[str, Callable[..., List[Dict[str, str]]]] = {
+    "wheel": generate_wheels_matrix,
+    "conda": generate_conda_matrix,
+    "libtorch": generate_libtorch_matrix,
+}
+
+
+def generate_build_matrix(
+    package_type: str,
+    operating_system: str,
+    channel: str,
+    with_cuda: str,
+    with_rocm: str,
+    with_cpu: str,
+    with_xpu: str,
+    limit_pr_builds: str,
+    use_only_dl_pytorch_org: str,
+    build_python_only: str,
+    use_split_build: str = "false",
+    python_versions: Optional[List[str]] = None,
+) -> Dict[str, List[Dict[str, str]]]:
+    includes = []
+
+    package_types = package_type.split(",")
+    if len(package_types) == 1:
+        package_types = PACKAGE_TYPES if package_type == "all" else [package_type]
+
+    channels = CUDA_ARCHES_DICT.keys() if channel == "all" else [channel]
+
+    for channel in channels:
+        for package in package_types:
+            initialize_globals(channel, build_python_only == ENABLE)
+            includes.extend(
+                GENERATING_FUNCTIONS_BY_PACKAGE_TYPE[package](
+                    operating_system,
+                    channel,
+                    with_cuda,
+                    with_rocm,
+                    with_cpu,
+                    with_xpu,
+                    limit_pr_builds == "true",
+                    use_only_dl_pytorch_org == "true",
+                    use_split_build == "true",
+                    python_versions,
+                )
+            )
+
+    return {"include": includes}
+
+
+def main(args: List[str]) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--package-type",
+        help="Package type to lookup for, also supports comma separated values",
+        type=str,
+        default=os.getenv("PACKAGE_TYPE", "wheel"),
+    )
+    parser.add_argument(
+        "--operating-system",
+        help="Operating system to generate for",
+        type=str,
+        default=os.getenv("OS", LINUX),
+    )
+    parser.add_argument(
+        "--channel",
+        help="Channel to use, default nightly",
+        type=str,
+        choices=["nightly", "test", "release", "all"],
+        default=os.getenv("CHANNEL", "nightly"),
+    )
+    parser.add_argument(
+        "--with-cuda",
+        help="Build with Cuda?",
+        type=str,
+        choices=[ENABLE, DISABLE],
+        default=os.getenv("WITH_CUDA", ENABLE),
+    )
+    parser.add_argument(
+        "--with-rocm",
+        help="Build with Rocm?",
+        type=str,
+        choices=[ENABLE, DISABLE],
+        default=os.getenv("WITH_ROCM", ENABLE),
+    )
+    parser.add_argument(
+        "--with-cpu",
+        help="Build with CPU?",
+        type=str,
+        choices=[ENABLE, DISABLE],
+        default=os.getenv("WITH_CPU", ENABLE),
+    )
+    parser.add_argument(
+        "--with-xpu",
+        help="Build with XPU?",
+        type=str,
+        choices=[ENABLE, DISABLE],
+        default=os.getenv("WITH_XPU", ENABLE),
+    )
+    # By default this is false for this script but expectation is that the caller
+    # workflow will default this to be true most of the time, where a pull
+    # request is synchronized and does not contain the label "ciflow/binaries/all"
+    parser.add_argument(
+        "--limit-pr-builds",
+        help="Limit PR builds to single python/cuda config",
+        type=str,
+        choices=["true", "false"],
+        default=os.getenv("LIMIT_PR_BUILDS", "false"),
+    )
+    # This is used when testing release builds to test release binaries
+    # only from download.pytorch.org. When pipy binaries are not released yet.
+    parser.add_argument(
+        "--use-only-dl-pytorch-org",
+        help="Use only download.pytorch.org when gen wheel install command?",
+        type=str,
+        choices=["true", "false"],
+        default=os.getenv("USE_ONLY_DL_PYTORCH_ORG", "false"),
+    )
+    # Generates a single version python for building python packages only
+    # This basically makes it so that we only generate a matrix including the oldest
+    # version of python that we support
+    # For packages that look similar to torchtune-0.0.1-py3-none-any.whl
+    parser.add_argument(
+        "--build-python-only",
+        help="Build python only",
+        type=str,
+        choices=[ENABLE, DISABLE],
+        default=os.getenv("BUILD_PYTHON_ONLY", ENABLE),
+    )
+
+    parser.add_argument(
+        "--use-split-build",
+        help="Use split build for wheel",
+        type=str,
+        choices=["true", "false"],
+        default=os.getenv("USE_SPLIT_BUILD", DISABLE),
+    )
+
+    parser.add_argument(
+        "--python-versions",
+        help="Only build the select JSON-encoded list of python versions",
+        type=str,
+        default=os.getenv("PYTHON_VERSIONS", "[]"),
+    )
+
+    options = parser.parse_args(args)
+    try:
+        python_versions = json.loads(options.python_versions)
+    except json.JSONDecodeError:
+        python_versions = None
+
+    assert (
+        options.with_cuda or options.with_rocm or options.with_xpu or options.with_cpu
+    ), "Must build with either CUDA, ROCM, XPU, or CPU support."
+
+    build_matrix = generate_build_matrix(
+        options.package_type,
+        options.operating_system,
+        options.channel,
+        options.with_cuda,
+        options.with_rocm,
+        options.with_cpu,
+        options.with_xpu,
+        options.limit_pr_builds,
+        options.use_only_dl_pytorch_org,
+        options.build_python_only,
+        options.use_split_build,
+        python_versions,
+    )
+
+    print(json.dumps(build_matrix))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index a660fc4ef2..72d7e21b5c 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -15,7 +15,7 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     with:
       package-type: wheel
       os: linux
diff --git a/.github/workflows/build-test-tensorrt-linux.yml b/.github/workflows/build-test-tensorrt-linux.yml
index 3f4abb9add..cfad7274dc 100644
--- a/.github/workflows/build-test-tensorrt-linux.yml
+++ b/.github/workflows/build-test-tensorrt-linux.yml
@@ -10,7 +10,7 @@ permissions:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     with:
       package-type: wheel
       os: linux
diff --git a/.github/workflows/build-test-tensorrt-windows.yml b/.github/workflows/build-test-tensorrt-windows.yml
index b6eb1d765c..d2be9febd7 100644
--- a/.github/workflows/build-test-tensorrt-windows.yml
+++ b/.github/workflows/build-test-tensorrt-windows.yml
@@ -10,7 +10,7 @@ permissions:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     with:
       package-type: wheel
       os: windows
diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index 0201ab5ff2..c2b05d8994 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -15,7 +15,7 @@ on:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     with:
       package-type: wheel
       os: windows
diff --git a/.github/workflows/generate_binary_build_matrix.yml b/.github/workflows/generate_binary_build_matrix.yml
new file mode 100644
index 0000000000..11281ece94
--- /dev/null
+++ b/.github/workflows/generate_binary_build_matrix.yml
@@ -0,0 +1,112 @@
+name: Generates the binary build matrix
+
+on:
+  workflow_call:
+    inputs:
+      package-type:
+        description: "Package type to build from (wheel, conda, libtorch)"
+        default: "wheel"
+        type: string
+      os:
+        description: "Operating system to generate for (linux, windows, macos, macos-arm64)"
+        default: "linux"
+        type: string
+      channel:
+        description: "Channel to use (nightly, test, release, all)"
+        default: ""
+        type: string
+      test-infra-repository:
+        description: "Test infra repository to use"
+        default: "pytorch/test-infra"
+        type: string
+      test-infra-ref:
+        description: "Test infra reference to use"
+        default: "main"
+        type: string
+      with-cuda:
+        description: "Build with Cuda?"
+        default: "enable"
+        type: string
+      with-rocm:
+        description: "Build with Rocm?"
+        default: "enable"
+        type: string
+      with-cpu:
+        description: "Build with CPU?"
+        default: "enable"
+        type: string
+      with-xpu:
+        description: "Build with XPU?"
+        default: "disable"
+        type: string
+      use-only-dl-pytorch-org:
+        description: "Use only download.pytorch.org when generating wheel install command?"
+        default: "false"
+        type: string
+      build-python-only:
+        description: "Generate binary build matrix for a python only package (i.e. only one python version)"
+        default: "disable"
+        type: string
+      python-versions:
+        description: "A JSON-encoded list of python versions to build. An empty list means building all supported versions"
+        default: "[]"
+        type: string
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
+
+    outputs:
+      matrix:
+        description: "Generated build matrix"
+        value: ${{ jobs.generate.outputs.matrix }}
+
+jobs:
+  generate:
+    outputs:
+      matrix: ${{ steps.generate.outputs.matrix }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Checkout test-infra repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.test-infra-repository }}
+          ref: ${{ inputs.test-infra-ref }}
+      - uses: ./.github/actions/set-channel
+      - uses: actions/checkout@v4
+        with:
+          repository: pytorch/tensorrt
+      - name: Generate test matrix
+        id: generate
+        env:
+          PACKAGE_TYPE: ${{ inputs.package-type }}
+          OS: ${{ inputs.os }}
+          CHANNEL: ${{ inputs.channel != '' && inputs.channel || env.CHANNEL }}
+          WITH_CUDA: ${{ inputs.with-cuda }}
+          WITH_ROCM: ${{ inputs.with-rocm }}
+          WITH_CPU: ${{ inputs.with-cpu }}
+          WITH_XPU: ${{ inputs.with-xpu }}
+          # limit pull request builds to one version of python unless ciflow/binaries/all is applied to the workflow
+          # should not affect builds that are from events that are not the pull_request event
+          LIMIT_PR_BUILDS: ${{ github.event_name == 'pull_request' && !contains( github.event.pull_request.labels.*.name, 'ciflow/binaries/all') }}
+          # This is used when testing release binaries only from download.pytorch.org.
+          # In cases when pipy binaries are not published yet.
+          USE_ONLY_DL_PYTORCH_ORG: ${{ inputs.use-only-dl-pytorch-org }}
+          BUILD_PYTHON_ONLY: ${{ inputs.build-python-only }}
+          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+          PYTHON_VERSIONS: ${{ inputs.python-versions }}
+        run: |
+          set -eou pipefail
+          MATRIX_BLOB="$(python3 .github/scripts/generate_binary_build_matrix.py)"
+          echo "${MATRIX_BLOB}"
+          echo "matrix=${MATRIX_BLOB}" >> "${GITHUB_OUTPUT}"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.package-type }}-${{ inputs.os }}-${{ inputs.test-infra-repository }}-${{ inputs.test-infra-ref }}
+  cancel-in-progress: true
\ No newline at end of file
diff --git a/.github/workflows/release-linux.yml b/.github/workflows/release-linux.yml
index 53ed569725..ca13b37443 100644
--- a/.github/workflows/release-linux.yml
+++ b/.github/workflows/release-linux.yml
@@ -15,7 +15,7 @@ permissions:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     if: ${{ contains(github.event.pull_request.labels.*.name, 'build-release-artifacts') || startsWith(github.event.ref, 'refs/tags/v') }}
     with:
       package-type: wheel
diff --git a/.github/workflows/release-windows.yml b/.github/workflows/release-windows.yml
index e9d393f544..271547cec3 100644
--- a/.github/workflows/release-windows.yml
+++ b/.github/workflows/release-windows.yml
@@ -15,7 +15,7 @@ permissions:
 
 jobs:
   generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    uses: ./.github/workflows/generate_binary_build_matrix.yml
     if: ${{ contains(github.event.pull_request.labels.*.name, 'build-release-artifacts') || startsWith(github.event.ref, 'refs/tags/v') }}
     with:
       package-type: wheel

From 5f1d12f9d47b253bc23e9122dfa7c4eed3456bd4 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 6 Dec 2024 10:06:59 -0800
Subject: [PATCH 11/11] chore: updating conf to ignore rendering triton example

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 docsrc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docsrc/conf.py b/docsrc/conf.py
index daa1a30100..ffe341c722 100644
--- a/docsrc/conf.py
+++ b/docsrc/conf.py
@@ -93,7 +93,7 @@
 sphinx_gallery_conf = {
     "examples_dirs": "../examples",
     "gallery_dirs": "tutorials/_rendered_examples/",
-    "ignore_pattern": "utils.py",
+    "ignore_pattern": r"(triton/\w*.py)|(utils.py)",
 }
 
 # Setup the breathe extension