From 726322665e1e7c138d2beb35dab22794fcea1a80 Mon Sep 17 00:00:00 2001
From: Harry Kim <harry_kim@live.com>
Date: Fri, 15 Nov 2024 10:14:39 -0800
Subject: [PATCH] Delete
 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models directory

Suggest removing this guide and use EKS multinode guide as a default. This article is causing customer confusion.
---
 .../.gitignore                                |   5 -
 .../README.md                                 | 746 ------------------
 .../chart/.gitignore                          |   1 -
 .../chart/Chart.yaml                          |  20 -
 .../chart/gpt2_values.yaml                    |  18 -
 .../chart/llama-2-70b_values.yaml             |  26 -
 .../chart/llama-2-7b-chat_values.yaml         |  26 -
 .../chart/llama-2-7b_values.yaml              |  26 -
 .../chart/llama-3-70b-instruct_values.yaml    |  26 -
 .../chart/llama-3-8b-instruct_values.yaml     |  26 -
 .../chart/llama-3-8b_values.yaml              |  26 -
 .../chart/opt125m_values.yaml                 |  20 -
 .../chart/templates/NOTES.txt                 |  48 --
 .../chart/templates/deployment.yaml           | 358 ---------
 .../chart/templates/job.yaml                  | 227 ------
 .../chart/templates/pod-monitor.yaml          |  35 -
 .../chart/templates/rbac.yaml                 |  84 --
 .../chart/templates/service.yaml              |  52 --
 .../chart/values.schema.json                  | 324 --------
 .../chart/values.yaml                         | 126 ---
 .../containers/README.md                      |  26 -
 .../containers/kubessh                        |  19 -
 .../containers/server.py                      | 611 --------------
 .../containers/triton_trt-llm.containerfile   |  86 --
 .../nvidia_dcgm-exporter_values.yaml          | 107 ---
 ...vidia_gpu-feature-discovery_daemonset.yaml |  87 --
 .../pvc.yaml                                  |  33 -
 27 files changed, 3189 deletions(-)
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md
 delete mode 100755 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml
 delete mode 100644 Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore
deleted file mode 100644
index 462fe9f8..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-.vscode/
-**/.vscode/
-
-dev_*
-**/dev_*
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md
deleted file mode 100644
index 7846a7b8..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md
+++ /dev/null
@@ -1,746 +0,0 @@
-<!---
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
---->
-
-# Multi-Node Generative AI w/ Triton Server and TensorRT-LLM
-
-It almost goes without saying that large language models (LLM) are large.
-LLMs often are too large to fit in the memory of a single GPU.
-Therefore we need a solution which enables multiple GPUs to cooperate to enable inference serving for this very large models.
-
-This guide aims to explain how to perform multi-GPU, multi-node deployment of large language models using Triton Server and
-TRT-LLM in a Kubernetes cluster.
-Setting up multi-node LLM support using Triton Inference Server, TensorRT-LLM, and Kubernetes is not difficult, but it does
-require preparation.
-
-We'll cover the following topics:
-
-* [Cluster Setup](#cluster-setup)
-  * [Persistent Volume Setup](#persistent-volume-setup)
-  * [Core Cluster Services](#core-cluster-services)
-    * [Kubernetes Node Feature Discovery service](#kubernetes-node-feature-discovery-service)
-    * [NVIDIA Device Plugin for Kubernetes](#nvidia-device-plugin-for-kubernetes)
-    * [NVIDIA GPU Feature Discovery service](#nvidia-gpu-feature-discovery-service)
-  * [Hugging Face Authorization](#hugging-face-authorization)
-* [Triton Preparation](#triton-preparation)
-  * [Model Preparation Script](#model-preparation-script)
-  * [Custom Container Image](#custom-container-image)
-  * [Kubernetes Pull Secrets](#kubernetes-pull-secrets)
-* [Triton Deployment](#triton-deployment)
-  * [How It Works](#how-it-works)
-  * [Potential Improvements](#potential-improvements)
-    * [Autoscaling and Gang Scheduling](#autoscaling-and-gang-scheduling)
-    * [Network Topology Aware Scheduling](#network-topology-aware-scheduling)
-* [Developing this Guide](#developing-this-guide)
-
-Prior to beginning this guide/tutorial you will need a couple of things.
-
-* Kubernetes Control CLI (`kubectl`)
-  [ [documentation](https://kubernetes.io/docs/reference/kubectl/introduction/)
-  | [download](https://kubernetes.io/releases/download/) ]
-* Helm CLI (`helm`)
-  [ [documentation](https://helm.sh/)
-  | [download](https://helm.sh/docs/intro/install) ]
-* Docker CLI (`docker`)
-  [ [documentation](https://docs.docker.com/)
-  | [download](https://docs.docker.com/get-docker/) ]
-* Decent text editing software for editing YAML files.
-* Kubernetes cluster.
-* Fully configured `kubectl` with administrator permissions to the cluster.
-
-
-
-## Cluster Setup
-
-The following instructions are setting up a Kubernetes cluster for the deployment of LLMs using Triton Server and TRT-LLM.
-
-
-### Prerequisites
-
-This guide assumes that all nodes with NVIDIA GPUs have the following:
-- A node label of `nvidia.com/gpu=present` to more easily identify nodes with NVIDIA GPUs.
-- A node taint of `nvidia.com/gpu=present:NoSchedule` to prevent non-GPU pods from being deployed to GPU nodes.
-
-> [!Tip]
-> When using a Kubernetes provider like AKS, EKA, or GKE, it is usually best to use their interface when configuring nodes
-> instead of using `kubectl` to do it directly.
-
-
-### Persistent Volume Setup
-
-To enable multiple pods deployed to multiple nodes to load shards of the same model so that they can used in coordination to
-serve inference request too large to loaded by a single GPU, we'll need a common, shared storage location.
-In Kubernetes, these common, shared storage locations are referred to as persistent volumes.
-Persistent volumes can be volume mapped in to any number of pods and then accessed by processes running inside of said pods
-as if they were part of the pod's file system.
-
-Additionally, we will need to create a persistent-volume claim which can use to assign the persistent volume to a pod.
-
-Unfortunately, the creation of a persistent volume will depend on how your cluster is setup, and is outside the scope of this
-tutorial.
-That said, we will provide a basic overview of the process.
-
-#### Create a Persistent Volume
-
-If your cluster is hosted by a cloud service provider, (CSP) like Amazon (EKS), Azure (AKS), or gCloud (GKE)
-step-by-step instructions are available online for how to setup a persistent volume for your cluster.
-Otherwise, you will need to work with your cluster administrator or find a separate guide online on how to setup a
-persistent volume for your cluster.
-
-The following resources can assist with the setting up of persistent volumes for your cluster.
-
-* [Kubernetes Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/)
-* [AKS Persistent Volumes](https://learn.microsoft.com/en-us/azure/aks/azure-csi-disk-storage-provision)
-* [EKS Persistent Volumes](https://aws.amazon.com/blogs/storage/persistent-storage-for-kubernetes/)
-* [GKE Persistent Volumes](https://cloud.google.com/kubernetes-engine/docs/concepts/persistent-volumes)
-* [OKE Persistent Volumes](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingpersistentvolumeclaim.htm)
-
-> [!Important]
-> It is important to consider the storage requirements of the models you expect your cluster to host, and be sure to
-> sufficiently size the persistent volume for the combined storage size of all models.
-
-Below are some example values gathered from internal testing of this tutorial.
-
-| Model           | Parallelism | Raw Size | Converted Size | Total Size |
-| :-------------- | ----------: | -------: | -------------: | ---------: |
-| **Llama-3-8B**  | 2           | 15Gi     | 32Gi           | 47Gi       |
-| **Llama-3-8B**  | 4           | 15Gi     | 36Gi           | 51Gi       |
-| **Llama-3-70B** | 8           | 90Gi     | 300Gi          | 390Gi      |
-
-#### Create a Persistent-Volume Claim
-
-In order to connect the Triton Server pods to the persistent volume created above, we need to create a persistent-volume
-claim (PVC). You can use the [pvc.yaml](./pvc.yaml) file provided as part of this tutorial to create one.
-
-> [!Important]
-> The `volumeName` property must match the `metadata.name` property of the persistent volume created above.
-
-
-### Core Cluster Services
-
-Once all nodes are correctly labeled and tainted, use the following steps to prepare the cluster deploying large language
-models across multiple nodes with Triton Server.
-
-The following series of steps are intended to prepare a fresh cluster.
-For clusters in varying states, it is best to coordinate with your cluster administrator before installing new services and
-capabilities.
-
-#### Kubernetes Node Feature Discovery service
-
-1.  Add the Kubernetes Node Feature Discovery chart repository to the local cache.
-
-    ```bash
-    helm repo add kube-nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts \
-      && helm repo update
-    ```
-
-2.  Run the command below to install the service.
-
-    ```bash
-    helm install -n kube-system node-feature-discovery kube-nfd/node-feature-discovery \
-      --set nameOverride=node-feature-discovery \
-      --set worker.tolerations[0].key=nvidia.com/gpu \
-      --set worker.tolerations[0].operator=Exists \
-      --set worker.tolerations[0].effect=NoSchedule
-    ```
-
-    > [!Note]
-    > The above command sets toleration values which allow for the deployment of a pod onto a node with
-    > a matching taint.
-    > See this document's [prerequisites](#prerequisites) for the taints this document expected to have been applied to GPU
-    > nodes in the cluster.
-
-#### NVIDIA Device Plugin for Kubernetes
-
-1.  This step is unnecessary if the Device Plugin has already been installed in your cluster.
-    Cloud provider turnkey Kubernetes clusters, such as those from AKS, EKS, and GKE, often have the Device Plugin
-    automatically once a GPU node as been added to the cluster.
-
-    To check if your cluster requires the NVIDIA Device Plugin for Kubernetes, run the following command and inspect
-    the output for `nvidia-device-plugin-daemonset`.
-
-    ```bash
-    kubectl get daemonsets --all-namespaces
-    ```
-
-    Example output:
-    ```text
-    NAMESPACE     NAME         DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE
-    kube-system   kube-proxy   6         6         6       6            6
-    ```
-
-2.  If `nvidia-device-plugin-daemonset` is not listed, run the command below to install the plugin.
-    Once installed it will provide containers access to GPUs in your clusters.
-
-    For additional information, see
-    [Github/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin/blob/main/README.md).
-
-    ```bash
-    kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml
-    ```
-
-#### NVIDIA GPU Feature Discovery Service
-
-1.  This step is unnecessary if the service has already been installed in your cluster.
-
-    To check if your cluster requires the NVIDIA Device Plugin for Kubernetes, run the following command and inspect
-    the output for `nvidia-device-plugin-daemonset`.
-
-    ```bash
-    kubectl get daemonsets --all-namespaces
-    ```
-
-    Example output:
-    ```text
-    NAMESPACE     NAME                             DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE
-    kube-system   kube-proxy                       6         6         6       6            6
-    kube-system   nvidia-device-plugin-daemonset   6         6         6       6            6
-    ```
-
-2.  If `gpu-feature-discovery` is listed, skip this step and the next.
-
-    Otherwise, use the YAML file below to install the GPU Feature Discovery service.
-
-    > [nvidia_gpu-feature-discovery_daemonset.yaml](nvidia_gpu-feature-discovery_daemonset.yaml)
-
-    The file above was created by downloading its contents from
-    [GitHub/NVIDIA](https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml)
-    and modified specifically for this tutorial.
-
-    ```bash
-    curl https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml \
-      >  nvidia_gpu-feature-discovery_daemonset.yaml
-    ```
-
-3.  Then run the command below to install the
-
-    ```bash
-    kubectl apply -f ./nvidia_gpu-feature-discovery_daemonset.yaml
-    ```
-
-
-### Hugging Face Authorization
-
-In order to download models from Hugging Face, your pods will require an access token with the appropriate permission to
-download models from their servers.
-
-1.  If you do not already have a Hugging Face access token, you will need to created one.
-    To create a Hugging Face access token,
-    [follow their guide](https://huggingface.co/docs/hub/en/security-tokens).
-
-2.  Once you have a token, use the command below to persist the token as a secret named `hf-model-pull` in your cluster.
-
-    ```bash
-    kubectl create secret generic hf-model-pull '--from-literal=password=<access_token>'
-    ```
-
-3.  To verify that your secret has been created, use the following command and inspect the output for your secret.
-
-    ```bash
-    kubectl get secrets
-    ```
-
-
-
-## Triton Preparation
-
-
-### Model Preparation Script
-
-The intention of this script to handle the acquisition of the model file from Hugging Face, the generation of the TensorRT
-engine and plan files, and the caching of said generated files.
-The script depends on the fact that the Kubernetes deployment scripts we'll be using rely on the persistent volume backing the
-persistent-volume claim provided as part of the Helm chart.
-
-Specially, the model and engine directories will me mapped to folders in the persistent volume and remapped to all subsequent
-pods deployed as part of the Helm chart.
-This enables the generation script to detect that the plan and engine generation steps have been completed and not repeat work.
-
-> [!Tip]
-> This script will executed as a job every time the Helm chart is installed unless the `.model.skipConversion` property is
-> set to `false`.
-
-When Triton Server is started, the same persistent volume folders will be mounted to its container and Triton will use the
-pre-generated model plan and engine files.
-Not only does this enable pods on separate nodes to share the same model engine and plan files, it drastically reduces the time
-required for subsequent pod starts on the same node.
-
-> [!Note]
-> You can look at the code used to acquire and convert the models in [containers/server.py](containers/server.py).
-> This file is copied into the server container image (see below) during its creation and then executed when the conversion
-> job pod is deployed.
-
-#### Custom Container Image
-
-1.  Using the file below, we'll create a custom container image in the next step.
-
-    > [triton_trt-llm.containerfile](containers/triton_trt-llm.containerfile)
-
-2.  Run the following command to create a custom Triton Inference Server w/ all necessary tools to generate TensorRT-LLM
-    plan and engine files. In this example we'll use the tag `24.04` to match the date portion of `24.04-trtllm-python-py3`
-    from the base image.
-
-    ```bash
-    docker build \
-      --file ./triton_trt-llm.containerfile \
-      --rm \
-      --tag triton_trt-llm:24.04 \
-      .
-    ```
-
-    ##### Custom Version of Triton CLI
-
-    This custom Triton Server container image makes use of a custom version of the Triton CLI.
-    The relevant changes have been made available as a
-    [topic branch](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn) in the Triton CLI repository on
-    GitHub.
-    The changes in the branch can be
-    [inspected](https://github.com/triton-inference-server/triton_cli/compare/main...jwyman/aslb-mn) using the GitHub
-    interface, and primarily contain the addition of the ability to specify tensor parallelism when optimizing models for
-    TensorRT-LLM and enable support for additional models.
-
-3.  Upload the Container Image to a Cluster Visible Repository.
-
-    In order for your Kubernetes cluster to be able to download out new container image, it will need to be pushed to a
-    container image repository that nodes in your cluster can reach.
-    In this example, we'll use the fictional `nvcr.io/example` repository for demonstration purposes.
-    You will need to determine which repositories you have write access to that your cluster can also access.
-
-    1. First, re-tag the container image with the repository's name like below.
-
-        ```bash
-        docker tag \
-          triton_trt-llm:24.04 \
-          nvcr.io/example/triton_trt-llm:24.04
-        ```
-
-    2. Next, upload the container image to your repository.
-
-        ```bash
-        docker push nvcr.io/example/triton_trt-llm:24.04
-        ```
-
-#### Kubernetes Pull Secrets
-
-If your container image repository requires credentials to download images from, then you will need to create a Kubernetes
-docker-registry secret.
-We'll be using the `nvcr.io` container image repository example above for demonstration purposes.
-Be sure to properly escape any special characters such as `$` in the password or username values.
-
-1.  Use the command below to create the necessary secret.  Secrets for your repository should be similar, but not be identical
-to the example below.
-
-    ```bash
-    kubectl create secret docker-registry ngc-container-pull \
-      --docker-password='dGhpcyBpcyBub3QgYSByZWFsIHNlY3JldC4gaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4=' \
-      --docker-server='nvcr.io' \
-      --docker-username='\$oauthtoken'
-    ```
-
-2.  The above command will create a secret in your cluster named `ngc-container-pull`.
-    You can verify that the secret was created correctly using the following command and inspecting its output for the secret
-    you're looking for.
-
-    ```bash
-    kubectl get secrets
-    ```
-
-3.  Ensure the contents of the secret are correct, you can run the following command.
-
-    ```bash
-    kubectl get secret/ngc-container-pull -o yaml
-    ```
-
-    You should see an output similar to the following.
-
-    ```yaml
-    apiVersion: v1
-    data:
-      .dockerconfigjson: eyJhdXRocyI6eyJudmNyLmlvIjp7InVzZXJuYW1lIjoiJG9hdXRodG9rZW4iLCJwYXNzd29yZCI6IlZHaHBjeUJwY3lCdWIzUWdZU0J5WldGc0lITmxZM0psZEN3Z2FYUWdhWE1nYjI1c2VTQm1iM0lnWkdWdGIyNXpkSEpoZEdsdmJpQndkWEp3YjNObGN5ND0iLCJhdXRoIjoiSkc5aGRYUm9kRzlyWlc0NlZrZG9jR041UW5CamVVSjFZak5SWjFsVFFubGFWMFp6U1VoT2JGa3pTbXhrUTNkbllWaFJaMkZZVFdkaU1qVnpaVk5DYldJelNXZGFSMVowWWpJMWVtUklTbWhrUjJ4MlltbENkMlJZU25kaU0wNXNZM2swWjFWSGVHeFpXRTVzU1VjMWJHUnRWbmxKU0ZaNldsTkNRMWxZVG14T2FsRm5aRWM0WjJGSGJHdGFVMEo1V2xkR2MwbElUbXhaTTBwc1pFaE5hQT09In19fQ==
-    kind: Secret
-    metadata:
-      name: ngc-container-pull
-      namespace: default
-    type: kubernetes.io/dockerconfigjson
-    ```
-
-    The value of `.dockerconfigjson` is a base-64 encoded string which can be decoded into the following.
-
-    ```json
-    {
-      "auths": {
-        "nvcr.io": {
-          "username":"$oauthtoken",
-          "password":"VGhpcyBpcyBub3QgYSByZWFsIHNlY3JldCwgaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4gUGxlYXNlIG5ldmVyIHVzZSBCYXNlNjQgdG8gaGlkZSByZWFsIHNlY3JldHMh",
-          "auth":"JG9hdXRodG9rZW46VkdocGN5QnBjeUJ1YjNRZ1lTQnlaV0ZzSUhObFkzSmxkQ3dnYVhRZ2FYTWdiMjVzZVNCbWIzSWdaR1Z0YjI1emRISmhkR2x2YmlCd2RYSndiM05sY3k0Z1VHeGxZWE5sSUc1bGRtVnlJSFZ6WlNCQ1lYTmxOalFnZEc4Z2FHbGtaU0J5WldGc0lITmxZM0psZEhNaA=="
-        }
-      }
-    }
-    ```
-
-    You can use this compact command line to get the above output with a single command.
-
-    ```bash
-    kubectl get secret/ngc-container-pull -o json | jq -r '.data[".dockerconfigjson"]' | base64 -d | jq
-    ```
-
-    > [!Note]
-    > The values of `password` and `auth` are also base-64 encoded string.
-    > We recommend inspecting the values of the following values:
-    >
-    > * Value of `.auths['nvcr.io'].username`.
-    > * Base64 decoded value of `.auths['nvcr.io'].password`.
-    > * Base64 decoded value of `.auths['nvcr.io'].auths`.
-
-
-
-## Triton Deployment
-
-> [!Note]
-> Deploying Triton Server with a model that fits on a single GPU is straightforward but not explained by this guide.
-> For instructions and examples of deploying a model using a single GPU or multiple GPUs on a single node, use the
-> [Autoscaling and Load Balancing Generative AI w/ Triton Server and TensorRT-LLM Guide](../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md) instead.
-
-Given the memory requirements of some AI models it is not possible to host them using a single device.
-Triton and TensorRT-LLM provide a mechanism to enable a large model to be hosted by multiple GPU devices working in concert.
-The provided sample Helm [chart](./chart/) provides a mechanism for taking advantage of this capability.
-
-To enable this feature, adjust the `model.tensorrtLlm.parallelism.tensor` value to an integer greater than 1.
-Configuring a model to use tensor parallelism enables the TensorRT-LLM runtime to effectively combine the memory of multiple
-GPUs to host a model too large to fit on a single GPU.
-
-Similarly, changing the value of `model.tensorrtLlm.parallelism.pipeline` will enable pipeline parallelism.
-Pipeline parallelism is used to combine the compute capacity of multiple GPUs to process inference requests in parallel.
-
-> [!Important]
-> The product of the values of `.tensor` and `.pipeline` should be a power of 2 greater than `0` and less than or equal to
-> `32`.
-
-The number of GPUs required to host the model is equal to product of the values of `.tensor` and `.pipeline`.
-When the model is deployed, one pod per GPU required will be created.
-The Helm chart will create a leader pod and one or more work pods, depending on the number of additional pods required to
-host the model.
-Additionally, a model conversion job will be created to download the model from Hugging Face and then convert the downloaded
-model into TRT-LLM engin and plan files.
-To disable the creation of a conversion job by the Helm chart, set the values file's `model.skipConversion` property to
-`false`.
-
-> [!Warning]
-> If your cluster has insufficient resources to create the conversion job, the leader pod, and the required worker pods,
-> and the job pod is not scheduled to execute first, it is possible for the example Helm chart to become "hung" due to the
-> leader pod waiting on the job pod's completion and there being insufficient resources to schedule the job pod.
->
-> If this occurs, it is best to delete the Helm installation and retry until the job pod is successfully scheduled.
-> Once the job pod completes, it will release its resources and make them available for the other pods to start.
-
-### Deploying Single GPU Models
-
-Deploying Triton Server with a model that fits on a single GPU is straightforward using the steps below.
-
-1.  Create a custom values file with required values:
-
-    * Container image name.
-    * Model name.
-    * Supported / available GPU.
-    * Image pull secrets (if necessary).
-    * Hugging Face secret name.
-
-    The provided sample Helm [chart](./chart/) include several example values files such as
-    [llama-3-8b_values.yaml](chart/llama-3-8b-instruct_values.yaml).
-
-2.  Deploy LLM on Triton + TRT-LLM.
-
-    Apply the custom values file to override the exported base values file using the command below, and create the Triton
-    Server Kubernetes deployment.
-
-    > [!Tip]
-    > The order that the values files are specified on the command line is important with values are applied and
-    > override existing values in the order they are specified.
-
-    ```bash
-    helm install <installation_name> \
-      --values ./chart/values.yaml \
-      --values ./chart/<custom_values>.yaml \
-      --set 'triton.image.name=<custom_image_name>' \
-      ./chart/.
-    ```
-
-    > [!Important]
-    > Be sure to substitute the correct values for `<installation_name>` and `<custom_values>` in the example above.
-
-3.  Verify the Chart Installation.
-
-    Use the following commands to inspect the installed chart and to determine if everything is working as intended.
-
-    ```bash
-    kubectl get deployments,pods,services,jobs --selector='app=<installation_name>'
-    ```
-
-    > [!Important]
-    > Be sure to substitute the correct value for `<installation_name>` in the example above.
-
-    You should output similar to below (assuming the installation name of "llama-3"):
-
-    ```text
-    NAME                      READY   UP-TO-DATE   AVAILABLE
-    deployment.apps/llama-3   0/1     1            0
-
-    NAME                          READY   STATUS    RESTARTS
-    pod/llama-3-7989ffd8d-ck62t   0/1     Pending   0
-
-    NAME              TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)
-    service/llama-3   ClusterIP   10.100.23.237   <none>        8000/TCP,8001/TCP,8002/TCP
-    ```
-
-4.  Uninstalling the Chart
-
-    Uninstalling a Helm chart is as straightforward as running the command below.
-    This is useful when experimenting with various options and configurations.
-
-    ```bash
-    helm uninstall <installation_name>
-    ```
-
-### How It Works
-
-The Helm chart creates a model-conversion job and multiple Kubernetes deployments to support the distributed model's tensor parallelism needs.
-When a distributed model is deployed, a "leader" pod along with a number of "workers" to meet the model's tensor parallelism requirements are
-created.
-The leader pod then awaits for the conversion job to complete and for all worker pods to be successfully deployed.
-
-The model-conversion job is responsible for downloading the configured model from Hugging Face and converting that model into a TensorRT-LLM
-ready set of engine and plan files.
-The model-conversion job will place all downloaded and converted files on the provided persistent volume.
-
-> [!Note]
-> Model downloads from Hugging Face are reused when possible.
-> Converted TRT-LLM models are GPU and tensor-parallelism specific.
-> Therefore a converted model will exist for every GPU the model is deployed on to as well as for every configuration of tensor parallelism.
-
-Once these conditions are met, the leader pod creates an [`mpirun`](https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man1/mpirun.1.html) process which creates a Triton Server process in each pod of the distributed model.
-
-The leader pod's process is responsible for handling inference request and response functionality, as well as inference request tokenization and
-result de-tokenization.
-Worker pods' processes provide expanded GPU compute and memory capacity.
-All of the processes are coordinated by the original `mpirun` process.
-Communications between the processes is accelerated by [NVIDIA Collective Communications Library](https://developer.nvidia.com/nccl) (NCCL).
-NCCL enables GPU-to-GPU direct communication and avoids the wasteful data copying from GPU-to-CPU-to-GPU that occur otherwise.
-
-
-### Potential Improvements
-
-#### Autoscaling and Gang Scheduling
-
-This guide does not provide any solution for autoscaling or load balancing Triton deployments because Kubernetes horizontal pod
-autoscaling (HPA) is not capable of managing deployments composed of multiple pods.
-Additionally, because the solution provided in this tutorial makes use of multiple deployments, any automation has a high risk of concurrent,
-partial deployments exhausting available resources preventing any of the deployments from succeeding.
-
-For an example of concurrent, partial deployments preventing each other from successfully deploying, imagine a cluster with 4 nodes, each with 8 GPUs for a total of 32 available GPUs.
-Now consider a model which requires 8 GPUs to be deployed and we attempt to deploy 5 copies of it.
-When individually deploying the models, each deployment is assigned 8 GPUs until there are zero available GPUs remaining resulting in the model
-being successfully deployed 4 times.
-At this point, the system understands that there are no more available resources and the 5 model copy fails to deploy.
-
-However, when attempting to deploy all 5 copies of the model simultaneously, it is highly likely that each copy will get at least 1 GPU resource
-assigned to it.
-This results in their insufficient resources for at least two of the copies; leaving both deployments stuck in a non-functional, partially
-deployed state.
-
-One solution to this problem would be to leverage a gang scheduler for Kubernetes.
-Gang scheduling would enable the Kubernetes scheduler to only create a pod if its entire cohort of pods can be created.
-This provides a solution to the partial deployment of model pods blocking each other from being fully deployed.
-
-> [!Note]
-> Read about [gang scheduling on Wikipedia](https://en.wikipedia.org/wiki/Gang_scheduling) for additional information.
-
-The above solutions, however, does not provide any kind of autoscaling solution.
-To achieve this, a custom, gang-schedular-aware autoscaler would be required.
-
-#### Network Topology Aware Scheduling
-
-Triton Server w/ TensorRT-LLM leverage a highly-optimized networking stacked known as the
-[NVIDIA Collective Communications Library](https://developer.nvidia.com/nccl) (NCCL) to enable tensor parallelization.
-NCCL takes advantage of he ability for modern GPUs to leverage
-[remote direct memory access](https://en.wikipedia.org/wiki/Remote_direct_memory_access) (RDMA) based network acceleration to optimize operations
-between GPUs regardless if they're on the same or nearby machines.
-This means that quality of the network between GPUs on separate machines directly affects the performance of distributed models.
-
-Providing a network topology aware scheduler for Kubernetes, could help ensure that the GPUs assigned to the pods of a model deployment are
-relatively local to each other.
-Ideally, on the same machine or at least the same networking switch to minimize network latency and the impact of bandwidth limitations.
-
-
-## Developing this Guide
-
-During the development of this guide, I ran into several problems that needed to be solved before we could provide a useful
-guide.
-This section will outline and describe the issues I ran into and how we resolved them.
-
-> _This document was developed using a Kubernetes cluster provided by Amazon EKS._
-> _Clusters provisioned on-premises or provided by other cloud service providers such as Azure AKS or GCloud GKE might require_
-> _modifications to this guide._
-
-
-### Why This Set of Software Components?
-
-The set of software packages described in this document is close the minimum viable set of packages without handcrafting
-custom Helm charts and YAML files for every package and dependency.
-Is this the only set of packages and components that can be used to make this solution work?
-Definitely not, there are several alternatives which could meet our requirements.
-This set of packages and components is just the set I happen to choose for this guide.
-
-Below is a high-level description of why each package is listed in this guide.
-
-#### NVIDIA Device Plugin for Kubernetes
-
-Required to enable GPUs to be treated as resources by the Kubernetes scheduler.
-Without this component, GPUs would not be assigned to containers correctly.
-
-#### NVIDIA GPU Discovery Service for Kubernetes
-
-Provides automatic labelling of Kubernetes nodes based on the NVIDIA devices and software available on the node.
-Without the provided labels, it would not be possible to specify specific GPU SKUs when deploying models because the
-Kubernetes scheduler treats all GPUs as identical (referring to them all with the generic resources name `nvidia.com/gpu`).
-
-#### Kubernetes Node Discovery Service
-
-This is a requirement for the [NVIDIA GPU Discovery Service for Kubernetes](#nvidia-gpu-discovery-service-for-kubernetes).
-
-#### NVIDIA DCGM Exporter
-
-Provides hardware monitoring and metrics for NVIDIA GPUs and other devices present in the cluster.
-Without the metrics this provides, monitoring GPU utilization, temperature and other metrics would not be possible.
-
-While Triton Server has the capability to collect and serve NVIDIA hardware metrics, relying on Triton Server to provide this
-service is non-optimal for several reasons.
-
-Firstly, many processes on the same machine querying the NVIDIA device driver for current state, filtering the results for
-only values that pertain to the individual process, and serving them via Triton's open-metrics server is as wasteful as the
-the number of Triton Server process beyond the first on the node.
-
-Secondly, due to the need to interface with the kernel-mode driver to retrieve hardware metrics, queries get serialized adding
-additional overhead and latency to the system.
-
-Finally, the rate at which metrics are collected from Triton Server is not the same as the rate at which metrics are collected
-from the DCGM Exporter.
-Separating the metrics collection from Triton Server allows for customized metric collection rates, which enables us to
-further minimize the process overhead placed on the node.
-
-##### Why is the DCGM Exporter Values File Custom?
-
-I decided to use a custom values file when installing the DCGM Exporter Helm chart for several reasons.
-
-Firstly, it is my professional opinion that every container in a cluster should specify resource limits and requests.
-Not doing so opens the node up to a number of difficult to diagnose failure conditions related to resource exhaustion.
-Out of memory errors are the most obvious and easiest to root cause.
-Additionally, difficult to reproduce, transient timeout and timing errors caused CPU over-subscription can easily happen when
-any container is unconstrained and quickly waste an entire engineering team's time as they attempt to triage, debug, and
-resolve them.
-
-Secondly, the DCGM Exporter process itself spams error logs when it cannot find NVIDIA devices in the system.
-This is primarily because the service was originally created for non-Kubernetes environments.
-Therefore I wanted to restrict which node the exporter would get deployed to.
-Fortunately, the DCGM Helm chart makes this easy by support node selector options.
-
-Thirdly, because nodes with NVIDIA GPUs have been tainted with the `nvidia.com/gpu=present:NoSchedule` that prevents any
-pod which does not explicitly tolerate the taint from be assigned to the node, I need to add the tolerations to the DCGM
-Exporter pod.
-
-Finally, the default Helm chart for DCGM Exporter is missing the required `--kubernetes=true` option being passed in via
-command line options when the process is started.
-Without this option, DCGM Exporter does not correctly associate hardware metrics with the pods actually using it, and
-there would be mechanism for understand how each pod uses the GPU resources assigned to it.
-
-
-### Why Use the Triton CLI and Not Other Tools Provided by NVIDIA?
-
-I chose to use the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool to optimize models for
-TensorRT-LLM instead of other available tools for a couple of reasons.
-
-Firstly, using the Triton CLI simplifies the conversion and optimization of models into a single command.
-
-Secondly, relying on the Triton CLI simplifies the creation of the container because all requirements were met with a single
-`pip install` command.
-
-#### Why Use a Custom Branch of Triton CLI Instead of an Official Release?
-
-I decided to use a custom [branch of Triton CLI](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn)
-because there are features this guide needed that were not present in any of the official releases available.
-The branch is not a Merge Request because the method used to add the needed features does not aligned with changes the
-maintainers have planned.
-Once we can achieve alignment, this guide will be updated to use an official release.
-
-
-### Why Does the Chart Run a Python Script Instead of Triton Server Directly?
-
-There are two reasons:
-
-1.  In order to retrieve a model from Hugging Face, convert and optimize it for TensorRT-LLM, and cache it on the host, I
-    decided that [pod initialization container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) was the
-    most straightforward solution.
-
-    In order to make the best use of the initialization container I chose to use a custom [server.py](./containers/server.py)
-    script that made of the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool.
-
-2.  Multi-GPU deployments require a rather specialized command line to run, and generating it using Helm chart scripting was
-    not something I wanted to deal with.
-    Leveraging the custom Python script was the logical, and easiest, solution.
-
-#### Why is the Python Written Like That?
-
-Because I'm not a Python developer, but I am learning!
-My background is in C/C++ with plenty of experience with shell scripting languages.
-
-
-### Why Use a Custom Triton Image?
-
-I decided to use a custom image for a few reasons.
-
-1.  Given the answer above and the use of Triton CLI and a custom Python script, the initialization container needed both
-    components pre-installed in it to avoid unnecessary use of ephemeral storage.
-
-    > [!Warning]
-    > Use of ephemeral storage can lead to pod eviction, and therefore should be avoided whenever possible.
-
-2.  Since the Triton + TRT-LLM image is already incredibly large, I wanted to avoid consuming additional host storage space
-    with yet another container image.
-
-    Additionally, the experience of a pod appearing to be stuck in the `Pending` state while it download a container prior to
-    the initialization container is easier to understand compared to a short `Pending` state before the initialization
-    container, followed by a much longer `Pending` state before the Triton Server can start.
-
-3.  I wanted a custom, constant environment variable set for `ENGINE_DEST_PATH` that could be used by both the initialization
-    and Triton Server containers.
-
----
-
-Software versions featured in this document:
-
-* Triton Inference Server v2.45.0 (24.04-trtllm-python-py3)
-* TensorRT-LLM v0.9.0
-* Triton CLI v0.0.7
-* NVIDIA Device Plugin for Kubernetes v0.15.0
-* NVIDIA GPU Discovery Service for Kubernetes v0.8.2
-* NVIDIA DCGM Exporter v3.3.5
-* Kubernetes Node Discovery Service v0.15.4
-* Prometheus Stack for Kubernetes v58.7.2
-* Prometheus Adapter for Kubernetes v4.10.0
-
----
-
-Author: J Wyman, System Software Architect, AI &amp; Distributed Systems
-
-Copyright &copy; 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore
deleted file mode 100644
index 10c40355..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dev_values.yaml
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml
deleted file mode 100644
index 03e6d381..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: v2
-appVersion: 0.1.0
-description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial
-icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png
-name: triton_trt-llm_multi-node_example
-version: 0.1.0
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml
deleted file mode 100644
index 4afa2eaa..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: gpt2
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml
deleted file mode 100644
index 803124f1..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: NVIDIA-A10G
-
-model:
-  name: llama-2-70b
-  tensorrtLlm:
-    conversion:
-      gpu: 8
-      memory: 256Gi
-    parallelism:
-      tensor: 8
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml
deleted file mode 100644
index 0a701e24..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: llama-2-7b-chat
-  tensorrtLlm:
-    conversion:
-      gpu: 2
-      memory: 64Gi
-    parallelism:
-      tensor: 2
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml
deleted file mode 100644
index 0b0b4666..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: llama-2-7b
-  tensorrtLlm:
-    conversion:
-      gpu: 2
-      memory: 64Gi
-    parallelism:
-      tensor: 2
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml
deleted file mode 100644
index 67b93d5b..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: NVIDIA-A10G
-
-model:
-  name: llama-3-70b-instruct
-  tensorrtLlm:
-    conversion:
-      gpu: 8
-      memory: 256Gi
-    parallelism:
-      tensor: 8
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml
deleted file mode 100644
index d849fecd..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: llama-3-8b-instruct
-  tensorrtLlm:
-    conversion:
-      gpu: 4
-      memory: 128Gi
-    parallelism:
-      tensor: 4
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml
deleted file mode 100644
index 9f7b594e..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: llama-3-8b
-  tensorrtLlm:
-    conversion:
-      gpu: 2
-      memory: 64Gi
-    parallelism:
-      tensor: 2
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml
deleted file mode 100644
index 12a4be4e..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See values.yaml for reference values.
-
-gpu: Tesla-V100-SXM2-16GB
-
-model:
-  name: opt125m
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt
deleted file mode 100644
index 6591ffbe..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-{{- $create_account := true }}
-{{- $create_job := true }}
-{{- $create_service := true }}
-{{- with $.Values.model }}
-{{-   if .skipConversion }}
-{{-     $create_job = false }}
-{{-   end }}
-{{- end }}
-{{- with $.Values.kubernetes }}
-{{-   if .noService }}
-{{-     $create_service = false }}
-{{-   end }}
-{{-   if .serviceAccount}}
-{{-     $create_account = false }}
-{{-   end }}
-{{- end }}
-
-{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
-
-Release Name: {{ $.Release.Name }}
-Namespace: {{ $.Release.Namespace }}
-Deployment Name: {{ $.Release.Name }}
-{{- if $create_job }}
-Conversion Job: {{ $.Release.Name }}
-{{- end }}
-{{- if $create_service }}
-Service Name: {{ $.Release.Name }}
-{{- end }}
-{{- if $create_account }}
-ServiceAccount Name: {{ $.Release.Name }}
-{{- end }}
-
-Helpful commands:
-
-  $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
-  $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
-  $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments
-{{- if $create_job -}}
-,jobs
-{{- end -}}
-,pods
-{{- if $create_service -}}
-,services
-{{- end -}}
-,podmonitors
-{{- if $create_account -}}
-,serviceAccounts
-{{- end -}}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml
deleted file mode 100644
index 705e7e10..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-{{- $hostRootPath := "/triton" }}
-{{- $image_name := "" }}
-{{- with $.Values.triton }}
-{{-   with .image }}
-{{-     $image_name = required "Property '.triton.image.name' is required." .name }}
-{{-   else }}
-{{-    fail "Property '.triton.image' is required." }}
-{{-   end }}
-{{- else }}
-{{-  fail "Property '.triton' is required" }}
-{{- end }}
-{{- $model_name := "" }}
-{{- $model_dt := "float16" }}
-{{- $model_pp := 1 }}
-{{- $model_tp := 1 }}
-{{- with $.Values.kubernetes }}
-{{-   with .hostRootPath }}
-{{-     $hostRootPath = . }}
-{{-   end }}
-{{- end }}
-{{- with $.Values.model }}
-{{-   $model_name = required "Property '.model.name' is required." .name }}
-{{-   with .tensorrtLlm }}
-{{-     with .dataType }}
-{{-       $model_dt = . }}
-{{-     end }}
-{{-     with .parallelism }}
-{{-       with .pipeline }}
-{{-         $model_pp = (int .) }}
-{{-       end }}
-{{-       with .tensor }}
-{{-         $model_tp = (int .) }}
-{{-       end }}
-{{-     end }}
-{{-   end }}
-{{- else }}
-{{-   fail "Property '.model' is required." }}
-{{- end }}
-{{- $model_lower := lower $model_name }}
-{{- $model_upper := upper $model_name }}
-{{- $pod_count := mul $model_pp $model_tp }}
-{{- $triton_cpu := 4 }}
-{{- $triton_memory := "32Gi" }}
-{{- with $.Values.triton }}
-{{-   with .image }}
-{{-     with .name }}
-{{-       $image_name = . }}
-{{-     end }}
-{{-   end }}
-{{-   with .resources }}
-{{-     with .cpu }}
-{{-       $triton_cpu = (int .) }}
-{{-     end }}
-{{-     with .memory }}
-{{-       $triton_memory = . }}
-{{-     end }}
-{{-   end }}
-{{- end }}
-{{- $engine_path := printf "/var/run/models/%s/%dx%d/engine" $model_lower (int $model_pp) (int $model_tp) }}
-{{- $model_path := printf "/var/run/models/%s/%dx%d/model" $model_lower (int $model_pp) (int $model_tp) }}
-{{- $skip_conversion := false }}
-{{- with $.Values.model }}
-{{-   with .skipConversion }}
-{{-     $skip_conversion = . }}
-{{-   end }}
-{{- end }}
-{{- $hf_verbosity := "error" }}
-{{- with $.Values.logging }}
-{{-   with .initialization }}
-{{-     if .verbose }}
-{{-       $hf_verbosity = "info" }}
-{{-     end }}
-{{-   end }}
-{{- end }}
-{{- $service_account := $.Release.Name }}
-{{- with $.Values.kubernetes }}
-{{-   with .serviceAccount }}
-{{-     $service_account = . }}
-{{-   end }}
-{{- end }}
-{{- range $i := until (int $pod_count) }}
-{{-   if eq $i 0 }}
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ $.Release.Name }}-leader
-  labels:
-    app: {{ $.Release.Name }}
-{{-     with $.Values.kubernetes }}
-{{-       with .labels }}
-{{          toYaml . | indent 4 }}
-{{-       end }}
-{{-     end }}
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: {{ $.Release.Name }}
-      pod-rank: {{ $i | quote }}
-  template:
-    metadata:
-      labels:
-        app: {{ $.Release.Name }}
-        app.kubernetes.io/component: server
-        pod-rank: {{ $i | quote }}
-{{-     with $.Values.kubernetes }}
-{{-       with .labels }}
-{{          toYaml . | indent 8 }}
-{{-       end }}
-{{-     end }}
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: nvidia.com/gpu
-                operator: Exists
-              - key: nvidia.com/gpu.product
-                operator: In
-                values:
-                - {{ required "Property '.gpu' is required." $.Values.gpu }}
-      containers:
-      - name: triton
-        command:
-        - python3
-        - ./server.py
-        - leader
-        - --deployment={{ $.Release.Name }}
-        - --namespace={{ $.Release.Namespace }}
-        - --dt={{ $model_dt }}
-        - --pp={{ $model_pp }}
-        - --tp={{ $model_tp }}
-        - --multinode
-{{-     if $skip_conversion }}
-        - --noconvert
-{{-     end }}
-{{-     with $.Values.logging }}
-{{-       with .tritonServer }}
-{{-         if .useIso8601 }}
-        - --iso8601
-{{-         end }}
-{{-         if .verbose }}
-        - --verbose
-{{-         end }}
-{{-       end }}
-{{-     end }}
-        env:
-        - name: ENGINE_DEST_PATH
-          value: {{ $engine_path }}
-        - name: MODEL_DEST_PATH
-          value: {{ $model_path }}
-{{-     with $.Values.logging }}
-{{-       with .tritonServer }}
-{{-         if .verbose }}
-        - name: NCCL_DEBUG
-          value: INFO
-{{-         end }}
-{{-       end }}
-{{-     end }}
-        image: {{ $image_name }}
-        imagePullPolicy: IfNotPresent
-        livenessProbe:
-          failureThreshold: 15
-          httpGet:
-            path: /v2/health/live
-            port: 8000
-          initialDelaySeconds: 10
-          periodSeconds: 2
-          successThreshold: 1
-        ports:
-        - containerPort: 8000
-          name: http
-        - containerPort: 8001
-          name: grpc
-        - containerPort: 8002
-          name: metrics
-        readinessProbe:
-          failureThreshold: 15
-          httpGet:
-            path: /v2/health/ready
-            port: 8000
-          initialDelaySeconds: 15
-          periodSeconds: 2
-          successThreshold: 1
-        resources:
-          limits:
-            cpu: {{ $triton_cpu }}
-            ephemeral-storage: 1Gi
-            memory: {{ $triton_memory }}
-            nvidia.com/gpu: 1
-          requests:
-            cpu: {{ $triton_cpu }}
-            ephemeral-storage: 1Gi
-            memory: {{ $triton_memory }}
-            nvidia.com/gpu: 1
-        startupProbe:
-          failureThreshold: 60
-          httpGet:
-            path: /v2/health/ready
-            port: 8000
-          initialDelaySeconds: 60
-          periodSeconds: 15
-          successThreshold: 1
-        volumeMounts:
-        - mountPath: /var/run/models
-          name: model-repository
-          readOnly: true
-{{-     with $.Values.triton }}
-{{-       with .image }}
-{{-         with .pullSecrets }}
-      imagePullSecrets:
-{{            toYaml . | indent 6 }}
-{{-         end }}
-{{-       end }}
-{{-     end }}
-      restartPolicy: Always
-      serviceAccountName: {{ $service_account }}
-      terminationGracePeriodSeconds: 30
-      tolerations:
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-{{-     with $.Values.kubernetes }}
-{{-       with .tolerations }}
-{{          toYaml . | indent 6 }}
-{{-       end }}
-{{-     end }}
-      volumes:
-{{-     with $.Values.model }}
-{{-       with .pullSecret }}
-      - name: hf-secret
-        secret:
-          secretName: {{ . }}
-{{-       end }}
-{{-     end }}
-      - name: model-repository
-        persistentVolumeClaim:
-          claimName: {{ $.Values.model.persistentVolumeClaim }}
-          readOnly: false
-{{-   else }}
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ $.Release.Name }}-worker{{ $i }}
-  labels:
-    app: {{ $.Release.Name }}
-{{-     with $.Values.kubernetes }}
-{{-       with .labels }}
-{{          toYaml . | indent 4 }}
-{{-       end }}
-{{-     end }}
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: {{ $.Release.Name }}
-      pod-rank: {{ $i | quote }}
-  template:
-    metadata:
-      labels:
-        app: {{ $.Release.Name }}
-        app.kubernetes.io/component: worker
-        pod-rank: {{ $i | quote }}
-{{-     with $.Values.kubernetes }}
-{{-       with .labels }}
-{{          toYaml . | indent 8 }}
-{{-       end }}
-{{-     end }}
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: nvidia.com/gpu
-                operator: Exists
-              - key: nvidia.com/gpu.product
-                operator: In
-                values:
-                - {{ required "Property '.gpu' is required." $.Values.gpu }}
-      containers:
-      - name: worker-{{ $i }}
-        command:
-        - python3
-        - ./server.py
-        - worker
-        env:
-        - name: ENGINE_DEST_PATH
-          value: {{ $engine_path }}
-        - name: MODEL_DEST_PATH
-          value: {{ $model_path }}
-{{-     with $.Values.logging }}
-{{-       with .tritonServer }}
-{{-         if .verbose }}
-        - name: NCCL_DEBUG
-          value: INFO
-{{-         end }}
-{{-       end }}
-{{-     end }}
-        image: {{ $image_name }}
-        imagePullPolicy: IfNotPresent
-        resources:
-          limits:
-            cpu: {{ $triton_cpu }}
-            ephemeral-storage: 1Gi
-            memory: {{ $triton_memory }}
-            nvidia.com/gpu: 1
-          requests:
-            cpu: {{ $triton_cpu }}
-            ephemeral-storage: 1Gi
-            memory: {{ $triton_memory }}
-            nvidia.com/gpu: 1
-        volumeMounts:
-        - mountPath: /var/run/models
-          name: model-repository
-          readOnly: true
-{{-     with $.Values.triton }}
-{{-       with .image }}
-{{-         with .pullSecrets }}
-      imagePullSecrets:
-{{            toYaml . | indent 6 }}
-{{-         end }}
-{{-       end }}
-{{-     end }}
-      restartPolicy: Always
-      serviceAccountName: {{ $service_account }}
-      terminationGracePeriodSeconds: 30
-      tolerations:
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-{{-     with $.Values.kubernetes }}
-{{-       with .tolerations }}
-{{          toYaml . | indent 6 }}
-{{-       end }}
-{{-     end }}
-      volumes:
-      - name: model-repository
-        persistentVolumeClaim:
-          claimName: {{ $.Values.model.persistentVolumeClaim }}
-          readOnly: true
-{{-   end }}
-{{- end }}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml
deleted file mode 100644
index 55a64568..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-{{- $skip_conversion := false }}
-{{- with $.Values.model }}
-{{-   with .skipConversion }}
-{{-     $skip_conversion = . }}
-{{-   end }}
-{{- end }}
-{{- if not $skip_conversion }}
-{{-   $hostRootPath := "/triton" }}
-{{-   $image_name := "" }}
-{{-   with $.Values.triton }}
-{{-     with .image }}
-{{-       $image_name = required "Property '.triton.image.name' is required." .name }}
-{{-     else }}
-{{-       fail "Property '.triton.image' is required." }}
-{{-     end }}
-{{-   else }}
-{{-    fail "Property '.triton' is required" }}
-{{-   end }}
-{{-   $model_name := "" }}
-{{-   $model_dt := "float16" }}
-{{-   $model_pp := 1 }}
-{{-   $model_tp := 1 }}
-{{-   with $.Values.kubernetes }}
-{{-     with .hostRootPath }}
-{{-       $hostRootPath = . }}
-{{-     end }}
-{{-   end }}
-{{-   with $.Values.model }}
-{{-     $model_name = required "Property '.model.name' is required." .name }}
-{{-     with .tensorrtLlm }}
-{{-       with .dataType }}
-{{-         $model_dt = . }}
-{{-       end }}
-{{-       with .parallelism }}
-{{-         with .pipeline }}
-{{-           $model_pp = (int .) }}
-{{-         end }}
-{{-         with .tensor }}
-{{-           $model_tp = (int .) }}
-{{-         end }}
-{{-       end }}
-{{-     end }}
-{{-   else }}
-{{-     fail "Property '.model' is required." }}
-{{-   end }}
-{{-   $model_lower := lower $model_name }}
-{{-   $model_upper := upper $model_name }}
-{{-   $pod_count := mul $model_pp $model_tp }}
-{{-   $model_cpu := 4 }}
-{{-   $model_gpu := 1 }}
-{{-   $model_memory := "32Gi" }}
-{{-   with $.Values.triton }}
-{{-     with .image }}
-{{-       with .name }}
-{{-         $image_name = . }}
-{{-       end }}
-{{-     end }}
-{{-   end }}
-{{-   with $.Values.model }}
-{{-     with .tensorrtLlm }}
-{{-       with .conversion }}
-{{-         with .cpu }}
-{{-           $model_cpu = . }}
-{{-         end }}
-{{-         with .gpu }}
-{{-           $model_gpu = (int .) }}
-{{-         end}}
-{{-         with .memory }}
-{{-           $model_memory = . }}
-{{-         end }}
-{{-       end }}
-{{-     end }}
-{{-   end }}
-{{-   $engine_path := printf "/var/run/models/%s/%dx%d/engine" $model_lower (int $model_pp) (int $model_tp) }}
-{{-   $model_path := printf "/var/run/models/%s/%dx%d/model" $model_lower (int $model_pp) (int $model_tp) }}
-{{-   $hf_verbosity := "error" }}
-{{-   with $.Values.logging }}
-{{-     with .initialization }}
-{{-       if .verbose }}
-{{-         $hf_verbosity = "info" }}
-{{-       end }}
-{{-     end }}
-{{-   end }}
-{{-   $service_account := $.Release.Name }}
-{{-   with $.Values.kubernetes }}
-{{-     with .serviceAccount }}
-{{-       $service_account = . }}
-{{-     end }}
-{{-   end }}
-apiVersion: batch/v1
-kind: Job
-metadata:
-  labels:
-    app: {{ $.Release.Name }}
-{{-   with $.Values.kubernetes }}
-{{-     with .labels }}
-{{        toYaml . | indent 4 }}
-{{-     end }}
-{{-   end }}
-  name: {{ $.Release.Name }}
-spec:
-  backoffLimit: 4
-  template:
-    metadata:
-      labels:
-        app: {{ $.Release.Name }}-converter
-{{-   with $.Values.kubernetes }}
-{{-     with .labels }}
-{{        toYaml . | indent 8 }}
-{{-     end }}
-{{-   end }}
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: nvidia.com/gpu
-                operator: Exists
-              - key: nvidia.com/gpu.product
-                operator: In
-                values:
-                - {{ required "Property '.gpu' is required." $.Values.gpu }}
-      containers:
-      - name: converter
-        command:
-        - python3
-        - ./server.py
-        - convert
-        - --model={{ $model_lower }}
-        - --dt={{ $model_dt }}
-        - --pp={{ $model_pp }}
-        - --tp={{ $model_tp }}
-        - --multinode
-{{-   with $.Values.logging }}
-{{-     with .initialization }}
-{{-       if .verbose }}
-        - --verbose
-{{-       end }}
-{{-     end }}
-{{-   end }}
-        env:
-        - name: ENGINE_DEST_PATH
-          value: {{ $engine_path }}
-        - name: HF_HOME
-          value: /var/run/models/hugging_face
-        - name: HF_HUB_VERBOSITY
-          value: {{ $hf_verbosity }}
-        - name: MODEL_DEST_PATH
-          value: {{ $model_path }}
-{{-   with $.Values.logging }}
-{{-     with .initialization }}
-{{-       if .verbose }}
-        - name: NCCL_DEBUG
-          value: INFO
-{{-       end }}
-{{-     end }}
-{{-   end }}
-        image: {{ $image_name }}
-        imagePullPolicy: IfNotPresent
-        resources:
-          limits:
-            cpu: {{ $model_cpu }}
-            ephemeral-storage: 32Gi
-            memory: {{ $model_memory }}
-            nvidia.com/gpu: {{ $model_gpu }}
-          requests:
-            cpu: {{ $model_cpu }}
-            ephemeral-storage: 32Gi
-            memory: {{ $model_memory }}
-            nvidia.com/gpu: {{ $model_gpu }}
-        securityContext:
-          readOnlyRootFilesystem: false
-          runAsGroup: 0
-          runAsUser: 0
-        volumeMounts:
-{{-   with $.Values.model }}
-{{-     if .pullSecret }}
-        - mountPath: /var/run/secrets/hugging_face
-          name: hf-secret
-          readOnly: true
-{{-     end }}
-{{-   end }}
-        - mountPath: /var/run/models
-          name: model-repository
-          readOnly: false
-{{-   with $.Values.triton }}
-{{-     with .image }}
-{{-       with .pullSecrets }}
-      imagePullSecrets:
-{{          toYaml . | indent 6 }}
-{{-       end }}
-{{-     end }}
-{{-   end }}
-      restartPolicy: Never
-      serviceAccountName: {{ $service_account }}
-      terminationGracePeriodSeconds: 30
-      tolerations:
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-{{-   with $.Values.kubernetes }}
-{{-     with .tolerations }}
-{{        toYaml . | indent 6 }}
-{{-     end }}
-{{-   end }}
-      volumes:
-{{-   with $.Values.model }}
-{{-     with .pullSecret }}
-      - name: hf-secret
-        secret:
-          secretName: {{ . }}
-{{-     end }}
-{{-   end }}
-      - name: model-repository
-        persistentVolumeClaim:
-          claimName: {{ $.Values.model.persistentVolumeClaim }}
-          readOnly: false
-{{- end }}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml
deleted file mode 100644
index 4b91286d..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: monitoring.coreos.com/v1
-kind: PodMonitor
-metadata:
-  name: {{ $.Release.Name }}
-  labels:
-    app: {{ $.Release.Name }}
-    app.kubernetes.io/component: monitor
-    release: prometheus
-{{- with $.Values.kubernetes }}
-{{-   with .labels }}
-{{      toYaml . | indent 4 }}
-{{-   end }}
-{{- end }}
-spec:
-  selector:
-    matchLabels:
-      app: {{ $.Release.Name }}
-      app.kubernetes.io/component: server
-  podMetricsEndpoints:
-  - port: metrics
-    path: /metrics
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml
deleted file mode 100644
index 59903ae3..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{{- $service_account := 0 }}
-{{- with $.Values.kubernetes }}
-{{-   with .serviceAccount }}
-{{-     $service_account = . }}
-{{-   end }}
-{{- end }}
-{{- if not $service_account }}
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  labels:
-{{-   with $.Values.kubernetes }}
-{{-     with .labels }}
-{{        toYaml . | indent 4 }}
-{{-     end }}
-{{-   end }}
-  name: {{ $.Release.Name }}
-rules:
-- apiGroups:
-  - ''
-  - apps
-  - batch
-  resources:
-  - deployments
-  - jobs
-  - pods
-  - pods/status
-  - services
-  verbs:
-  - get
-  - list
-- apiGroups: ['']
-  resources:
-  - pods/exec
-  verbs:
-  - create
-
----
-
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  labels:
-{{-   with $.Values.kubernetes }}
-{{-     with .labels }}
-{{        toYaml . | indent 4 }}
-{{-     end }}
-{{-   end }}
-  name: {{ $.Release.Name }}
-
----
-
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  labels:
-{{-   with $.Values.kubernetes }}
-{{-     with .labels }}
-{{        toYaml . | indent 4 }}
-{{-     end }}
-{{-   end }}
-  name: {{ $.Release.Name }}
-subjects:
-- kind: ServiceAccount
-  name: {{ $.Release.Name }}
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: Role
-  name: {{ $.Release.Name }}
-{{- end }}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml
deleted file mode 100644
index 3bf3b3d5..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{{- $noService := false }}
-{{- with $.Values.kubernetes }}
-{{-   with .noService }}
-{{-     $noService = . }}
-{{-   end }}
-{{- end }}
-{{- if $noService }}
-# Chart values optioned to not create a service. Service not created.
-{{- else }}
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ $.Release.Name }}
-  labels:
-    app: {{ $.Release.Name }}
-    app.kubernetes.io/component: service
-{{- with $.Values.kubernetes }}
-{{-   with .labels }}
-{{      toYaml . | indent 4 }}
-{{-   end }}
-{{- end }}
-spec:
-  ports:
-  - name: http
-    port: 8000
-    targetPort: http
-  - name: grpc
-    port: 8001
-    targetPort: grpc
-  - name: metrics
-    port: 8002
-    targetPort: metrics
-  selector:
-    app: {{ $.Release.Name }}
-    app.kubernetes.io/component: server
-    pod-rank: {{ 0 | quote}}
-  type: ClusterIP
-{{- end }}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json
deleted file mode 100644
index 99917ea2..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json
+++ /dev/null
@@ -1,324 +0,0 @@
-{
-  "$schema": "https://json-schema.org/draft-07/schema#",
-  "copyright": [
-    "# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.",
-    "# NVIDIA CORPORATION and its licensors retain all intellectual property",
-    "# and proprietary rights in and to this software, related documentation",
-    "# and any modifications thereto.  Any use, reproduction, disclosure or",
-    "# distribution of this software and related documentation without an express",
-    "# license agreement from NVIDIA CORPORATION is strictly prohibited."
-  ],
-  "properties": {
-    "gpu": {
-      "description": "Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.",
-      "type": "string"
-    },
-    "model": {
-      "description": "Configuration options related to the AI model to be deployed.",
-      "properties": {
-        "name": {
-          "description": "Name of the model to be served Triton Server instances.",
-          "pattern": "(gpt2|opt125m|llama-(2-(7b|70b)(-chat)?|3-(8b|70b)(-instruct)?))",
-          "type": "string"
-        },
-        "persistentVolumeClaim": {
-          "description": "Persistent volume claim where model content will be persisted.",
-          "type": "string"
-        },
-        "pullSecret": {
-          "description": "Name of the secret used to download the model from Hugging Face.",
-          "oneOf": [
-            { "type": "string" },
-            { "type": "null" }
-          ]
-        },
-        "skipConversion": {
-          "description": "When `false` a model conversion job is created and the leader pod will wait for the job to complete before starting Triton; otherwise this doesn't happen.",
-          "oneOf": [
-            { "type": "boolean" },
-            { "type": "null" }
-          ]
-        },
-        "tensorrtLlm": {
-          "description": "Configuration options related to the conversion of a non-optimized model into TensorRT format.",
-          "oneOf": [
-            {
-              "properties": {
-                "conversion": {
-                  "description": "Configuration opens related to conversion of non-TensorRT models to TensorRT engine and plan files.",
-                  "oneOf": [
-                    {
-                      "properties": {
-                        "cpu": {
-                          "description": "Number of logical CPU cores reserved for, and assigned to the model conversion job.",
-                          "oneOf": [
-                            {
-                              "minimum": 1,
-                              "type": "integer"
-                            },
-                            {
-                              "pattern": "^\\d+m$",
-                              "type": "string"
-                            },
-                            { "type": "null" }
-                          ]
-                        },
-                        "gpu": {
-                          "description": "Number of GPUs reserved for, and assigned to the model conversion job.",
-                          "oneOf": [
-                            {
-                              "minimum": 0,
-                              "type": "integer"
-                            },
-                            { "type": "null" }
-                          ]
-                        },
-                        "memory": {
-                          "description": "Amount of CPU-visible system memory allocated to, and reserved for the model conversion job.",
-                          "oneOf": [
-                            {
-                              "pattern": "^\\d+[GKMgkm]i$",
-                              "type": "string"
-                            },
-                            { "type": "null" }
-                          ]
-                        }
-                      },
-                      "type": "object"
-                    },
-                    { "type": "null" }
-                  ]
-                },
-                "dataType": {
-                  "description": "Data type used when compiling and optimizing the model for TensorRT.",
-                  "oneOf": [
-                    {
-                      "pattern": "(bfloat16|float16|float32)",
-                      "type": "string"
-                    },
-                    { "type": "null" }
-                  ]
-                },
-                "enable": {
-                  "description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.",
-                  "oneOf": [
-                    { "type": "boolean" },
-                    { "type": "null" }
-                  ]
-                },
-                "parallelism": {
-                  "description": "Parallelism configuration options which affect how the model is converted to TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.",
-                  "oneOf": [
-                    {
-                      "properties": {
-                        "pipeline": {
-                          "oneOf": [
-                            {
-                              "minimum": 1,
-                              "type": "integer"
-                            },
-                            { "type": "null" }
-                          ]
-                        },
-                        "tensor": {
-                          "oneOf": [
-                            {
-                              "minimum": 1,
-                              "type": "integer"
-                            },
-                            { "type": "null" }
-                          ]
-                        }
-                      },
-                      "type": "object"
-                    },
-                    { "type": "null" }
-                  ]
-                }
-              },
-              "type": "object"
-            },
-            { "type": "null" }
-          ]
-        }
-      },
-      "required": [
-        "name",
-        "persistentVolumeClaim"
-      ],
-      "type": "object"
-    },
-    "triton": {
-      "description": "Configuration options for Triton Server.",
-      "properties": {
-        "image": {
-          "description": "Configuration options related to the container image for Triton Server.",
-          "properties": {
-            "pullSecrets": {
-              "description": "Optional list of pull secrets to be used when downloading the Triton Server container image.",
-              "oneOf": [
-                {
-                  "items": [
-                    { "type": "object" }
-                  ],
-                  "type": "array"
-                },
-                { "type": "null" }
-              ]
-            },
-            "name": {
-              "description": "Name of the container image containing the version of Triton Server to be used.",
-              "type": "string"
-            }
-          },
-          "required": [ "name" ],
-          "type": "object"
-        },
-        "resources": {
-          "description": "Configuration options managing the resources assigned to individual Triton Server instances. ",
-          "oneOf": [
-            {
-              "properties": {
-                "cpu": {
-                  "description": "Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.",
-                  "oneOf": [
-                    {
-                      "minimum": 1,
-                      "type": "integer"
-                    },
-                    {
-                      "pattern": "^\\d+m$",
-                      "type": "string"
-                    },
-                    { "type": "null" }
-                  ]
-                },
-                "memory": {
-                  "description": "Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.",
-                  "oneOf": [
-                    {
-                      "pattern": "^\\d+[GKMgkm]i$",
-                      "type": "string"
-                    },
-                    { "type": "null" }
-                  ]
-                }
-              },
-              "type": "object"
-            },
-            { "type": "null" }
-          ]
-        }
-      },
-      "required": [ "image" ],
-      "type": "object"
-    },
-    "logging": {
-      "description": "Configuration options related to how various components generate logs.",
-      "oneOf": [
-        {
-          "properties": {
-            "initialization": {
-              "description": "Logging configuration options specific to the initialization container.",
-              "oneOf": [
-                {
-                  "properties": {
-                    "verbose": {
-                      "description": "When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used.",
-                      "oneOf": [
-                        { "type": "boolean" },
-                        { "type": "null" }
-                      ]
-                    }
-                  },
-                  "type": "object"
-                },
-                { "type": "null" }
-              ]
-            },
-            "tritonServer": {
-              "description": "Logging configuration options specific to Triton Server.",
-              "oneOf": [
-                {
-                  "properties": {
-                    "useIso8601": {
-                      "description": "When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. ",
-                      "oneOf": [
-                        { "type": "boolean" },
-                        { "type": "null" }
-                      ]
-                    },
-                    "verbose": {
-                      "description": "When `true` Triton Server uses verbose logging; otherwise standard logging is used.",
-                      "oneOf": [
-                        { "type": "boolean" },
-                        { "type": "null" }
-                      ]
-                    }
-                  },
-                  "type": "object"
-                },
-                { "type": "null" }
-              ]
-            }
-          },
-          "type": "object"
-        },
-        { "type": "null" }
-      ]
-    },
-    "kubernetes": {
-      "description": "Configurations option related to the Kubernetes objects created by the chart.",
-      "oneOf": [
-        {
-          "properties": {
-            "hostRootPath": {
-              "description": "Root file-system path used when mounting content to the underlying host.",
-              "oneOf": [
-                { "type": "string" },
-                { "type": "null" }
-              ]
-            },
-            "labels": {
-              "description": "Optional set of labels to be applied to created Kubernetes objects.",
-              "oneOf": [
-                { "type": "object" },
-                { "type": "null" }
-              ]
-            },
-            "noService": {
-              "description": "When `false`, a service will not be created when the chart is installed; otherwise a service will be created.",
-              "oneOf": [
-                { "type": "boolean" },
-                { "type": "null" }
-              ]
-            },
-            "tolerations": {
-              "description": "Tolerations applied to every pod deployed as part of this deployment.",
-              "oneOf": [
-                {
-                  "items": [
-                    {
-                      "description": "Toleration applied to every pod deployed as part of this deployment.",
-                      "type": "object"
-                    },
-                    { "type": "null" }
-                  ],
-                  "type": "array"
-                },
-                { "type": "null" }
-              ]
-            }
-          },
-          "type": "object"
-        },
-        { "type": "null" }
-      ]
-    }
-  },
-  "required": [
-    "gpu",
-    "model",
-    "triton"
-  ]
-}
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml
deleted file mode 100644
index 4d7e7328..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The GPU SKU that supports `.model` and to which Triton Server instances can be deployed.
-# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.
-# Run 'kubectl get nodes' to find node names.
-# Run 'kubectl describe node <node_name>' to inspect a node's labels.
-gpu: # (required)
-# Example values: NVIDIA-A100-SXM4-40GB, NVIDIA-A10G, Tesla-V100-SXM2-16GB, Tesla-T4
-
-# Configuration options related to the AI model to be deployed.
-model: # (required)
-  # Name of the model to be served Triton Server instances.
-  # Supported values are:
-  # - gpt2
-  # - llama-2-7b
-  # - llama-2-70b
-  # - llama-2-7b-chat
-  # - llama-2-70b-chat
-  # - llama-3-8b
-  # - llama-3-70b
-  # - llama-3-8b-instruct
-  # - llama-3-70b-instruct
-  # - opt125m
-  name: # (required)
-  # Persistent volume claim where model content will be persisted.
-  # Expected to support read/write many access.
-  persistentVolumeClaim: # (required)
-  # Name of the secret used to download the model from Hugging Face.
-  # GPT2 does not require an access token to download.
-  # Other models may require per repository permissions to be granted.
-  pullSecret: # (optional)
-  # When `false` a model conversion job is created and the leader pod will wait for the job to complete before starting Triton; otherwise this doesn't happen.
-  # When not relying on the model conversion job, the following must exist on the persistent volume:
-  # - models: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/model"
-  # - engine: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/engine"
-  skipConversion: # (default: false)
-  # Configuration options related to the conversion of a non-optimized model into TensorRT format.
-  tensorrtLlm: # (optional)
-    # Configuration opens related to conversion of non-TensorRT models to TensorRT engine and plan files.
-    # Ignored when `model.skipConversion` is `true`.
-    conversion: # (optional)
-      # Number of logical CPU cores reserved for, and assigned to the model conversion job.
-      cpu: # (default: 4)
-      # Number of GPUs reserved for, and assigned to the model conversion job.
-      gpu: # (default: 1)
-      # Amount of CPU-visible system memory allocated to, and reserved for the model conversion job.
-      memory: # (default: 32Gi)
-    # Data type used when compiling and optimizing the model for TensorRT.
-    # Supported options are float16, bfloat16, float32
-    dataType: # (default: float16)
-    # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.
-    # When 'false', the init container will fall back to vLLM and parallelism options are ignored.
-    enable: true # (default: true)
-    # Parallelism configuration options which affect how the model is converted to
-    # TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.
-    parallelism: # (optional)
-      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a
-      # subset of layers that is executed on a separate device.
-      # The main limitation of this method is that, due to the sequential nature of the processing, some devices or
-      # layers may remain idle while waiting for the output.
-      pipeline: # (default: 1)
-      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller,
-      # independent blocks of computation that can be executed on different devices.
-      # Attention blocks and multi-layer perceptron (MLP) layers are major components of transformers that can take advantage of
-      # tensor parallelism.
-      # In multi-head attention blocks, each head or group of heads can be assigned to a different device so they can be computed
-      # independently and in parallel.
-      tensor: # (default: 1)
-
-# Configuration options for Triton Server.
-triton: # (required)
-  # Configuration options related to the container image for Triton Server.
-  image: # (required)
-    # Optional list of pull secrets to be used when downloading the Triton Server container image.
-    pullSecrets: # (optional)
-    # - name: ngc-container-pull
-    # Name of the container image containing the version of Triton Server to be used.
-    name: # (required)
-  # Configuration options managing the resources assigned to individual Triton Server instances.
-  resources: # (optional)
-    # Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.
-    cpu: # (default: 4)
-    # Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.
-    memory: # (default: 32Gi)
-
-# Configuration options related to how various components generate logs.
-logging: # (optional)
-  # Logging configuration options specific to the initialization container.
-  initialization:
-    # When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used.
-    verbose: # (default: false)
-  # Logging configuration options specific to Triton Server.
-  tritonServer:
-    # When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used.
-    useIso8601: # (default: false)
-    # When `true` Triton Server uses verbose logging; otherwise standard logging is used.
-    verbose: # (default: false)
-
-# Configurations option related to the Kubernetes objects created by the chart.
-kubernetes: # (optional)
-  # Root file-system path used when mounting content to the underlying host.
-  hostRootPath: # (default: /triton)
-  # Optional set of labels to be applied to created Kubernetes objects.
-  # These labels can be used for association with a preexisting service object.
-  labels: # (optional)
-    # customLabel: exampleValue
-  # When `false`, a service will not be created when the chart is installed; otherwise a service will be created.
-  noService: # (default: false)
-  # Name of the service account to use when deploying components.
-  # When not provided, a service account will be created.
-  serviceAccount: # (optional)
-  # Tolerations applied to every pod deployed as part of this deployment.
-  # Template already includes `nvidia.com/gpu=present:NoSchedule`.
-  tolerations: # (optional)
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md
deleted file mode 100644
index 98a9f49f..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-<!---
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
---->
-
-
-# Container Generation
-
-The files in this folder are intended to be used to create the Triton Server container image.
-
-Run the following command to create a Triton Server container image.
-
-```bash
-docker build --file ./triton_trt-llm.containerfile --tag <image_name_here> .
-```
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh
deleted file mode 100755
index 4eb88dab..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pod=$1
-shift
-kubectl exec $pod  -- /bin/sh -c "$*"
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py
deleted file mode 100644
index 2b59895d..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py
+++ /dev/null
@@ -1,611 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import shutil
-import signal
-import subprocess
-import sys
-import time
-
-# These values are expected to match the mount points in the Helm Chart.
-# Any changes here must also be made there, and vice versa.
-HUGGING_FACE_TOKEN_PATH = "/var/run/secrets/hugging_face/password"
-
-ERROR_EXIT_DELAY = 15
-ERROR_CODE_FATAL = 255
-ERROR_CODE_USAGE = 253
-EXIT_SUCCESS = 0
-
-# Environment variable keys.
-CLI_VERBOSE_KEY = "TRITON_CLI_VERBOSE"
-ENGINE_PATH_KEY = "ENGINE_DEST_PATH"
-HUGGING_FACE_KEY = "HF_HOME"
-MODEL_PATH_KEY = "MODEL_DEST_PATH"
-
-HUGGING_FACE_CLI = "huggingface-cli"
-DELAY_BETWEEN_QUERIES = 2
-
-
-# ---
-
-
-def create_directory(directory_path: str):
-    if directory_path is None or len(directory_path) == 0:
-        return
-
-    segments = directory_path.split("/")
-    path = ""
-
-    for segment in segments:
-        if segment is None or len(segment) == 0:
-            continue
-
-        path = f"{path}/{segment}"
-
-        if is_verbose:
-            write_output(f"> mkdir {path}")
-
-        if not os.path.exists(path):
-            os.mkdir(path)
-
-
-# ---
-
-
-def die(exit_code: int):
-    if exit_code is None:
-        exit_code = ERROR_CODE_FATAL
-
-    write_error(f"       Waiting {ERROR_EXIT_DELAY} second before exiting.")
-    # Delay the process' termination to provide a small window for administrators to capture the logs before it exits and restarts.
-    time.sleep(ERROR_EXIT_DELAY)
-
-    exit(exit_code)
-
-
-# ---
-
-
-def hugging_face_authenticate(args):
-    # Validate that `HF_HOME` environment variable was set correctly.
-    if HUGGING_FACE_HOME is None or len(HUGGING_FACE_HOME) == 0:
-        raise Exception(f"Required environment variable '{HUGGING_FACE_KEY}' not set.")
-
-    # When a Hugging Face secret has been mounted, we'll use that to authenticate with Hugging Face.
-    if os.path.exists(HUGGING_FACE_TOKEN_PATH):
-        with open(HUGGING_FACE_TOKEN_PATH) as token_file:
-            write_output(
-                f"Hugging Face token file '{HUGGING_FACE_TOKEN_PATH}' detected, attempting to authenticate w/ Hugging Face."
-            )
-            write_output(" ")
-
-            hugging_face_token = token_file.read()
-
-            # Use Hugging Face's CLI to complete the authentication.
-            result = run_command(
-                [HUGGING_FACE_CLI, "login", "--token", hugging_face_token], [3]
-            )
-
-            if result != 0:
-                raise Exception(f"Hugging Face authentication failed. ({result})")
-
-            write_output("Hugging Face authentication successful.")
-            write_output(" ")
-
-
-# ---
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("mode", type=str, choices=["convert", "leader", "worker"])
-    parser.add_argument("--model", type=str, default=None)
-    parser.add_argument(
-        "--dt",
-        type=str,
-        default="float16",
-        choices=["bfloat16", "float16", "float32"],
-        help="Tensor type.",
-    )
-    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism.")
-    parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism.")
-    parser.add_argument("--iso8601", action="count", default=0)
-    parser.add_argument("--verbose", action="count", default=0)
-    parser.add_argument(
-        "--deployment", type=str, help="Name of the Kubernetes deployment."
-    )
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default="default",
-        help="Namespace of the Kubernetes deployment.",
-    )
-    parser.add_argument("--multinode", action="count", default=0)
-    parser.add_argument(
-        "--noconvert",
-        action="count",
-        default=0,
-        help="Prevents leader waiting for model conversion before inference serving begins.",
-    )
-
-    return parser.parse_args()
-
-
-# ---
-
-
-def remove_path(path: str):
-    if os.path.exists(path):
-        if os.path.isfile(path):
-            if is_verbose:
-                write_output(f"> rm {path}")
-
-            os.remove(path)
-        else:
-            if is_verbose:
-                write_output(f"> rm -rf {path}")
-
-            shutil.rmtree(path)
-
-
-# ---
-
-
-def run_command(cmd_args: [str], omit_args: [int] = None):
-    command = ""
-
-    for i, arg in enumerate(cmd_args):
-        command += " "
-        if omit_args is not None and i in omit_args:
-            command += "*****"
-        else:
-            command += arg
-
-    write_output(f">{command}")
-    write_output(" ")
-
-    # Run triton_cli to build the TRT-LLM engine + plan.
-    return subprocess.call(cmd_args, stderr=sys.stderr, stdout=sys.stdout)
-
-
-# ---
-
-
-def signal_handler(sig, frame):
-    write_output(f"Signal {sig} detected, quitting.")
-    exit(EXIT_SUCCESS)
-
-
-# ---
-
-
-def wait_for_convert(args):
-    if args.noconvert != 0:
-        write_output("Leader skip waiting for model-conversion job.")
-        return
-
-    write_output("Begin waiting for model-conversion job.")
-
-    cmd_args = [
-        "kubectl",
-        "get",
-        f"job/{args.deployment}",
-        "-n",
-        f"{args.namespace}",
-        "-o",
-        'jsonpath={.status.active}{"|"}{.status.failed}{"|"}{.status.succeeded}',
-    ]
-    command = " ".join(cmd_args)
-
-    active = 1
-    failed = 0
-    succeeded = 0
-
-    while active > 0 and succeeded == 0:
-        time.sleep(DELAY_BETWEEN_QUERIES)
-
-        if is_verbose:
-            write_output(f"> {command}")
-
-        output = subprocess.check_output(cmd_args).decode("utf-8")
-        if output is None or len(output) == 0:
-            continue
-
-        if is_verbose:
-            write_output(output)
-
-        output = output.strip(" ")
-        if len(output) > 0:
-            parts = output.split("|")
-
-            if len(parts) > 2 and len(parts[2]) > 0:
-                succeeded = int(parts[2])
-            else:
-                succeeded = 0
-
-            if len(parts) > 1 and len(parts[1]) > 0:
-                failed = int(parts[1])
-            else:
-                failed = 0
-
-            if len(parts) > 0 and len(parts[0]) > 0:
-                active = int(parts[0])
-            else:
-                active = 0
-
-        if active > 0:
-            write_output("Waiting for model-conversion job.")
-        elif succeeded > 0:
-            write_output("Model-conversion job succeeded.")
-        elif failed > 0:
-            write_error("Model-conversion job failed.")
-            raise RuntimeError("Model-conversion job failed.")
-
-    write_output(" ")
-
-
-# ---
-
-
-def wait_for_workers(world_size: int):
-    if world_size is None or world_size <= 0:
-        raise RuntimeError("Argument `world_size` must be greater than zero.")
-
-    write_output("Begin waiting for worker pods.")
-
-    cmd_args = [
-        "kubectl",
-        "get",
-        "pods",
-        "-n",
-        f"{args.namespace}",
-        "-l",
-        f"app={args.deployment}",
-        "-o",
-        "jsonpath='{.items[*].metadata.name}'",
-    ]
-    command = " ".join(cmd_args)
-
-    workers = []
-
-    while len(workers) < world_size:
-        time.sleep(DELAY_BETWEEN_QUERIES)
-
-        if is_verbose:
-            write_output(f"> {command}")
-
-        output = subprocess.check_output(cmd_args).decode("utf-8")
-
-        if is_verbose:
-            write_output(output)
-
-        output = output.strip("'")
-
-        workers = output.split(" ")
-
-        if len(workers) < world_size:
-            write_output(
-                f"Waiting for worker pods, {len(workers)} of {world_size} ready."
-            )
-        else:
-            write_output(f"{len(workers)} of {world_size} workers ready.")
-
-    write_output(" ")
-
-    if workers is not None and len(workers) > 1:
-        workers.sort()
-
-    return workers
-
-
-# ---
-
-
-def write_output(message: str):
-    print(message, file=sys.stdout, flush=True)
-
-
-# ---
-
-
-def write_error(message: str):
-    print(message, file=sys.stderr, flush=True)
-
-
-# ---
-# Below this line are the primary functions.
-# ---
-
-
-def do_convert(args):
-    write_output("Initializing Model")
-
-    if args.model is None or len(args.model) == 0:
-        write_error("fatal: Model name must be provided.")
-        die(ERROR_CODE_FATAL)
-
-    create_directory(ENGINE_DIRECTORY)
-    create_directory(MODEL_DIRECTORY)
-
-    hugging_face_authenticate(args)
-
-    engine_path = ENGINE_DIRECTORY
-    engine_lock_file = os.path.join(engine_path, "lock")
-    engine_ready_file = os.path.join(engine_path, "ready")
-    model_path = MODEL_DIRECTORY
-    model_lock_file = os.path.join(model_path, "lock")
-    model_ready_file = os.path.join(model_path, "ready")
-
-    # When the model and plan already exist, we can exit early, happily.
-    if os.path.exists(engine_ready_file) and os.path.exists(model_ready_file):
-        everything_exists = True
-
-        if os.path.exists(engine_lock_file):
-            write_output("Incomplete engine directory detected, removing.")
-            everything_exists = False
-            remove_path(engine_path)
-
-        if os.path.exists(model_lock_file):
-            write_output("Incomplete model directory detected, removing.")
-            everything_exists = False
-            remove_path(engine_path)
-
-        if everything_exists:
-            write_output(
-                f"TensorRT engine and plan detected for {args.model}. No work to do, exiting."
-            )
-            exit(EXIT_SUCCESS)
-
-    write_output(f"Begin generation of TensorRT engine and plan for {args.model}.")
-    write_output(" ")
-
-    create_directory(engine_path)
-
-    # Create a lock file for the engine directory.
-    if is_verbose:
-        write_output(f"> echo '{args.model}' > {engine_lock_file}")
-
-    with open(engine_lock_file, "w") as f:
-        f.write(args.model)
-
-    create_directory(model_path)
-
-    # Create a lock file for the engine model.
-    if is_verbose:
-        write_output(f"> echo '{args.model}' > {model_lock_file}")
-
-    with open(model_lock_file, "w") as f:
-        f.write(args.model)
-
-    try:
-        # Build up a set of args for the subprocess call.
-        cmd_args = [
-            "triton",
-            "import",
-            "--model",
-            args.model,
-            "--model-repository",
-            MODEL_DIRECTORY,
-        ]
-
-        cmd_args += ["--backend", "tensorrtllm"]
-
-        if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]:
-            cmd_args += ["--data-type", args.dt]
-
-        if args.pp > 1:
-            cmd_args += ["--pipeline-parallelism", f"{args.pp}"]
-
-        if args.tp > 1:
-            cmd_args += ["--tensor-parallelism", f"{args.tp}"]
-
-        if args.tp * args.pp > 1 and args.multinode > 0:
-            cmd_args += ["--disable-custom-all-reduce"]
-
-        # When verbose, insert the verbose flag.
-        # It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position.
-        # This limitation will likely be removed a future release of triton_cli.
-        if is_verbose:
-            cmd_args.insert(1, "--verbose")
-
-        result = run_command(cmd_args)
-
-        if result == 0:
-            # Create the ready file.
-            if is_verbose:
-                write_output(f"> echo '{args.model}' > {engine_ready_file}")
-
-            with open(engine_ready_file, "w") as f:
-                f.write(args.model)
-
-            # Create the ready file.
-            if is_verbose:
-                write_output(f"> echo '{args.model}' > {model_ready_file}")
-
-            with open(model_ready_file, "w") as f:
-                f.write(args.model)
-
-            # Remove the lock files.
-            if is_verbose:
-                write_output(f"> rm {engine_lock_file}")
-
-            os.remove(engine_lock_file)
-
-            if is_verbose:
-                write_output(f"> rm {model_lock_file}")
-
-            os.remove(model_lock_file)
-        else:
-            # Clean the model and engine directories when the command fails.
-            remove_path(engine_path)
-            remove_path(model_path)
-
-        exit(result)
-
-    except Exception as exception:
-        remove_path(engine_path)
-        remove_path(model_path)
-        raise exception
-
-
-# ---
-
-
-def do_leader(args):
-    world_size = args.tp * args.pp
-
-    if world_size <= 0:
-        raise Exception(
-            "usage: Options --pp and --pp must both be equal to or greater than 1."
-        )
-
-    write_output(f"Executing Leader (world size: {world_size})")
-
-    wait_for_convert(args)
-
-    workers = wait_for_workers(world_size)
-
-    if len(workers) != world_size:
-        write_error(f"fatal: {len(workers)} found, expected {world_size}.")
-        die(ERROR_EXIT_DELAY)
-
-    cmd_args = [
-        "mpirun",
-        "--allow-run-as-root",
-    ]
-
-    if is_verbose > 0:
-        cmd_args += ["--debug-devel"]
-
-    cmd_args += [
-        "--report-bindings",
-        "-mca",
-        "plm_rsh_agent",
-        "kubessh",
-        "-np",
-        f"{world_size}",
-        "--host",
-        ",".join(workers),
-    ]
-
-    # Add per node command lines separated by ':'.
-    for i in range(world_size):
-        if i != 0:
-            cmd_args += [":"]
-
-        cmd_args += [
-            "-n",
-            "1",
-            "tritonserver",
-            "--allow-cpu-metrics=false",
-            "--allow-gpu-metrics=false",
-            "--disable-auto-complete-config",
-            f"--id=rank{i}",
-            "--model-load-thread-count=2",
-            f"--model-repository={MODEL_DIRECTORY}",
-        ]
-
-        # Rank0 node needs to support metrics collection and web services.
-        if i == 0:
-            cmd_args += [
-                "--allow-metrics=true",
-                "--metrics-interval-ms=1000",
-            ]
-
-            if is_verbose > 0:
-                cmd_args += ["--log-verbose=1"]
-
-            if args.iso8601 > 0:
-                cmd_args += ["--log-format=ISO8601"]
-
-        # Rank(N) nodes can disable metrics, web services, and logging.
-        else:
-            cmd_args += [
-                "--allow-http=false",
-                "--allow-grpc=false",
-                "--allow-metrics=false",
-                "--model-control-mode=explicit",
-                "--load-model=tensorrt_llm",
-                "--log-info=false",
-                "--log-warning=false",
-            ]
-
-    result = run_command(cmd_args)
-
-    if result != 0:
-        die(result)
-
-    exit(result)
-
-
-# ---
-
-
-def do_worker(args):
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    write_output("Worker paused awaiting SIGINT or SIGTERM.")
-    signal.pause()
-
-
-# ---
-
-
-write_output("Reporting system information.")
-run_command(["whoami"])
-run_command(["cgget", "-n", "--values-only", "--variable memory.limit_in_bytes", "/"])
-run_command(["nvidia-smi"])
-
-ENGINE_DIRECTORY = os.getenv(ENGINE_PATH_KEY)
-HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY)
-MODEL_DIRECTORY = os.getenv(MODEL_PATH_KEY)
-
-is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None
-
-# Validate that `ENGINE_PATH_KEY` isn't empty.
-if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0:
-    raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.")
-
-# Validate that `MODEL_PATH_KEY` isn't empty.
-if MODEL_DIRECTORY is None or len(MODEL_DIRECTORY) == 0:
-    raise Exception(f"Required environment variable '{MODEL_PATH_KEY}' not set.")
-
-# Parse options provided.
-args = parse_arguments()
-
-# Update the is_verbose flag with values passed in by options.
-is_verbose = is_verbose or args.verbose > 0
-
-if is_verbose:
-    write_output(f"{ENGINE_PATH_KEY}='{ENGINE_DIRECTORY}'")
-    write_output(f"{HUGGING_FACE_KEY}='{HUGGING_FACE_HOME}'")
-    write_output(f"{MODEL_PATH_KEY}='{MODEL_DIRECTORY}'")
-
-if args.mode == "convert":
-    do_convert(args)
-
-elif args.mode == "leader":
-    do_leader(args)
-
-elif args.mode == "worker":
-    do_worker(args)
-
-else:
-    write_error(f"usage: server.py <mode> [<options>].")
-    write_error(f'       Invalid mode ("{args.mode}") provided.')
-    write_error(f'       Supported values are "init" or "exec".')
-    die(ERROR_CODE_USAGE)
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile
deleted file mode 100644
index e4fc9850..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3
-ARG ENGINE_DEST_PATH=/var/run/models/engine
-ARG HF_HOME=/var/run/hugging_face
-ARG MODEL_DEST_PATH=/var/run/models/model
-
-FROM ${BASE_CONTAINER_IMAGE}
-
-# Set a set of useful labels.
-LABEL "base"="${BASE_CONTAINER_IMAGE}"
-LABEL "role"="server"
-
-# Stop APT (Debian package manager) from complaining about interactivity.
-ENV DEBIAN_FRONTEND=noninteractive
-# Set additional environment values that make usage more pleasant.
-ENV TERM=xterm-256color
-
-RUN apt update \
- && apt install --yes \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    gnupg \
-    cgroup-tools \
- && rm -rf /var/lib/apt/lists/*
-
-# Install kubectl because server.py script depends on it.
-# Step 1: acquire the Kubernetes APT GPG key.
-RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key \
-  | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \
- && chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg
-
-# Step 2: Acquire the API sources list for Kubernetes.
-RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /' \
-  | tee /etc/apt/sources.list.d/kubernetes.list \
- && chmod 644 /etc/apt/sources.list.d/kubernetes.list
-
-# Step 3: Install kubectl.
-RUN apt update \
- && apt install --yes \
-    kubectl \
- && apt autoremove --yes \
- && apt purge --yes \
- && rm -rf /var/lib/apt/lists/*
-
-# Set Triton CLI environment variables which control where
-# TRTLLM engine and model files are downloaded to; and where
-# the path to the Huggingface cache.
-ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH}
-ENV HF_HOME ${HF_HOME}
-ENV MODEL_DEST_PATH ${MODEL_DEST_PATH}
-
-# Set the active working directory.
-WORKDIR /workspace
-
-# Install a custom version of Triton CLI that support Tensor parallelism and
-# the 70B version of Llama models.
-RUN pip --verbose install \
-    --no-cache-dir \
-    --no-color \
-    --no-input \
-    git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn
-
-# Copy kubessh script w/ executable permissions for everyone.
-# This enables the script to be executed no matter the user the container is run as.
-# This works around the issue of the file being non-executable when the container is build on a Windows host.
-COPY --chmod=555 kubessh .
-COPY server.py .
-
-RUN apt list --installed \
- && pip list --version
-
-ENTRYPOINT [ "/bin/bash" ]
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml
deleted file mode 100644
index 30111dad..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# All values are defaults unless specified otherwise.
-
-image:
-  repository: nvcr.io/nvidia/k8s/dcgm-exporter
-  pullPolicy: IfNotPresent
-  tag: 3.3.5-3.4.1-ubuntu22.04
-
-arguments:
- # Reduces the delay between GPU metrics collection passed to 1 second.
-- --collect-interval=1000
-- --collectors=/etc/dcgm-exporter/dcp-metrics-included.csv
- # Required. Enables Kubernetes specific metric collection features.
-- --kubernetes=true
-
-serviceAccount:
-  create: true
-  annotations: { }
-  name:
-
-rollingUpdate:
-  maxUnavailable: 1
-  maxSurge: 0
-
-podLabels: { }
-
-podAnnotations:
-  prometheus.io/scrape: "true"
-  prometheus.io/port: "9400"
-  # Required by Prometheus Operator for proper metrics collection.
-  release: prometheus
-podSecurityContext: { }
-
-securityContext:
-  # Enables advanced GPU metrics features. Optional.
-  privileged: true
-  runAsNonRoot: false
-  runAsUser: 0
-  capabilities:
-    add: [ "SYS_ADMIN" ]
-
-service:
-  enable: true
-  type: ClusterIP
-  port: 9400
-  address: ":9400"
-  annotations:
-    prometheus.io/port: "9400"
-    prometheus.io/scrape: "true"
-    release: prometheus
-
-resources:
-  # Sets proper resource utilization limits, and enables Kubernetes to manage the pod's resource consumption.
-  # All contains should have these.
-  limits:
-    cpu: 2
-    memory: 1Gi
-  # Sets proper resource requirements, and enables Kubernetes to account for the pod's resource consumption.
-  # All contains should have these.
-  requests:
-    cpu: 1
-    memory: 1Gi
-
-serviceMonitor:
-  enabled: true
-  # Reduces the delay between metric collection passes.
-  interval: 1s
-  honorLabels: false
-  additionalLabels:
-    # Useful for helping Prometheus identify metrics collectors.
-    monitoring: prometheus
-    # Required by Prometheus to identify metrics collectors.
-    release: prometheus
-
-nodeSelector:
-  # Ensures that DCGM Exporter process is only deployed to nodes with GPUs.
-  nvidia.com/gpu: present
-
-tolerations:
-# Enables the DCGM Exporter pods to be deployed to nodes with GPUs.
-- key: nvidia.com/gpu
-  operator: Exists
-  effect: NoSchedule
-
-affinity:
-  nodeAffinity:
-  requiredDuringSchedulingIgnoredDuringExecution:
-    nodeSelectorTerms:
-    - matchExpressions:
-      # Ensures that DCGM Exporter process is only deployed to nodes with GPUs.
-      - key: nvidia.com/gpu
-        operator: Exists
-
-kubeletPath: "/var/lib/kubelet/pod-resources"
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml
deleted file mode 100644
index 02ac2cd8..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# In the document below, the version `0.8.2` of the gpu-feature-discovery container is used.
-# It is always wise to check if a new version has been released and to use the latest available release when possible.
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: gpu-feature-discovery
-  namespace: kube-system
-  labels:
-    app.kubernetes.io/name: gpu-feature-discovery
-    app.kubernetes.io/version: 0.8.2
-    app.kubernetes.io/part-of: nvidia-gpu
-spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: gpu-feature-discovery
-      app.kubernetes.io/part-of: nvidia-gpu
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: gpu-feature-discovery
-        app.kubernetes.io/version: 0.8.2
-        app.kubernetes.io/part-of: nvidia-gpu
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            # The following set of node selector match expressions restrict the nodes the service's pods
-            # can be deployed to, to node which meet one or more of the following criteria:
-            # * Nodes with NVIDIA PCIE devices attached (10DE is NVIDIA's PCIE device number).
-            # * Nodes with NVIDIA CPUs.
-            # * Nodes with NVIDIA GPUs.
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: feature.node.kubernetes.io/pci-10de.present
-                operator: In
-                values:
-                - "true"
-            - matchExpressions:
-              - key: feature.node.kubernetes.io/cpu-model.vendor_id
-                operator: In
-                values:
-                - "NVIDIA"
-            - matchExpressions:
-              - key: "nvidia.com/gpu"
-                operator: In
-                values:
-                - "true"
-                - present
-      containers:
-      - image: nvcr.io/nvidia/gpu-feature-discovery:v0.8.2
-        name: gpu-feature-discovery
-        volumeMounts:
-        - name: output-dir
-          mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
-        - name: host-sys
-          mountPath: "/sys"
-        env:
-        - name: MIG_STRATEGY
-          value: none
-        securityContext:
-          privileged: true
-      # Enables the service's pods to be deployed on nodes with GPUs.
-      tolerations:
-      - key: nvidia.com/gpu
-        operator: Exists
-        effect: NoSchedule
-      volumes:
-      - name: output-dir
-        hostPath:
-          path: "/etc/kubernetes/node-feature-discovery/features.d"
-      - name: host-sys
-        hostPath:
-          path: "/sys"
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml
deleted file mode 100644
index 8bf110f9..00000000
--- a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: model-volume
-spec:
-  accessModes:
-  # The PVC must support multiple, concurrent readers and writers.
-  # This is because multiple pods will be mapped to the PVC as each worker pod needs access to the model's data.
-  # Additionally, multiple models could be converted in parallel by concurrent conversion jobs.
-  - ReadWriteMany
-  resources:
-    requests:
-      # This size does not need to match the PV's `spec.capacity.storage` value, but not doing so will prevent utilization of the entire PV.
-      storage: 512Gi
-  # Depending on your storage class provider, this value should be empty or the value specified by the provider.
-  # Please read your provider's documentation when determining this value.
-  storageClassName: ""
-  # This value must be an exact match for the PV's `metadata.name` property.
-  volumeName: model-volume