Update gh-pages for pic hyperlinks. (#1973)

Co-authored-by: Guoming Zhang <[email protected]>
NVIDIA · Jul 17, 2024 · 10588d0 · 10588d0
1 parent df5423f
commit 10588d0
Show file tree

Hide file tree

Showing 52 changed files with 5,905 additions and 5,905 deletions.
diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html
@@ -4724,7 +4724,7 @@ <h2>types.h<a class="headerlink" href="#types-h" title="Link to this heading">
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf1b5ad70>
+<jinja2.runtime.BlockReference object at 0x7f8a046e6800>
 
 <div class="footer">
     <p>

diff --git a/_cpp_gen/runtime.html b/_cpp_gen/runtime.html
diff --git a/_sources/_cpp_gen/runtime.rst.txt b/_sources/_cpp_gen/runtime.rst.txt
@@ -28,66 +28,6 @@ ____________
 .. doxygenfile:: cudaStream.h
    :project: TensorRT-LLM
 
-generationInput.h
-_________________
-
-.. doxygenfile:: generationInput.h
-   :project: TensorRT-LLM
-
-generationOutput.h
-__________________
-
-.. doxygenfile:: generationOutput.h
-   :project: TensorRT-LLM
-
-ipcUtils.h
-__________
-
-.. doxygenfile:: ipcUtils.h
-   :project: TensorRT-LLM
-
-loraCache.h
-___________
-
-.. doxygenfile:: loraCache.h
-   :project: TensorRT-LLM
-
-loraCachePageManagerConfig.h
-____________________________
-
-.. doxygenfile:: loraCachePageManagerConfig.h
-   :project: TensorRT-LLM
-
-loraModule.h
-____________
-
-.. doxygenfile:: loraModule.h
-   :project: TensorRT-LLM
-
-memoryCounters.h
-________________
-
-.. doxygenfile:: memoryCounters.h
-   :project: TensorRT-LLM
-
-promptTuningParams.h
-____________________
-
-.. doxygenfile:: promptTuningParams.h
-   :project: TensorRT-LLM
-
-tllmLogger.h
-____________
-
-.. doxygenfile:: tllmLogger.h
-   :project: TensorRT-LLM
-
-worldConfig.h
-_____________
-
-.. doxygenfile:: worldConfig.h
-   :project: TensorRT-LLM
-
 decodingInput.h
 _______________
 
@@ -106,6 +46,18 @@ ____________________________
 .. doxygenfile:: explicitDraftTokensBuffers.h
    :project: TensorRT-LLM
 
+generationInput.h
+_________________
+
+.. doxygenfile:: generationInput.h
+   :project: TensorRT-LLM
+
+generationOutput.h
+__________________
+
+.. doxygenfile:: generationOutput.h
+   :project: TensorRT-LLM
+
 gptDecoder.h
 ____________
 
@@ -154,24 +106,60 @@ _________
 .. doxygenfile:: iTensor.h
    :project: TensorRT-LLM
 
+ipcUtils.h
+__________
+
+.. doxygenfile:: ipcUtils.h
+   :project: TensorRT-LLM
+
 lookaheadModule.h
 _________________
 
 .. doxygenfile:: lookaheadModule.h
    :project: TensorRT-LLM
 
+loraCache.h
+___________
+
+.. doxygenfile:: loraCache.h
+   :project: TensorRT-LLM
+
+loraCachePageManagerConfig.h
+____________________________
+
+.. doxygenfile:: loraCachePageManagerConfig.h
+   :project: TensorRT-LLM
+
+loraModule.h
+____________
+
+.. doxygenfile:: loraModule.h
+   :project: TensorRT-LLM
+
 medusaModule.h
 ______________
 
 .. doxygenfile:: medusaModule.h
    :project: TensorRT-LLM
 
+memoryCounters.h
+________________
+
+.. doxygenfile:: memoryCounters.h
+   :project: TensorRT-LLM
+
 modelConfig.h
 _____________
 
 .. doxygenfile:: modelConfig.h
    :project: TensorRT-LLM
 
+promptTuningParams.h
+____________________
+
+.. doxygenfile:: promptTuningParams.h
+   :project: TensorRT-LLM
+
 rawEngine.h
 ___________
 
@@ -202,3 +190,15 @@ ___________________________
 .. doxygenfile:: speculativeDecodingModule.h
    :project: TensorRT-LLM
 
+tllmLogger.h
+____________
+
+.. doxygenfile:: tllmLogger.h
+   :project: TensorRT-LLM
+
+worldConfig.h
+_____________
+
+.. doxygenfile:: worldConfig.h
+   :project: TensorRT-LLM
+
diff --git a/_sources/blogs/XQA-kernel.md.txt b/_sources/blogs/XQA-kernel.md.txt
@@ -8,7 +8,7 @@ Support matrix and usage flags are described in [docs/source/advanced/gpt_attent
 Looking at the Throughput-Latency curves below, we see that the enabling of XQA optimization increases throughput. Higher throughput equates to serving more users, and we can see that TPOT on the Y-axis flattens out when XQA gets enabled.
 
 
-<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/media/XQA_ThroughputvsLatency.png" alt="XQA increased throughput within same latency budget" width="950" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/XQA_ThroughputvsLatency.png?raw=true" alt="XQA increased throughput within same latency budget" width="950" height="auto">
 
 <sub>Preliminary measured Performance, subject to change. TPOT lower is better. FP8, 8xH100 GPUs, Single Engine, ISL/OSL: 512/2048, BS: 1 - 256, TensorRT-LLM v0.8a</sub>
 

diff --git a/_sources/performance/perf-best-practices.md.txt b/_sources/performance/perf-best-practices.md.txt
@@ -26,7 +26,7 @@ runtime and, for some of them, decrease the engine build time.
 ### `max_batch_size`, `max_seq_len` and `max_num_tokens`
 
 <p align="center">
-    <img src="../media/max_bs_toks_len.svg" alt="Explain `max_batch_size`, `max_seq_len` and `max_num_tokens`" width="30%" height="auto">
+    <img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/media/max_bs_toks_len.svg?raw=true" alt="Explain `max_batch_size`, `max_seq_len` and `max_num_tokens`" width="30%" height="auto">
 </p>
 
 Regarding the impacts of those three arguments to the GPU memory usage, please refer to [memory.md](../reference/memory.md)

diff --git a/_sources/speculative_decoding.md.txt b/_sources/speculative_decoding.md.txt
@@ -253,7 +253,7 @@ Consider the following diagram, which illustrates how the hidden states from the
 are passed to the base model's language model (LM) head and to four Medusa heads (MHs).
 
 <p align="center">
-    <img src="./media/medusa_tree.svg" alt="Example Medusa Tree" width="auto" height="auto">
+    <img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/media/medusa_tree.svg?raw=true" alt="Example Medusa Tree" width="auto" height="auto">
 </p>
 
 In this example:

diff --git a/advanced/batch-manager.html b/advanced/batch-manager.html
@@ -411,7 +411,7 @@ <h2>In-flight Batching with the Triton Inference Server<a class="headerlink" hre
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf305c430>
+<jinja2.runtime.BlockReference object at 0x7f8a09129930>
 
 <div class="footer">
     <p>

diff --git a/advanced/expert-parallelism.html b/advanced/expert-parallelism.html
@@ -169,7 +169,7 @@ <h2>How to Enable<a class="headerlink" href="#how-to-enable" title="Link to this
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3045b40>
+<jinja2.runtime.BlockReference object at 0x7f8a0912bf10>
 
 <div class="footer">
     <p>

diff --git a/advanced/gpt-attention.html b/advanced/gpt-attention.html
@@ -486,7 +486,7 @@ <h3>Relative Attention Bias (RAB)<a class="headerlink" href="#relative-attention
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf2fca320>
+<jinja2.runtime.BlockReference object at 0x7f8a091076d0>
 
 <div class="footer">
     <p>

diff --git a/advanced/gpt-runtime.html b/advanced/gpt-runtime.html
@@ -378,7 +378,7 @@ <h2>Know Issues and Future Changes<a class="headerlink" href="#know-issues-and-f
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3045180>
+<jinja2.runtime.BlockReference object at 0x7f8a090f4f10>
 
 <div class="footer">
     <p>

diff --git a/advanced/graph-rewriting.html b/advanced/graph-rewriting.html
@@ -349,7 +349,7 @@ <h2>Classical Workflow<a class="headerlink" href="#classical-workflow" title="Li
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3d06110>
+<jinja2.runtime.BlockReference object at 0x7f8a092e8430>
 
 <div class="footer">
     <p>

diff --git a/advanced/inference-request.html b/advanced/inference-request.html
@@ -365,7 +365,7 @@ <h1>Responses<a class="headerlink" href="#responses" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3be3220>
+<jinja2.runtime.BlockReference object at 0x7f8a09121810>
 
 <div class="footer">
     <p>

diff --git a/advanced/lora.html b/advanced/lora.html
@@ -323,7 +323,7 @@ <h3>LoRA with tensor parallel<a class="headerlink" href="#lora-with-tensor-paral
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3045de0>
+<jinja2.runtime.BlockReference object at 0x7f8a092eb700>
 
 <div class="footer">
     <p>

diff --git a/advanced/weight-streaming.html b/advanced/weight-streaming.html
@@ -206,7 +206,7 @@ <h2>API Changes<a class="headerlink" href="#api-changes" title="Link to this hea
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3cc2cb0>
+<jinja2.runtime.BlockReference object at 0x7f8a091051e0>
 
 <div class="footer">
     <p>

diff --git a/architecture/add-model.html b/architecture/add-model.html
@@ -240,7 +240,7 @@ <h2>Reference<a class="headerlink" href="#reference" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3180c40>
+<jinja2.runtime.BlockReference object at 0x7f8a08927550>
 
 <div class="footer">
     <p>

diff --git a/architecture/checkpoint.html b/architecture/checkpoint.html
@@ -506,7 +506,7 @@ <h2>Make Evaluation<a class="headerlink" href="#make-evaluation" title="Link to
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3be1150>
+<jinja2.runtime.BlockReference object at 0x7f8a09121720>
 
 <div class="footer">
     <p>

diff --git a/architecture/core-concepts.html b/architecture/core-concepts.html
@@ -377,7 +377,7 @@ <h1>Runtime<a class="headerlink" href="#runtime" title="Link to this heading">
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3044bb0>
+<jinja2.runtime.BlockReference object at 0x7f8a09397220>
 
 <div class="footer">
     <p>

diff --git a/architecture/overview.html b/architecture/overview.html
@@ -158,7 +158,7 @@ <h2>Model Weights<a class="headerlink" href="#model-weights" title="Link to this
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3250520>
+<jinja2.runtime.BlockReference object at 0x7f8a08941990>
 
 <div class="footer">
     <p>

diff --git a/architecture/workflow.html b/architecture/workflow.html
@@ -336,7 +336,7 @@ <h2>CLI Tools<a class="headerlink" href="#cli-tools" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3c0a6b0>
+<jinja2.runtime.BlockReference object at 0x7f8a089629e0>
 
 <div class="footer">
     <p>

diff --git a/blogs/Falcon180B-H200.html b/blogs/Falcon180B-H200.html
@@ -295,7 +295,7 @@ <h3>Closing<a class="headerlink" href="#closing" title="Link to this heading">
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3c09720>
+<jinja2.runtime.BlockReference object at 0x7f8a093abd00>
 
 <div class="footer">
     <p>

diff --git a/blogs/H100vsA100.html b/blogs/H100vsA100.html
@@ -247,7 +247,7 @@ <h2>What is H100 FP8?<a class="headerlink" href="#what-is-h100-fp8" title="Link
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3d17310>
+<jinja2.runtime.BlockReference object at 0x7f8a08ce38e0>
 
 <div class="footer">
     <p>

diff --git a/blogs/H200launch.html b/blogs/H200launch.html
@@ -239,7 +239,7 @@ <h2>Latest HBM Memory<a class="headerlink" href="#latest-hbm-memory" title="Link
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf2d1e740>
+<jinja2.runtime.BlockReference object at 0x7f8a08d0ca90>
 
 <div class="footer">
     <p>

diff --git a/blogs/XQA-kernel.html b/blogs/XQA-kernel.html
@@ -141,7 +141,7 @@ <h1>New XQA-kernel provides 2.4x more Llama-70B throughput within the same laten
 <p>Support matrix and usage flags are described in <a class="reference internal" href="#/docs/source/advanced/gpt-attention.md#xqa-optimization"><span class="xref myst">docs/source/advanced/gpt_attention</span></a>.</p>
 <p><strong>Increased Throughput:</strong>
 Looking at the Throughput-Latency curves below, we see that the enabling of XQA optimization increases throughput. Higher throughput equates to serving more users, and we can see that TPOT on the Y-axis flattens out when XQA gets enabled.</p>
-<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/media/XQA_ThroughputvsLatency.png" alt="XQA increased throughput within same latency budget" width="950" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/XQA_ThroughputvsLatency.png?raw=true" alt="XQA increased throughput within same latency budget" width="950" height="auto">
 <p><sub>Preliminary measured Performance, subject to change. TPOT lower is better. FP8, 8xH100 GPUs, Single Engine, ISL/OSL: 512/2048, BS: 1 - 256, TensorRT-LLM v0.8a</sub></p>
 <section id="llama-70b-on-h200-up-to-2-4x-increased-throughput-with-xqa-within-same-latency-budget">
 <h2>Llama-70B on H200 up to 2.4x increased throughput with XQA within same latency budget<a class="headerlink" href="#llama-70b-on-h200-up-to-2-4x-increased-throughput-with-xqa-within-same-latency-budget" title="Link to this heading"></a></h2>
@@ -204,7 +204,7 @@ <h3>Closing<a class="headerlink" href="#closing" title="Link to this heading">
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3d169e0>
+<jinja2.runtime.BlockReference object at 0x7f8a08e203a0>
 
 <div class="footer">
     <p>

diff --git a/blogs/quantization-in-TRT-LLM.html b/blogs/quantization-in-TRT-LLM.html
@@ -359,7 +359,7 @@ <h2>What’s coming next<a class="headerlink" href="#whats-coming-next" title="L
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3c09b70>
+<jinja2.runtime.BlockReference object at 0x7f8a08d688b0>
 
 <div class="footer">
     <p>

diff --git a/executor.html b/executor.html
@@ -190,7 +190,7 @@ <h2>Python Bindings for the Executor API<a class="headerlink" href="#python-bind
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3d174f0>
+<jinja2.runtime.BlockReference object at 0x7f8a04c7b640>
 
 <div class="footer">
     <p>

diff --git a/genindex.html b/genindex.html
@@ -3773,7 +3773,7 @@ <h2 id="T">T</h2>
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf87c1ba0>
+<jinja2.runtime.BlockReference object at 0x7f8a08c552d0>
 
 <div class="footer">
     <p>

diff --git a/index.html b/index.html
@@ -364,7 +364,7 @@ <h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Li
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7fedf3c08ee0>
+<jinja2.runtime.BlockReference object at 0x7f8a08e59ae0>
 
 <div class="footer">
     <p>