diff --git a/docs/how-to/fine-tuning-llms/index.rst b/docs/how-to/fine-tuning-llms/index.rst
index c197158f28..a966a9b95f 100644
--- a/docs/how-to/fine-tuning-llms/index.rst
+++ b/docs/how-to/fine-tuning-llms/index.rst
@@ -2,9 +2,9 @@
    :description: How to fine-tune LLMs with ROCm
    :keywords: ROCm, LLM, fine-tuning, usage, tutorial
 
-**************************
-Fine-tuning LLMs with ROCm
-**************************
+*******************************************
+Fine-tuning LLMs and inference optimization
+*******************************************
 
 ROCm empowers the fine-tuning and optimization of large language models, making them accessible and efficient for
 specialized tasks. ROCm supports the broader AI ecosystem to ensure seamless integration with open frameworks,
diff --git a/docs/how-to/fine-tuning-llms/llm-inference-frameworks.rst b/docs/how-to/fine-tuning-llms/llm-inference-frameworks.rst
index 9da634d5cc..646b99d4ec 100644
--- a/docs/how-to/fine-tuning-llms/llm-inference-frameworks.rst
+++ b/docs/how-to/fine-tuning-llms/llm-inference-frameworks.rst
@@ -32,7 +32,7 @@ Installing vLLM
 
    .. code-block:: shell
 
-      # Install from the source
+      # Install from source
       git clone https://github.com/ROCm/vllm.git    
       cd vllm
       PYTORCH_ROCM_ARCH=gfx942 python setup.py install #MI300 series
diff --git a/docs/how-to/fine-tuning-llms/model-acceleration-libraries.rst b/docs/how-to/fine-tuning-llms/model-acceleration-libraries.rst
index f1bc7c7046..5a57a6b83d 100644
--- a/docs/how-to/fine-tuning-llms/model-acceleration-libraries.rst
+++ b/docs/how-to/fine-tuning-llms/model-acceleration-libraries.rst
@@ -40,7 +40,7 @@ ROCm provides two different implementations of Flash Attention 2 modules. They c
 
       .. code-block:: shell
 
-         # Install from the source
+         # Install from source
          git clone https://github.com/ROCm/flash-attention.git
          cd flash-attention/
          GPU_ARCHS=gfx942 python setup.py install #MI300 series
@@ -156,7 +156,7 @@ of the PyTorch compilation.
 
 .. code-block:: python
 
-   # Sample script to run LLM with the static key-value cache and pytorch compilation
+   # Sample script to run LLM with the static key-value cache and PyTorch compilation
    from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
    import torch
    from typing import Optional
@@ -180,7 +180,8 @@ of the PyTorch compilation.
        return new_token
    
    batch_size, seq_length = inputs["input_ids"].shape
-   # static key-value cache
+
+   # Static key-value cache
    max_cache_length = 1024
    max_new_tokens = 10
    model._setup_cache(StaticCache, batch_size, max_cache_len=max_cache_length)
@@ -190,6 +191,7 @@ of the PyTorch compilation.
    
    logits = model(**inputs, cache_position=cache_position, return_dict=False, use_cache=True)[0]
    next_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
+
    # torch compilation
    decode_one_tokens = torch.compile(decode_one_tokens, mode="max-autotune-no-cudagraphs",fullgraph=True)
    
@@ -221,10 +223,10 @@ page describes the options.
 
 .. code-block:: python
 
-   # To turn on TunableOps, simply set this environmental variable
+   # To turn on TunableOp, simply set this environment variable
    export PYTORCH_TUNABLEOP_ENABLED=1
    
-   # python
+   # Python
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
diff --git a/docs/how-to/fine-tuning-llms/model-quantization.rst b/docs/how-to/fine-tuning-llms/model-quantization.rst
index 18d604b3d4..c79b4d46a5 100644
--- a/docs/how-to/fine-tuning-llms/model-quantization.rst
+++ b/docs/how-to/fine-tuning-llms/model-quantization.rst
@@ -32,7 +32,7 @@ The AutoGPTQ library implements the GPTQ algorithm.
 
    .. code-block:: shell
 
-      # This will install pre-built wheel for a specific ROCm version  
+      # This will install pre-built wheel for a specific ROCm version.
       
       pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/
 
@@ -40,11 +40,11 @@ The AutoGPTQ library implements the GPTQ algorithm.
 
    .. code-block:: shell
 
-      # Clone the source code
+      # Clone the source code.
       git clone https://github.com/AutoGPTQ/AutoGPTQ.git
       cd AutoGPTQ
       
-      # Speed up the compilation by specifying PYTORCH_ROCM_ARCH to target device 
+      # Speed up the compilation by specifying PYTORCH_ROCM_ARCH to target device.
       PYTORCH_ROCM_ARCH=gfx942 ROCM_VERSION=6.1 pip install .
       
       # Show the package after the installation 
@@ -93,12 +93,14 @@ Using GPTQ with AutoGPTQ
 
    .. code-block:: python
 
-      # import auto_gptq class
+      # Import auto_gptq class.
       from auto_gptq import AutoGPTQForCausalLM
-      # load non-quantized model
+
+      # Load non-quantized model.
       base_model = AutoGPTQForCausalLM.from_pretrained(base_model_name, quantize_config, device_map = "auto")
       base_model.quantize(examples)
-      # save quantized model
+
+      # Save quantized model.
       base_model.save_quantized(quantized_model_name)
 
 Using GPTQ with Hugging Face Transformers
@@ -201,7 +203,7 @@ Installing bitsandbytes
 Using bitsandbytes primitives
 -----------------------------
 
-To get started with bitsandbytes primitives, use the following code a reference.
+To get started with bitsandbytes primitives, use the following code as reference.
 
 .. code-block:: python
 
@@ -230,7 +232,7 @@ To load a Transformers model in 4-bit, set ``load_int_4bt=true`` in ``BitsAndByt
            device_map="auto", 
            quantization_config=quantization_config)
    
-   # check the memory footprint with get_memory_footprint method
+   # Check the memory footprint with get_memory_footprint method
    print(bnb_model_4bit.get_memory_footprint())
 
 To load a model in 8-bit for inference, use the ``load_in_8bit`` option.
diff --git a/docs/how-to/fine-tuning-llms/multi-gpu-fine-tuning-and-inference.rst b/docs/how-to/fine-tuning-llms/multi-gpu-fine-tuning-and-inference.rst
index b567c60cb1..cb7dc01c13 100644
--- a/docs/how-to/fine-tuning-llms/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/fine-tuning-llms/multi-gpu-fine-tuning-and-inference.rst
@@ -130,8 +130,8 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================
 
-torchtune is a PyTorch-native library for easy single and multi-accelerator or GPU model fine-tuning and inference with
-LLMs.
+`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-accelerator or
+GPU model fine-tuning and inference with LLMs.
 
 #. Install torchtune using pip.
 
@@ -157,80 +157,80 @@ LLMs.
       subcommands:
         {download,ls,cp,run,validate}
 
-torchtune recipes are designed around easily composable components and workable training loops, with minimal abstraction
-getting in the way of fine-tuning. Run ``tune ls`` to show built-in torchtune configuration recipes.
-
-.. code-block:: shell
-
-   RECIPE                                   CONFIG
-   full_finetune_single_device              llama2/7B_full_low_memory
-                                            llama3/8B_full_single_device
-                                            mistral/7B_full_low_memory
-   full_finetune_distributed                llama2/7B_full
-                                            llama2/13B_full
-                                            llama3/8B_full
-                                            mistral/7B_full
-                                            gemma/2B_full
-   lora_finetune_single_device              llama2/7B_lora_single_device
-                                            llama2/7B_qlora_single_device
-                                            llama3/8B_lora_single_device
-                                            llama3/8B_qlora_single_device
-                                            llama2/13B_qlora_single_device
-                                            mistral/7B_lora_single_device
-
-The ``RECIPE`` column shows the easy-to-use and workable fine-tuning and inference recipes for popular fine-tuning
-techniques (such as LoRA). The ``CONFIG`` column lists the YAML configurations for easily configuring training,
-evaluation, quantization, or inference recipes.
-
-The snippet shows the architecture of a model's YAML configuration file:
-
-.. code-block:: yaml
-
-   # Model Arguments
-   model:
-     _component_: torchtune.models.llama2.lora_llama2_7b
-     lora_attn_modules: ['q_proj', 'v_proj']
-     apply_lora_to_mlp: False
-     apply_lora_to_output: False
-     lora_rank: 8
-     lora_alpha: 16
-   
-   tokenizer:
-     _component_: torchtune.models.llama2.llama2_tokenizer
-     path: /tmp/Llama-2-7b-hf/tokenizer.model
-   
-   # Dataset and Sampler
-   dataset:
-     _component_: torchtune.datasets.alpaca_cleaned_dataset
-     train_on_input: True
+#. torchtune recipes are designed around easily composable components and workable training loops, with minimal abstraction
+   getting in the way of fine-tuning. Run ``tune ls`` to show built-in torchtune configuration recipes.
+
+   .. code-block:: shell
+
+      RECIPE                                   CONFIG
+      full_finetune_single_device              llama2/7B_full_low_memory
+                                               llama3/8B_full_single_device
+                                               mistral/7B_full_low_memory
+      full_finetune_distributed                llama2/7B_full
+                                               llama2/13B_full
+                                               llama3/8B_full
+                                               mistral/7B_full
+                                               gemma/2B_full
+      lora_finetune_single_device              llama2/7B_lora_single_device
+                                               llama2/7B_qlora_single_device
+                                               llama3/8B_lora_single_device
+                                               llama3/8B_qlora_single_device
+                                               llama2/13B_qlora_single_device
+                                               mistral/7B_lora_single_device
+
+   The ``RECIPE`` column shows the easy-to-use and workable fine-tuning and inference recipes for popular fine-tuning
+   techniques (such as LoRA). The ``CONFIG`` column lists the YAML configurations for easily configuring training,
+   evaluation, quantization, or inference recipes.
+
+   The snippet shows the architecture of a model's YAML configuration file:
+
+   .. code-block:: yaml
+
+      # Model arguments
+      model:
+        _component_: torchtune.models.llama2.lora_llama2_7b
+        lora_attn_modules: ['q_proj', 'v_proj']
+        apply_lora_to_mlp: False
+        apply_lora_to_output: False
+        lora_rank: 8
+        lora_alpha: 16
+      
+      tokenizer:
+        _component_: torchtune.models.llama2.llama2_tokenizer
+        path: /tmp/Llama-2-7b-hf/tokenizer.model
+      
+      # Dataset and sampler
+      dataset:
+        _component_: torchtune.datasets.alpaca_cleaned_dataset
+        train_on_input: True
 
-This configuration file defines the fine-tuning base model path, data set, hyper-parameters for optimizer and scheduler,
-and training data type. To download the base model for fine-tuning, run the following command:
+#. This configuration file defines the fine-tuning base model path, data set, hyper-parameters for optimizer and scheduler,
+   and training data type. To download the base model for fine-tuning, run the following command:
 
-.. code-block:: shell
+   .. code-block:: shell
 
-   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token
+      tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token
 
-The output directory argument for ``--output-dir`` should map the model path specified in YAML config file.
+   The output directory argument for ``--output-dir`` should map the model path specified in YAML config file.
 
-To launch ``lora_finetune_distributed`` on four devices, run the following
-command:
+#. To launch ``lora_finetune_distributed`` on four devices, run the following
+   command:
 
-.. code-block:: shell
+   .. code-block:: shell
 
-   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config llama2/7B_lora
+      tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config llama2/7B_lora
 
-If successful, you should something like the following output:
+   If successful, you should something like the following output:
 
-.. code-block:: shell
+   .. code-block:: shell
 
-   INFO:torchtune.utils.logging:FSDP is enabled. Instantiating Model on CPU for Rank 0 ...
-   INFO:torchtune.utils.logging:Model instantiation took 7.32 secs
-   INFO:torchtune.utils.logging:Memory Stats after model init:
-   {'peak_memory_active': 9.478172672, 'peak_memory_alloc': 8.953868288, 'peak_memory_reserved': 11.112808448}
-   INFO:torchtune.utils.logging:Optimizer and loss are initialized.
-   INFO:torchtune.utils.logging:Dataset and Sampler are initialized.
-   INFO:torchtune.utils.logging:Learning rate scheduler is initialized.
-   1|111|Loss: 1.5790324211120605:   7%|█                                          | 114/1618
+      INFO:torchtune.utils.logging:FSDP is enabled. Instantiating Model on CPU for Rank 0 ...
+      INFO:torchtune.utils.logging:Model instantiation took 7.32 secs
+      INFO:torchtune.utils.logging:Memory Stats after model init:
+      {'peak_memory_active': 9.478172672, 'peak_memory_alloc': 8.953868288, 'peak_memory_reserved': 11.112808448}
+      INFO:torchtune.utils.logging:Optimizer and loss are initialized.
+      INFO:torchtune.utils.logging:Dataset and Sampler are initialized.
+      INFO:torchtune.utils.logging:Learning rate scheduler is initialized.
+      1|111|Loss: 1.5790324211120605:   7%|█                                          | 114/1618
 
 Read more about inference frameworks in :doc:`LLM inference frameworks <llm-inference-frameworks>`.
diff --git a/docs/how-to/fine-tuning-llms/overview.rst b/docs/how-to/fine-tuning-llms/overview.rst
index 90330cdbd2..b4493f0718 100644
--- a/docs/how-to/fine-tuning-llms/overview.rst
+++ b/docs/how-to/fine-tuning-llms/overview.rst
@@ -7,7 +7,7 @@ Conceptual overview of fine-tuning LLMs
 ***************************************
 
 Large language models (LLMs) are trained on massive amounts of text data to generate coherent and fluent text. The
-underlying *transformer* architecture is the fundamental building block of all LLMs. Transformers serve as the
+underlying *transformer* architecture is the fundamental building block of all LLMs. Transformers 
 enable LLMs to understand and generate text by capturing contextual relationships and long-range dependencies. To better
 understand the philosophy of the transformer architecture, review the foundational
 `Attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ paper.
@@ -60,7 +60,7 @@ overcome this issue of high memory consumption.
 LoRA accelerates the adjustment process and reduces related memory costs. To be precise, LoRA decomposes the portion of
 weight changes :math:`ΔW` into high-precision low-rank representations, which do not require the calculations of all
 :math:`ΔW`. It learns the decomposition representation of :math:`ΔW` during training, as shown in
-:ref:`the weight update diagram <fine-tuning-llms-concept-challenge>`. This is how LoRA saves on
+the :ref:`weight update diagram <fine-tuning-llms-concept-challenge>`. This is how LoRA saves on
 computing resources.
 
 LoRA is integrated into the `Hugging Face Parameter-Efficient Fine-Tuning (PEFT)
diff --git a/docs/how-to/fine-tuning-llms/single-gpu-fine-tuning-and-inference.rst b/docs/how-to/fine-tuning-llms/single-gpu-fine-tuning-and-inference.rst
index 48da84446f..507bc18e79 100644
--- a/docs/how-to/fine-tuning-llms/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/fine-tuning-llms/single-gpu-fine-tuning-and-inference.rst
@@ -87,7 +87,8 @@ Setting up the base implementation environment
 
    .. code-block:: shell
 
-      # Install `bitsandbytes` for ROCm 6.0+, use -DBNB_ROCM_ARCH to target specific GPU arch
+      # Install `bitsandbytes` for ROCm 6.0+.
+      # Use -DBNB_ROCM_ARCH to target a specific GPU architecture.
       git clone --recurse https://github.com/ROCm/bitsandbytes.git
       cd bitsandbytes
       git checkout rocm_enabled
@@ -95,13 +96,13 @@ Setting up the base implementation environment
       cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
       python setup.py install
       
-      # To leverage the SFTTrainer in TRL for model fine-tuning
+      # To leverage the SFTTrainer in TRL for model fine-tuning.
       pip install trl
       
-      # To leverage PEFT for efficiently adapting pre-trained language models 
+      # To leverage PEFT for efficiently adapting pre-trained language models .
       pip install peft
       
-      # Install the other dependencies:
+      # Install the other dependencies.
       pip install transformers, datasets, huggingface-hub, scipy
 
 #. Check that the required packages can be imported.
@@ -139,14 +140,14 @@ Download the base model and fine-tuning dataset
 
    .. code-block:: python
 
-      # Base model and tokenizer names
+      # Base model and tokenizer names.
       base_model_name = "meta-llama/Llama-2-7b-chat-hf"
       
-      # Load base model to GPU memory
+      # Load base model to GPU memory.
       device = "cuda:0"
       base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
       
-      # Load tokenizer
+      # Load tokenizer.
       tokenizer = AutoTokenizer.from_pretrained(
               base_model_name, 
               trust_remote_code = True)
@@ -159,14 +160,14 @@ Download the base model and fine-tuning dataset
 
    .. code-block::
 
-      # Dataset for fine-tuning
+      # Dataset for fine-tuning.
       training_dataset_name = "mlabonne/guanaco-llama2-1k"
       training_dataset = load_dataset(training_dataset_name, split = "train")
       
-      # Check the data 
+      # Check the data.
       print(training_dataset)
       
-      # #11 is a QA sample in English
+      # Dataset 11 is a QA sample in English.
       print(training_dataset[11])
 
 #. With the base model and the dataset, let's start fine-tuning!
@@ -180,7 +181,7 @@ To set up ``SFTTrainer`` parameters, you can use the following code as reference
 
 .. code-block:: python
 
-   # Training Params for SFTTrainer
+   # Training parameters for SFTTrainer.
    training_arguments = TrainingArguments(
        output_dir = "./results",
             num_train_epochs = 1,
@@ -228,7 +229,7 @@ Compare the number of trainable parameters and training time under the two diffe
                     bias = "none",
                     task_type = "CAUSAL_LM"
             )
-            # View the number of Trainable Params
+            # View the number of trainable parameters.
             from peft import get_peft_model
             peft_model = get_peft_model(base_model, peft_config)
             peft_model.print_trainable_parameters()
@@ -244,7 +245,7 @@ Compare the number of trainable parameters and training time under the two diffe
 
          .. code-block:: python
 
-            # Initialize a sft trainer
+            # Initialize an SFT trainer.
             sft_trainer = SFTTrainer(
                     model = base_model,
                     train_dataset = training_dataset,
@@ -254,7 +255,7 @@ Compare the number of trainable parameters and training time under the two diffe
                     args = training_arguments
             ) 
             
-            # Run the trainer
+            # Run the trainer.
             sft_trainer.train()
 
          The output should look like this:
@@ -302,7 +303,7 @@ Compare the number of trainable parameters and training time under the two diffe
 
          .. code-block:: python
 
-            # Trainer without LoRA config
+            # Trainer without LoRA config.
             trainer_full = SFTTrainer(
                     model = base_model,
                     train_dataset = training_dataset,
@@ -311,7 +312,7 @@ Compare the number of trainable parameters and training time under the two diffe
                     args = training_arguments
             ) 
             
-            # Training 
+            # Training.
             trainer_full.train()
 
          The output should look like this:
@@ -347,20 +348,20 @@ store, and load.
 
       .. code-block:: python
 
-         # PEFT adapter name
+         # PEFT adapter name.
          adapter_name = "llama-2-7b-enhanced-adapter"
          
-         # Save PEFT adapter
+         # Save PEFT adapter.
          sft_trainer.model.save_pretrained(adapter_name)
 
       The saved PEFT adapter should look like this on your system:
 
       .. code-block:: shell
 
-         # Access adapter directory
+         # Access adapter directory.
          cd llama-2-7b-enhanced-adapter
          
-         # List all adapter files
+         # List all adapter files.
          README.md  adapter_config.json  adapter_model.safetensors
 
    .. tab-item:: Saving a fully fine-tuned model
@@ -371,20 +372,20 @@ store, and load.
 
       .. code-block:: python
 
-         # fully fine-tuned model name
+         # Fully fine-tuned model name.
          new_model_name = "llama-2-7b-enhanced"
          
-         # Save the fully fine-tuned model
+         # Save the fully fine-tuned model.
          full_trainer.model.save_pretrained(new_model_name)
 
       The saved new full model should look like this on your system:
 
       .. code-block:: shell
 
-         # Access new model directory
+         # Access new model directory.
          cd llama-2-7b-enhanced
          
-         # List all model files
+         # List all model files.
          config.json                       model-00002-of-00006.safetensors  model-00005-of-00006.safetensors
          generation_config.json            model-00003-of-00006.safetensors  model-00006-of-00006.safetensors
          model-00001-of-00006.safetensors  model-00004-of-00006.safetensors  model.safetensors.index.json
diff --git a/docs/index.md b/docs/index.md
index 57965eb4d5..24b01d93bc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -92,7 +92,7 @@ Our documentation is organized into the following categories:
 :padding: 2
 
 * [Using ROCm for AI](./how-to/rocm-for-ai/index.rst)
-* [Fine-tuning LLMs with ROCm](./how-to/fine-tuning-llms/index.rst)
+* [Fine-tuning LLMs and inference optimization](./how-to/fine-tuning-llms/index.rst)
 * [System tuning for various architectures](./how-to/tuning-guides.md)
   * [MI100](./how-to/tuning-guides/mi100.md)
   * [MI200](./how-to/tuning-guides/mi200.md)
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 3e2a0ac05b..6fca673f1d 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -59,7 +59,7 @@ subtrees:
       - file: how-to/rocm-for-ai/hugging-face-models.rst
       - file: how-to/rocm-for-ai/deploy-your-model.rst
   - file: how-to/fine-tuning-llms/index.rst
-    title: Fine-tuning LLMs with ROCm
+    title: Fine-tuning LLMs and inference optimization
     subtrees:
     - entries:
       - file: how-to/fine-tuning-llms/overview.rst