diff --git a/README.md b/README.md
index a795931..eade0b4 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,10 @@ print(ranked_answers)
 # ]
 ```
 
-The API documentation can be found [here](src/llm_blender/README.md).
+The detailed documentation can be found in the [LLM-Blender API Reference](src/llm_blender/README.md).
+
+As the [`llm-blender` library](https://github.com/yuchenlin/LLM-Blender) lacks a stable release, the necessary code has been incorporated into this project under `src/llm_blender/llm_blender_utils`.
+
 
 ## Results
 
diff --git a/paper/llm_blender.pdf b/paper/llm_blender.pdf
index 235a6a1..5e4ea79 100644
Binary files a/paper/llm_blender.pdf and b/paper/llm_blender.pdf differ
diff --git a/src/llm_blender/billsum/llama.py b/src/llm_blender/billsum/llama.py
index 50284c4..1e62259 100644
--- a/src/llm_blender/billsum/llama.py
+++ b/src/llm_blender/billsum/llama.py
@@ -1,3 +1,16 @@
+"""Evaluation of Llama-3-8b on the BillSum dataset
+
+This script implements a pipeline to evaluate the Llama-3-8b model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -8,7 +21,19 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Additional instructions for the model for Summarization
     instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -16,38 +41,58 @@ def generate_result(
     )
 
     # Format prompt to be compatible with meta-llama-3-8b-instruct
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = (
         """<|begin_of_text|><|start_header_id|>user<|end_header_id|> """
         f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
     )
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "meta-llama-3-8b-instruct.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_llama.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_llama.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/llm_blender_ranker_all_llms.py b/src/llm_blender/billsum/llm_blender_ranker_all_llms.py
index cd65e79..a6fac93 100644
--- a/src/llm_blender/billsum/llm_blender_ranker_all_llms.py
+++ b/src/llm_blender/billsum/llm_blender_ranker_all_llms.py
@@ -1,3 +1,19 @@
+"""
+Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
+evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral).
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
@@ -5,8 +21,10 @@
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the BillSum dataset
 dataset = load_dataset("billsum", split="test")
 
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -51,6 +69,7 @@
     """a concise and easy-to-understand format.: {{ prompt }} [/INST] """
 )
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template)
@@ -59,8 +78,10 @@
 qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
 
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params)
@@ -69,11 +90,13 @@
 qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
-
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
 
@@ -97,6 +120,8 @@
 
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("openchat_prompt_builder", "openchat_model")
@@ -105,6 +130,7 @@
 blender_pipeline.connect("qwen_prompt_builder", "qwen_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("openchat_model", "llm_blender_ranker")
@@ -113,10 +139,13 @@
 blender_pipeline.connect("qwen_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
 
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
             {"llama_prompt_builder": {"prompt": prompt}},
@@ -130,6 +159,7 @@
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
@@ -137,9 +167,13 @@
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py b/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py
index 72d2d3a..dce0115 100644
--- a/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py
+++ b/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py
@@ -1,3 +1,19 @@
+"""
+Evaluation of ensemble of best performing LLMs on the BillSum dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the BillSum dataset. The pipeline is
+ evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads 3 top performing LLMs: LLaMA, Phi and  Mistral.
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
@@ -5,8 +21,10 @@
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the BillSum dataset
 dataset = load_dataset("billsum", split="test")
 
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -26,20 +44,26 @@
     """a concise and easy-to-understand format.: {{ prompt }} [/INST] """
 )
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
 
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
 
@@ -51,18 +75,24 @@
 
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
 
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
             {
@@ -74,6 +104,7 @@
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
@@ -81,9 +112,13 @@
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/mistral.py b/src/llm_blender/billsum/mistral.py
index fd7b53c..bed26d9 100644
--- a/src/llm_blender/billsum/mistral.py
+++ b/src/llm_blender/billsum/mistral.py
@@ -1,3 +1,16 @@
+"""Evaluation of Mistral-7b on the BillSum dataset
+
+This script implements a pipeline to evaluate the Mistral-7b model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Mistral-7b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -8,7 +21,19 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Additional instructions for the model for Summarization
     instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -16,35 +41,55 @@ def generate_result(
     )
 
     # Format prompt to be compatible with mistral-7b-instruct-v0.2
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<s>[INST] {instruction} {prompt} [/INST] """
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_mistral.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_mistral.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/openchat.py b/src/llm_blender/billsum/openchat.py
index cca895a..40b1f28 100644
--- a/src/llm_blender/billsum/openchat.py
+++ b/src/llm_blender/billsum/openchat.py
@@ -1,10 +1,33 @@
+"""Evaluation of OpenChat on the BillSum dataset
+
+This script implements a pipeline to evaluate the OpenChat model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the OpenChat model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator
 
 
-def construct_prompt(prompt=""):
+def construct_prompt(prompt: str = ""):
+    """
+    Construct a prompt with instructions for summarization.
+
+    Args:
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The constructed for the model.
+    """
+    # Additional instructions for the model for Summarization
     prompt_with_instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -18,37 +41,68 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
 
+    Returns:
+        str: The generated response from the model.
+    """
     # Format prompt to be compatible with openchat-3.5-0106
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = construct_prompt(prompt)
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/openchat-3.5-0106.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/openhermes.py b/src/llm_blender/billsum/openhermes.py
index af28273..6649ece 100644
--- a/src/llm_blender/billsum/openhermes.py
+++ b/src/llm_blender/billsum/openhermes.py
@@ -1,3 +1,16 @@
+"""Evaluation of OpenHermes on the BillSum dataset
+
+This script implements a pipeline to evaluate the OpenHermes model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the OpenHermes model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -8,6 +21,19 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Additional instructions for the model for Summarization
     instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -15,39 +41,59 @@ def generate_result(
     )
 
     # Format prompt to be compatible with openhermes-2.5-mistral-7b
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<|im_start|>system
       {instruction}<|im_end|>
       <|im_start|>user
       {prompt}<|im_end|>
       <|im_start|>assistant"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/phi.py b/src/llm_blender/billsum/phi.py
index 841499a..4da16d4 100644
--- a/src/llm_blender/billsum/phi.py
+++ b/src/llm_blender/billsum/phi.py
@@ -1,3 +1,16 @@
+"""Evaluation of Phi-3 on the BillSum dataset
+
+This script implements a pipeline to evaluate the Phi-3 model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Phi-3 model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -8,7 +21,19 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Additional instructions for the model for Summarization
     instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -16,35 +41,55 @@ def generate_result(
     )
 
     # Format prompt to be compatible with phi-3-mini-4k-instruct
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<|user|>\n{instruction} {prompt} <|end|>\n<|assistant|>"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "phi-3-mini-4k-instruct.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_phi.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_phi.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/qwen.py b/src/llm_blender/billsum/qwen.py
index 6474e0b..3c5878b 100644
--- a/src/llm_blender/billsum/qwen.py
+++ b/src/llm_blender/billsum/qwen.py
@@ -1,10 +1,33 @@
+"""Evaluation of Qwen on the BillSum dataset
+
+This script implements a pipeline to evaluate the Qwen model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Qwen model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator
 
 
-def construct_prompt(prompt=""):
+def construct_prompt(prompt: str = ""):
+    """
+    Construct a prompt with instructions for summarization.
+
+    Args:
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The constructed for the model.
+    """
+    # Additional instructions for the model for Summarization
     prompt_with_instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -24,37 +47,68 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
 
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
     # Format prompt to be compatible with qwen1.5-7b
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = construct_prompt(prompt)
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/qwen1_5-7b-chat-q4_k_m.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_qwen.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/solar.py b/src/llm_blender/billsum/solar.py
index 943950b..433fe1c 100644
--- a/src/llm_blender/billsum/solar.py
+++ b/src/llm_blender/billsum/solar.py
@@ -1,10 +1,33 @@
+"""Evaluation of Solar on the BillSum dataset
+
+This script implements a pipeline to evaluate the Solar model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Solar model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator
 
 
-def construct_prompt(prompt=""):
+def construct_prompt(prompt: str = ""):
+    """
+    Construct a prompt with instructions for summarization.
+
+    Args:
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The constructed for the model.
+    """
+    # Additional instructions for the model for Summarization
     prompt_with_instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -21,37 +44,68 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
 
+    Returns:
+        str: The generated response from the model.
+    """
     # Format prompt to be compatible with solar-10.7b-instruct-v1.0
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = construct_prompt(prompt)
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/solar-10.7b-instruct-v1.0.Q4_K_M"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Load the dataset from the HuggingFace
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/starling.py b/src/llm_blender/billsum/starling.py
index 703fde5..287b562 100644
--- a/src/llm_blender/billsum/starling.py
+++ b/src/llm_blender/billsum/starling.py
@@ -1,10 +1,33 @@
+"""Evaluation of Starling on the BillSum dataset
+
+This script implements a pipeline to evaluate the Starling model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Starling model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator
 
 
-def construct_prompt(prompt=""):
+def construct_prompt(prompt: str = ""):
+    """
+    Construct a prompt with instructions for summarization.
+
+    Args:
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The constructed for the model.
+    """
+    # Additional instructions for the model for Summarization
     prompt_with_instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
@@ -18,37 +41,69 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with starling-lm-7b-alpha
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = construct_prompt(prompt)
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/starling-lm-7b-alpha.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_starling.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_starling.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/llama.py b/src/llm_blender/mix_instruct/llama.py
index 2ae3222..9060c51 100644
--- a/src/llm_blender/mix_instruct/llama.py
+++ b/src/llm_blender/mix_instruct/llama.py
@@ -1,3 +1,16 @@
+"""Evaluation of Llama-3-8b on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Llama-3-8b model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,40 +22,72 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
-    # Format prompt to be compatible with meta-llama-3-8b-instruct
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Format prompt to be compatible with meta-llama-3-8b-instruct model
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = (
         """<|begin_of_text|><|start_header_id|>user<|end_header_id|> """
         f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
     )
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "meta-llama-3-8b-instruct.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_llama.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_llama.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py b/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py
index be67605..b6bea1f 100644
--- a/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py
+++ b/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py
@@ -1,3 +1,19 @@
+"""
+Evaluation of ensemble of LLMs on the Mix-Instruct dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
+evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral).
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
@@ -5,9 +21,10 @@
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the Mix-Instruct dataset
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
 
-
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -41,6 +58,7 @@
 
 mistral_prompt_template = """<s>[INST] {{ instruction }} {{ prompt }} [/INST] """
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template)
@@ -49,8 +67,10 @@
 qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 128, "temperature": 0.2}}
 
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params)
@@ -59,34 +79,31 @@
 qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
-
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
-
 blender_pipeline.add_component(instance=phi_prompt_builder, name="phi_prompt_builder")
 blender_pipeline.add_component(instance=phi_model, name="phi_model")
-
 blender_pipeline.add_component(instance=openchat_prompt_builder, name="openchat_prompt_builder")
 blender_pipeline.add_component(instance=openchat_model, name="openchat_model")
-
 blender_pipeline.add_component(instance=openhermes_prompt_builder, name="openhermes_prompt_builder")
 blender_pipeline.add_component(instance=openhermes_model, name="openhermes_model")
-
 blender_pipeline.add_component(instance=solar_prompt_builder, name="solar_prompt_builder")
 blender_pipeline.add_component(instance=solar_model, name="solar_model")
-
 blender_pipeline.add_component(instance=qwen_prompt_builder, name="qwen_prompt_builder")
 blender_pipeline.add_component(instance=qwen_model, name="qwen_model")
-
 blender_pipeline.add_component(instance=mistral_prompt_builder, name="mistral_prompt_builder")
 blender_pipeline.add_component(instance=mistral_model, name="mistral_model")
-
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("openchat_prompt_builder", "openchat_model")
@@ -95,6 +112,7 @@
 blender_pipeline.connect("qwen_prompt_builder", "qwen_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("openchat_model", "llm_blender_ranker")
@@ -103,24 +121,28 @@
 blender_pipeline.connect("qwen_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
 
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     instruction = row["instruction"]
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
-            {"llama_prompt_builder": {"prompt": prompt}},
-            {"phi_prompt_builder": {"prompt": prompt}},
-            {"openchat_prompt_builder": {"instruction": instruction, "prompt": prompt}},
-            {"openhermes_prompt_builder": {"instruction": instruction, "prompt": prompt}},
-            {"solar_prompt_builder": {"instruction": instruction, "prompt": prompt}},
-            {"qwen_prompt_builder": {"instruction": instruction, "prompt": prompt}},
-            {"mistral_prompt_builder": {"instruction": instruction, "prompt": prompt}},
+            "llama_prompt_builder": {"prompt": prompt},
+            "phi_prompt_builder": {"prompt": prompt},
+            "openchat_prompt_builder": {"instruction": instruction, "prompt": prompt},
+            "openhermes_prompt_builder": {"instruction": instruction, "prompt": prompt},
+            "solar_prompt_builder": {"instruction": instruction, "prompt": prompt},
+            "qwen_prompt_builder": {"instruction": instruction, "prompt": prompt},
+            "mistral_prompt_builder": {"instruction": instruction, "prompt": prompt},
         }
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
@@ -128,9 +150,13 @@
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py b/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py
index 2bf4df8..e354c7b 100644
--- a/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py
+++ b/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py
@@ -1,3 +1,19 @@
+"""
+Evaluation of ensemble of best performing LLMs on the Mix-Instruct dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
+ evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads 3 top performing LLMs: LLaMA, Phi and  Mistral.
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
@@ -5,8 +21,10 @@
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the Mix-Instruct dataset
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
 
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -26,21 +44,26 @@
     """a concise and easy-to-understand format.: {{ prompt }} [/INST] """
 )
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 128, "temperature": 0.2}}
 
-
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
 
@@ -52,18 +75,25 @@
 
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
+
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     instruction = row["instruction"]
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
             {"llama_prompt_builder": {"instruction": instruction, "prompt": prompt}},
@@ -73,6 +103,7 @@
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
@@ -80,9 +111,13 @@
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/mistral.py b/src/llm_blender/mix_instruct/mistral.py
index 99718ef..c047526 100644
--- a/src/llm_blender/mix_instruct/mistral.py
+++ b/src/llm_blender/mix_instruct/mistral.py
@@ -1,3 +1,16 @@
+"""Evaluation of Mistral-7b on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Mistral-7b model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Mistral-7b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,37 +22,69 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
     # Format prompt to be compatible with mistral-7b-instruct-v0.2
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<s>[INST] {instruction} {prompt} [/INST] """
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_mistral.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_mistral.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/openchat.py b/src/llm_blender/mix_instruct/openchat.py
index 6d529a5..f8119bf 100644
--- a/src/llm_blender/mix_instruct/openchat.py
+++ b/src/llm_blender/mix_instruct/openchat.py
@@ -1,3 +1,16 @@
+"""Evaluation of OpenChat-3.5 on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the OpenChat-3.5 model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the OpenChat-3.5 model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,37 +22,70 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with openchat-3.5-0106
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""GPT4 Correct User: {instruction}\n{prompt}<|end_of_turn|>GPT4 Correct Assistant:"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/openchat-3.5-0106.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/openhermes.py b/src/llm_blender/mix_instruct/openhermes.py
index 6d1fd13..651fa97 100644
--- a/src/llm_blender/mix_instruct/openhermes.py
+++ b/src/llm_blender/mix_instruct/openhermes.py
@@ -1,3 +1,16 @@
+"""Evaluation of OpenHermes-2.5 on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the OpenHermes-2.5 model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the OpenHermes-2.5 model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,41 +22,74 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with openhermes-2.5-mistral-7b
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<|im_start|>system
       {instruction}<|im_end|>
       <|im_start|>user
       {prompt}<|im_end|>
       <|im_start|>assistant"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/phi.py b/src/llm_blender/mix_instruct/phi.py
index 027fdd4..b787888 100644
--- a/src/llm_blender/mix_instruct/phi.py
+++ b/src/llm_blender/mix_instruct/phi.py
@@ -1,3 +1,16 @@
+"""Evaluation of Phi-3-mini on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Phi-3-mini model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Phi-3-mini model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,37 +22,70 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with phi-3-mini-4k-instruct
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<|user|>\n{instruction} {prompt} <|end|>\n<|assistant|>"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "phi-3-mini-4k-instruct.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_phi.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_phi.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/qwen.py b/src/llm_blender/mix_instruct/qwen.py
index 321faef..cefae0c 100644
--- a/src/llm_blender/mix_instruct/qwen.py
+++ b/src/llm_blender/mix_instruct/qwen.py
@@ -1,3 +1,16 @@
+"""Evaluation of Qwen-1.5-7b on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Qwen-1.5-7b model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Qwen-1.5-7b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,41 +22,74 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with qwen1.5-7b
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""<|im_start|>system
             You are a helpful assistant.<|im_end|>
             <|im_start|>user
             {instruction}: {prompt}<|im_end|>
             <|im_start|>assistant"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/qwen1_5-7b-chat-q4_k_m.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/solar.py b/src/llm_blender/mix_instruct/solar.py
index 5f4ba8a..157d102 100644
--- a/src/llm_blender/mix_instruct/solar.py
+++ b/src/llm_blender/mix_instruct/solar.py
@@ -1,3 +1,16 @@
+"""Evaluation of Solar-10.7b-Instruct on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Solar-10.7b-Instruct model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Solar-10.7b-Instruct model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,38 +22,71 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with solar-10.7b-instruct-v1.0
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""### User: {instruction} {prompt}
     ### Assistant:"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/solar-10.7b-instruct-v1.0.Q4_K_M"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_openchat.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_openchat.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/mix_instruct/starling.py b/src/llm_blender/mix_instruct/starling.py
index 0773539..3f51455 100644
--- a/src/llm_blender/mix_instruct/starling.py
+++ b/src/llm_blender/mix_instruct/starling.py
@@ -1,3 +1,16 @@
+"""Evaluation of Starling-7b on the Mix-Instruct dataset
+
+This script implements a pipeline to evaluate the Starling-7b model on the Mix-Instruct dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Starling-7b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the Mix-Instruct dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -9,37 +22,70 @@ def generate_result(
     prompt: str = "",
     instruction: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
+
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+        instruction (str): Additional instructions for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
 
     # Format prompt to be compatible with starling-lm-7b-alpha
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = f"""GPT4 Correct User: {instruction}\n{prompt}<|end_of_turn|>GPT4 Correct Assistant:"""
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 128, "temperature": 0.2},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "models/starling-lm-7b-alpha.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("llm-blender/mix-instruct", split="validation")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'input' and 'instruction' columns
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1
 )
-dataset.to_csv("output_starling.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_starling.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])