diff --git a/README.md b/README.md index a795931..eade0b4 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,10 @@ print(ranked_answers) # ] ``` -The API documentation can be found [here](src/llm_blender/README.md). +The detailed documentation can be found in the [LLM-Blender API Reference](src/llm_blender/README.md). + +As the [`llm-blender` library](https://github.com/yuchenlin/LLM-Blender) lacks a stable release, the necessary code has been incorporated into this project under `src/llm_blender/llm_blender_utils`. + ## Results diff --git a/paper/llm_blender.pdf b/paper/llm_blender.pdf index 235a6a1..5e4ea79 100644 Binary files a/paper/llm_blender.pdf and b/paper/llm_blender.pdf differ diff --git a/src/llm_blender/billsum/llama.py b/src/llm_blender/billsum/llama.py index 50284c4..1e62259 100644 --- a/src/llm_blender/billsum/llama.py +++ b/src/llm_blender/billsum/llama.py @@ -1,3 +1,16 @@ +"""Evaluation of Llama-3-8b on the BillSum dataset + +This script implements a pipeline to evaluate the Llama-3-8b model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -8,7 +21,19 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ + # Additional instructions for the model for Summarization instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -16,38 +41,58 @@ def generate_result( ) # Format prompt to be compatible with meta-llama-3-8b-instruct + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> """ f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""" ) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "meta-llama-3-8b-instruct.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_llama.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_llama.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/llm_blender_ranker_all_llms.py b/src/llm_blender/billsum/llm_blender_ranker_all_llms.py index cd65e79..a6fac93 100644 --- a/src/llm_blender/billsum/llm_blender_ranker_all_llms.py +++ b/src/llm_blender/billsum/llm_blender_ranker_all_llms.py @@ -1,3 +1,19 @@ +""" +Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender + +This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is +evaluated on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral). +2. Builds prompts for each model using specific templates. +3. Generates responses for prompts from the Mix-Instruct dataset using each model. +4. Ranks the generated responses from all the models using the LLM Blender Ranker. +5. Evaluates the top-ranked response against reference outputs using multiple metrics. + +The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs. +""" + from datasets import load_dataset from haystack import Pipeline from haystack.components.builders import PromptBuilder @@ -5,8 +21,10 @@ from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker +# Load the BillSum dataset dataset = load_dataset("billsum", split="test") +# Define prompt templates for each model llama_prompt_template = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """ """text. The summary should cover all the key points and main ideas presented in the original text, while """ @@ -51,6 +69,7 @@ """a concise and easy-to-understand format.: {{ prompt }} [/INST] """ ) +# Initialize PromptBuilder for each model llama_prompt_builder = PromptBuilder(template=llama_prompt_template) phi_prompt_builder = PromptBuilder(template=phi_prompt_template) openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template) @@ -59,8 +78,10 @@ qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template) mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template) +# Define model and generation parameters for all models model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}} +# Initialize LlamaCppGenerator for each model llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params) phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params) openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params) @@ -69,11 +90,13 @@ qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params) mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params) +# Initialize LLMBlenderRanker to ensemble multiple models llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu") - +# Create the main pipeline blender_pipeline = Pipeline() +# Add components to the pipeline blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder") blender_pipeline.add_component(instance=llama_model, name="llama_model") @@ -97,6 +120,8 @@ blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker") +# Connect components in the pipeline +# Connect the prompt builders to the respective model blender_pipeline.connect("llama_prompt_builder", "llama_model") blender_pipeline.connect("phi_prompt_builder", "phi_model") blender_pipeline.connect("openchat_prompt_builder", "openchat_model") @@ -105,6 +130,7 @@ blender_pipeline.connect("qwen_prompt_builder", "qwen_model") blender_pipeline.connect("mistral_prompt_builder", "mistral_model") +# Connect all the models to the LLMBlenderRanker for ensembling blender_pipeline.connect("llama_model", "llm_blender_ranker") blender_pipeline.connect("phi_model", "llm_blender_ranker") blender_pipeline.connect("openchat_model", "llm_blender_ranker") @@ -113,10 +139,13 @@ blender_pipeline.connect("qwen_model", "llm_blender_ranker") blender_pipeline.connect("mistral_model", "llm_blender_ranker") +# Process the dataset and generate answers generated_answers_labels = [] for row in dataset: prompt = row["input"] label = row["output"] + + # Run the pipeline for each input output = blender_pipeline.run( { {"llama_prompt_builder": {"prompt": prompt}}, @@ -130,6 +159,7 @@ ) generated_answers_labels.append((output["answers"], label)) +# Prepare data for evaluation preds = [] labels = [] for ranked_answers, label in generated_answers_labels: @@ -137,9 +167,13 @@ preds.append(ranked_answers[0].data) labels.append(label) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=preds, labels=labels) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the evaluation metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py b/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py index 72d2d3a..dce0115 100644 --- a/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py +++ b/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py @@ -1,3 +1,19 @@ +""" +Evaluation of ensemble of best performing LLMs on the BillSum dataset using LLM Blender + +This script implements a pipeline to ensemble multiple language models on the BillSum dataset. The pipeline is + evaluated on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads 3 top performing LLMs: LLaMA, Phi and Mistral. +2. Builds prompts for each model using specific templates. +3. Generates responses for prompts from the Mix-Instruct dataset using each model. +4. Ranks the generated responses from all the models using the LLM Blender Ranker. +5. Evaluates the top-ranked response against reference outputs using multiple metrics. + +The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs. +""" + from datasets import load_dataset from haystack import Pipeline from haystack.components.builders import PromptBuilder @@ -5,8 +21,10 @@ from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker +# Load the BillSum dataset dataset = load_dataset("billsum", split="test") +# Define prompt templates for each model llama_prompt_template = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """ """text. The summary should cover all the key points and main ideas presented in the original text, while """ @@ -26,20 +44,26 @@ """a concise and easy-to-understand format.: {{ prompt }} [/INST] """ ) +# Initialize PromptBuilder for each model llama_prompt_builder = PromptBuilder(template=llama_prompt_template) phi_prompt_builder = PromptBuilder(template=phi_prompt_template) mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template) +# Define model and generation parameters for all models model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}} +# Initialize LlamaCppGenerator for each model llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params) phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params) mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params) +# Initialize LLMBlenderRanker to ensemble multiple models llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu") +# Create the main pipeline blender_pipeline = Pipeline() +# Add components to the pipeline blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder") blender_pipeline.add_component(instance=llama_model, name="llama_model") @@ -51,18 +75,24 @@ blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker") +# Connect components in the pipeline +# Connect the prompt builders to the respective model blender_pipeline.connect("llama_prompt_builder", "llama_model") blender_pipeline.connect("phi_prompt_builder", "phi_model") blender_pipeline.connect("mistral_prompt_builder", "mistral_model") +# Connect all the models to the LLMBlenderRanker for ensembling blender_pipeline.connect("llama_model", "llm_blender_ranker") blender_pipeline.connect("phi_model", "llm_blender_ranker") blender_pipeline.connect("mistral_model", "llm_blender_ranker") +# Process the dataset and generate answers generated_answers_labels = [] for row in dataset: prompt = row["input"] label = row["output"] + + # Run the pipeline for each input output = blender_pipeline.run( { { @@ -74,6 +104,7 @@ ) generated_answers_labels.append((output["answers"], label)) +# Prepare data for evaluation preds = [] labels = [] for ranked_answers, label in generated_answers_labels: @@ -81,9 +112,13 @@ preds.append(ranked_answers[0].data) labels.append(label) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=preds, labels=labels) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the evaluation metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/mistral.py b/src/llm_blender/billsum/mistral.py index fd7b53c..bed26d9 100644 --- a/src/llm_blender/billsum/mistral.py +++ b/src/llm_blender/billsum/mistral.py @@ -1,3 +1,16 @@ +"""Evaluation of Mistral-7b on the BillSum dataset + +This script implements a pipeline to evaluate the Mistral-7b model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Mistral-7b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -8,7 +21,19 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ + # Additional instructions for the model for Summarization instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -16,35 +41,55 @@ def generate_result( ) # Format prompt to be compatible with mistral-7b-instruct-v0.2 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""[INST] {instruction} {prompt} [/INST] """ - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "mistral-7b-instruct-v0.2.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_mistral.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_mistral.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/openchat.py b/src/llm_blender/billsum/openchat.py index cca895a..40b1f28 100644 --- a/src/llm_blender/billsum/openchat.py +++ b/src/llm_blender/billsum/openchat.py @@ -1,10 +1,33 @@ +"""Evaluation of OpenChat on the BillSum dataset + +This script implements a pipeline to evaluate the OpenChat model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the OpenChat model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator from llm_blender import LLMBlenderEvaluator -def construct_prompt(prompt=""): +def construct_prompt(prompt: str = ""): + """ + Construct a prompt with instructions for summarization. + + Args: + prompt (str): The main text input for the model. + + Returns: + str: The constructed for the model. + """ + # Additional instructions for the model for Summarization prompt_with_instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -18,37 +41,68 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with openchat-3.5-0106 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = construct_prompt(prompt) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/openchat-3.5-0106.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/openhermes.py b/src/llm_blender/billsum/openhermes.py index af28273..6649ece 100644 --- a/src/llm_blender/billsum/openhermes.py +++ b/src/llm_blender/billsum/openhermes.py @@ -1,3 +1,16 @@ +"""Evaluation of OpenHermes on the BillSum dataset + +This script implements a pipeline to evaluate the OpenHermes model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the OpenHermes model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -8,6 +21,19 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ + # Additional instructions for the model for Summarization instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -15,39 +41,59 @@ def generate_result( ) # Format prompt to be compatible with openhermes-2.5-mistral-7b + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""<|im_start|>system {instruction}<|im_end|> <|im_start|>user {prompt}<|im_end|> <|im_start|>assistant""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/phi.py b/src/llm_blender/billsum/phi.py index 841499a..4da16d4 100644 --- a/src/llm_blender/billsum/phi.py +++ b/src/llm_blender/billsum/phi.py @@ -1,3 +1,16 @@ +"""Evaluation of Phi-3 on the BillSum dataset + +This script implements a pipeline to evaluate the Phi-3 model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Phi-3 model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -8,7 +21,19 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ + # Additional instructions for the model for Summarization instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -16,35 +41,55 @@ def generate_result( ) # Format prompt to be compatible with phi-3-mini-4k-instruct + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""<|user|>\n{instruction} {prompt} <|end|>\n<|assistant|>""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "phi-3-mini-4k-instruct.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_phi.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_phi.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/qwen.py b/src/llm_blender/billsum/qwen.py index 6474e0b..3c5878b 100644 --- a/src/llm_blender/billsum/qwen.py +++ b/src/llm_blender/billsum/qwen.py @@ -1,10 +1,33 @@ +"""Evaluation of Qwen on the BillSum dataset + +This script implements a pipeline to evaluate the Qwen model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Qwen model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator from llm_blender import LLMBlenderEvaluator -def construct_prompt(prompt=""): +def construct_prompt(prompt: str = ""): + """ + Construct a prompt with instructions for summarization. + + Args: + prompt (str): The main text input for the model. + + Returns: + str: The constructed for the model. + """ + # Additional instructions for the model for Summarization prompt_with_instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -24,37 +47,68 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with qwen1.5-7b + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = construct_prompt(prompt) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/qwen1_5-7b-chat-q4_k_m.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_qwen.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/solar.py b/src/llm_blender/billsum/solar.py index 943950b..433fe1c 100644 --- a/src/llm_blender/billsum/solar.py +++ b/src/llm_blender/billsum/solar.py @@ -1,10 +1,33 @@ +"""Evaluation of Solar on the BillSum dataset + +This script implements a pipeline to evaluate the Solar model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Solar model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator from llm_blender import LLMBlenderEvaluator -def construct_prompt(prompt=""): +def construct_prompt(prompt: str = ""): + """ + Construct a prompt with instructions for summarization. + + Args: + prompt (str): The main text input for the model. + + Returns: + str: The constructed for the model. + """ + # Additional instructions for the model for Summarization prompt_with_instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -21,37 +44,68 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with solar-10.7b-instruct-v1.0 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = construct_prompt(prompt) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/solar-10.7b-instruct-v1.0.Q4_K_M" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Load the dataset from the HuggingFace +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/billsum/starling.py b/src/llm_blender/billsum/starling.py index 703fde5..287b562 100644 --- a/src/llm_blender/billsum/starling.py +++ b/src/llm_blender/billsum/starling.py @@ -1,10 +1,33 @@ +"""Evaluation of Starling on the BillSum dataset + +This script implements a pipeline to evaluate the Starling model on the BillSum dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Starling model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the BillSum dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the BillSum dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator from llm_blender import LLMBlenderEvaluator -def construct_prompt(prompt=""): +def construct_prompt(prompt: str = ""): + """ + Construct a prompt with instructions for summarization. + + Args: + prompt (str): The main text input for the model. + + Returns: + str: The constructed for the model. + """ + # Additional instructions for the model for Summarization prompt_with_instruction = ( """ Provide a comprehensive summary of the given text. """ """The summary should cover all the key points and main ideas presented in the original text, """ @@ -18,37 +41,69 @@ def generate_result( generator: LlamaCppGenerator, prompt: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with starling-lm-7b-alpha + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = construct_prompt(prompt) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 500, "temperature": 0.1}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/starling-lm-7b-alpha.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("billsum", split="test") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'text' column +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1 ) -dataset.to_csv("output_starling.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_starling.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/llama.py b/src/llm_blender/mix_instruct/llama.py index 2ae3222..9060c51 100644 --- a/src/llm_blender/mix_instruct/llama.py +++ b/src/llm_blender/mix_instruct/llama.py @@ -1,3 +1,16 @@ +"""Evaluation of Llama-3-8b on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Llama-3-8b model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,40 +22,72 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. - # Format prompt to be compatible with meta-llama-3-8b-instruct + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ + # Format prompt to be compatible with meta-llama-3-8b-instruct model + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> """ f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""" ) - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "meta-llama-3-8b-instruct.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_llama.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_llama.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py b/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py index be67605..b6bea1f 100644 --- a/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py +++ b/src/llm_blender/mix_instruct/llm_blender_ranker_all_llms.py @@ -1,3 +1,19 @@ +""" +Evaluation of ensemble of LLMs on the Mix-Instruct dataset using LLM Blender + +This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is +evaluated on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral). +2. Builds prompts for each model using specific templates. +3. Generates responses for prompts from the Mix-Instruct dataset using each model. +4. Ranks the generated responses from all the models using the LLM Blender Ranker. +5. Evaluates the top-ranked response against reference outputs using multiple metrics. + +The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs. +""" + from datasets import load_dataset from haystack import Pipeline from haystack.components.builders import PromptBuilder @@ -5,9 +21,10 @@ from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker +# Load the Mix-Instruct dataset dataset = load_dataset("llm-blender/mix-instruct", split="validation") - +# Define prompt templates for each model llama_prompt_template = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """ """text. The summary should cover all the key points and main ideas presented in the original text, while """ @@ -41,6 +58,7 @@ mistral_prompt_template = """[INST] {{ instruction }} {{ prompt }} [/INST] """ +# Initialize PromptBuilder for each model llama_prompt_builder = PromptBuilder(template=llama_prompt_template) phi_prompt_builder = PromptBuilder(template=phi_prompt_template) openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template) @@ -49,8 +67,10 @@ qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template) mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template) +# Define model and generation parameters for all models model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 128, "temperature": 0.2}} +# Initialize LlamaCppGenerator for each model llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params) phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params) openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params) @@ -59,34 +79,31 @@ qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params) mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params) +# Initialize LLMBlenderRanker to ensemble multiple models llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu") - +# Create the main pipeline blender_pipeline = Pipeline() +# Add components to the pipeline blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder") blender_pipeline.add_component(instance=llama_model, name="llama_model") - blender_pipeline.add_component(instance=phi_prompt_builder, name="phi_prompt_builder") blender_pipeline.add_component(instance=phi_model, name="phi_model") - blender_pipeline.add_component(instance=openchat_prompt_builder, name="openchat_prompt_builder") blender_pipeline.add_component(instance=openchat_model, name="openchat_model") - blender_pipeline.add_component(instance=openhermes_prompt_builder, name="openhermes_prompt_builder") blender_pipeline.add_component(instance=openhermes_model, name="openhermes_model") - blender_pipeline.add_component(instance=solar_prompt_builder, name="solar_prompt_builder") blender_pipeline.add_component(instance=solar_model, name="solar_model") - blender_pipeline.add_component(instance=qwen_prompt_builder, name="qwen_prompt_builder") blender_pipeline.add_component(instance=qwen_model, name="qwen_model") - blender_pipeline.add_component(instance=mistral_prompt_builder, name="mistral_prompt_builder") blender_pipeline.add_component(instance=mistral_model, name="mistral_model") - blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker") +# Connect components in the pipeline +# Connect the prompt builders to the respective model blender_pipeline.connect("llama_prompt_builder", "llama_model") blender_pipeline.connect("phi_prompt_builder", "phi_model") blender_pipeline.connect("openchat_prompt_builder", "openchat_model") @@ -95,6 +112,7 @@ blender_pipeline.connect("qwen_prompt_builder", "qwen_model") blender_pipeline.connect("mistral_prompt_builder", "mistral_model") +# Connect all the models to the LLMBlenderRanker for ensembling blender_pipeline.connect("llama_model", "llm_blender_ranker") blender_pipeline.connect("phi_model", "llm_blender_ranker") blender_pipeline.connect("openchat_model", "llm_blender_ranker") @@ -103,24 +121,28 @@ blender_pipeline.connect("qwen_model", "llm_blender_ranker") blender_pipeline.connect("mistral_model", "llm_blender_ranker") +# Process the dataset and generate answers generated_answers_labels = [] for row in dataset: instruction = row["instruction"] prompt = row["input"] label = row["output"] + + # Run the pipeline for each input output = blender_pipeline.run( { - {"llama_prompt_builder": {"prompt": prompt}}, - {"phi_prompt_builder": {"prompt": prompt}}, - {"openchat_prompt_builder": {"instruction": instruction, "prompt": prompt}}, - {"openhermes_prompt_builder": {"instruction": instruction, "prompt": prompt}}, - {"solar_prompt_builder": {"instruction": instruction, "prompt": prompt}}, - {"qwen_prompt_builder": {"instruction": instruction, "prompt": prompt}}, - {"mistral_prompt_builder": {"instruction": instruction, "prompt": prompt}}, + "llama_prompt_builder": {"prompt": prompt}, + "phi_prompt_builder": {"prompt": prompt}, + "openchat_prompt_builder": {"instruction": instruction, "prompt": prompt}, + "openhermes_prompt_builder": {"instruction": instruction, "prompt": prompt}, + "solar_prompt_builder": {"instruction": instruction, "prompt": prompt}, + "qwen_prompt_builder": {"instruction": instruction, "prompt": prompt}, + "mistral_prompt_builder": {"instruction": instruction, "prompt": prompt}, } ) generated_answers_labels.append((output["answers"], label)) +# Prepare data for evaluation preds = [] labels = [] for ranked_answers, label in generated_answers_labels: @@ -128,9 +150,13 @@ preds.append(ranked_answers[0].data) labels.append(label) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=preds, labels=labels) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the evaluation metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py b/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py index 2bf4df8..e354c7b 100644 --- a/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py +++ b/src/llm_blender/mix_instruct/llm_blender_ranker_top_3_llms.py @@ -1,3 +1,19 @@ +""" +Evaluation of ensemble of best performing LLMs on the Mix-Instruct dataset using LLM Blender + +This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is + evaluated on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads 3 top performing LLMs: LLaMA, Phi and Mistral. +2. Builds prompts for each model using specific templates. +3. Generates responses for prompts from the Mix-Instruct dataset using each model. +4. Ranks the generated responses from all the models using the LLM Blender Ranker. +5. Evaluates the top-ranked response against reference outputs using multiple metrics. + +The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs. +""" + from datasets import load_dataset from haystack import Pipeline from haystack.components.builders import PromptBuilder @@ -5,8 +21,10 @@ from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker +# Load the Mix-Instruct dataset dataset = load_dataset("llm-blender/mix-instruct", split="validation") +# Define prompt templates for each model llama_prompt_template = ( """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """ """text. The summary should cover all the key points and main ideas presented in the original text, while """ @@ -26,21 +44,26 @@ """a concise and easy-to-understand format.: {{ prompt }} [/INST] """ ) +# Initialize PromptBuilder for each model llama_prompt_builder = PromptBuilder(template=llama_prompt_template) phi_prompt_builder = PromptBuilder(template=phi_prompt_template) mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template) +# Define model and generation parameters for all models model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 128, "temperature": 0.2}} - +# Initialize LlamaCppGenerator for each model llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params) phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params) mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params) +# Initialize LLMBlenderRanker to ensemble multiple models llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu") +# Create the main pipeline blender_pipeline = Pipeline() +# Add components to the pipeline blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder") blender_pipeline.add_component(instance=llama_model, name="llama_model") @@ -52,18 +75,25 @@ blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker") +# Connect components in the pipeline +# Connect the prompt builders to the respective model blender_pipeline.connect("llama_prompt_builder", "llama_model") blender_pipeline.connect("phi_prompt_builder", "phi_model") blender_pipeline.connect("mistral_prompt_builder", "mistral_model") +# Connect all the models to the LLMBlenderRanker for ensembling blender_pipeline.connect("llama_model", "llm_blender_ranker") blender_pipeline.connect("phi_model", "llm_blender_ranker") blender_pipeline.connect("mistral_model", "llm_blender_ranker") + +# Process the dataset and generate answers generated_answers_labels = [] for row in dataset: instruction = row["instruction"] prompt = row["input"] label = row["output"] + + # Run the pipeline for each input output = blender_pipeline.run( { {"llama_prompt_builder": {"instruction": instruction, "prompt": prompt}}, @@ -73,6 +103,7 @@ ) generated_answers_labels.append((output["answers"], label)) +# Prepare data for evaluation preds = [] labels = [] for ranked_answers, label in generated_answers_labels: @@ -80,9 +111,13 @@ preds.append(ranked_answers[0].data) labels.append(label) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=preds, labels=labels) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the evaluation metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/mistral.py b/src/llm_blender/mix_instruct/mistral.py index 99718ef..c047526 100644 --- a/src/llm_blender/mix_instruct/mistral.py +++ b/src/llm_blender/mix_instruct/mistral.py @@ -1,3 +1,16 @@ +"""Evaluation of Mistral-7b on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Mistral-7b model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Mistral-7b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,37 +22,69 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with mistral-7b-instruct-v0.2 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""[INST] {instruction} {prompt} [/INST] """ - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "mistral-7b-instruct-v0.2.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_mistral.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_mistral.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/openchat.py b/src/llm_blender/mix_instruct/openchat.py index 6d529a5..f8119bf 100644 --- a/src/llm_blender/mix_instruct/openchat.py +++ b/src/llm_blender/mix_instruct/openchat.py @@ -1,3 +1,16 @@ +"""Evaluation of OpenChat-3.5 on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the OpenChat-3.5 model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the OpenChat-3.5 model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,37 +22,70 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with openchat-3.5-0106 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""GPT4 Correct User: {instruction}\n{prompt}<|end_of_turn|>GPT4 Correct Assistant:""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/openchat-3.5-0106.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/openhermes.py b/src/llm_blender/mix_instruct/openhermes.py index 6d1fd13..651fa97 100644 --- a/src/llm_blender/mix_instruct/openhermes.py +++ b/src/llm_blender/mix_instruct/openhermes.py @@ -1,3 +1,16 @@ +"""Evaluation of OpenHermes-2.5 on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the OpenHermes-2.5 model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the OpenHermes-2.5 model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,41 +22,74 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with openhermes-2.5-mistral-7b + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""<|im_start|>system {instruction}<|im_end|> <|im_start|>user {prompt}<|im_end|> <|im_start|>assistant""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/phi.py b/src/llm_blender/mix_instruct/phi.py index 027fdd4..b787888 100644 --- a/src/llm_blender/mix_instruct/phi.py +++ b/src/llm_blender/mix_instruct/phi.py @@ -1,3 +1,16 @@ +"""Evaluation of Phi-3-mini on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Phi-3-mini model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Phi-3-mini model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,37 +22,70 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with phi-3-mini-4k-instruct + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""<|user|>\n{instruction} {prompt} <|end|>\n<|assistant|>""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "phi-3-mini-4k-instruct.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_phi.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_phi.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/qwen.py b/src/llm_blender/mix_instruct/qwen.py index 321faef..cefae0c 100644 --- a/src/llm_blender/mix_instruct/qwen.py +++ b/src/llm_blender/mix_instruct/qwen.py @@ -1,3 +1,16 @@ +"""Evaluation of Qwen-1.5-7b on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Qwen-1.5-7b model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Qwen-1.5-7b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,41 +22,74 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with qwen1.5-7b + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""<|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user {instruction}: {prompt}<|im_end|> <|im_start|>assistant""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/qwen1_5-7b-chat-q4_k_m.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/solar.py b/src/llm_blender/mix_instruct/solar.py index 5f4ba8a..157d102 100644 --- a/src/llm_blender/mix_instruct/solar.py +++ b/src/llm_blender/mix_instruct/solar.py @@ -1,3 +1,16 @@ +"""Evaluation of Solar-10.7b-Instruct on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Solar-10.7b-Instruct model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Solar-10.7b-Instruct model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,38 +22,71 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with solar-10.7b-instruct-v1.0 + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""### User: {instruction} {prompt} ### Assistant:""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/solar-10.7b-instruct-v1.0.Q4_K_M" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_openchat.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_openchat.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"]) diff --git a/src/llm_blender/mix_instruct/starling.py b/src/llm_blender/mix_instruct/starling.py index 0773539..3f51455 100644 --- a/src/llm_blender/mix_instruct/starling.py +++ b/src/llm_blender/mix_instruct/starling.py @@ -1,3 +1,16 @@ +"""Evaluation of Starling-7b on the Mix-Instruct dataset + +This script implements a pipeline to evaluate the Starling-7b model on the Mix-Instruct dataset +on the BLEURT, BARTScore, and BERTScore metrics. + +The pipeline performs the following steps: +1. Loads the Starling-7b model using the LlamaCppGenerator from Haystack. +2. Generates responses for prompts and instructions from the Mix-Instruct dataset. +3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics. + +This evaluation provides a baseline for the model's performance on the Mix-Instruct dataset. +""" + from datasets import load_dataset from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator @@ -9,37 +22,70 @@ def generate_result( prompt: str = "", instruction: str = "", ) -> str: + """ + Generate a response using the LlamaCppGenerator. + + The prompt and instruction are formatted to be compatible with the model. + + Args: + generator (LlamaCppGenerator): The initialized LlamaCppGenerator object. + prompt (str): The main text input for the model. + instruction (str): Additional instructions for the model. + + Returns: + str: The generated response from the model. + """ # Format prompt to be compatible with starling-lm-7b-alpha + # This specific format is required for the model to distinguish between user input and expected output formatted_prompt = f"""GPT4 Correct User: {instruction}\n{prompt}<|end_of_turn|>GPT4 Correct Assistant:""" - # Generate text + # Generate text using the LlamaCppGenerator result = generator.run( formatted_prompt, generation_kwargs={"max_tokens": 128, "temperature": 0.2}, ) + + # Extract the generated text from the result generated_answer = result["replies"][0] return generated_answer +# Define the path to the model weights model = "models/starling-lm-7b-alpha.Q4_K_M.gguf" + +# Initialize the LlamaCppGenerator with the specified model and context window size generator = LlamaCppGenerator( model=model, n_ctx=256, ) + +# Warm up the generator (loading the model into memory) generator.warm_up() +# Load the dataset from the HuggingFace dataset = load_dataset("llm-blender/mix-instruct", split="validation") + +# Convert the dataset to a pandas DataFrame for easier manipulation dataset = dataset.to_pandas() + +# Generate results for each row in the dataset +# Apply the generate_result function to each row, using the 'input' and 'instruction' columns +# Store the results in the 'result' column dataset.loc[:, "result"] = dataset.apply( lambda row: str(generate_result(generator=generator, prompt=row["input"], instruction=row["instruction"])), axis=1 ) -dataset.to_csv("output_starling.csv", index=False) +# Save the generated texts to a CSV file +dataset.to_csv("output_starling.csv", index=False) +# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"]) + +# Compute various metrics to evaluate the generated results against the reference outputs metrics = evaluator.compute_metrics() +# Print the computed metrics print("BLEURT Score", metrics["bleurt"]) print("BARTSCORE Score", metrics["bartscore"]) print("BERTSCORE Score", metrics["bertscore"])