From 89880237789f2db59b36fc1ab99ad78462136d20 Mon Sep 17 00:00:00 2001
From: Kevin BEAUGRAND <9513635+kbeaugrand@users.noreply.github.com>
Date: Sun, 15 Sep 2024 22:51:58 +0200
Subject: [PATCH] Fix evaluations and test set generation to use json_object
 output format from LLMs (#787)

## Motivation and Context (Why the change? What's the scenario?)
Use json_obect output format from LLM in the Testset generation and
Evaluation

## High level description (Approach, Design)
Add POCO objects generated by LLM and update the deserialization.
---
 .../AnswerCorrectnessEvaluator.cs             |  6 +-
 .../AnswerCorrectness/StatementExtraction.cs  | 16 ++++
 .../ContextRecall/ContextRecallEvaluator.cs   | 12 +--
 .../GroundTruthClassifications.cs             | 16 ++++
 .../ContextRelevancyEvaluator.cs              |  2 +
 .../Faithfulness/FaithfulnessEvaluations.cs   | 16 ++++
 .../Faithfulness/FaithfulnessEvaluator.cs     | 17 +++--
 .../Relevance/RelevanceEvaluator.cs           |  2 +
 .../Prompts/Evaluation/ContextPrecision.txt   |  4 +-
 .../Prompts/Evaluation/ContextRecall.txt      | 75 ++++++++++---------
 .../Prompts/Evaluation/Correctness.txt        |  4 +-
 .../Prompts/Evaluation/Faithfulness.txt       | 69 +++++++++--------
 .../Prompts/Extraction/Keyphrase.txt          | 13 ++--
 .../Prompts/Extraction/Question.txt           |  2 +-
 .../Prompts/Extraction/Statements.txt         |  6 +-
 .../SyntheticData/ConditionalQuestion.txt     |  3 +-
 .../SyntheticData/MultiContextQuestion.txt    |  3 +-
 .../Prompts/SyntheticData/QuestionAnswer.txt  |  4 +-
 .../SyntheticData/ReasoningQuestion.txt       |  3 +-
 applications/evaluation/TestSetGenerator.cs   | 14 +++-
 .../Evaluation.FunctionalTests.csproj         |  6 ++
 21 files changed, 194 insertions(+), 99 deletions(-)
 create mode 100644 applications/evaluation/Evaluators/AnswerCorrectness/StatementExtraction.cs
 create mode 100644 applications/evaluation/Evaluators/ContextRecall/GroundTruthClassifications.cs
 create mode 100644 applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluations.cs
diff --git a/applications/evaluation/Evaluators/AnswerCorrectness/AnswerCorrectnessEvaluator.cs b/applications/evaluation/Evaluators/AnswerCorrectness/AnswerCorrectnessEvaluator.cs
index c07059b35..28204182f 100644
--- a/applications/evaluation/Evaluators/AnswerCorrectness/AnswerCorrectnessEvaluator.cs
+++ b/applications/evaluation/Evaluators/AnswerCorrectness/AnswerCorrectnessEvaluator.cs
@@ -20,11 +20,15 @@ internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine
     private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.ExtractStatements));
 
     private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.EvaluateCorrectness));
 
     public AnswerCorrectnessEvaluator(Kernel kernel)
@@ -42,7 +46,7 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
                 { "answer", answer.Result }
             }).ConfigureAwait(false);
 
-            return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
+            return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
         }).ConfigureAwait(false);
 
         if (statements is null)
diff --git a/applications/evaluation/Evaluators/AnswerCorrectness/StatementExtraction.cs b/applications/evaluation/Evaluators/AnswerCorrectness/StatementExtraction.cs
new file mode 100644
index 000000000..53bc5f247
--- /dev/null
+++ b/applications/evaluation/Evaluators/AnswerCorrectness/StatementExtraction.cs
@@ -0,0 +1,16 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
+
+#pragma warning disable CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class StatementExtraction
+#pragma warning restore CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+    [JsonPropertyName("statements")]
+    public List<string> Statements { get; set; } = new List<string>();
+}
diff --git a/applications/evaluation/Evaluators/ContextRecall/ContextRecallEvaluator.cs b/applications/evaluation/Evaluators/ContextRecall/ContextRecallEvaluator.cs
index 2397944ef..a2a8c33c8 100644
--- a/applications/evaluation/Evaluators/ContextRecall/ContextRecallEvaluator.cs
+++ b/applications/evaluation/Evaluators/ContextRecall/ContextRecallEvaluator.cs
@@ -20,6 +20,8 @@ internal sealed class ContextRecallEvaluator : EvaluationEngine
     private KernelFunction EvaluateContextRecall => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextRecall"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.EvaluateContextRecall));
 
     public ContextRecallEvaluator(Kernel kernel)
@@ -29,7 +31,7 @@ public ContextRecallEvaluator(Kernel kernel)
 
     internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
     {
-        var evaluations = await this.Try(3, async (remainingTry) =>
+        var classification = await this.Try(3, async (remainingTry) =>
         {
             var extraction = await this.EvaluateContextRecall.InvokeAsync(this._kernel, new KernelArguments
             {
@@ -38,16 +40,16 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
                 { "ground_truth", testSet.GroundTruth }
             }).ConfigureAwait(false);
 
-            return JsonSerializer.Deserialize<IEnumerable<GroundTruthClassification>>(extraction.GetValue<string>()!);
+            return JsonSerializer.Deserialize<GroundTruthClassifications>(extraction.GetValue<string>()!);
         }).ConfigureAwait(false);
 
-        if (evaluations is null)
+        if (classification is null)
         {
             return 0;
         }
 
-        metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", evaluations);
+        metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", classification);
 
-        return (float)evaluations.Count(c => c.Attributed > 0) / (float)evaluations.Count();
+        return (float)classification.Evaluations.Count(c => c.Attributed > 0) / (float)classification.Evaluations.Count;
     }
 }
diff --git a/applications/evaluation/Evaluators/ContextRecall/GroundTruthClassifications.cs b/applications/evaluation/Evaluators/ContextRecall/GroundTruthClassifications.cs
new file mode 100644
index 000000000..ee1c0235b
--- /dev/null
+++ b/applications/evaluation/Evaluators/ContextRecall/GroundTruthClassifications.cs
@@ -0,0 +1,16 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.ContextRecall;
+
+#pragma warning disable CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class GroundTruthClassifications
+#pragma warning restore CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+    [JsonPropertyName("evaluations")]
+    public List<GroundTruthClassification> Evaluations { get; set; } = new();
+}
diff --git a/applications/evaluation/Evaluators/ContextRelevancy/ContextRelevancyEvaluator.cs b/applications/evaluation/Evaluators/ContextRelevancy/ContextRelevancyEvaluator.cs
index 6c321f549..df984ecab 100644
--- a/applications/evaluation/Evaluators/ContextRelevancy/ContextRelevancyEvaluator.cs
+++ b/applications/evaluation/Evaluators/ContextRelevancy/ContextRelevancyEvaluator.cs
@@ -19,6 +19,8 @@ internal sealed class ContextRelevancyEvaluator : EvaluationEngine
     private KernelFunction EvaluateContext => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextPrecision"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     });
 
     public ContextRelevancyEvaluator(Kernel kernel)
diff --git a/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluations.cs b/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluations.cs
new file mode 100644
index 000000000..d6a5e5d03
--- /dev/null
+++ b/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluations.cs
@@ -0,0 +1,16 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.Faithfulness;
+
+#pragma warning disable CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class FaithfulnessEvaluations
+#pragma warning restore CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+    [JsonPropertyName("evaluations")]
+    public List<StatementEvaluation> Evaluations { get; set; } = new();
+}
diff --git a/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluator.cs b/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluator.cs
index 371dec492..6cf2e643b 100644
--- a/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluator.cs
+++ b/applications/evaluation/Evaluators/Faithfulness/FaithfulnessEvaluator.cs
@@ -6,6 +6,7 @@
 using System.Text.Json;
 using System.Threading.Tasks;
 using Microsoft.KernelMemory.Evaluation;
+using Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 
@@ -20,11 +21,15 @@ internal sealed class FaithfulnessEvaluator : EvaluationEngine
     private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.ExtractStatements));
 
     private KernelFunction FaithfulnessEvaluation => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Faithfulness"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.FaithfulnessEvaluation));
 
     public FaithfulnessEvaluator(Kernel kernel)
@@ -34,7 +39,7 @@ public FaithfulnessEvaluator(Kernel kernel)
 
     internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, object?> metadata)
     {
-        var statements = await this.Try(3, async (remainingTry) =>
+        var extraction = await this.Try(3, async (remainingTry) =>
         {
             var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments
             {
@@ -42,10 +47,10 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
                 { "answer", answer.Result }
             }).ConfigureAwait(false);
 
-            return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
+            return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
         }).ConfigureAwait(false);
 
-        if (statements is null)
+        if (extraction is null)
         {
             return 0;
         }
@@ -56,10 +61,10 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
             {
                 { "context", string.Join(Environment.NewLine, answer.RelevantSources.SelectMany(c => c.Partitions.Select(p => p.Text))) },
                 { "answer", answer.Result },
-                { "statements", JsonSerializer.Serialize(statements) }
+                { "statements", JsonSerializer.Serialize(extraction) }
             }).ConfigureAwait(false);
 
-            var faithfulness = JsonSerializer.Deserialize<IEnumerable<StatementEvaluation>>(evaluation.GetValue<string>()!);
+            var faithfulness = JsonSerializer.Deserialize<FaithfulnessEvaluations>(evaluation.GetValue<string>()!);
 
             return faithfulness;
         }).ConfigureAwait(false);
@@ -71,6 +76,6 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
 
         metadata.Add($"{nameof(FaithfulnessEvaluator)}-Evaluation", faithfulness);
 
-        return faithfulness.Count(c => c.Verdict > 0) / (float)statements.Count();
+        return faithfulness.Evaluations.Count(c => c.Verdict > 0) / (float)extraction.Statements.Count;
     }
 }
diff --git a/applications/evaluation/Evaluators/Relevance/RelevanceEvaluator.cs b/applications/evaluation/Evaluators/Relevance/RelevanceEvaluator.cs
index bed94ee19..00d9e0f22 100644
--- a/applications/evaluation/Evaluators/Relevance/RelevanceEvaluator.cs
+++ b/applications/evaluation/Evaluators/Relevance/RelevanceEvaluator.cs
@@ -25,6 +25,8 @@ internal sealed class RelevanceEvaluator : EvaluationEngine
     private KernelFunction ExtractQuestion => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Question"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     }, functionName: nameof(this.ExtractQuestion));
 
     public RelevanceEvaluator(Kernel kernel)
diff --git a/applications/evaluation/Prompts/Evaluation/ContextPrecision.txt b/applications/evaluation/Prompts/Evaluation/ContextPrecision.txt
index c2ebc0489..117a83739 100644
--- a/applications/evaluation/Prompts/Evaluation/ContextPrecision.txt
+++ b/applications/evaluation/Prompts/Evaluation/ContextPrecision.txt
@@ -26,10 +26,10 @@ verification: {
 
 ## Instructions
 
-Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output.
+Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output. Your output should always be a json data.
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
 answer: {{$answer}}
-verification: 
\ No newline at end of file
+verification: 
diff --git a/applications/evaluation/Prompts/Evaluation/ContextRecall.txt b/applications/evaluation/Prompts/Evaluation/ContextRecall.txt
index be5e8cb0a..caa8e28c9 100644
--- a/applications/evaluation/Prompts/Evaluation/ContextRecall.txt
+++ b/applications/evaluation/Prompts/Evaluation/ContextRecall.txt
@@ -3,47 +3,51 @@
 question: What can you tell me about albert Albert Einstein?
 context: Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
 answer: Albert Einstein born in 14 March 1879 was  German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905.  Einstein moved to Switzerland in 1895
-classification: 
-   [
-       {
-           "Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
-           "Reason": "The date of birth of Einstein is mentioned clearly in the context.",
-           "Attributed": 1,
-       },
-       {
-           "Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
-           "Reason": "The exact sentence is present in the given context.",
-           "Attributed": 1,
-       },
-       {
-           "Statement": "He published 4 papers in 1905.",
-           "Reason": "There is no mention about papers he wrote in the given context.",
-           "Attributed": 0,
-       },
-       {
-           "Statement": "Einstein moved to Switzerland in 1895.",
-           "Reason": "There is no supporting evidence for this in the given context.",
-           "Attributed": 0
-       },
-   ]
+{
+    evaluations: 
+    [
+        {
+            "Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
+            "Reason": "The date of birth of Einstein is mentioned clearly in the context.",
+            "Attributed": 1,
+        },
+        {
+            "Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
+            "Reason": "The exact sentence is present in the given context.",
+            "Attributed": 1,
+        },
+        {
+            "Statement": "He published 4 papers in 1905.",
+            "Reason": "There is no mention about papers he wrote in the given context.",
+            "Attributed": 0,
+        },
+        {
+            "Statement": "Einstein moved to Switzerland in 1895.",
+            "Reason": "There is no supporting evidence for this in the given context.",
+            "Attributed": 0
+        },
+    ]
+}
 
 question: who won 2020 icc world cup?
 context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
 answer: England
-classification: 
-   [
-       {
-           "Statement": "England won the 2022 ICC Men's T20 World Cup.",
-           "Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
-           "Attributed": 1
-       },
-   ]
-
+{
+    evaluations: 
+    [
+        {
+            "Statement": "England won the 2022 ICC Men's T20 World Cup.",
+            "Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
+            "Attributed": 1
+        },
+    ]
+}
 
 question: What is the primary fuel for the Sun?
 context: NULL
 answer: Hydrogen
-classification:
+{
+   evaluations:
    [
        {
            "Statement": "The Sun's primary fuel is hydrogen.",
@@ -51,14 +55,15 @@ classification:
            "Attributed": 0
        },
    ]
+}
 
 ## Instructions
 
-Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification.
+Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Your output should always be a json data.
 
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
 answer: {{$ground_truth}}
-classification: 
\ No newline at end of file
+classification: 
diff --git a/applications/evaluation/Prompts/Evaluation/Correctness.txt b/applications/evaluation/Prompts/Evaluation/Correctness.txt
index aeaaf3b7c..051c6d309 100644
--- a/applications/evaluation/Prompts/Evaluation/Correctness.txt
+++ b/applications/evaluation/Prompts/Evaluation/Correctness.txt
@@ -84,9 +84,9 @@ Given a ground truth and an answer statements, analyze each statement and classi
 - FP (false positive): statements present in the answer but not directly supported by any statement in ground truth,
 - FN (false negative): statements found in the ground truth but not present in answer.
 
-Each statement can only belong to one of the categories. Provide a reason for each classification.
+Each statement can only belong to one of the categories. Provide a reason for each classification. Your output should always be a json data.
 
 question: {{$question}}
 answer: {{$answer}}
 ground_truth: {{$ground_truth}}
-extracted_statements: 
\ No newline at end of file
+extracted_statements: 
diff --git a/applications/evaluation/Prompts/Evaluation/Faithfulness.txt b/applications/evaluation/Prompts/Evaluation/Faithfulness.txt
index 6beb92184..da28e7afb 100644
--- a/applications/evaluation/Prompts/Evaluation/Faithfulness.txt
+++ b/applications/evaluation/Prompts/Evaluation/Faithfulness.txt
@@ -7,45 +7,50 @@ statements: [
     "John is a dedicated student.",
     "John has a part-time job.",
 ]
-answer: [
-    {
-        "Statement": "John is majoring in Biology.",
-        "Reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
-        "Verdict": 0
-    },
-    {
-        "Statement": "John is taking a course on Artificial Intelligence.",
-        "Reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
-        "Verdict": 0
-    },
-    {
-        "Statement": "John is a dedicated student.",
-        "Reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
-        "Verdict": 1
-    },
-    {
-        "Statement": "John has a part-time job.",
-        "Reason": "There is no information given in the context about John having a part-time job.",
-        "Verdict": 0
-    }
-]
+{
+    evaluations:
+    [
+        {
+            "Statement": "John is majoring in Biology.",
+            "Reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
+            "Verdict": 0
+        },
+        {
+            "Statement": "John is taking a course on Artificial Intelligence.",
+            "Reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
+            "Verdict": 0
+        },
+        {
+            "Statement": "John is a dedicated student.",
+            "Reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
+            "Verdict": 1
+        },
+        {
+            "Statement": "John has a part-time job.",
+            "Reason": "There is no information given in the context about John having a part-time job.",
+            "Verdict": 0
+        }
+    ]
+}
 
 context: Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.
 statements: ["Albert Einstein was a genius."],
-answer: [
-    {
-        "Statement": "Albert Einstein was a genius.",
-        "Reason": "The context and statement are unrelated",
-        "Verdict": 0,
-    }
-]
+{
+    evaluations:
+    [
+        {
+            "Statement": "Albert Einstein was a genius.",
+            "Reason": "The context and statement are unrelated",
+            "Verdict": 0,
+        }
+    ]
+}
 
 ## Instructions
 
-Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context. Be sure to evaluate all statements.
+Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context. Be sure to evaluate all statements. Your output should always be a json data.
 
 Let's do it with a real data.
 
 context: {{$context}}
-statements: {{$statements}}
-answer: 
+statements: {{$statements}} 
diff --git a/applications/evaluation/Prompts/Extraction/Keyphrase.txt b/applications/evaluation/Prompts/Extraction/Keyphrase.txt
index 8b3144370..90b89b289 100644
--- a/applications/evaluation/Prompts/Extraction/Keyphrase.txt
+++ b/applications/evaluation/Prompts/Extraction/Keyphrase.txt
@@ -1,26 +1,29 @@
 ﻿## Examples
 
 text: A black hole is a region of spacetime where gravity is so strong that nothing, including light and other electromagnetic waves, has enough energy to escape it. The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole.
-keyphrases: [
+{
+    statements: [
                     "Black hole",
                     "Region of spacetime",
                     "Strong gravity",
                     "Light and electromagnetic waves",
                     "Theory of general relativity"
                 ]
+}
 
 text: The Great Wall of China is an ancient series of walls and fortifications located in northern China, built around 500 years ago. This immense wall stretches over 13,000 miles and is a testament to the skill and persistence of ancient Chinese engineers.
-keyphrases:  [
+{
+    statements:  [
                     "Great Wall of China",
                     "Ancient fortifications",
                     "Northern China"
-                ]           
+                ]
+}
 
 ## Instructions
 
-Extract the top 3 to 5 keyphrases from the provided text, focusing on the most significant and distinctive aspects.
+Extract the top 3 to 5 keyphrases from the provided text, focusing on the most significant and distinctive aspects, in a JSON array.
 
 Let's do it with a real data.
 
 text: {{$input}}
-keyphrases:
\ No newline at end of file
diff --git a/applications/evaluation/Prompts/Extraction/Question.txt b/applications/evaluation/Prompts/Extraction/Question.txt
index c4d49532f..56a6434c1 100644
--- a/applications/evaluation/Prompts/Extraction/Question.txt
+++ b/applications/evaluation/Prompts/Extraction/Question.txt
@@ -34,7 +34,7 @@ output:
 
 ## Instructions
 
-Generate a question for the given answer and Identify if answer is noncommittal. Give committal as 1 if the answer is committal and 0 if the answer is noncommittal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers
+Generate a question for the given answer and Identify if answer is noncommittal. Give committal as 1 if the answer is committal and 0 if the answer is noncommittal. A noncommittal answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers Your output should always be a json data.
 
 Let's do it with a real data.
 
diff --git a/applications/evaluation/Prompts/Extraction/Statements.txt b/applications/evaluation/Prompts/Extraction/Statements.txt
index 093866048..b874b1c54 100644
--- a/applications/evaluation/Prompts/Extraction/Statements.txt
+++ b/applications/evaluation/Prompts/Extraction/Statements.txt
@@ -2,7 +2,7 @@
 
 question: Who was Albert Einstein and what is he best known for?
 answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
-output: [
+statements: [
  "Albert Einstein was a German-born theoretical physicist.",
  "Albert Einstein is recognized as one of the greatest and most influential physicists of all time.",
  "Albert Einstein was best known for developing the theory of relativity.",
@@ -11,10 +11,10 @@ output: [
 
 ## Instructions
 
-Given a question, an answer, and sentences from the answer analyze the complexity of each sentence and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement.
+Given a question, an answer, and sentences from the answer analyze the complexity of each sentence and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Your output should always be a json data.
 
 Let's do it with a real data.
 
 question: {{$question}}
 answer: {{$answer}}
-output: 
\ No newline at end of file
+statements: 
diff --git a/applications/evaluation/Prompts/SyntheticData/ConditionalQuestion.txt b/applications/evaluation/Prompts/SyntheticData/ConditionalQuestion.txt
index dcf211843..499b9c0fd 100644
--- a/applications/evaluation/Prompts/SyntheticData/ConditionalQuestion.txt
+++ b/applications/evaluation/Prompts/SyntheticData/ConditionalQuestion.txt
@@ -17,9 +17,10 @@ Follow the rules given below while rewriting the question.
     2. The rewritten question must be reasonable and must be understood and responded by humans.
     3. The rewritten question must be fully answerable from information present context.
     4. phrases like 'provided context','according to the context?',etc are not allowed to appear in the question.
+    5. always answer in json format.
 
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
-output: 
\ No newline at end of file
+output: 
diff --git a/applications/evaluation/Prompts/SyntheticData/MultiContextQuestion.txt b/applications/evaluation/Prompts/SyntheticData/MultiContextQuestion.txt
index 038d99852..3430006e1 100644
--- a/applications/evaluation/Prompts/SyntheticData/MultiContextQuestion.txt
+++ b/applications/evaluation/Prompts/SyntheticData/MultiContextQuestion.txt
@@ -19,10 +19,11 @@ Follow the rules given below while rewriting the question.
 	3. The rewritten question must be fully answerable from information present in context1 and context2. 
 	4. Read and understand both contexts and rewrite the question so that answering requires insight from both context1 and context2.
 	5. phrases like 'based on the provided context','according to the context?',etc are not allowed to appear in the question.
+    6. always answer in json format.
 
 Let's do it with a real data.
 
 question: {{$question}}
 context1: {{$context1}}
 context2: {{$context2}}
-output: 
\ No newline at end of file
+output: 
diff --git a/applications/evaluation/Prompts/SyntheticData/QuestionAnswer.txt b/applications/evaluation/Prompts/SyntheticData/QuestionAnswer.txt
index 2984d1bef..b72194762 100644
--- a/applications/evaluation/Prompts/SyntheticData/QuestionAnswer.txt
+++ b/applications/evaluation/Prompts/SyntheticData/QuestionAnswer.txt
@@ -25,9 +25,9 @@ answer: {
 
 ## Instructions
 
-Answer the question using the information from the given context. Output verdict as '1' if answer is present '-1' if answer is not present in the context.
+Answer the question using the information from the given context. Output verdict as '1' if answer is present '-1' if answer is not present in the context. Always answer in json format.
 Let's do it with a real data.
 
 context: {{$context}}}}
 question: {{$question}}
-answer: 
\ No newline at end of file
+answer: 
diff --git a/applications/evaluation/Prompts/SyntheticData/ReasoningQuestion.txt b/applications/evaluation/Prompts/SyntheticData/ReasoningQuestion.txt
index 5cb2a8f1e..9003d55aa 100644
--- a/applications/evaluation/Prompts/SyntheticData/ReasoningQuestion.txt
+++ b/applications/evaluation/Prompts/SyntheticData/ReasoningQuestion.txt
@@ -17,9 +17,10 @@ Complicate the given question by rewriting question into a multi-hop reasoning q
     2. Do not frame questions that contains more than 15 words. Use abbreviation wherever possible.
     3. Make sure the question is clear and unambiguous.
     4. phrases like 'based on the provided context','according to the context',etc are not allowed to appear in the question.
+    5. always answer in json format.
 
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
-output: 
\ No newline at end of file
+output: 
diff --git a/applications/evaluation/TestSetGenerator.cs b/applications/evaluation/TestSetGenerator.cs
index a85c6c36c..984e29b9c 100644
--- a/applications/evaluation/TestSetGenerator.cs
+++ b/applications/evaluation/TestSetGenerator.cs
@@ -8,6 +8,7 @@
 using System.Threading.Tasks;
 using Microsoft.Extensions.DependencyInjection;
 using Microsoft.KernelMemory.Evaluation.TestSet;
+using Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
 using Microsoft.KernelMemory.MemoryStorage;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
@@ -24,36 +25,45 @@ public sealed partial class TestSetGenerator : EvaluationEngine
     private KernelFunction Translate => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("Transmutation", "Translate"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0
     });
 
     private KernelFunction QuestionAnswerGeneration => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("SyntheticData", "QuestionAnswer"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     });
 
     private KernelFunction KeyPhraseExtraction => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Keyphrase"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0,
+        ResponseFormat = "json_object"
     });
 
     private KernelFunction SeedQuestionGeneration => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("SyntheticData", "SeedQuestion"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0
     });
 
     private KernelFunction ReasoningQuestionGeneration => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("SyntheticData", "ReasoningQuestion"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0
     });
 
     private KernelFunction MultiContextQuestionGeneration => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("SyntheticData", "MultiContextQuestion"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0
     });
 
     private KernelFunction ConditioningQuestionGeneration => this._evaluatorKernel.CreateFunctionFromPrompt(this.GetSKPrompt("SyntheticData", "ConditionalQuestion"), new OpenAIPromptExecutionSettings
     {
         Temperature = 1e-8f,
+        Seed = 0
     });
 
     internal TestSetGenerator(
@@ -343,10 +353,10 @@ private async Task<IEnumerable<string>> GetKeyPhrases(string context, int retryC
                 { "input", context }
             }).ConfigureAwait(false);
 
-            return JsonSerializer.Deserialize<IEnumerable<string>>(generatedKeyPhrases.GetValue<string>()!);
+            return JsonSerializer.Deserialize<StatementExtraction>(generatedKeyPhrases.GetValue<string>()!);
         }).ConfigureAwait(false);
 
-        return this.Shuffle(keyPhrases!);
+        return this.Shuffle(keyPhrases!.Statements!);
     }
 
     private Task<QuestionAnswer> GetQuestionAnswerAsync(string context, string question, string language = null!, int retryCount = 3)
diff --git a/applications/tests/Evaluation.Tests/Evaluation.FunctionalTests.csproj b/applications/tests/Evaluation.Tests/Evaluation.FunctionalTests.csproj
index 80166cebe..4860c116d 100644
--- a/applications/tests/Evaluation.Tests/Evaluation.FunctionalTests.csproj
+++ b/applications/tests/Evaluation.Tests/Evaluation.FunctionalTests.csproj
@@ -11,6 +11,12 @@
         <IsPackable>false</IsPackable>
     </PropertyGroup>
 
+    <ItemGroup>
+      <None Include="..\..\..\service\tests\Core.FunctionalTests\file1-NASA-news.pdf" Link="file1-NASA-news.pdf">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      </None>
+    </ItemGroup>
+
     <ItemGroup>
         <PackageReference Include="Microsoft.NET.Test.Sdk" />
         <PackageReference Include="xunit" />