diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py index 49c59f0f..3a7c6f9f 100644 --- a/evaluation/auto_evaluation/dataset/preprocess.py +++ b/evaluation/auto_evaluation/dataset/preprocess.py @@ -48,8 +48,12 @@ def read_deepeval_cache(): metric["metric_data"]["success"] ) - print("Metric Scores: ", metric_scores) - print("Metric Passes: ", metric_passes) + print("Average Metric Scores: ") + for key, value in metric_scores.items(): + print(key, sum(value) / len(value)) + print("Metric Passrates: ") + for key, value in metric_passes.items(): + print(key, value.count(True) / len(value)) if __name__ == "__main__": diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py index ecffb108..31a64748 100644 --- a/evaluation/auto_evaluation/src/models/vertex_ai.py +++ b/evaluation/auto_evaluation/src/models/vertex_ai.py @@ -6,8 +6,6 @@ import instructor from typing import Any - -# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory # type: ignore from deepeval.models.base_model import DeepEvalBaseLLM from pydantic import BaseModel