cloudera · ewilliams-cloudera · Jan 31, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/llm-service/app/services/caii/CaiiEmbeddingModel.py b/llm-service/app/services/caii/CaiiEmbeddingModel.py
@@ -66,7 +66,7 @@ def _get_query_embedding(self, query: str) -> Embedding:
         return self._get_embedding(query, "query")
 
     def _get_embedding(self, query: str, input_type: str) -> Embedding:
-        model = self.endpoint.endpointmetadata.model_name
+        model = self.endpoint.model_name
         body = json.dumps(
             {
                 "input": query,
@@ -99,7 +99,7 @@ def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]:
         if len(texts) == 1:
             return [self._get_text_embedding(texts[0])]
 
-        model = self.endpoint.endpointmetadata.model_name
+        model = self.endpoint.model_name
         body = json.dumps(
             {
                 "input": texts,

diff --git a/llm-service/app/services/caii/caii.py b/llm-service/app/services/caii/caii.py
@@ -95,7 +95,7 @@ def get_llm(
     api_base = endpoint.url.removesuffix("/chat/completions")
     headers = build_auth_headers()
 
-    model = endpoint.endpointmetadata.model_name
+    model = endpoint.model_name
     if "mistral" in endpoint_name.lower():
         llm = CaiiModelMistral(
             model=model,

diff --git a/llm-service/app/services/caii/types.py b/llm-service/app/services/caii/types.py
@@ -36,7 +36,7 @@
 #  DATA.
 #
 from dataclasses import dataclass
-from typing import List, Dict, Any, Optional
+from typing import Dict, Any, Optional
 
 from pydantic import BaseModel, ConfigDict
 
@@ -69,14 +69,8 @@
 #
 
 
-class EndpointMetadata(BaseModel):
-    model_config = ConfigDict(protected_namespaces=())
-    # current_model: Optional[RegistrySource]
-    # previous_model: Optional[RegistrySource]
-    model_name: str
-
-
 class Endpoint(BaseModel):
+    model_config = ConfigDict(protected_namespaces=(), extra='ignore')
     namespace: str
     name: str
     url: str
@@ -91,27 +85,25 @@ class Endpoint(BaseModel):
     resources: Dict[str, str]
     # source: Dict[str, RegistrySource]
     autoscaling: Dict[str, Any]
-    endpointmetadata: EndpointMetadata
+    model_name: str
     traffic: Dict[str, str]
     api_standard: str
     has_chat_template: bool
-    metricFormat: str
     task: str
     instance_type: str
+    metric_format: str
 
 
-@dataclass
-class ListEndpointEntry:
+class ListEndpointEntry(BaseModel):
+    model_config = ConfigDict(extra='ignore')
     namespace: str
     name: str
     url: str
     state: str
     created_by: str
-    replica_count: int
-    replica_metadata: List[Any]
     api_standard: str
     has_chat_template: bool
-    metricFormat: str
+    metric_format: str
 
 
 @dataclass

diff --git a/llm-service/performance_testing/all_off.csv b/llm-service/performance_testing/all_off.csv
diff --git a/llm-service/performance_testing/performance_testing.py b/llm-service/performance_testing/performance_testing.py
@@ -35,6 +35,7 @@
 #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 #  DATA.
 #
+import itertools
 import os
 import sys
 import time
@@ -55,98 +56,117 @@
 from app.services.query.querier import CUSTOM_PROMPT
 from app.services.query.chat_engine import FlexibleContextChatEngine
 
+test_runtime_config = {
+    "reranking_model": [
+        model.model_id for model in models.get_available_rerank_models()
+    ],
+    "synthesis_model": ["meta.llama3-1-8b-instruct-v1:0"],
+    "top_k": [5, 10],
+    "hyde": [True, False],
+}
+
 
 # usage: uv run --env-file=../.env performance_testing/performance_testing.py <data_source_id> questions_mini.csv
 def main():
-    data_source_id: int = int(sys.argv[1])
     file: str = sys.argv[2]
-    metadata = get_metadata(data_source_id)
-    summarization_model = metadata.summarization_model
-    chunk_size = metadata.chunk_size
+    data_source_id = int(sys.argv[1])
     with open(os.path.abspath(os.path.join(os.path.dirname(__file__), file)), "r") as f:
         df = pd.read_csv(f)
         questions: list[str] = df["Question"].tolist()
 
     with open(
         os.path.abspath(os.path.join(os.path.dirname(__file__), "raw_results.csv")), "a"
     ) as details:
-        for synthesis_model in ["meta.llama3-1-8b-instruct-v1:0", "meta.llama3-1-70b-instruct-v1:0"]:
-            for reranking_model in models.get_available_rerank_models():
-                for top_k in [5, 10]:
-                    for hyde in [True, False]:
-                        print(f"Running with hyde={hyde}")
-                        score_sum = 0
-                        question_count = 0
-                        max_score = 0
-                        max_score_sum = 0
-                        min_max_score = 10000000
-                        relevance_sum = 0
-                        faithfulness_sum = 0
-                        for question in questions:
-                            chat_engine = setup(
-                                use_hyde=hyde,
-                                data_source_id=data_source_id,
-                                top_k=top_k,
-                                reranking_model=reranking_model.model_id,
-                                synthesis_model=synthesis_model
-                            )
-                            chat_response: AgentChatResponse = chat_engine.chat(
-                                message=question, chat_history=None
-                            )
-                            # Relevance - Measures if the response and source nodes match the query. This is useful for measuring if the query was actually answered by the response.
-                            # Faithfulness - Measures if the response from a query engine matches any source nodes. This is useful for measuring if the response was hallucinated.
-                            relevance, faithfulness = evaluators.evaluate_response(
-                                query=question,
-                                chat_response=chat_response,
-                                model_name=summarization_model,
-                            )
-                            relevance_sum += relevance
-                            faithfulness_sum += faithfulness
-
-                            nodes = chat_response.source_nodes
-
-                            if nodes:
-                                question_count += 1
-                                question_max = max(node.score for node in nodes)
-                                max_score = max(max_score, question_max)
-                                avg_score = sum(node.score for node in nodes) / len(nodes)
-                                score_sum += avg_score
-                                max_score_sum += max_score
-                                min_max_score = min(max_score, min_max_score)
-                                #  timestamp, hyde, summarization_model, top_k, file_name_1, max_score, relevance, faithfulness, question
-                                details.write(
-                                    f'{time.time()},{hyde},{summarization_model},{top_k},{nodes[0].metadata.get("file_name")},{question_max},{relevance},{faithfulness},"{question}"\n'
-                                )
-                            details.flush()
-
-                        average_average_score = score_sum / question_count
-                        average_max_score = max_score_sum / question_count
-                        relevance_average = relevance_sum / question_count
-                        faithfulness_average = faithfulness_sum / question_count
-                        print(f"{chat_engine._configuration=}")
-                        # print(f"Average score: {average_average_score}")
-                        with open(
-                            os.path.abspath(
-                                os.path.join(os.path.dirname(__file__), "results.csv")
-                            ),
-                            "a",
-                        ) as f:
-                            # chunk_size,summarization_model,reranking_model,synthesis_model,hyde,top_k,average_max_score,min_max_score,relevance_average,faithfulness_average
-                            f.write(f"{chunk_size},{summarization_model},{reranking_model.model_id},{synthesis_model},{hyde},{top_k},{average_max_score},{min_max_score},{relevance_average},{faithfulness_average}\n")
-                            f.flush()
+        for config in [
+            dict(zip(test_runtime_config.keys(), values))
+            for values in itertools.product(*test_runtime_config.values())
+        ]:
+            print(f"Config: {config}")
+            top_k = config["top_k"]
+            hyde = config["hyde"]
+            reranking_model = config["reranking_model"]
+            synthesis_model = config["synthesis_model"]
+
+            metadata = get_metadata(data_source_id)
+            summarization_model = metadata.summarization_model
+            chunk_size = metadata.chunk_size
+
+            score_sum = 0
+            question_count = 0
+            max_score = 0
+            max_score_sum = 0
+            min_max_score = 10000000
+            relevance_sum = 0
+            faithfulness_sum = 0
+            for question in questions:
+                chat_engine = setup(
+                    data_source_id=data_source_id,
+                    hyde=hyde,
+                    top_k=top_k,
+                    synthesis_model=synthesis_model,
+                    reranking_model=reranking_model,
+                )
+                chat_response: AgentChatResponse = chat_engine.chat(
+                    message=question, chat_history=None
+                )
+                # Relevance - Measures if the response and source nodes match the query. This is useful for measuring if the query was actually answered by the response.
+                # Faithfulness - Measures if the response from a query engine matches any source nodes. This is useful for measuring if the response was hallucinated.
+                relevance, faithfulness = evaluators.evaluate_response(
+                    query=question,
+                    chat_response=chat_response,
+                    model_name=summarization_model,
+                )
+                relevance_sum += relevance
+                faithfulness_sum += faithfulness
+
+                nodes = chat_response.source_nodes
+
+                if nodes:
+                    question_count += 1
+                    question_max = max(node.score for node in nodes)
+                    max_score = max(max_score, question_max)
+                    avg_score = sum(node.score for node in nodes) / len(nodes)
+                    score_sum += avg_score
+                    max_score_sum += max_score
+                    min_max_score = min(max_score, min_max_score)
+                    #  timestamp,chunk_size, hyde, summarization_model,reranking_model,top_k, file_name_1, max_score, relevance, faithfulness, question
+                    details.write(
+                        f'{time.time()},{chunk_size},{hyde},{summarization_model},{reranking_model},{top_k},{nodes[0].metadata.get("file_name")},{question_max},{relevance},{faithfulness},"{question}"\n'
+                    )
+                details.flush()
+
+            average_average_score = score_sum / question_count
+            average_max_score = max_score_sum / question_count
+            relevance_average = relevance_sum / question_count
+            faithfulness_average = faithfulness_sum / question_count
+            print(f"{chat_engine._configuration=}")
+            # print(f"Average score: {average_average_score}")
+            with open(
+                os.path.abspath(os.path.join(os.path.dirname(__file__), "results.csv")),
+                "a",
+            ) as f:
+                # chunk_size,summarization_model,reranking_model,synthesis_model,hyde,top_k,average_max_score,min_max_score,relevance_average,faithfulness_average
+                f.write(
+                    f"{chunk_size},{summarization_model},{reranking_model},{synthesis_model},{hyde},{top_k},{average_max_score},{min_max_score},{relevance_average},{faithfulness_average}\n"
+                )
+                f.flush()
 
 
 def setup(
-    data_source_id: int, use_hyde=True, top_k: int = 5,
-        synthesis_model="meta.llama3-1-8b-instruct-v1:0", reranking_model="amazon.rerank-v1:0") -> FlexibleContextChatEngine:
+    data_source_id: int,
+    hyde=True,
+    top_k: int = 5,
+    synthesis_model="meta.llama3-1-8b-instruct-v1:0",
+    reranking_model="amazon.rerank-v1:0",
+) -> FlexibleContextChatEngine:
     model_name = synthesis_model
     rerank_model = reranking_model
     query_configuration = QueryConfiguration(
         top_k=5,
         model_name=model_name,
         use_question_condensing=True,
-        use_hyde=use_hyde,
-        rerank_model_name=rerank_model
+        use_hyde=hyde,
+        rerank_model_name=rerank_model,
     )
     llm = models.get_llm(model_name=query_configuration.model_name)
     qdrant_store = QdrantVectorStore.for_chunks(data_source_id)