Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimentation plus CAII true-up #119

Merged
merged 13 commits into from
Jan 31, 2025
4 changes: 2 additions & 2 deletions llm-service/app/services/caii/CaiiEmbeddingModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _get_query_embedding(self, query: str) -> Embedding:
return self._get_embedding(query, "query")

def _get_embedding(self, query: str, input_type: str) -> Embedding:
model = self.endpoint.endpointmetadata.model_name
model = self.endpoint.model_name
body = json.dumps(
{
"input": query,
Expand Down Expand Up @@ -99,7 +99,7 @@ def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]:
if len(texts) == 1:
return [self._get_text_embedding(texts[0])]

model = self.endpoint.endpointmetadata.model_name
model = self.endpoint.model_name
body = json.dumps(
{
"input": texts,
Expand Down
2 changes: 1 addition & 1 deletion llm-service/app/services/caii/caii.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_llm(
api_base = endpoint.url.removesuffix("/chat/completions")
headers = build_auth_headers()

model = endpoint.endpointmetadata.model_name
model = endpoint.model_name
if "mistral" in endpoint_name.lower():
llm = CaiiModelMistral(
model=model,
Expand Down
22 changes: 7 additions & 15 deletions llm-service/app/services/caii/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# DATA.
#
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from typing import Dict, Any, Optional

from pydantic import BaseModel, ConfigDict

Expand Down Expand Up @@ -69,14 +69,8 @@
#


class EndpointMetadata(BaseModel):
model_config = ConfigDict(protected_namespaces=())
# current_model: Optional[RegistrySource]
# previous_model: Optional[RegistrySource]
model_name: str


class Endpoint(BaseModel):
model_config = ConfigDict(protected_namespaces=(), extra='ignore')
namespace: str
name: str
url: str
Expand All @@ -91,27 +85,25 @@ class Endpoint(BaseModel):
resources: Dict[str, str]
# source: Dict[str, RegistrySource]
autoscaling: Dict[str, Any]
endpointmetadata: EndpointMetadata
model_name: str
traffic: Dict[str, str]
api_standard: str
has_chat_template: bool
metricFormat: str
task: str
instance_type: str
metric_format: str


@dataclass
class ListEndpointEntry:
class ListEndpointEntry(BaseModel):
model_config = ConfigDict(extra='ignore')
namespace: str
name: str
url: str
state: str
created_by: str
replica_count: int
replica_metadata: List[Any]
api_standard: str
has_chat_template: bool
metricFormat: str
metric_format: str


@dataclass
Expand Down
241 changes: 0 additions & 241 deletions llm-service/performance_testing/all_off.csv

This file was deleted.

164 changes: 92 additions & 72 deletions llm-service/performance_testing/performance_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#
import itertools
import os
import sys
import time
Expand All @@ -55,98 +56,117 @@
from app.services.query.querier import CUSTOM_PROMPT
from app.services.query.chat_engine import FlexibleContextChatEngine

test_runtime_config = {
"reranking_model": [
model.model_id for model in models.get_available_rerank_models()
],
"synthesis_model": ["meta.llama3-1-8b-instruct-v1:0"],
"top_k": [5, 10],
"hyde": [True, False],
}


# usage: uv run --env-file=../.env performance_testing/performance_testing.py <data_source_id> questions_mini.csv
def main():
data_source_id: int = int(sys.argv[1])
file: str = sys.argv[2]
metadata = get_metadata(data_source_id)
summarization_model = metadata.summarization_model
chunk_size = metadata.chunk_size
data_source_id = int(sys.argv[1])
with open(os.path.abspath(os.path.join(os.path.dirname(__file__), file)), "r") as f:
df = pd.read_csv(f)
questions: list[str] = df["Question"].tolist()

with open(
os.path.abspath(os.path.join(os.path.dirname(__file__), "raw_results.csv")), "a"
) as details:
for synthesis_model in ["meta.llama3-1-8b-instruct-v1:0", "meta.llama3-1-70b-instruct-v1:0"]:
for reranking_model in models.get_available_rerank_models():
for top_k in [5, 10]:
for hyde in [True, False]:
print(f"Running with hyde={hyde}")
score_sum = 0
question_count = 0
max_score = 0
max_score_sum = 0
min_max_score = 10000000
relevance_sum = 0
faithfulness_sum = 0
for question in questions:
chat_engine = setup(
use_hyde=hyde,
data_source_id=data_source_id,
top_k=top_k,
reranking_model=reranking_model.model_id,
synthesis_model=synthesis_model
)
chat_response: AgentChatResponse = chat_engine.chat(
message=question, chat_history=None
)
# Relevance - Measures if the response and source nodes match the query. This is useful for measuring if the query was actually answered by the response.
# Faithfulness - Measures if the response from a query engine matches any source nodes. This is useful for measuring if the response was hallucinated.
relevance, faithfulness = evaluators.evaluate_response(
query=question,
chat_response=chat_response,
model_name=summarization_model,
)
relevance_sum += relevance
faithfulness_sum += faithfulness

nodes = chat_response.source_nodes

if nodes:
question_count += 1
question_max = max(node.score for node in nodes)
max_score = max(max_score, question_max)
avg_score = sum(node.score for node in nodes) / len(nodes)
score_sum += avg_score
max_score_sum += max_score
min_max_score = min(max_score, min_max_score)
# timestamp, hyde, summarization_model, top_k, file_name_1, max_score, relevance, faithfulness, question
details.write(
f'{time.time()},{hyde},{summarization_model},{top_k},{nodes[0].metadata.get("file_name")},{question_max},{relevance},{faithfulness},"{question}"\n'
)
details.flush()

average_average_score = score_sum / question_count
average_max_score = max_score_sum / question_count
relevance_average = relevance_sum / question_count
faithfulness_average = faithfulness_sum / question_count
print(f"{chat_engine._configuration=}")
# print(f"Average score: {average_average_score}")
with open(
os.path.abspath(
os.path.join(os.path.dirname(__file__), "results.csv")
),
"a",
) as f:
# chunk_size,summarization_model,reranking_model,synthesis_model,hyde,top_k,average_max_score,min_max_score,relevance_average,faithfulness_average
f.write(f"{chunk_size},{summarization_model},{reranking_model.model_id},{synthesis_model},{hyde},{top_k},{average_max_score},{min_max_score},{relevance_average},{faithfulness_average}\n")
f.flush()
for config in [
dict(zip(test_runtime_config.keys(), values))
for values in itertools.product(*test_runtime_config.values())
]:
print(f"Config: {config}")
top_k = config["top_k"]
hyde = config["hyde"]
reranking_model = config["reranking_model"]
synthesis_model = config["synthesis_model"]

metadata = get_metadata(data_source_id)
summarization_model = metadata.summarization_model
chunk_size = metadata.chunk_size

score_sum = 0
question_count = 0
max_score = 0
max_score_sum = 0
min_max_score = 10000000
relevance_sum = 0
faithfulness_sum = 0
for question in questions:
chat_engine = setup(
data_source_id=data_source_id,
hyde=hyde,
top_k=top_k,
synthesis_model=synthesis_model,
reranking_model=reranking_model,
)
chat_response: AgentChatResponse = chat_engine.chat(
message=question, chat_history=None
)
# Relevance - Measures if the response and source nodes match the query. This is useful for measuring if the query was actually answered by the response.
# Faithfulness - Measures if the response from a query engine matches any source nodes. This is useful for measuring if the response was hallucinated.
relevance, faithfulness = evaluators.evaluate_response(
query=question,
chat_response=chat_response,
model_name=summarization_model,
)
relevance_sum += relevance
faithfulness_sum += faithfulness

nodes = chat_response.source_nodes

if nodes:
question_count += 1
question_max = max(node.score for node in nodes)
max_score = max(max_score, question_max)
avg_score = sum(node.score for node in nodes) / len(nodes)
score_sum += avg_score
max_score_sum += max_score
min_max_score = min(max_score, min_max_score)
# timestamp,chunk_size, hyde, summarization_model,reranking_model,top_k, file_name_1, max_score, relevance, faithfulness, question
details.write(
f'{time.time()},{chunk_size},{hyde},{summarization_model},{reranking_model},{top_k},{nodes[0].metadata.get("file_name")},{question_max},{relevance},{faithfulness},"{question}"\n'
)
details.flush()

average_average_score = score_sum / question_count
average_max_score = max_score_sum / question_count
relevance_average = relevance_sum / question_count
faithfulness_average = faithfulness_sum / question_count
print(f"{chat_engine._configuration=}")
# print(f"Average score: {average_average_score}")
with open(
os.path.abspath(os.path.join(os.path.dirname(__file__), "results.csv")),
"a",
) as f:
# chunk_size,summarization_model,reranking_model,synthesis_model,hyde,top_k,average_max_score,min_max_score,relevance_average,faithfulness_average
f.write(
f"{chunk_size},{summarization_model},{reranking_model},{synthesis_model},{hyde},{top_k},{average_max_score},{min_max_score},{relevance_average},{faithfulness_average}\n"
)
f.flush()


def setup(
data_source_id: int, use_hyde=True, top_k: int = 5,
synthesis_model="meta.llama3-1-8b-instruct-v1:0", reranking_model="amazon.rerank-v1:0") -> FlexibleContextChatEngine:
data_source_id: int,
hyde=True,
top_k: int = 5,
synthesis_model="meta.llama3-1-8b-instruct-v1:0",
reranking_model="amazon.rerank-v1:0",
) -> FlexibleContextChatEngine:
model_name = synthesis_model
rerank_model = reranking_model
query_configuration = QueryConfiguration(
top_k=5,
model_name=model_name,
use_question_condensing=True,
use_hyde=use_hyde,
rerank_model_name=rerank_model
use_hyde=hyde,
rerank_model_name=rerank_model,
)
llm = models.get_llm(model_name=query_configuration.model_name)
qdrant_store = QdrantVectorStore.for_chunks(data_source_id)
Expand Down
Loading