test_benchmark.py

from deepeval import assert_test
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import AnswerRelevancyMetric, GEval
from openai import OpenAI

def test_answer_relevancy():
  client = OpenAI()

  # Define the input question once
  input_question = "How do contract terms influence negotiations in software and SaaS?"

  response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "system", "content": "You are a helpful assistant. Keep your answers concise and to the point."},
          {"role": "user", "content": input_question}
      ],
      temperature=0.9,
      max_tokens=600
  )

  # Store the response content
  response_content = response.choices[0].message.content

  # Create the test case with the actual response
  test_case = LLMTestCase(
      input=input_question,
      expected_output="paris is good",
      actual_output=response_content
  )

  # Create evaluation metrics with lower thresholds to make failures more visible
  relevancy_metric = AnswerRelevancyMetric(
      threshold=0.5,
      include_reason=True
  )

  correctness_metric = GEval(
      name="Correctness",
      evaluation_params=[
          LLMTestCaseParams.EXPECTED_OUTPUT,
          LLMTestCaseParams.ACTUAL_OUTPUT,
          LLMTestCaseParams.INPUT
      ],
      criteria="The actual output should be relevant to the input and match the expected output as closely as possible.",
      threshold=0.5
  )

  # Add debug printing
  print(f"Actual output: {response_content}")
  
  # Run the evaluation and catch any exceptions
  try:
      assert_test(test_case, [relevancy_metric, correctness_metric])
  except AssertionError as e:
      print(f"Test failed as expected: {e}")