diff --git a/sdks/python/src/opik/evaluation/scorer.py b/sdks/python/src/opik/evaluation/scorer.py index 80237371ba..1f6dda33d5 100644 --- a/sdks/python/src/opik/evaluation/scorer.py +++ b/sdks/python/src/opik/evaluation/scorer.py @@ -4,10 +4,13 @@ from typing import List, Optional, Dict, Any, Union, Callable from .types import LLMTask +from opik.types import ErrorInfoDict + from opik.api_objects.dataset import dataset, dataset_item from opik.api_objects.experiment import experiment, experiment_item from opik.api_objects import opik_client, trace from opik import context_storage, opik_context, exceptions +from opik.decorator import error_info_collector from . import test_case, test_result from .metrics import arguments_helpers, score_result, base_metric @@ -72,6 +75,8 @@ def _process_item( ], ) -> test_result.TestResult: try: + error_info: Optional[ErrorInfoDict] = None + trace_data = trace.TraceData( input=item.get_content(), name="evaluation_task", @@ -80,8 +85,13 @@ def _process_item( ) context_storage.set_trace_data(trace_data) item_content = item.get_content() + LOGGER.debug("Task started, input: %s", item_content) - task_output_ = task(item_content) + try: + task_output_ = task(item_content) + except Exception as exception: + error_info = error_info_collector.collect(exception) + raise LOGGER.debug("Task finished, output: %s", task_output_) opik_context.update_current_trace(output=task_output_) @@ -107,7 +117,12 @@ def _process_item( finally: trace_data = context_storage.pop_trace_data() # type: ignore + assert trace_data is not None + + if error_info is not None: + trace_data.error_info = error_info + trace_data.init_end_time() client.trace(**trace_data.__dict__) experiment_item_ = experiment_item.ExperimentItem( diff --git a/sdks/python/tests/library_integration/llama_index/test_llama_index.py b/sdks/python/tests/library_integration/llama_index/test_llama_index.py index c9b6962fa3..b0c8fafc12 100644 --- a/sdks/python/tests/library_integration/llama_index/test_llama_index.py +++ b/sdks/python/tests/library_integration/llama_index/test_llama_index.py @@ -7,6 +7,7 @@ from opik.config import OPIK_PROJECT_DEFAULT_NAME from opik.integrations.llama_index import LlamaIndexCallbackHandler + from ...testlib import ANY_BUT_NONE, TraceModel, assert_equal diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index 722e2937eb..4df38a7baa 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -1,12 +1,14 @@ import mock import pytest + from typing import Dict, Any +import opik from opik.api_objects.dataset import dataset_item from opik.api_objects import opik_client from opik import evaluation, exceptions, url_helpers from opik.evaluation import metrics -from ...testlib import ANY_BUT_NONE, assert_equal +from ...testlib import ANY_BUT_NONE, ANY_STRING, assert_equal from ...testlib.models import ( TraceModel, FeedbackScoreModel, @@ -277,3 +279,73 @@ def say_task(dataset_item: Dict[str, Any]): ) mock_dataset.__internal_api__get_items_as_dataclasses__.assert_called_once() + + +def test_evaluate__exception_raised_from_the_task__error_info_added_to_the_trace( + fake_backend, +): + mock_dataset = mock.MagicMock(spec=["__internal_api__get_items_as_dataclasses__"]) + mock_dataset.name = "the-dataset-name" + mock_dataset.__internal_api__get_items_as_dataclasses__.return_value = [ + dataset_item.DatasetItem( + id="dataset-item-id-1", + input={"message": "say hello"}, + reference="hello", + ), + ] + + def say_task(dataset_item: Dict[str, Any]): + raise Exception("some-error-message") + + mock_experiment = mock.Mock() + mock_create_experiment = mock.Mock() + mock_create_experiment.return_value = mock_experiment + + mock_get_experiment_url = mock.Mock() + mock_get_experiment_url.return_value = "any_url" + + with mock.patch.object( + opik_client.Opik, "create_experiment", mock_create_experiment + ): + with mock.patch.object( + url_helpers, "get_experiment_url", mock_get_experiment_url + ): + with pytest.raises(Exception): + evaluation.evaluate( + dataset=mock_dataset, + task=say_task, + experiment_name="the-experiment-name", + scoring_metrics=[], + task_threads=1, + ) + opik.flush_tracker() + + mock_dataset.__internal_api__get_items_as_dataclasses__.assert_called_once() + + mock_create_experiment.assert_called_once_with( + dataset_name="the-dataset-name", + name="the-experiment-name", + experiment_config=None, + prompt=None, + ) + + mock_experiment.insert.assert_called_once_with(experiment_items=mock.ANY) + EXPECTED_TRACE_TREE = TraceModel( + id=ANY_BUT_NONE, + name="evaluation_task", + input={ + "input": {"message": "say hello"}, + "reference": "hello", + }, + output=None, + start_time=ANY_BUT_NONE, + end_time=ANY_BUT_NONE, + error_info={ + "exception_type": "Exception", + "message": "some-error-message", + "traceback": ANY_STRING(), + }, + spans=[], + ) + + assert_equal(EXPECTED_TRACE_TREE, fake_backend.trace_trees[0])