From 231ed88e21ec53f40441ed643ef062966ba802b7 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik <98702584+alexkuzmik@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:59:46 +0100 Subject: [PATCH] [OPIK-573] Implement get_experiment_by_[name, id] and implement Experiment.get_items() (#868) * Draft implementation of new Experiment API * Add Opik.get_experiment_by_name method * Fix lint errors * Add new properties to experiment * Fix lint errors * Add ExperimentItemContent to __all__ * Update the e2e test for experiment to include items content check * Fix lint errors * Update evaluation unit tests * Remove prompt property * Add docstrings, fix lint errors * Add docstrings to experiment methods * Make both get_experiment_by_* methods raise ExperimentNotFound error if experiment was not found * Update evaluation unit test * Fix lint errors * Rename fields in ExperimentItemContent * Fix lint errors --- sdks/python/src/opik/__init__.py | 6 + .../opik/api_objects/experiment/experiment.py | 76 +++++++++++-- .../api_objects/experiment/experiment_item.py | 40 ++++++- .../opik/api_objects/experiment/helpers.py | 21 ++++ .../src/opik/api_objects/opik_client.py | 53 +++++++++ sdks/python/src/opik/evaluation/evaluator.py | 1 + sdks/python/src/opik/evaluation/scorer.py | 4 +- sdks/python/src/opik/exceptions.py | 4 + .../opik/plugins/pytest/experiment_runner.py | 6 +- sdks/python/tests/e2e/test_experiment.py | 104 +++++++++++++++--- .../tests/unit/evaluation/test_evaluate.py | 14 ++- 11 files changed, 294 insertions(+), 35 deletions(-) diff --git a/sdks/python/src/opik/__init__.py b/sdks/python/src/opik/__init__.py index 2e057cbaf0..6dda699447 100644 --- a/sdks/python/src/opik/__init__.py +++ b/sdks/python/src/opik/__init__.py @@ -4,6 +4,10 @@ from .api_objects.trace import Trace from .api_objects.span import Span from .api_objects.dataset import Dataset +from .api_objects.experiment.experiment_item import ( + ExperimentItemReferences, + ExperimentItemContent, +) from . import _logging from .configurator.configure import configure from . import package_version @@ -18,6 +22,8 @@ "__version__", "evaluate", "evaluate_experiment", + "ExperimentItemContent", + "ExperimentItemReferences", "track", "flush_tracker", "Opik", diff --git a/sdks/python/src/opik/api_objects/experiment/experiment.py b/sdks/python/src/opik/api_objects/experiment/experiment.py index 392227ebce..6eab879f08 100644 --- a/sdks/python/src/opik/api_objects/experiment/experiment.py +++ b/sdks/python/src/opik/api_objects/experiment/experiment.py @@ -1,4 +1,5 @@ import logging +import functools from typing import List, Optional from opik.rest_api import client as rest_api_client @@ -21,21 +22,51 @@ def __init__( rest_client: rest_api_client.OpikApi, prompt: Optional[Prompt] = None, ) -> None: - self.id = id - self.name = name - self.dataset_name = dataset_name + self._id = id + self._name = name + self._dataset_name = dataset_name self._rest_client = rest_client - self.prompt = prompt + self._prompt = prompt - def insert(self, experiment_items: List[experiment_item.ExperimentItem]) -> None: + @property + def id(self) -> str: + return self._id + + @functools.cached_property + def dataset_id(self) -> str: + return self._rest_client.datasets.get_dataset_by_identifier( + dataset_name=self._dataset_name + ).id + + @functools.cached_property + def name(self) -> str: + if self._name is not None: + return self._name + + return self._rest_client.experiments.get_experiment_by_id(id=self.id).name + + def insert( + self, + experiment_items_references: List[experiment_item.ExperimentItemReferences], + ) -> None: + """ + Creates a new experiment item by linking the existing trace and dataset item. + + Args: + experiment_items_references: The list of ExperimentItemReferences objects, containing + trace id and dataset item id to link together into experiment item. + + Returns: + None + """ rest_experiment_items = [ rest_experiment_item.ExperimentItem( - id=item.id if item.id is not None else helpers.generate_id(), + id=helpers.generate_id(), experiment_id=self.id, dataset_item_id=item.dataset_item_id, trace_id=item.trace_id, ) - for item in experiment_items + for item in experiment_items_references ] batches = sequence_splitter.split_into_batches( @@ -48,3 +79,34 @@ def insert(self, experiment_items: List[experiment_item.ExperimentItem]) -> None experiment_items=batch, ) LOGGER.debug("Sent experiment items batch of size %d", len(batch)) + + def get_items(self) -> List[experiment_item.ExperimentItemContent]: + """ + Returns: + List[ExperimentItemContent]: the list with contents of existing experiment items. + """ + result: List[experiment_item.ExperimentItemContent] = [] + + page = 0 + + while True: # TODO: refactor this logic when backend implements a proper streaming endpoint + page += 1 + dataset_items_page = ( + self._rest_client.datasets.find_dataset_items_with_experiment_items( + id=self.dataset_id, + experiment_ids=f'["{self.id}"]', + page=page, + size=100, + ) + ) + if len(dataset_items_page.content) == 0: + break + + for dataset_item in dataset_items_page.content: + rest_experiment_item_compare = dataset_item.experiment_items[0] + content = experiment_item.ExperimentItemContent.from_rest_experiment_item_compare( + value=rest_experiment_item_compare + ) + result.append(content) + + return result diff --git a/sdks/python/src/opik/api_objects/experiment/experiment_item.py b/sdks/python/src/opik/api_objects/experiment/experiment_item.py index 47e4251a7f..e66365dbeb 100644 --- a/sdks/python/src/opik/api_objects/experiment/experiment_item.py +++ b/sdks/python/src/opik/api_objects/experiment/experiment_item.py @@ -1,10 +1,44 @@ import dataclasses -from typing import Optional +from typing import Dict, Any, List, Optional +from opik.types import FeedbackScoreDict +from opik.rest_api.types import experiment_item_compare @dataclasses.dataclass -class ExperimentItem: +class ExperimentItemReferences: dataset_item_id: str trace_id: str - id: Optional[str] = None + + +@dataclasses.dataclass +class ExperimentItemContent: + id: str + dataset_item_id: str + trace_id: str + dataset_item_data: Optional[Dict[str, Any]] + evaluation_task_output: Optional[Dict[str, Any]] + feedback_scores: List[FeedbackScoreDict] + + @classmethod + def from_rest_experiment_item_compare( + cls, + value: experiment_item_compare.ExperimentItemCompare, + ) -> "ExperimentItemContent": + feedback_scores: List[FeedbackScoreDict] = [ + { + "category_name": rest_feedback_score.category_name, + "name": rest_feedback_score.name, + "reason": rest_feedback_score.reason, + "value": rest_feedback_score.value, + } + for rest_feedback_score in value.feedback_scores + ] + return ExperimentItemContent( + id=value.id, + trace_id=value.trace_id, + dataset_item_id=value.dataset_item_id, + dataset_item_data=value.input, + evaluation_task_output=value.output, + feedback_scores=feedback_scores, + ) diff --git a/sdks/python/src/opik/api_objects/experiment/helpers.py b/sdks/python/src/opik/api_objects/experiment/helpers.py index 4a02b6f10c..6874a9ea66 100644 --- a/sdks/python/src/opik/api_objects/experiment/helpers.py +++ b/sdks/python/src/opik/api_objects/experiment/helpers.py @@ -2,6 +2,9 @@ from .. import prompt import logging from opik import jsonable_encoder +from opik.rest_api import OpikApi +from opik.rest_api.types import experiment_public +from opik import exceptions LOGGER = logging.getLogger(__name__) @@ -38,3 +41,21 @@ def build_metadata_and_prompt_version( metadata = jsonable_encoder.jsonable_encoder(experiment_config) return metadata, prompt_version + + +def get_experiment_data_by_name( + rest_client: OpikApi, name: str +) -> experiment_public.ExperimentPublic: + page = 0 + + while True: + page += 1 + experiment_page_public = rest_client.experiments.find_experiments(name=name) + if len(experiment_page_public.content) == 0: + raise exceptions.ExperimentNotFound( + f"Experiment with the name {name} not found." + ) + + for experiment in experiment_page_public.content: + if experiment.name == name: + return experiment diff --git a/sdks/python/src/opik/api_objects/opik_client.py b/sdks/python/src/opik/api_objects/opik_client.py index 1bf1638c46..0c055f07a7 100644 --- a/sdks/python/src/opik/api_objects/opik_client.py +++ b/sdks/python/src/opik/api_objects/opik_client.py @@ -19,6 +19,7 @@ constants, validation_helpers, ) +from .experiment import helpers as experiment_helpers from ..message_processing import streamer_constructors, messages from ..message_processing.batching import sequence_splitter @@ -26,6 +27,7 @@ from ..rest_api.types import dataset_public, trace_public, span_public, project_public from ..rest_api.core.api_error import ApiError from .. import ( + exceptions, datetime_helpers, config, httpx_client, @@ -525,6 +527,57 @@ def create_experiment( return experiment_ + def get_experiment_by_name(self, name: str) -> experiment.Experiment: + """ + Returns an existing experiment by its name. + + Args: + name: The name of the experiment. + + Returns: + experiment.Experiment: the API object for an existing experiment. + """ + experiment_public = experiment_helpers.get_experiment_data_by_name( + rest_client=self._rest_client, name=name + ) + + return experiment.Experiment( + id=experiment_public.id, + name=name, + dataset_name=experiment_public.dataset_name, + rest_client=self._rest_client, + # TODO: add prompt if exists + ) + + def get_experiment_by_id(self, id: str) -> experiment.Experiment: + """ + Returns an existing experiment by its id. + + Args: + id: The id of the experiment. + + Returns: + experiment.Experiment: the API object for an existing experiment. + """ + try: + experiment_public = self._rest_client.experiments.get_experiment_by_id( + id=id + ) + except ApiError as exception: + if exception.status_code == 404: + raise exceptions.ExperimentNotFound( + f"Experiment with the id {id} not found." + ) from exception + raise + + return experiment.Experiment( + id=experiment_public.id, + name=experiment_public.name, + dataset_name=experiment_public.dataset_name, + rest_client=self._rest_client, + # TODO: add prompt if exists + ) + def end(self, timeout: Optional[int] = None) -> None: """ End the Opik session and submit all pending messages. diff --git a/sdks/python/src/opik/evaluation/evaluator.py b/sdks/python/src/opik/evaluation/evaluator.py index 2c321f7851..0ac17a75dd 100644 --- a/sdks/python/src/opik/evaluation/evaluator.py +++ b/sdks/python/src/opik/evaluation/evaluator.py @@ -109,6 +109,7 @@ def evaluate( experiment_name=experiment.name, test_results=test_results, ) + return evaluation_result_ diff --git a/sdks/python/src/opik/evaluation/scorer.py b/sdks/python/src/opik/evaluation/scorer.py index 1f6dda33d5..649704f14d 100644 --- a/sdks/python/src/opik/evaluation/scorer.py +++ b/sdks/python/src/opik/evaluation/scorer.py @@ -125,12 +125,12 @@ def _process_item( trace_data.init_end_time() client.trace(**trace_data.__dict__) - experiment_item_ = experiment_item.ExperimentItem( + experiment_item_ = experiment_item.ExperimentItemReferences( dataset_item_id=item.id, trace_id=trace_data.id, ) - experiment_.insert(experiment_items=[experiment_item_]) + experiment_.insert(experiment_items_references=[experiment_item_]) def score_tasks( diff --git a/sdks/python/src/opik/exceptions.py b/sdks/python/src/opik/exceptions.py index dff0df5ba1..7566609cf6 100644 --- a/sdks/python/src/opik/exceptions.py +++ b/sdks/python/src/opik/exceptions.py @@ -36,3 +36,7 @@ def __str__(self) -> str: f"Format arguments: {list(self.format_arguments)}. " f"Difference: {list(self.symmetric_difference)}. " ) + + +class ExperimentNotFound(OpikException): + pass diff --git a/sdks/python/src/opik/plugins/pytest/experiment_runner.py b/sdks/python/src/opik/plugins/pytest/experiment_runner.py index 99f90ba8b2..e124a306e5 100644 --- a/sdks/python/src/opik/plugins/pytest/experiment_runner.py +++ b/sdks/python/src/opik/plugins/pytest/experiment_runner.py @@ -47,7 +47,7 @@ def run(client: opik_client.Opik, test_items: List[Item]) -> None: experiment = client.create_experiment(name=experiment_name, dataset_name="tests") - experiment_items: List[experiment_item.ExperimentItem] = [] + experiment_items: List[experiment_item.ExperimentItemReferences] = [] dataset_items_to_create: List[dataset_item.DatasetItem] = [] for test_item in test_items: @@ -70,12 +70,12 @@ def run(client: opik_client.Opik, test_items: List[Item]) -> None: dataset_items_to_create.append(dataset_item_) experiment_items.append( - experiment_item.ExperimentItem( + experiment_item.ExperimentItemReferences( dataset_item_id=dataset_item_id, trace_id=test_run_trace_id, ) ) dataset.__internal_api__insert_items_as_dataclasses__(items=dataset_items_to_create) - experiment.insert(experiment_items=experiment_items) + experiment.insert(experiment_items_references=experiment_items) client.flush() diff --git a/sdks/python/tests/e2e/test_experiment.py b/sdks/python/tests/e2e/test_experiment.py index beb57c8701..6db1b44921 100644 --- a/sdks/python/tests/e2e/test_experiment.py +++ b/sdks/python/tests/e2e/test_experiment.py @@ -2,11 +2,15 @@ import opik -from opik import Prompt, synchronization +from opik import Prompt, synchronization, exceptions from opik.api_objects.dataset import dataset_item from opik.evaluation import metrics +from opik.api_objects.experiment import experiment_item from . import verifiers from .conftest import _random_chars +from ..testlib import assert_equal, ANY_BUT_NONE + +import pytest def test_experiment_creation_via_evaluate_function__happyflow( @@ -75,22 +79,74 @@ def task(item: Dict[str, Any]): feedback_scores_amount=1, prompt=prompt, ) - # TODO: check more content of the experiment - # - # EXPECTED_DATASET_ITEMS = [ - # dataset_item.DatasetItem( - # input={"question": "What is the of capital of France?"}, - # expected_model_output={"output": "Paris"}, - # ), - # dataset_item.DatasetItem( - # input={"question": "What is the of capital of Germany?"}, - # expected_model_output={"output": "Berlin"}, - # ), - # dataset_item.DatasetItem( - # input={"question": "What is the of capital of Poland?"}, - # expected_model_output={"output": "Warsaw"}, - # ), - # ] + + retrieved_experiment = opik_client.get_experiment_by_name(experiment_name) + experiment_items_contents = retrieved_experiment.get_items() + assert len(experiment_items_contents) == 3 + + EXPECTED_EXPERIMENT_ITEMS_CONTENT = [ + experiment_item.ExperimentItemContent( + id=ANY_BUT_NONE, + dataset_item_id=ANY_BUT_NONE, + trace_id=ANY_BUT_NONE, + dataset_item_data={ + "input": {"question": "What is the of capital of France?"}, + "expected_model_output": {"output": "Paris"}, + }, + evaluation_task_output={"output": "Paris"}, + feedback_scores=[ + { + "category_name": None, + "name": "equals_metric", + "reason": None, + "value": 1.0, + } + ], + ), + experiment_item.ExperimentItemContent( + id=ANY_BUT_NONE, + dataset_item_id=ANY_BUT_NONE, + trace_id=ANY_BUT_NONE, + dataset_item_data={ + "input": {"question": "What is the of capital of Germany?"}, + "expected_model_output": {"output": "Berlin"}, + }, + evaluation_task_output={"output": "Berlin"}, + feedback_scores=[ + { + "category_name": None, + "name": "equals_metric", + "reason": None, + "value": 1.0, + } + ], + ), + experiment_item.ExperimentItemContent( + id=ANY_BUT_NONE, + dataset_item_id=ANY_BUT_NONE, + trace_id=ANY_BUT_NONE, + dataset_item_data={ + "input": {"question": "What is the of capital of Poland?"}, + "expected_model_output": {"output": "Warsaw"}, + }, + evaluation_task_output={"output": "Krakow"}, + feedback_scores=[ + { + "category_name": None, + "name": "equals_metric", + "reason": None, + "value": 0.0, + } + ], + ), + ] + assert_equal( + sorted( + EXPECTED_EXPERIMENT_ITEMS_CONTENT, + key=lambda item: str(item.dataset_item_data), + ), + sorted(experiment_items_contents, key=lambda item: str(item.dataset_item_data)), + ) def test_experiment_creation__experiment_config_not_set__None_metadata_sent_to_backend( @@ -323,3 +379,17 @@ def task(item: Dict[str, Any]): feedback_scores_amount=3, prompt=prompt, ) + + +def test_experiment__get_experiment_by_id__experiment_not_found__ExperimentNotFound_error_is_raised( + opik_client: opik.Opik, +): + with pytest.raises(exceptions.ExperimentNotFound): + opik_client.get_experiment_by_id("not-existing-id") + + +def test_experiment__get_experiment_by_name__experiment_not_found__ExperimentNotFound_error_is_raised( + opik_client: opik.Opik, +): + with pytest.raises(exceptions.ExperimentNotFound): + opik_client.get_experiment_by_id("not-existing-name") diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index 4df38a7baa..4551e0e49d 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -71,7 +71,10 @@ def say_task(dataset_item: Dict[str, Any]): ) mock_experiment.insert.assert_has_calls( - [mock.call(experiment_items=mock.ANY), mock.call(experiment_items=mock.ANY)] + [ + mock.call(experiment_items_references=mock.ANY), + mock.call(experiment_items_references=mock.ANY), + ] ) EXPECTED_TRACE_TREES = [ TraceModel( @@ -182,7 +185,10 @@ def say_task(dataset_item: Dict[str, Any]): prompt=None, ) mock_experiment.insert.assert_has_calls( - [mock.call(experiment_items=mock.ANY), mock.call(experiment_items=mock.ANY)] + [ + mock.call(experiment_items_references=mock.ANY), + mock.call(experiment_items_references=mock.ANY), + ] ) EXPECTED_TRACE_TREES = [ @@ -329,7 +335,9 @@ def say_task(dataset_item: Dict[str, Any]): prompt=None, ) - mock_experiment.insert.assert_called_once_with(experiment_items=mock.ANY) + mock_experiment.insert.assert_called_once_with( + experiment_items_references=[mock.ANY] + ) EXPECTED_TRACE_TREE = TraceModel( id=ANY_BUT_NONE, name="evaluation_task",