[OPIK-641] create guardrails integration (#954)

* Draft implementation * Add guardrails integration test * Add docstring * Add guardrails workflow * Add protection from decorating the same object multiple times. * Fix lint errors * Update tests * Move hub downloads to workflow file * Update workflow file * Add gr api key from environments * Update workflow file * Configure gh token in workflow via cli command * Update gr token configuration * Pop 3.9 py version from test matrix since it doesnt support guardrails CLI * Separate hub installations from lib installation in workflow * Update workflow to not install local models * Remove obsolete mv command * Update test to use LLM check only, update integration to get model from validator if available * Fix lint errors
comet-ml · Dec 24, 2024 · af3dbcf · af3dbcf
1 parent f8d6678
commit af3dbcf
Show file tree

Hide file tree

Showing 9 changed files with 267 additions and 3 deletions.
diff --git a/.github/workflows/lib-guardrails-tests.yml b/.github/workflows/lib-guardrails-tests.yml
@@ -0,0 +1,57 @@
+# Workflow to run Guar tests
+#
+# Please read inputs to provide correct values.
+#
+name: SDK Lib Guardrails Tests
+run-name: "SDK Lib Guardrails Tests ${{ github.ref_name }} by @${{ github.actor }}"
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  OPENAI_ORG_ID:  ${{ secrets.OPENAI_ORG_ID }}
+  GUARDRAILS_API_KEY: ${{ secrets.GUARDRAILS_API_KEY }}
+on:
+  workflow_call:
+
+jobs:
+  tests:
+    name: Guardrails Python ${{matrix.python_version}}
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: sdks/python
+
+    strategy:
+      fail-fast: true
+      matrix:
+        python_version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{matrix.python_version}}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{matrix.python_version}}
+
+      - name: Install opik
+        run: pip install .
+
+      - name: Install test tools
+        run: |
+          cd ./tests
+          pip install --no-cache-dir --disable-pip-version-check -r test_requirements.txt
+
+      - name: Install lib 
+        run: |
+          cd ./tests
+          pip install --no-cache-dir --disable-pip-version-check -r library_integration/guardrails/requirements.txt
+
+      - name: Install checks from guardrails hub
+        run: |
+          guardrails configure --token $GUARDRAILS_API_KEY --disable-metrics --enable-remote-inferencing;
+          guardrails hub install hub://guardrails/politeness_check
+
+      - name: Run tests
+        run: |
+          cd ./tests/library_integration/guardrails/
+          python -m pytest  -vv .
diff --git a/.github/workflows/lib-integration-tests-runner.yml b/.github/workflows/lib-integration-tests-runner.yml
@@ -17,6 +17,7 @@ on:
           - anthropic
           - aisuite
           - haystack
+          - guardrails
   schedule:
     - cron: "0 0 */1 * *"
   pull_request:
@@ -80,3 +81,9 @@ jobs:
     if: contains(fromJSON('["haystack", "all"]'), needs.init_environment.outputs.LIBS)
     uses: ./.github/workflows/lib-haystack-tests.yml
     secrets: inherit
+
+  guardrails_tests:
+    needs: [init_environment]
+    if: contains(fromJSON('["guardrails", "all"]'), needs.init_environment.outputs.LIBS)
+    uses: ./.github/workflows/lib-guardrails-tests.yml
+    secrets: inherit
diff --git a/sdks/python/src/opik/integrations/guardrails/__init__.py b/sdks/python/src/opik/integrations/guardrails/__init__.py
@@ -0,0 +1,3 @@
+from .guardrails_tracker import track_guardrails
+
+__all__ = ["track_guardrails"]
diff --git a/sdks/python/src/opik/integrations/guardrails/guardrails_decorator.py b/sdks/python/src/opik/integrations/guardrails/guardrails_decorator.py
@@ -0,0 +1,86 @@
+import logging
+from typing import (
+    Any,
+    AsyncGenerator,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from guardrails import validators
+
+from opik.decorator import arguments_helpers, base_track_decorator, inspect_helpers
+
+LOGGER = logging.getLogger(__name__)
+
+KWARGS_KEYS_TO_LOG_AS_INPUTS = ["value"]
+RESPONSE_KEYS_TO_LOG_AS_OUTPUT = ["output"]
+
+
+class GuardrailsValidatorValidateDecorator(base_track_decorator.BaseTrackDecorator):
+    def _start_span_inputs_preprocessor(
+        self,
+        func: Callable,
+        track_options: arguments_helpers.TrackOptions,
+        args: Tuple,
+        kwargs: Dict[str, Any],
+    ) -> arguments_helpers.StartSpanParameters:
+        name = track_options.name if track_options.name is not None else func.__name__
+        metadata = track_options.metadata if track_options.metadata is not None else {}
+        metadata.update({"created_from": "guardrails"})
+        input = (
+            inspect_helpers.extract_inputs(func, args, kwargs)
+            if track_options.capture_input
+            else None
+        )
+
+        validator_instance = func.__self__  # type: ignore
+        model = (
+            validator_instance.llm_callable
+            if hasattr(validator_instance, "llm_callable")
+            else None
+        )
+        if model is not None:
+            metadata["model"] = model
+
+        result = arguments_helpers.StartSpanParameters(
+            name=name,
+            input=input,
+            type=track_options.type,
+            metadata=metadata,
+            project_name=track_options.project_name,
+            model=model,
+        )
+
+        return result
+
+    def _end_span_inputs_preprocessor(
+        self, output: Any, capture_output: bool
+    ) -> arguments_helpers.EndSpanParameters:
+        assert isinstance(
+            output,
+            validators.ValidationResult,
+        )
+        tags = ["guardrails", output.outcome]
+
+        result = arguments_helpers.EndSpanParameters(
+            output=output,
+            metadata=output.metadata,
+            tags=tags,
+        )
+
+        return result
+
+    def _generators_handler(
+        self,
+        output: Any,
+        capture_output: bool,
+        generations_aggregator: Optional[Callable[[List[Any]], str]],
+    ) -> Optional[Union[Generator, AsyncGenerator]]:
+        return super()._generators_handler(
+            output, capture_output, generations_aggregator
+        )
diff --git a/sdks/python/src/opik/integrations/guardrails/guardrails_tracker.py b/sdks/python/src/opik/integrations/guardrails/guardrails_tracker.py
@@ -0,0 +1,39 @@
+from typing import Optional
+
+import guardrails
+
+from . import guardrails_decorator
+
+
+def track_guardrails(
+    guard: guardrails.Guard, project_name: Optional[str] = None
+) -> guardrails.Guard:
+    """
+    Adds Opik tracking to a guardrails Guard instance.
+
+    Every validation step will be logged as a trace.
+
+    Args:
+        guard: An instance of Guard object.
+        project_name: The name of the project to log data.
+
+    Returns:
+        The modified Guard instance with Opik tracking enabled for its validators.
+    """
+    validators = guard._validators
+    decorator_factory = guardrails_decorator.GuardrailsValidatorValidateDecorator()
+
+    for validator in validators:
+        if hasattr(validator.async_validate, "opik_tracked"):
+            continue
+
+        validate_decorator = decorator_factory.track(
+            name=f"{validator.rail_alias}.validate",
+            project_name=project_name,
+            type="llm" if hasattr(validator, "llm_callable") else "general",
+        )
+        setattr(
+            validator, "async_validate", validate_decorator(validator.async_validate)
+        )  # decorate async version because it is being called under the hood of guardrails engine
+
+    return guard
diff --git a/sdks/python/tests/library_integration/guardrails/__init__.py b/sdks/python/tests/library_integration/guardrails/__init__.py
diff --git a/sdks/python/tests/library_integration/guardrails/requirements.txt b/sdks/python/tests/library_integration/guardrails/requirements.txt
@@ -0,0 +1 @@
+guardrails-ai
diff --git a/sdks/python/tests/library_integration/guardrails/test_guardrails.py b/sdks/python/tests/library_integration/guardrails/test_guardrails.py
@@ -0,0 +1,69 @@
+import pytest
+from guardrails import Guard, OnFailAction
+from guardrails.hub import PolitenessCheck
+
+import opik
+from opik.config import OPIK_PROJECT_DEFAULT_NAME
+from opik.integrations.guardrails.guardrails_tracker import track_guardrails
+
+from ...testlib import ANY_BUT_NONE, ANY_DICT, SpanModel, TraceModel, assert_equal
+
+
+@pytest.mark.parametrize(
+    "project_name, expected_project_name",
+    [
+        (None, OPIK_PROJECT_DEFAULT_NAME),
+        ("guardrails-integration-test", "guardrails-integration-test"),
+    ],
+)
+def test_guardrails__trace_and_span_per_one_validation_check(
+    fake_backend, ensure_openai_configured, project_name, expected_project_name
+):
+    politeness_check = PolitenessCheck(
+        llm_callable="gpt-3.5-turbo", on_fail=OnFailAction.NOOP
+    )
+
+    guard: Guard = Guard().use_many(politeness_check)
+    guard = track_guardrails(guard, project_name=project_name)
+
+    result = guard.validate(
+        "Would you be so kind to pass me a cup of tea?",
+    )  # Both the guardrails pass
+    expected_result_tag = "pass" if result.validation_passed else "fail"
+    opik.flush_tracker()
+
+    COMPETITOR_CHECK_EXPECTED_TRACE_TREE = TraceModel(
+        id=ANY_BUT_NONE,
+        name="guardrails/politeness_check.validate",
+        input={
+            "value": "Would you be so kind to pass me a cup of tea?",
+            "metadata": ANY_DICT,
+        },
+        output=ANY_BUT_NONE,
+        tags=["guardrails", expected_result_tag],
+        metadata={"created_from": "guardrails", "model": "gpt-3.5-turbo"},
+        start_time=ANY_BUT_NONE,
+        end_time=ANY_BUT_NONE,
+        project_name=expected_project_name,
+        spans=[
+            SpanModel(
+                id=ANY_BUT_NONE,
+                type="llm",
+                name="guardrails/politeness_check.validate",
+                input={
+                    "value": "Would you be so kind to pass me a cup of tea?",
+                    "metadata": ANY_DICT,
+                },
+                output=ANY_BUT_NONE,
+                tags=["guardrails", expected_result_tag],
+                metadata={"created_from": "guardrails", "model": "gpt-3.5-turbo"},
+                start_time=ANY_BUT_NONE,
+                end_time=ANY_BUT_NONE,
+                project_name=expected_project_name,
+                model="gpt-3.5-turbo",
+                spans=[],
+            )
+        ],
+    )
+
+    assert_equal(COMPETITOR_CHECK_EXPECTED_TRACE_TREE, fake_backend.trace_trees[0])
diff --git a/sdks/python/tests/testlib/assert_helpers.py b/sdks/python/tests/testlib/assert_helpers.py
@@ -40,9 +40,11 @@ def prepare_difference_report(expected: Any, actual: Any) -> str:
 
 
 def assert_equal(expected, actual):
-    # expected MUST be left argument so that __eq__ operators
-    # from our ANY* comparison helpers were called instead of __eq__ operators
-    # of the actual object
+    """
+    expected MUST be left argument so that __eq__ operators
+    from our ANY* comparison helpers were called instead of __eq__ operators
+    of the actual object
+    """
     assert expected == actual, f"Details: {prepare_difference_report(actual, expected)}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .guardrails_tracker import track_guardrails

		__all__ = ["track_guardrails"]