Merge branch 'main' into robot-test-actions

opendatahub-io · Jan 17, 2024 · 8a0d674 · 8a0d674
2 parents 0fba52a + 97a976b
commit 8a0d674
Show file tree

Hide file tree

Showing 22 changed files with 556 additions and 173 deletions.
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -81,7 +81,7 @@ jobs:
 
       - name: Upload documentation
         if: matrix.session == 'docs-build'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: docs
           path: clients/python/docs/_build
diff --git a/api/openapi/model-registry.yaml b/api/openapi/model-registry.yaml
@@ -1576,7 +1576,7 @@ components:
             runtime:
               description: Model runtime.
               type: string
-            state:
+            desiredState:
               $ref: '#/components/schemas/InferenceServiceState'
     InferenceServiceCreate:
       description: >-

diff --git a/clients/python/noxfile.py b/clients/python/noxfile.py
@@ -49,7 +49,7 @@ def mypy(session: Session) -> None:
 def tests(session: Session) -> None:
     """Run the test suite."""
     session.install(".")
-    session.install("coverage[toml]", "pytest", "pytest-cov", "pygments")
+    session.install("coverage[toml]", "pytest", "pytest-cov", "pygments", "testcontainers")
     try:
         session.run(
             "pytest",

diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -10,6 +10,10 @@ readme = "README.md"
 python = ">= 3.9, < 3.11"
 attrs = "^21.0"
 ml-metadata = "^1.14.0"
+# Testcontainers for python, remote mlmd grpc tested with:
+# ml-metadata = { url = "https://github.com/tarilabs/ml-metadata/releases/download/1.14.0/ml_metadata-1.14.0-py3-none-any.whl" }
+# or might consider as well:
+# ml-metadata = { url = "https://github.com/tarilabs/ml-metadata-remote/releases/download/1.14.0/ml_metadata-1.14.0-py3-none-any.whl" }
 typing-extensions = "^4.8"
 
 [tool.poetry.group.dev.dependencies]
@@ -22,6 +26,7 @@ pytest-cov = "^4.1.0"
 sphinx-autobuild = "^2021.3.14"
 ruff = "^0.1.6"
 mypy = "^1.7.0"
+testcontainers = "^3.7.1"
 
 [tool.coverage.run]
 branch = true

diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py
@@ -1,24 +1,81 @@
+import os
+import time
 from typing import Union
 
 import pytest
+from ml_metadata import errors, metadata_store
 from ml_metadata.proto import (
     ArtifactType,
-    ConnectionConfig,
     ContextType,
     metadata_store_pb2,
 )
+from ml_metadata.proto.metadata_store_pb2 import MetadataStoreClientConfig
 from model_registry.core import ModelRegistryAPIClient
 from model_registry.store.wrapper import MLMDStore
 from model_registry.types import ModelArtifact, ModelVersion, RegisteredModel
+from testcontainers.core.container import DockerContainer
+from testcontainers.core.waiting_utils import wait_for_logs
 
 ProtoTypeType = Union[ArtifactType, ContextType]
 
+# ruff: noqa: PT021 supported
+@pytest.fixture(scope="session")
+def mlmd_conn(request) -> MetadataStoreClientConfig:
+    model_registry_root_dir = model_registry_root(request)
+    print("Assuming this is the Model Registry root directory:", model_registry_root_dir)
+    shared_volume = model_registry_root_dir / "test/config/ml-metadata"
+    sqlite_db_file = shared_volume / "metadata.sqlite.db"
+    if sqlite_db_file.exists():
+        msg = f"The file {sqlite_db_file} already exists; make sure to cancel it before running these tests."
+        raise FileExistsError(msg)
+    container = DockerContainer("gcr.io/tfx-oss-public/ml_metadata_store_server:1.14.0")
+    container.with_exposed_ports(8080)
+    container.with_volume_mapping(shared_volume, "/tmp/shared", "rw") # noqa this is file target in container
+    container.with_env("METADATA_STORE_SERVER_CONFIG_FILE", "/tmp/shared/conn_config.pb") # noqa this is target in container
+    container.start()
+    wait_for_logs(container, "Server listening on")
+    os.system('docker container ls --format "table {{.ID}}\t{{.Names}}\t{{.Ports}}" -a') # noqa governed test
+    print("waited for logs and port")
+    cfg = MetadataStoreClientConfig(
+        host = "localhost",
+        port = int(container.get_exposed_port(8080))
+    )
+    print(cfg)
+
+    # this callback is needed in order to perform the container.stop()
+    # removing this callback might result in mlmd container shutting down before the tests had chance to fully run,
+    # and resulting in grpc connection resets.
+    def teardown():
+        container.stop()
+        print("teardown of plain_wrapper completed.")
+
+    request.addfinalizer(teardown)
+
+    time.sleep(3) # allowing some time for mlmd grpc to fully stabilize (is "spent" once per pytest session anyway)
+    _throwaway_store = metadata_store.MetadataStore(cfg)
+    wait_for_grpc(container, _throwaway_store)
+
+    return cfg
+
+
+def model_registry_root(request):
+    return (request.config.rootpath / "../..").resolve()  # resolves to absolute path
+
 
 @pytest.fixture()
-def plain_wrapper() -> MLMDStore:
-    config = ConnectionConfig()
-    config.fake_database.SetInParent()
-    return MLMDStore(config)
+def plain_wrapper(request, mlmd_conn: MetadataStoreClientConfig) -> MLMDStore:
+    sqlite_db_file = model_registry_root(request) / "test/config/ml-metadata/metadata.sqlite.db"
+    def teardown():
+        try:
+            os.remove(sqlite_db_file)
+            print(f"Removed {sqlite_db_file} successfully.")
+        except Exception as e:
+            print(f"An error occurred while removing {sqlite_db_file}: {e}")
+        print("plain_wrapper_after_each done.")
+
+    request.addfinalizer(teardown)
+
+    return MLMDStore(mlmd_conn)
 
 
 def set_type_attrs(mlmd_obj: ProtoTypeType, name: str, props: list[str]):
@@ -78,3 +135,22 @@ def mr_api(store_wrapper: MLMDStore) -> ModelRegistryAPIClient:
     mr = object.__new__(ModelRegistryAPIClient)
     mr._store = store_wrapper
     return mr
+
+
+def wait_for_grpc(container: DockerContainer, store: metadata_store.MetadataStore, timeout=6, interval=2):
+    start = time.time()
+    while True:
+        duration = time.time() - start
+        results = None
+        try:
+            results = store.get_contexts()
+        except errors.UnavailableError as e:
+            print(e)
+            print("Container logs:\n", container.get_logs())
+            print("Container not ready. Retrying...")
+        if results is not None:
+            return duration
+        if timeout and duration > timeout:
+            raise TimeoutError("wait_for_grpc not ready %.3f seconds"
+                               % timeout)
+        time.sleep(interval)
diff --git a/doc/remote_only_packaging_of_MLMD_Python_lib.md b/doc/remote_only_packaging_of_MLMD_Python_lib.md
@@ -0,0 +1,126 @@
+# Remote-only packaging of MLMD Python lib
+
+## Context and Problem statement
+
+Google’s ML Metadata (MLMD) is a project composed of a C++ server, and a Python client library.
+The server exposes a gRPC interface, and is only distributed for x86-64 architectures.
+It is embedded in the client's wheel binary, providing an additional convenience [method for running the server locally (in memory)](https://www.tensorflow.org/tfx/guide/mlmd#metadata_storage_backends_and_store_connection_configuration), 
+whilst also making it [architecture specific](https://pypi.org/project/ml-metadata/1.14.0/#files).
+
+The [Model Registry project](https://docs.google.com/document/d/1G-pjdGaS2kLELsB5kYk_D4AmH-fTfnCnJOhJ8xENjx0/edit?usp=sharing) (MR) is built on top of MLMD.
+The Go implementation interfaces with the MLMD server via gRPC, typically available as a Docker container.
+The [MR Python client](https://github.com/opendatahub-io/model-registry/tree/main/clients/python#readme) wraps the MLMD client.
+
+As the MLMD client is architecture specific, so is the MR Python client, which **severely limits the targets it can run on**, as it only supports x86-64.
+This **poses many challenges to contributors** using other CPU architectures, specially ARM, as that's become more prevalent in recent years.
+
+Since the Model Registry python client does _not_ leverage the embedded MLMD server's in-memory connection, we present options for a “soft-fork” and re-distribution of the MLMD client as a pure Python library, making it platform/architecture agnostic.
+
+The resulting artifact, a MLMD python wheel supporting only remote gRPC connections at the same time being a “pure library” hence not requiring specific CPU architectures, OSes, or CPython versions, could be as well published on PyPI under the ODH org, and become the dependency of [MR python client](/clients/python/README.md).
+
+
+## Goals
+
+* consider required changes to “soft-fork” MLMD wheel to support only remote gRPC connections
+* repackaging as a pure library
+
+
+## Non-Goals
+
+* “hard”-fork of MLMD
+* maintaining original MLMD tests (those bound to embedded server)
+
+
+## Proposed solution
+
+Refer to the conclusions section for the motivations behind selecting:
+1. soft-fork upstream repo, modify pip+bazel build, to produce the distributable Python client ("Alternative B", below)
+2. create a `ml-metadata-remote` or similarly named package on PyPI based on the distributable wheel and files from the step1 above ("Packaging Option1", below)
+
+For documentation purposes, the exploration of the different alternatives is reported below.
+
+
+## Alternative A: repackage the resulting wheel
+
+This solution explores the steps required to simply repackage the existing and distributed (by Google) MLMD python wheel, removing all embedded binaries, and repackaging the wheel as a pure library.
+
+This has been experimented with success on this repository: ([link](https://github.com/tarilabs/ml-metadata-remote))
+
+The steps required are recorded here: ([link](https://github.com/tarilabs/ml-metadata-remote/commits/v1.14.0))
+
+and mainly consists of:
+
+1. Download one platform-specific MLMD v1.14.0 wheel and extract its content ([reference](https://github.com/tarilabs/ml-metadata-remote/commit/39dd0c7dcd063e0440a6354017445dada8423f0c#diff-b335630551682c19a781afebcf4d07bf978fb1f8ac04c6bf87428ed5106870f5))
+2. Remove embedded code, apply required code changes ([reference](https://github.com/tarilabs/ml-metadata-remote/commit/bcb1f0ffd37600e056342aff39e154bb35422668#diff-f363c85a1cf3536a48a7b721b02a6999b80a08b9c305d185327e87e2769b6f21))
+3. Recompute dist-info checksums before repackaging ([reference](https://github.com/tarilabs/ml-metadata-remote/commit/fda125fb742ab8ecf4a7153705717d8b50f59326#diff-53bdc596caf062825dbb42b65e5b2305db70d2e533c03bc677b13cc8c7cfd236))
+4. Repackage the directories as a new pure library wheel ([reference](https://github.com/tarilabs/ml-metadata-remote/commit/5d199f808eea0cb7ba78a0702be8de3306477df8))
+
+The resulting artifact has been [tested](https://github.com/tarilabs/ml-metadata-remote#readme:~:text=Testing%20with%20launching%20a%20local%20server) locally with a gRPC connection to MLMD server made available via Docker. The resulting artifact is directly available for local download: ([link](https://github.com/tarilabs/ml-metadata-remote/releases/tag/1.14.0))
+
+
+## Alternative B: build by soft-fork upstream repo
+
+This solution explores how to use the upstream MLMD repo by Google and by making necessary code changes, so to directly produce with the pip+bazel build the wheel as a pure library.
+
+This has been experimented with success on this fork: ([link](https://github.com/tarilabs/ml-metadata/commits/remote-r1.14.0))
+
+The steps required mainly consists of:
+
+1. Make changes to the bazel BUILD file ([reference](https://github.com/tarilabs/ml-metadata/commit/079aeb3a9da69eb960e428a7866e279d0bfb533b#diff-c8858dec4f58c1d8a280af8c117ff8480f7ed4ae863b96e1ba20b52f83222aab))
+2. Make changes to sh build script ([reference](https://github.com/tarilabs/ml-metadata/commit/079aeb3a9da69eb960e428a7866e279d0bfb533b#diff-125a2f247ce39f711e1c8a77f430bd5b1b865cd10b5c5fef0d9140d276c617f2))
+3. Make changes to setup.py build file ([reference](https://github.com/tarilabs/ml-metadata/commit/079aeb3a9da69eb960e428a7866e279d0bfb533b#diff-60f61ab7a8d1910d86d9fda2261620314edcae5894d5aaa236b821c7256badd7))
+4. Apply required code changes analogously to “Alternative A” (see other changes in [this commit](https://github.com/tarilabs/ml-metadata/commit/079aeb3a9da69eb960e428a7866e279d0bfb533b))
+
+The resulting artifact has been [tested](https://github.com/tarilabs/ml-metadata/commit/794ec39d97e3ac70db2ca18fcf5807c44f339f0b) locally with a gRPC connection to MLMD server made available via Docker, similar to instructions provided in “Alternative A”.
+
+
+## Packaging
+
+In this section we consider packaging and delivery options for the resulting artifact from the alternative selected above.
+
+
+### Packaging Option1: separate repo on ODH
+
+This delivery option considers having a separate repo on ODH, called “ml-metadata-remote” (or the likes). Repeat the exercise from the alternative selected above on this repo. Then deliver this as a package on PyPI.
+
+Pros:
+
+* Well isolated dependency
+    * also, if one day upstream Google MLMD resolves to be platform/arch agnostic, is just a matter of changing again the consumed dependency from MR python client
+* Google code (copyright header) is isolated from Model Registry code
+* The resulting artifact could also be re-used by other communities/users
+
+Cons:
+
+* Additional artifact to publish on PyPI
+
+
+### Packaging Option2: mix resulting artifact inside Model Registry repo
+
+This delivery option considers placing the resulting artifact by executing the exercise from the alternative selected above and placing it directly inside the Model Registry repo, with the python client source [location](https://github.com/opendatahub-io/model-registry/tree/main/clients/python). (for analogy, this is similar to “shading”/”uberjar” in Java world for those familiar with the concept)
+
+Pros:
+
+* Only one artifact to publish on PyPI
+
+Cons:
+
+* Google code (copyright header) is mixed with Model Registry code
+    * at this stage is not clear if any implications with uplifting the MR project in KF community
+* The resulting artifact cannot be re-used by other communities/users
+* If one day upstream Google MLMD resolves to be platform/arch agnostic, changing back the MR python client to use the original ml-metadata could require extra work and effort
+
+
+## Conclusion
+
+Based on analysis of the alternatives provided, we decided to further pursue:
+- the repackaging by **Alternative B** because makes it actually easier to demonstrate the steps and modifications required using as baseline the upstream repo.
+- the distribution by **Packaging Option1** because it will make it easier to "revert" to the upstream `ml-metadata` dependency if upstream will publish for all architectures, OSes, etc. and as the pros outweight considered cons.
+
+MR python client [tests](https://github.com/opendatahub-io/model-registry/blob/259b39320953bf05942dcec1fb5ec74f7eb5d4a7/clients/python/tests/conftest.py#L19) should be rewritten using Testcontainers, and not leveraging the embedded server (already done with [this PR](https://github.com/opendatahub-io/model-registry/pull/225)).
+
+The group concur this is a sensible approach ([recorded here](https://redhat-internal.slack.com/archives/C05LGBNUK9C/p1700763823505259?thread_ts=1700427888.670999&cid=C05LGBNUK9C)).
+
+This change would also better align the test approach used for the MR python client, by aligning with the same strategy of the MR core Go layer test framework, which already makes use of Testcontainers for Go ([reference](https://github.com/opendatahub-io/model-registry/blob/259b39320953bf05942dcec1fb5ec74f7eb5d4a7/internal/testutils/test_container_utils.go#L59)).
+
+This would allow to update the constraint on the version for the `attrs` dependency as part of this activity.
diff --git a/go.mod b/go.mod
@@ -3,16 +3,16 @@ module github.com/opendatahub-io/model-registry
 go 1.19
 
 require (
-	github.com/go-chi/chi/v5 v5.0.10
+	github.com/go-chi/chi/v5 v5.0.11
 	github.com/go-chi/cors v1.2.1
 	github.com/golang/glog v1.2.0
 	github.com/spf13/cobra v1.8.0
 	github.com/spf13/pflag v1.0.5
-	github.com/spf13/viper v1.18.1
+	github.com/spf13/viper v1.18.2
 	github.com/stretchr/testify v1.8.4
 	github.com/testcontainers/testcontainers-go v0.26.0
-	google.golang.org/grpc v1.59.0
-	google.golang.org/protobuf v1.31.0
+	google.golang.org/grpc v1.60.1
+	google.golang.org/protobuf v1.32.0
 )
 
 require (
@@ -31,9 +31,9 @@ require (
 	dario.cat/mergo v1.0.0 // indirect
 	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
 	github.com/Microsoft/go-winio v0.6.1 // indirect
-	github.com/Microsoft/hcsshim v0.11.1 // indirect
+	github.com/Microsoft/hcsshim v0.11.4 // indirect
 	github.com/cenkalti/backoff/v4 v4.2.1 // indirect
-	github.com/containerd/containerd v1.7.7 // indirect
+	github.com/containerd/containerd v1.7.11 // indirect
 	github.com/cpuguy83/dockercfg v0.3.1 // indirect
 	github.com/docker/distribution v2.8.2+incompatible // indirect
 	github.com/docker/docker v24.0.7+incompatible // indirect
@@ -43,7 +43,7 @@ require (
 	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
-	github.com/google/uuid v1.4.0
+	github.com/google/uuid v1.5.0
 	github.com/hashicorp/hcl v1.0.0 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/klauspost/compress v1.17.0 // indirect