fix batch and dataset types

iterative · May 19, 2022 · 6eb6951 · 6eb6951
1 parent 6f98313
commit 6eb6951
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 57 deletions.
diff --git a/mlem/api/commands.py b/mlem/api/commands.py
@@ -101,13 +101,8 @@ def apply(
             return res[0]
         return res
     if len(res) == 1:
-        return save(
-            res[0], output, repo=target_repo, external=external, index=index
-        )
-
-    raise NotImplementedError(
-        "Saving several input data objects is not implemented yet"
-    )
+        res = res[0]
+    return save(res, output, repo=target_repo, external=external, index=index)
 
 
 def apply_remote(
@@ -152,11 +147,8 @@ def apply_remote(
             return res[0]
         return res
     if len(res) == 1:
-        return save(res[0], output, repo=target_repo, index=index)
-
-    raise NotImplementedError(
-        "Saving several input data objects is not implemented yet"
-    )
+        res = res[0]
+    return save(res, output, repo=target_repo, index=index)
 
 
 def clone(

diff --git a/mlem/api/utils.py b/mlem/api/utils.py
@@ -9,7 +9,7 @@
 
 def get_dataset_value(dataset: Any, batch_size: Optional[int] = None) -> Any:
     if isinstance(dataset, str):
-        return load(dataset)
+        return load(dataset, batch_size=batch_size)
     if isinstance(dataset, MlemDataset):
         # TODO: https://github.com/iterative/mlem/issues/29
         #  fix discrepancies between model and data meta objects

diff --git a/mlem/core/base.py b/mlem/core/base.py
@@ -163,12 +163,13 @@ def build_mlem_object(
     **kwargs,
 ):
     not_links, links = parse_links(model, str_conf or [])
+    if model.__is_root__:
+        kwargs[model.__config__.type_field] = subtype
     return build_model(
         model,
         str_conf=not_links,
         file_conf=file_conf,
         conf=conf,
-        **{model.__config__.type_field: subtype},
         **kwargs,
         **links,
     )

diff --git a/mlem/core/dataset_type.py b/mlem/core/dataset_type.py
@@ -285,9 +285,9 @@ def write(
             res[str(i)] = art
             readers.append(elem_reader)
 
-        return ListReader(
-            dataset_type=dataset, readers=readers
-        ), flatdict.FlatterDict(res, delimiter="/")
+        return ListReader(dataset_type=dataset, readers=readers), dict(
+            flatdict.FlatterDict(res, delimiter="/")
+        )
 
 
 class ListReader(DatasetReader):
@@ -392,7 +392,7 @@ def write(
 
         return (
             _TupleLikeDatasetReader(dataset_type=dataset, readers=readers),
-            flatdict.FlatterDict(res, delimiter="/"),
+            dict(flatdict.FlatterDict(res, delimiter="/")),
         )
 
 
@@ -460,9 +460,12 @@ def process(cls, obj, **kwargs) -> DatasetType:
         if not py_types.intersection(
             PrimitiveType.PRIMITIVES
         ):  # py_types is guaranteed to be singleton set here
-            return TupleLikeListDatasetType(
-                items=[DatasetAnalyzer.analyze(o) for o in obj]
-            )
+            items_types = [DatasetAnalyzer.analyze(o) for o in obj]
+            first, *others = items_types
+            for other in others:
+                if first != other:
+                    return TupleLikeListDatasetType(items=items_types)
+            return ListDatasetType(dtype=first, size=len(obj))
 
         # optimization for large lists of same primitive type elements
         return ListDatasetType(
@@ -552,9 +555,9 @@ def write(
             )
             res[key] = art
             readers[key] = dtype_reader
-        return DictReader(
-            dataset_type=dataset, item_readers=readers
-        ), flatdict.FlatterDict(res, delimiter="/")
+        return DictReader(dataset_type=dataset, item_readers=readers), dict(
+            flatdict.FlatterDict(res, delimiter="/")
+        )
 
 
 class DictReader(DatasetReader):

diff --git a/mlem/core/objects.py b/mlem/core/objects.py
@@ -690,7 +690,10 @@ def load_value(self):
         self.dataset = self.reader.read(self.relative_artifacts)
 
     def read_batch(self, batch_size: int) -> Iterator[DatasetType]:
-        assert isinstance(self.reader, DatasetReader)
+        if self.reader is None:
+            raise MlemObjectNotSavedError(
+                "Cannot read batch from not saved dataset"
+            )
         return self.reader.read_batch(self.relative_artifacts, batch_size)
 
     def get_value(self):

diff --git a/mlem/polydantic/core.py b/mlem/polydantic/core.py
@@ -67,7 +67,7 @@ def validate(cls, value):
         """Polymorphic magic goes here"""
         if isinstance(value, cls):
             return value
-        if not cls.__is_root__:
+        if not cls.__is_root__ and cls.__config__.type_field not in value:
             return super().validate(value)
         if isinstance(value, str):
             value = {cls.__config__.type_field: value}

diff --git a/tests/cli/test_apply.py b/tests/cli/test_apply.py
@@ -9,7 +9,10 @@
 from sklearn.tree import DecisionTreeClassifier
 
 from mlem.api import load, save
+from mlem.core.dataset_type import ListDatasetType
 from mlem.core.errors import MlemRootNotFound
+from mlem.core.metadata import load_meta
+from mlem.core.objects import MlemDataset
 from mlem.runtime.client import HTTPClient
 from tests.conftest import MLEM_TEST_REPO, long, need_test_repo_auth
 
@@ -69,6 +72,7 @@ def test_apply_batch(runner, model_path_batch, data_path_batch):
         path = posixpath.join(dir, "data")
         result = runner.invoke(
             [
+                "--tb",
                 "apply",
                 model_path_batch,
                 data_path_batch,
@@ -82,8 +86,12 @@ def test_apply_batch(runner, model_path_batch, data_path_batch):
             ],
         )
         assert result.exit_code == 0, (result.output, result.exception)
-        predictions = load(path)
-        assert isinstance(predictions, ndarray)
+        predictions_meta = load_meta(
+            path, load_value=True, force_type=MlemDataset
+        )
+        assert isinstance(predictions_meta.dataset, ListDatasetType)
+        predictions = predictions_meta.get_value()
+        assert isinstance(predictions, list)
 
 
 def test_apply_with_import(runner, model_meta_saved_single, tmp_path_factory):
@@ -94,6 +102,7 @@ def test_apply_with_import(runner, model_meta_saved_single, tmp_path_factory):
         path = posixpath.join(dir, "data")
         result = runner.invoke(
             [
+                "--tb",
                 "apply",
                 model_meta_saved_single.loc.uri,
                 data_path,
@@ -122,6 +131,7 @@ def test_apply_batch_with_import(
         path = posixpath.join(dir, "data")
         result = runner.invoke(
             [
+                "--tb",
                 "apply",
                 model_meta_saved_single.loc.uri,
                 data_path,

diff --git a/tests/core/test_dataset_type.py b/tests/core/test_dataset_type.py
@@ -3,6 +3,7 @@
 
 from mlem.core.dataset_type import (
     DatasetAnalyzer,
+    DatasetReader,
     DatasetType,
     DictDatasetType,
     DictReader,
@@ -11,6 +12,7 @@
     PrimitiveReader,
     PrimitiveType,
     TupleDatasetType,
+    TupleLikeListDatasetType,
     _TupleLikeDatasetReader,
     _TupleLikeDatasetWriter,
 )
@@ -93,11 +95,11 @@ def test_list_source():
     )
 
     assert list(artifacts.keys()) == [f"{x}/data" for x in range(len(l_value))]
-    assert artifacts["0"]["data"].uri.endswith("data/0")
-    assert artifacts["1"]["data"].uri.endswith("data/1")
-    assert artifacts["2"]["data"].uri.endswith("data/2")
-    assert artifacts["3"]["data"].uri.endswith("data/3")
-    assert artifacts["4"]["data"].uri.endswith("data/4")
+    assert artifacts["0/data"].uri.endswith("data/0")
+    assert artifacts["1/data"].uri.endswith("data/1")
+    assert artifacts["2/data"].uri.endswith("data/2")
+    assert artifacts["3/data"].uri.endswith("data/3")
+    assert artifacts["4/data"].uri.endswith("data/4")
 
 
 def test_tuple():
@@ -150,16 +152,22 @@ def test_tuple_source():
         "4/data",
         "5/data",
     ]
-    assert list(artifacts["1"].keys()) == [
-        f"{x}/data" for x in range(len(t_value[1]))
-    ]
-    assert artifacts["0"]["data"].uri.endswith("data/0")
-    assert artifacts["1"]["0"]["data"].uri.endswith("data/1/0")
-    assert artifacts["1"]["1"]["data"].uri.endswith("data/1/1")
-    assert artifacts["2"]["data"].uri.endswith("data/2")
-    assert artifacts["3"]["data"].uri.endswith("data/3")
-    assert artifacts["4"]["data"].uri.endswith("data/4")
-    assert artifacts["5"]["data"].uri.endswith("data/5")
+    assert artifacts["0/data"].uri.endswith("data/0")
+    assert artifacts["1/0/data"].uri.endswith("data/1/0")
+    assert artifacts["1/1/data"].uri.endswith("data/1/1")
+    assert artifacts["2/data"].uri.endswith("data/2")
+    assert artifacts["3/data"].uri.endswith("data/3")
+    assert artifacts["4/data"].uri.endswith("data/4")
+    assert artifacts["5/data"].uri.endswith("data/5")
+
+
+def test_tuple_reader():
+    dataset_type = TupleLikeListDatasetType(items=[])
+    assert dataset_type.dict()["type"] == "tuple_like_list"
+    reader = _TupleLikeDatasetReader(dataset_type=dataset_type, readers=[])
+    new_reader = parse_obj_as(DatasetReader, reader.dict())
+    res = new_reader.read({})
+    assert res.data == []
 
 
 def test_mixed_list_source():
@@ -181,16 +189,13 @@ def test_mixed_list_source():
         "4/data",
         "5/data",
     ]
-    assert list(artifacts["1"].keys()) == [
-        f"{x}/data" for x in range(len(t_value[1]))
-    ]
-    assert artifacts["0"]["data"].uri.endswith("data/0")
-    assert artifacts["1"]["0"]["data"].uri.endswith("data/1/0")
-    assert artifacts["1"]["1"]["data"].uri.endswith("data/1/1")
-    assert artifacts["2"]["data"].uri.endswith("data/2")
-    assert artifacts["3"]["data"].uri.endswith("data/3")
-    assert artifacts["4"]["data"].uri.endswith("data/4")
-    assert artifacts["5"]["data"].uri.endswith("data/5")
+    assert artifacts["0/data"].uri.endswith("data/0")
+    assert artifacts["1/0/data"].uri.endswith("data/1/0")
+    assert artifacts["1/1/data"].uri.endswith("data/1/1")
+    assert artifacts["2/data"].uri.endswith("data/2")
+    assert artifacts["3/data"].uri.endswith("data/3")
+    assert artifacts["4/data"].uri.endswith("data/4")
+    assert artifacts["5/data"].uri.endswith("data/5")
 
 
 def test_dict():
@@ -238,7 +243,6 @@ def custom_assert(x, y):
     )
 
     assert list(artifacts.keys()) == ["1/data", "2/data", "3/1/data"]
-    assert list(artifacts["3"].keys()) == ["1/data"]
-    assert artifacts["1"]["data"].uri.endswith("data/1")
-    assert artifacts["2"]["data"].uri.endswith("data/2")
-    assert artifacts["3"]["1"]["data"].uri.endswith("data/3/1")
+    assert artifacts["1/data"].uri.endswith("data/1")
+    assert artifacts["2/data"].uri.endswith("data/2")
+    assert artifacts["3/1/data"].uri.endswith("data/3/1")