Merge pull request #72 from tomasvanpottelbergh/fix/disable-uri-file-…

…outputs Make `uri_file` output limitation explicit
getindata · Aug 28, 2023 · 80b483e · 80b483e
2 parents 9f9f265 + 0c63ab9
commit 80b483e
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,8 @@
 ## [Unreleased]
 
 -   Added support for pydantic v2 and bumped minimal required pydantic version to `2.0.0` by [@froessler](https://github.com/fdroessler)
--   Added adbility to mark a node as deterministic (enables caching on AzureML) by [@tomasvanpottelbergh](https://github.com/tomasvanpottelbergh)
+-   Added ability to mark a node as deterministic (enables caching on Azure ML) by [@tomasvanpottelbergh](https://github.com/tomasvanpottelbergh)
+-   Explicitly disabled support for `AzureMLAssetDataSet` outputs of `uri_file` type by [@tomasvanpottelbergh](https://github.com/tomasvanpottelbergh)
 
 ## [0.5.0] - 2023-08-11
 

diff --git a/docs/source/05_data_assets.rst b/docs/source/05_data_assets.rst
@@ -3,7 +3,9 @@ Azure Data Assets
 
 ``kedro-azureml`` adds support for two new datasets that can be used in the Kedro catalog. Right now we support both Azure ML v1 SDK (direct Python) and Azure ML v2 SDK (fsspec-based) APIs.
 
-**For v2 API (fspec-based)** - use ``AzureMLAssetDataSet`` that enables to use Azure ML v2-sdk Folder/File datasets for remote and local runs.
+**For v2 API (fspec-based)** - use ``AzureMLAssetDataSet`` that enables to use Azure ML v2 SDK Folder/File datasets for remote and local runs.
+Currently only the `uri_file` and `uri_folder` types are supported. Because of limitations of the Azure ML SDK, the `uri_file` type can only be used for pipeline inputs,
+not for outputs. The `uri_folder` type can be used for both inputs and outputs.
 
 **For v1 API** (deprecated ⚠️) use the ``AzureMLFileDataSet`` and the ``AzureMLPandasDataSet`` which translate to `File/Folder dataset`_ and `Tabular dataset`_ respectively in
 Azure Machine Learning. Both fully support the Azure versioning mechanism and can be used in the same way as any

diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py
@@ -154,19 +154,32 @@ def _get_versioned_azureml_dataset_name(
             suffix = ":" + version
         return azureml_dataset_name + suffix
 
-    def _get_input_type(self, dataset_name: str, pipeline: Pipeline) -> Input:
+    def _get_input(self, dataset_name: str, pipeline: Pipeline) -> Input:
         if self._is_param_or_root_non_azureml_asset_dataset(dataset_name, pipeline):
-            return "string"
+            return Input(type="string")
         elif dataset_name in self.catalog.list() and isinstance(
             ds := self.catalog._get_dataset(dataset_name), AzureMLAssetDataSet
         ):
             if ds._azureml_type == "uri_file" and dataset_name not in pipeline.inputs():
                 raise ValueError(
                     "AzureMLAssetDataSets with azureml_type 'uri_file' can only be used as pipeline inputs"
                 )
-            return ds._azureml_type
+            return Input(type=ds._azureml_type)
         else:
-            return "uri_folder"
+            return Input(type="uri_folder")
+
+    def _get_output(self, name):
+        if name in self.catalog.list() and isinstance(
+            ds := self.catalog._get_dataset(name), AzureMLAssetDataSet
+        ):
+            if ds._azureml_type == "uri_file":
+                raise ValueError(
+                    "AzureMLAssetDataSets with azureml_type 'uri_file' cannot be used as outputs"
+                )
+            # TODO: add versioning
+            return Output(type=ds._azureml_type, name=ds._azureml_dataset)
+        else:
+            return Output(type="uri_folder")
 
     def _from_params_or_value(
         self,
@@ -231,21 +244,11 @@ def _construct_azure_command(
             },
             environment=self._resolve_azure_environment(),  # TODO: check whether Environment exists
             inputs={
-                self._sanitize_param_name(name): Input(
-                    type=self._get_input_type(name, pipeline)
-                )
+                self._sanitize_param_name(name): self._get_input(name, pipeline)
                 for name in node.inputs
             },
             outputs={
-                self._sanitize_param_name(name): (
-                    # TODO: add versioning
-                    Output(name=ds._azureml_dataset)
-                    if name in self.catalog.list()
-                    and isinstance(
-                        ds := self.catalog._get_dataset(name), AzureMLAssetDataSet
-                    )
-                    else Output()
-                )
+                self._sanitize_param_name(name): self._get_output(name)
                 for name in node.outputs
             },
             code=self.config.azure.code_directory,