diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 6f71243ec..01596c95c 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,9 +1,19 @@ # Upcoming Release ## Major features and improvements +## Bug fixes and other changes +## Community contributions + +# Release 1.7.0: +## Major features and improvements * Added `polars.GenericDataSet`, a `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust. ## Bug fixes and other changes +* Fixed broken links in docstrings. +* Reverted PySpark pin to <4.0. + ## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: +* [Walber Moreira](https://github.com/wmoreiraa) # Release 1.6.0: diff --git a/kedro-datasets/docs/source/kedro_datasets.rst b/kedro-datasets/docs/source/kedro_datasets.rst index 18bff8808..d1e06429c 100644 --- a/kedro-datasets/docs/source/kedro_datasets.rst +++ b/kedro-datasets/docs/source/kedro_datasets.rst @@ -41,6 +41,7 @@ kedro_datasets kedro_datasets.plotly.JSONDataSet kedro_datasets.plotly.PlotlyDataSet kedro_datasets.polars.CSVDataSet + kedro_datasets.polars.GenericDataSet kedro_datasets.redis.PickleDataSet kedro_datasets.snowflake.SnowparkTableDataSet kedro_datasets.spark.DeltaTableDataSet diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index c0d5001a2..f06eb30db 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,3 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" -__version__ = "1.6.0" +__version__ = "1.7.0" diff --git a/kedro-datasets/kedro_datasets/polars/generic_dataset.py b/kedro-datasets/kedro_datasets/polars/generic_dataset.py index 0ba1fc43c..73660e746 100644 --- a/kedro-datasets/kedro_datasets/polars/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/generic_dataset.py @@ -20,26 +20,29 @@ # pylint: disable=too-many-instance-attributes class GenericDataSet(AbstractVersionedDataSet[pl.DataFrame, pl.DataFrame]): - """`polars.GenericDataSet` loads/saves data from/to a data file using an underlying - filesystem (e.g.: local, S3, GCS). It uses polars to dynamically select the - appropriate type of read/write target on a best effort basis. - Example usage for the - `YAML API `_: + """``polars.GenericDataSet`` loads/saves data from/to a data file using an underlying + filesystem (e.g.: local, S3, GCS). It uses polars to handle the dynamically select the + appropriate type of read/write on a best effort basis. + + Example adding a catalog entry with + `YAML API + `_: + .. code-block:: yaml - cars: - type: polars.GenericDataSet - file_format: parquet - filepath: s3://data/01_raw/company/cars.parquet - load_args: - low_memory: True - save_args: - compression: "snappy" - - Example usage for the - `Python API `_: + + >>> cars: + >>> type: polars.GenericDataSet + >>> file_format: parquet + >>> filepath: s3://data/01_raw/company/cars.parquet + >>> load_args: + >>> low_memory: True + >>> save_args: + >>> compression: "snappy" + + Example using Python API: :: + >>> from kedro_datasets.polars import GenericDataSet >>> import polars as pl >>> @@ -50,6 +53,7 @@ class GenericDataSet(AbstractVersionedDataSet[pl.DataFrame, pl.DataFrame]): >>> data_set.save(data) >>> reloaded = data_set.load() >>> assert data.frame_equal(reloaded) + """ DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] @@ -67,8 +71,9 @@ def __init__( fs_args: Dict[str, Any] = None, ): """Creates a new instance of ``GenericDataSet`` pointing to a concrete data file - on a specific filesystem. The appropriate polars load/save methods are - dynamically identified by string matching on a best effort basis. + on a specific filesystem. The appropriate polars load/save methods are dynamically + identified by string matching on a best effort basis. + Args: filepath: Filepath in POSIX format to a file prefixed with a protocol like `s3://`. @@ -76,16 +81,15 @@ def __init__( will be used. The prefix should be any protocol supported by ``fsspec``. Key assumption: The first argument of either load/save method points to - a filepath/buffer/io type location. There are some read/write targets - such as 'clipboard' or 'records' that will fail since they do not take a - filepath like argument. - file_format: String which is used to match the appropriate load/save method - on a best effort basis. For example if 'csv' is passed the - `polars.read_csv` and + a filepath/buffer/io type location. There are some read/write targets such + as 'clipboard' or 'records' that will fail since they do not take a filepath + like argument. + file_format: String which is used to match the appropriate load/save method on a + best effort basis. For example if 'csv' is passed, the `polars.read_csv` and `polars.DataFrame.write_csv` methods will be identified. An error will - be raised unless - at least one matching `read_{file_format}` or `write_{file_format}`. - load_args: polars options for loading files. + be raised unless there is at least one matching `read_` + or `write_`. + load_args: Polars options for loading CSV files. Here you can find all available arguments: https://pola-rs.github.io/polars/py-polars/html/reference/io.html All defaults are preserved. @@ -100,16 +104,12 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: - DataSetError: Will be raised if at least less than one appropriate - read or write methods are identified. + DataSetError: Will be raised if at least less than one appropriate read or write + methods are identified. """ self._file_format = file_format.lower() diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 429398b32..a35665ec9 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -7,7 +7,7 @@ SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -POLARS = "polars~=0.18.0" +POLARS = "polars>=0.18.0" DELTA = "delta-spark~=1.2.1"