diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index 112df2fdf9f..5c28b4e7e85 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -27,6 +27,7 @@ Top-level conversions cudf.to_numeric cudf.from_dlpack + cudf.from_pandas Top-level dealing with datetimelike ----------------------------------- diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile new file mode 100644 index 00000000000..d0c3cbf1020 --- /dev/null +++ b/docs/dask_cudf/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat new file mode 100644 index 00000000000..747ffb7b303 --- /dev/null +++ b/docs/dask_cudf/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png new file mode 100644 index 00000000000..d884e01374d Binary files /dev/null and b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png differ diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst new file mode 100644 index 00000000000..893f5dd7434 --- /dev/null +++ b/docs/dask_cudf/source/api.rst @@ -0,0 +1,79 @@ +=============== + API reference +=============== + +This page provides a list of all publicly accessible modules, methods, +and classes in the ``dask_cudf`` namespace. + + +Creating and storing DataFrames +=============================== + +:doc:`Like Dask `, Dask-cuDF supports creation +of DataFrames from a variety of storage formats. For on-disk data that +are not supported directly in Dask-cuDF, we recommend using Dask's +data reading facilities, followed by calling +:func:`.from_dask_dataframe` to obtain a Dask-cuDF object. + +.. automodule:: dask_cudf + :members: + from_cudf, + from_dask_dataframe, + read_csv, + read_json, + read_orc, + to_orc, + read_text, + read_parquet + +.. warning:: + + FIXME: where should the following live? + + .. autofunction:: dask_cudf.concat + + .. autofunction:: dask_cudf.from_delayed + +Grouping +======== + +As discussed in the :doc:`Dask documentation for groupby +`, ``groupby``, ``join``, and ``merge``, and +similar operations that require matching up rows of a DataFrame become +significantly more challenging in a parallel setting than they are in +serial. Dask-cuDF has the same challenges, however for certain groupby +operations, we can take advantage of functionality in cuDF that allows +us to compute multiple aggregations at once. There are therefore two +interfaces to grouping in Dask-cuDF, the general +:meth:`DataFrame.groupby` which returns a +:class:`.CudfDataFrameGroupBy` object, and a specialized +:func:`.groupby_agg`. Generally speaking, you should not need to call +:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it +if possible. + +.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy + :members: + :inherited-members: + :show-inheritance: + +.. autofunction:: dask_cudf.groupby_agg + + +DataFrames and Series +===================== + +The core distributed objects provided by Dask-cuDF are the +:class:`.DataFrame` and :class:`.Series`. These inherit respectively +from :class:`dask.dataframe.DataFrame` and +:class:`dask.dataframe.Series`, and so the API is essentially +identical. The full API is provided below. + +.. autoclass:: dask_cudf.DataFrame + :members: + :inherited-members: + :show-inheritance: + +.. autoclass:: dask_cudf.Series + :members: + :inherited-members: + :show-inheritance: diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py new file mode 100644 index 00000000000..1341e7fd9e7 --- /dev/null +++ b/docs/dask_cudf/source/conf.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018-2023, NVIDIA CORPORATION. + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "dask-cudf" +copyright = "2018-2023, NVIDIA Corporation" +author = "NVIDIA Corporation" +version = "23.04" +release = "23.04.00" + +language = "en" + + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.intersphinx", + "sphinx.ext.autodoc", + "sphinx_copybutton", + "numpydoc", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "myst_nb", +] + +templates_path = ["_templates"] +exclude_patterns = [] + +copybutton_prompt_text = ">>> " + +# Enable automatic generation of systematic, namespaced labels for sections +myst_heading_anchors = 2 + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "pydata_sphinx_theme" +html_logo = "_static/RAPIDS-logo-purple.png" +htmlhelp_basename = "dask-cudfdoc" +html_use_modindex = True + +html_static_path = ["_static"] + +pygments_style = "sphinx" + +html_theme_options = { + "external_links": [], + "github_url": "https://github.com/rapidsai/cudf", + "twitter_url": "https://twitter.com/rapidsai", + "show_toc_level": 1, + "navbar_align": "right", +} +include_pandas_compat = True + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "cupy": ("https://docs.cupy.dev/en/stable/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), + "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), + "dask": ("https://docs.dask.org/en/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), +} + +numpydoc_show_inherited_class_members = True +numpydoc_class_members_toctree = False +numpydoc_attributes_as_param_list = False + + +def setup(app): + app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") + app.add_js_file( + "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer" + ) diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst new file mode 100644 index 00000000000..0442ab0929a --- /dev/null +++ b/docs/dask_cudf/source/index.rst @@ -0,0 +1,112 @@ +.. dask-cudf documentation coordinating file, created by + sphinx-quickstart on Mon Feb 6 18:48:11 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to dask-cudf's documentation! +===================================== + +Dask-cuDF is an extension library for the `Dask `__ +parallel computing framework that provides a `cuDF +`__-backed distributed +dataframe with the same API as `Dask dataframes +`__. + +If you are familiar with Dask and `pandas `__ or +`cuDF `__, then Dask-cuDF +should feel familiar to you. If not, we recommend starting with `10 +minutes to Dask +`__ followed +by `10 minutes to cuDF and Dask-cuDF +`__. + +When running on multi-GPU systems, `Dask-CUDA +`__ is recommended to +simplify the setup of the cluster, taking advantage of all features of +the GPU and networking hardware. + +Using Dask-cuDF +--------------- + +When installed, Dask-cuDF registers itself as a dataframe backend for +Dask. This means that in many cases, using cuDF-backed dataframes requires +only small changes to an existing workflow. The minimal change is to +select cuDF as the dataframe backend in :doc:`Dask's +configuration `. To do so, we must set the option +``dataframe.backend`` to ``cudf``. From Python, this can be achieved +like so:: + + import dask + + dask.config.set({"dataframe.backend": "cudf"}) + +Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the +environment before running your code. + +Dataframe creation from on-disk formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your workflow creates Dask dataframes from on-disk formats +(for example using :func:`dask.dataframe.read_parquet`), then setting +the backend may well be enough to migrate your workflow. + +For example, consider reading a dataframe from parquet:: + + import dask.dataframe as dd + + # By default, we obtain a pandas-backed dataframe + df = dd.read_parquet("data.parquet", ...) + + +To obtain a cuDF-backed dataframe, we must set the +``dataframe.backend`` configuration option:: + + import dask + import dask.dataframe as dd + + dask.config.set({"dataframe.backend": "cudf"}) + # This gives us a cuDF-backed dataframe + df = dd.read_parquet("data.parquet", ...) + +This code will use cuDF's GPU-accelerated :func:`parquet reader +` to read partitions of the data. + +Dataframe creation from in-memory formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you already have a dataframe in memory and want to convert it to a +cuDF-backend one, there are two options depending on whether the +dataframe is already a Dask one or not. If you have a Dask dataframe, +then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"`` +as the backend; if you have a pandas dataframe then you can either +call :func:`dask.dataframe.from_pandas` followed by +:func:`~dask.dataframe.to_backend` or first convert the dataframe with +:func:`cudf.from_pandas` and then parallelise this with +:func:`dask_cudf.from_cudf`. + +API Reference +------------- + +Generally speaking, Dask-cuDF tries to offer exactly the same API as +Dask itself. There are, however, some minor differences mostly because +cuDF does not :doc:`perfectly mirror ` +the pandas API, or because cuDF provides additional configuration +flags (these mostly occur in data reading and writing interfaces). + +As a result, straightforward workflows can be migrated without too +much trouble, but more complex ones that utilise more features may +need a bit of tweaking. The API documentation describes details of the +differences and all functionality that Dask-cuDF supports. + +.. toctree:: + :maxdepth: 2 + + api + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 60bbe5d9571..d2858876fcd 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,6 +1,7 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import math +import textwrap import warnings import numpy as np @@ -68,6 +69,18 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): + """ + A distributed Dask DataFrame where the backing dataframe is a + :class:`cuDF DataFrame `. + + Typically you would not construct this object directly, but rather + use one of Dask-cuDF's IO routines. + + Most operations on :doc:`Dask DataFrames ` are + supported, with many of the same caveats. + + """ + _partition_type = cudf.DataFrame @_dask_cudf_nvtx_annotate @@ -671,12 +684,35 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): from_cudf.__doc__ = ( - "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__ + textwrap.dedent( + """ + Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. + + This function is a thin wrapper around + :func:`dask.dataframe.from_pandas`, accepting the same + arguments (described below) excepting that it operates on cuDF + rather than pandas objects.\n + """ + ) + + textwrap.dedent(dd.from_pandas.__doc__) ) @_dask_cudf_nvtx_annotate def from_dask_dataframe(df): + """ + Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF + one. + + Parameters + ---------- + df : dask.dataframe.DataFrame + The Dask dataframe to convert + + Returns + ------- + dask_cudf.DataFrame : A new Dask collection backed by cuDF objects + """ return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index f91738bdab0..f4bbcaf4dd1 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from functools import wraps from typing import Set @@ -433,22 +433,55 @@ def groupby_agg( ): """Optimized groupby aggregation for Dask-CuDF. - This aggregation algorithm only supports the following options: - - - "count" - - "mean" - - "std" - - "var" - - "sum" - - "min" - - "max" - - "collect" - - "first" - - "last" - - This "optimized" approach is more performant than the algorithm - in `dask.dataframe`, because it allows the cudf backend to - perform multiple aggregations at once. + Parameters + ---------- + ddf : DataFrame + DataFrame object to perform grouping on. + gb_cols : str or list[str] + Column names to group by. + aggs_in : str, list, or dict + Aggregations to perform. + split_every : int (optional) + How to group intermediate aggregates. + dropna : bool + Drop grouping key values corresponding to NA values. + as_index : bool + Currently ignored. + sort : bool + Sort the group keys, better performance is obtained when + not sorting. + shuffle : str (optional) + Control how shuffling of the DataFrame is performed. + sep : str + Internal usage. + + + Notes + ----- + This "optimized" approach is more performant than the algorithm in + implemented in :meth:`DataFrame.apply` because it allows the cuDF + backend to perform multiple aggregations at once. + + This aggregation algorithm only supports the following options + + * "collect" + * "count" + * "first" + * "last" + * "max" + * "mean" + * "min" + * "std" + * "sum" + * "var" + + + See Also + -------- + DataFrame.groupby : generic groupby of a DataFrame + dask.dataframe.apply_concat_apply : for more description of the + split_every argument. + """ # Assert that aggregations are supported aggs = _redirect_aggs(aggs_in) diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index b4d080fd182..fd27083bbf4 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -16,9 +16,10 @@ def read_csv(path, blocksize="default", **kwargs): """ - Read CSV files into a dask_cudf.DataFrame + Read CSV files into a :class:`.DataFrame`. - This API parallelizes the ``cudf.read_csv`` function in the following ways: + This API parallelizes the :func:`cudf:cudf.read_csv` function in + the following ways: It supports loading many files at once using globstrings: @@ -34,23 +35,26 @@ def read_csv(path, blocksize="default", **kwargs): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports - many of the same keyword arguments with the same performance guarantees. - See the docstring for ``cudf.read_csv()`` for more information on available + Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and + supports many of the same keyword arguments with the same + performance guarantees. See the docstring for + :func:`cudf:cudf.read_csv` for more information on available keyword arguments. Parameters ---------- path : str, path object, or file-like object - Either a path to a file (a str, pathlib.Path, or - py._path.local.LocalPath), URL (including http, ftp, and S3 locations), - or any object with a read() method (such as builtin open() file - handler function or StringIO). + Either a path to a file (a str, :py:class:`pathlib.Path`, or + py._path.local.LocalPath), URL (including http, ftp, and S3 + locations), or any object with a read() method (such as + builtin :py:func:`open` file handler function or + :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" - The target task partition size. If `None`, a single block + The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to ``cudf.read_csv``. + Passthrough key-word arguments that are sent to + :func:`cudf:cudf.read_csv`. Examples -------- @@ -61,6 +65,7 @@ def read_csv(path, blocksize="default", **kwargs): 0 1 hi 1 2 hello 2 3 ai + """ # Handle `chunksize` deprecation diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index bb3d0f3c601..2a6ad603414 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -10,30 +10,33 @@ def read_json(url_path, engine="auto", **kwargs): - """Create a dask_cudf DataFrame collection from JSON data + """Read JSON data into a :class:`.DataFrame`. - This function wraps ``dask.dataframe.read_json``, and passes + This function wraps :func:`dask.dataframe.read_json`, and passes ``engine=partial(cudf.read_json, engine="auto")`` by default. Parameters ---------- - url_path: str, list of str + url_path : str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. engine : str or Callable, default "auto" - If str, this value will be used as the ``engine`` argument when - ``cudf.read_json`` is used to create each partition. If Callable, - this value will be used as the underlying function used to create - each partition from JSON data. The default value is "auto", so - that ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to ``dask.dataframe.read_json`` by default. + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~typing.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + **kwargs : - Key-word arguments to pass through to ``dask.dataframe.read_json``. + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. Returns ------- - dask_cudf.DataFrame + :class:`.DataFrame` Examples -------- @@ -53,7 +56,8 @@ def read_json(url_path, engine="auto", **kwargs): See Also -------- - dask.dataframe.io.json.read_json + dask.dataframe.read_json + """ # TODO: Add optimized code path to leverage the diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index e731057ed90..49fea0d7602 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from io import BufferedWriter, IOBase @@ -25,37 +25,45 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read cudf dataframe from ORC file(s). + """Read ORC files into a :class:`.DataFrame`. Note that this function is mostly borrowed from upstream Dask. Parameters ---------- - path: str or list(str) + path : str or list[str] Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. - columns: None or list(str) + columns : None or list[str] Columns to load. If None, loads all. filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out row groups - using statistics stored for each row group as Parquet metadata. Row - groups that do not match the given filter predicate are not read. The - predicate is expressed in disjunctive normal form (DNF) like - `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical - combinations of single column predicates. The innermost tuples each - describe a single column predicate. The list of inner predicates is - interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the outermost list combines - these filters as a disjunction (OR). Predicates may also be passed - as a list of tuples. This form is interpreted as a single conjunction. - To express OR in predicates, one must use the (preferred) notation of - list of lists of tuples. - storage_options: None or dict + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict Further parameters to pass to the bytes backend. + See Also + -------- + dask.dataframe.read_orc + Returns ------- - cudf.DataFrame + dask_cudf.DataFrame + """ storage_options = storage_options or {} @@ -133,22 +141,25 @@ def to_orc( compute=True, **kwargs, ): - """Write a dask_cudf dataframe to ORC file(s) (one file per partition). + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). Parameters ---------- - df : dask_cudf.DataFrame - path: string or pathlib.Path + df : DataFrame + path : str or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. write_index : boolean, optional Whether or not to write the index. Defaults to True. - storage_options: None or dict + storage_options : None or dict Further parameters to pass to the bytes backend. compression : string or dict, optional compute : bool, optional - If True (default) then the result is computed immediately. If False - then a ``dask.delayed`` object is returned for future computation. + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + """ from dask import compute as dask_compute, delayed diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 452f2f8914a..b03ac256b05 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -438,13 +438,14 @@ def set_object_dtypes_from_pa_schema(df, schema): def read_parquet(path, columns=None, **kwargs): - """Read parquet files into a Dask DataFrame + """ + Read parquet files into a :class:`.DataFrame`. - Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine`` - to coordinate the execution of ``cudf.read_parquet``, and to - ultimately create a ``dask_cudf.DataFrame`` collection. + Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` + to coordinate the execution of :func:`cudf.read_parquet`, and to + ultimately create a :class:`.DataFrame` collection. - See the ``dask.dataframe.read_parquet`` documentation for + See the :func:`dask.dataframe.read_parquet` documentation for all available options. Examples @@ -469,6 +470,7 @@ def read_parquet(path, columns=None, **kwargs): See Also -------- cudf.read_parquet + dask.dataframe.read_parquet """ if isinstance(columns, str): columns = [columns]