rapidsai · rapids-bot · Mar 21, 2023 · Feb 6, 2023 · Feb 7, 2023 · Feb 7, 2023
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
@@ -25,6 +25,7 @@ Top-level conversions
 
     cudf.to_numeric
     cudf.from_dlpack
+    cudf.from_pandas
 
 Top-level dealing with datetimelike
 -----------------------------------

diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png
diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
@@ -0,0 +1,79 @@
+===============
+ API reference
+===============
+
+This page provides a list of all publicly accessible modules, methods,
+and classes in the ``dask_cudf`` namespace.
+
+
+Creating and storing DataFrames
+===============================
+
+:doc:`Like Dask <dask:dataframe-create>`, Dask-cuDF supports creation
+of DataFrames from a variety of storage formats. For on-disk data that
+are not supported directly in Dask-cuDF, we recommend using Dask's
+data reading facilities, followed by calling
+:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+
+.. automodule:: dask_cudf
+   :members:
+      from_cudf,
+      from_dask_dataframe,
+      read_csv,
+      read_json,
+      read_orc,
+      to_orc,
+      read_text,
+      read_parquet
+
+.. warning::
+
+   FIXME: where should the following live?
+
+   .. autofunction:: dask_cudf.concat
+
+   .. autofunction:: dask_cudf.from_delayed
+
+Grouping
+========
+
+As discussed in the :doc:`Dask documentation for groupby
+<dask:dataframe-groupby>`, ``groupby``, ``join``, and ``merge``, and
+similar operations that require matching up rows of a DataFrame become
+significantly more challenging in a parallel setting than they are in
+serial. Dask-cuDF has the same challenges, however for certain groupby
+operations, we can take advantage of functionality in cuDF that allows
+us to compute multiple aggregations at once. There are therefore two
+interfaces to grouping in Dask-cuDF, the general
+:meth:`DataFrame.groupby` which returns a
+:class:`.CudfDataFrameGroupBy` object, and a specialized
+:func:`.groupby_agg`. Generally speaking, you should not need to call
+:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it
+if possible.
+
+.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autofunction:: dask_cudf.groupby_agg
+
+
+DataFrames and Series
+=====================
+
+The core distributed objects provided by Dask-cuDF are the
+:class:`.DataFrame` and :class:`.Series`. These inherit respectively
+from :class:`dask.dataframe.DataFrame` and
+:class:`dask.dataframe.Series`, and so the API is essentially
+identical. The full API is provided below.
+
+.. autoclass:: dask_cudf.DataFrame
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: dask_cudf.Series
+   :members:
+   :inherited-members:
+   :show-inheritance:
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "dask-cudf"
+copyright = "2018-2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+version = "23.04"
+release = "23.04.00"
+
+language = "en"
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx_copybutton",
+    "numpydoc",
+    "IPython.sphinxext.ipython_console_highlighting",
+    "IPython.sphinxext.ipython_directive",
+    "myst_nb",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+copybutton_prompt_text = ">>> "
+
+# Enable automatic generation of systematic, namespaced labels for sections
+myst_heading_anchors = 2
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "pydata_sphinx_theme"
+html_logo = "_static/RAPIDS-logo-purple.png"
+htmlhelp_basename = "dask-cudfdoc"
+html_use_modindex = True
+
+html_static_path = ["_static"]
+
+pygments_style = "sphinx"
+
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/cudf",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
+include_pandas_compat = True
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/docs/", None),
+}
+
+numpydoc_show_inherited_class_members = True
+numpydoc_class_members_toctree = False
+numpydoc_attributes_as_param_list = False
+
+
+def setup(app):
+    app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
@@ -0,0 +1,112 @@
+.. dask-cudf documentation coordinating file, created by
+   sphinx-quickstart on Mon Feb  6 18:48:11 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to dask-cudf's documentation!
+=====================================
+
+Dask-cuDF is an extension library for the `Dask <https://dask.org>`__
+parallel computing framework that provides a `cuDF
+<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
+dataframe with the same API as `Dask dataframes
+<https://docs.dask.org/en/stable/dataframe.html>`__.
+
+If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+should feel familiar to you. If not, we recommend starting with `10
+minutes to Dask
+<https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
+by `10 minutes to cuDF and Dask-cuDF
+<https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
+
+When running on multi-GPU systems, `Dask-CUDA
+<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
+simplify the setup of the cluster, taking advantage of all features of
+the GPU and networking hardware.
+
+Using Dask-cuDF
+---------------
+
+When installed, Dask-cuDF registers itself as a dataframe backend for
+Dask. This means that in many cases, using cuDF-backed dataframes requires
+only small changes to an existing workflow. The minimal change is to
+select cuDF as the dataframe backend in :doc:`Dask's
+configuration <dask:configuration>`. To do so, we must set the option
+``dataframe.backend`` to ``cudf``. From Python, this can be achieved
+like so::
+
+  import dask
+
+  dask.config.set({"dataframe.backend": "cudf"})
+
+Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
+environment before running your code.
+
+Dataframe creation from on-disk formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your workflow creates Dask dataframes from on-disk formats
+(for example using :func:`dask.dataframe.read_parquet`), then setting
+the backend may well be enough to migrate your workflow.
+
+For example, consider reading a dataframe from parquet::
+
+   import dask.dataframe as dd
+
+   # By default, we obtain a pandas-backed dataframe
+   df = dd.read_parquet("data.parquet", ...)
+
+
+To obtain a cuDF-backed dataframe, we must set the
+``dataframe.backend`` configuration option::
+
+  import dask
+  import dask.dataframe as dd
+
+  dask.config.set({"dataframe.backend": "cudf"})
+  # This gives us a cuDF-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
+
+This code will use cuDF's GPU-accelerated :func:`parquet reader
+<cudf.read_parquet>` to read partitions of the data.
+
+Dataframe creation from in-memory formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you already have a dataframe in memory and want to convert it to a
+cuDF-backend one, there are two options depending on whether the
+dataframe is already a Dask one or not. If you have a Dask dataframe,
+then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
+as the backend; if you have a pandas dataframe then you can either
+call :func:`dask.dataframe.from_pandas` followed by
+:func:`~dask.dataframe.to_backend` or first convert the dataframe with
+:func:`cudf.from_pandas` and then parallelise this with
+:func:`dask_cudf.from_cudf`.
+
+API Reference
+-------------
+
+Generally speaking, Dask-cuDF tries to offer exactly the same API as
+Dask itself. There are, however, some minor differences mostly because
+cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
+the pandas API, or because cuDF provides additional configuration
+flags (these mostly occur in data reading and writing interfaces).
+
+As a result, straightforward workflows can be migrated without too
+much trouble, but more complex ones that utilise more features may
+need a bit of tweaking. The API documentation describes details of the
+differences and all functionality that Dask-cuDF supports.
+
+.. toctree::
+   :maxdepth: 2
+
+   api
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`