diff --git a/.gitignore b/.gitignore index b398cfc4f88..aee3d072de2 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,7 @@ dask-worker-space/ # protobuf **/*_pb2.py + +# Sphinx docs & build artifacts +docs/cudf/source/api_docs/generated/* +docs/cudf/source/api_docs/api/* \ No newline at end of file diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 70bbe88a00c..692ebe71794 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -26,7 +26,6 @@ dependencies: - pytest-benchmark - pytest-xdist - sphinx - - sphinx_rtd_theme - sphinxcontrib-websupport - nbsphinx - numpydoc @@ -57,6 +56,7 @@ dependencies: - nvtx>=0.2.1 - cachetools - transformers + - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main - git+https://github.com/dask/distributed.git@main diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 6d2abdda449..ce82b870e16 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -26,7 +26,6 @@ dependencies: - pytest-benchmark - pytest-xdist - sphinx - - sphinx_rtd_theme - sphinxcontrib-websupport - nbsphinx - numpydoc @@ -57,6 +56,7 @@ dependencies: - nvtx>=0.2.1 - cachetools - transformers + - pydata-sphinx-theme - pip: - git+https://github.com/dask/dask.git@main - git+https://github.com/dask/distributed.git@main diff --git a/docs/cudf/source/_static/RAPIDS-logo-purple.png b/docs/cudf/source/_static/RAPIDS-logo-purple.png new file mode 100644 index 00000000000..d884e01374d Binary files /dev/null and b/docs/cudf/source/_static/RAPIDS-logo-purple.png differ diff --git a/docs/cudf/source/_static/copybutton_pydocs.js b/docs/cudf/source/_static/copybutton_pydocs.js deleted file mode 100644 index cec05777e6b..00000000000 --- a/docs/cudf/source/_static/copybutton_pydocs.js +++ /dev/null @@ -1,65 +0,0 @@ -$(document).ready(function() { - /* Add a [>>>] button on the top-right corner of code samples to hide - * the >>> and ... prompts and the output and thus make the code - * copyable. */ - var div = $('.highlight-python .highlight,' + - '.highlight-python3 .highlight,' + - '.highlight-pycon .highlight,' + - '.highlight-default .highlight'); - var pre = div.find('pre'); - - // get the styles from the current theme - pre.parent().parent().css('position', 'relative'); - var hide_text = 'Hide the prompts and output'; - var show_text = 'Show the prompts and output'; - var border_width = pre.css('border-top-width'); - var border_style = pre.css('border-top-style'); - var border_color = pre.css('border-top-color'); - var button_styles = { - 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', - 'border-color': border_color, 'border-style': border_style, - 'border-width': border_width, 'text-size': '75%', - 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '1.5em', - 'border-radius': '0 3px 0 0', - 'transition': "0.5s" - } - - // create and add the button to all the code blocks that contain >>> - div.each(function(index) { - var jthis = $(this); - if (jthis.find('.gp').length > 0) { - var button = $('>>>'); - button.css(button_styles) - button.attr('title', hide_text); - button.data('hidden', 'false'); - jthis.prepend(button); - } - // tracebacks (.gt) contain bare text elements that need to be - // wrapped in a span to work with .nextUntil() (see later) - jthis.find('pre:has(.gt)').contents().filter(function() { - return ((this.nodeType == 3) && (this.data.trim().length > 0)); - }).wrap(''); - }); - - // define the behavior of the button when it's clicked - $('.copybutton').click(function(e){ - e.preventDefault(); - var button = $(this); - if (button.data('hidden') === 'false') { - // hide the code output - button.parent().find('.go, .gp, .gt').hide(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); - button.css('text-decoration', 'line-through'); - button.attr('title', show_text); - button.data('hidden', 'true'); - } else { - // show the code output - button.parent().find('.go, .gp, .gt').show(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); - button.css('text-decoration', 'none'); - button.attr('title', hide_text); - button.data('hidden', 'false'); - } - }); -}); - diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css index 475b9dfb4ec..2bdd6f5a299 100644 --- a/docs/cudf/source/_static/params.css +++ b/docs/cudf/source/_static/params.css @@ -8,14 +8,6 @@ content: ":"; } -.highlight:hover span#strike_button { - color:#767676; -} - -span#strike_button { - color :#d0ced7; -} - /* Fix for text wrap in sphinx tables: * https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html */ @@ -40,3 +32,24 @@ table.io-supported-types-table { table.io-supported-types-table thead{ text-align: center !important; } + +:root { + + --pst-color-active-navigation: 114, 83, 237; + --pst-color-navbar-link: 77, 77, 77; + --pst-color-navbar-link-hover: var(--pst-color-active-navigation); + --pst-color-navbar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-link: 77, 77, 77; + --pst-color-sidebar-link-hover: var(--pst-color-active-navigation); + --pst-color-sidebar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-expander-background-hover: 244, 244, 244; + --pst-color-sidebar-caption: 77, 77, 77; + --pst-color-toc-link: 119, 117, 122; + --pst-color-toc-link-hover: var(--pst-color-active-navigation); + --pst-color-toc-link-active: var(--pst-color-active-navigation); + +} + +.special-table td, .special-table th { + border: 1px solid #dee2e6; +} \ No newline at end of file diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst new file mode 100644 index 00000000000..f86822bc567 --- /dev/null +++ b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst @@ -0,0 +1,33 @@ +{% extends "!autosummary/class.rst" %} + +{% block methods %} +{% if methods %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_methods %} + {%- if not item.startswith('_') or item in ['__call__'] %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} + +{% block attributes %} +{% if attributes %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_attributes %} + {%- if not item.startswith('_') %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} \ No newline at end of file diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst new file mode 100644 index 00000000000..b57a7ceebb0 --- /dev/null +++ b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} \ No newline at end of file diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst deleted file mode 100644 index d3042be2129..00000000000 --- a/docs/cudf/source/api.rst +++ /dev/null @@ -1,270 +0,0 @@ -~~~~~~~~~~~~~~~~~~~ -cuDF API Reference -~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: cudf.core.dataframe - -DataFrame ---------- -.. autoclass:: DataFrame - :members: - :inherited-members: - :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, itertuples, iterrows - -Series ------- -.. currentmodule:: cudf.core.series - -.. autoclass:: Series - :members: - :inherited-members: - :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list - -Lists ------ -.. currentmodule:: cudf.core.column.lists - -.. autoclass:: ListMethods - :members: - -Strings -------- -.. currentmodule:: cudf.core.column.string - -.. autoclass:: StringMethods - :members: - -General Functions ------------------ -.. automodule:: cudf.core.reshape - :members: -.. autofunction:: cudf.to_datetime -.. autofunction:: cudf.to_numeric - -Index ------ -.. currentmodule:: cudf.core.index -.. autoclass:: Index - :members: - :inherited-members: - :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -RangeIndex ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: RangeIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -GenericIndex ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: GenericIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -MultiIndex ----------- -.. currentmodule:: cudf.core.multiindex -.. autoclass:: MultiIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int8Index ---------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int8Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int16Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int16Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int32Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Int64Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: Int64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt8Index ----------- -.. currentmodule:: cudf.core.index -.. autoclass:: UInt8Index - :inherited-members: - :members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt16Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt16Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt32Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -UInt64Index ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: UInt64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Float32Index ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: Float32Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Float64Index ------------- -.. currentmodule:: cudf.core.index -.. autoclass:: Float64Index - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -CategoricalIndex ----------------- -.. currentmodule:: cudf.core.index -.. autoclass:: CategoricalIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -StringIndex ------------ -.. currentmodule:: cudf.core.index -.. autoclass:: StringIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -DatetimeIndex -------------- -.. currentmodule:: cudf.core.index -.. autoclass:: DatetimeIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -TimedeltaIndex --------------- -.. currentmodule:: cudf.core.index -.. autoclass:: TimedeltaIndex - :members: - :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list - -Categories ----------- -.. currentmodule:: cudf.core.column.categorical - -.. autoclass:: CategoricalAccessor - :members: - -GroupBy -------- -.. currentmodule:: cudf.core.groupby.groupby - -.. autoclass:: GroupBy - :members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize - -Window ------- -.. currentmodule:: cudf.core.window -.. autoclass:: Rolling - :members: - -SubwordTokenizer ----------------- -.. currentmodule:: cudf.core.subword_tokenizer - -.. autoclass:: SubwordTokenizer - :members: - :special-members: __call__ - -General utility functions -------------------------- -.. currentmodule:: cudf.testing - -.. automodule:: cudf.testing.testing - :members: - - -Timedelta Properties --------------------- -.. currentmodule:: cudf.core.series -.. autoclass:: TimedeltaProperties - :members: - -Datetime Properties -------------------- -.. currentmodule:: cudf.core.series -.. autoclass:: DatetimeProperties - :members: - -IO --- -.. currentmodule:: cudf.io - -.. automodule:: cudf.io.csv - :members: -.. automodule:: cudf.io.parquet - :members: -.. automodule:: cudf.io.orc - :members: -.. automodule:: cudf.io.json - :members: -.. automodule:: cudf.io.avro - :members: -.. automodule:: cudf.io.dlpack - :members: -.. automodule:: cudf.io.feather - :members: -.. automodule:: cudf.io.hdf - :members: - -Extending cuDF ----------------- -.. currentmodule:: cudf.api.extensions - -.. automodule:: cudf.api.extensions.accessor - :members: - -GpuArrowReader --------------- -.. currentmodule:: cudf.comm.gpuarrow -.. autoclass:: GpuArrowReader - :members: - :exclude-members: count, index diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst new file mode 100644 index 00000000000..12ff1f13bc4 --- /dev/null +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -0,0 +1,254 @@ +========= +DataFrame +========= +.. currentmodule:: cudf + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + DataFrame + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + +.. autosummary:: + :toctree: api/ + + DataFrame.index + DataFrame.columns + +.. autosummary:: + :toctree: api/ + + DataFrame.dtypes + DataFrame.info + DataFrame.select_dtypes + DataFrame.values + DataFrame.ndim + DataFrame.size + DataFrame.shape + DataFrame.memory_usage + DataFrame.empty + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.astype + DataFrame.copy + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.head + DataFrame.at + DataFrame.iat + DataFrame.loc + DataFrame.iloc + DataFrame.insert + DataFrame.__iter__ + DataFrame.iteritems + DataFrame.keys + DataFrame.iterrows + DataFrame.itertuples + DataFrame.pop + DataFrame.tail + DataFrame.isin + DataFrame.where + DataFrame.mask + DataFrame.query + +For more information on ``.at``, ``.iat``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.add + DataFrame.sub + DataFrame.mul + DataFrame.div + DataFrame.truediv + DataFrame.floordiv + DataFrame.mod + DataFrame.pow + DataFrame.radd + DataFrame.rsub + DataFrame.rmul + DataFrame.rdiv + DataFrame.rtruediv + DataFrame.rfloordiv + DataFrame.rmod + DataFrame.rpow + +Function application, GroupBy & window +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.apply + DataFrame.apply_chunks + DataFrame.apply_rows + DataFrame.pipe + DataFrame.agg + DataFrame.groupby + DataFrame.rolling + +.. _api.dataframe.stats: + +Computations / descriptive stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.all + DataFrame.any + DataFrame.clip + DataFrame.corr + DataFrame.count + DataFrame.cov + DataFrame.cummax + DataFrame.cummin + DataFrame.cumprod + DataFrame.cumsum + DataFrame.describe + DataFrame.kurt + DataFrame.kurtosis + DataFrame.max + DataFrame.mean + DataFrame.min + DataFrame.mode + DataFrame.prod + DataFrame.product + DataFrame.quantile + DataFrame.quantiles + DataFrame.rank + DataFrame.round + DataFrame.skew + DataFrame.sum + DataFrame.std + DataFrame.var + +Reindexing / selection / label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.drop + DataFrame.drop_duplicates + DataFrame.equals + DataFrame.head + DataFrame.reindex + DataFrame.rename + DataFrame.reset_index + DataFrame.sample + DataFrame.searchsorted + DataFrame.set_index + DataFrame.repeat + DataFrame.tail + DataFrame.take + DataFrame.tile + +.. _api.dataframe.missing: + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.dropna + DataFrame.fillna + DataFrame.isna + DataFrame.isnull + DataFrame.nans_to_nulls + DataFrame.notna + DataFrame.notnull + DataFrame.replace + +Reshaping, sorting, transposing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.argsort + DataFrame.interleave_columns + DataFrame.partition_by_hash + DataFrame.pivot + DataFrame.scatter_by_map + DataFrame.sort_values + DataFrame.sort_index + DataFrame.nlargest + DataFrame.nsmallest + DataFrame.stack + DataFrame.unstack + DataFrame.melt + DataFrame.explode + DataFrame.T + DataFrame.transpose + +Combining / comparing / joining / merging / encoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.append + DataFrame.assign + DataFrame.join + DataFrame.merge + DataFrame.update + DataFrame.label_encoding + DataFrame.one_hot_encoding + +Numerical operations +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.acos + DataFrame.asin + DataFrame.atan + DataFrame.cos + DataFrame.exp + DataFrame.log + DataFrame.sin + DataFrame.sqrt + DataFrame.tan + +Time Series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.shift + +Serialization / IO / conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.as_gpu_matrix + DataFrame.as_matrix + DataFrame.from_arrow + DataFrame.from_pandas + DataFrame.from_records + DataFrame.hash_columns + DataFrame.to_arrow + DataFrame.to_dlpack + DataFrame.to_parquet + DataFrame.to_csv + DataFrame.to_hdf + DataFrame.to_dict + DataFrame.to_json + DataFrame.to_pandas + DataFrame.to_feather + DataFrame.to_records + DataFrame.to_string diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst new file mode 100644 index 00000000000..226ae8acd32 --- /dev/null +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -0,0 +1,32 @@ +================= +General Functions +================= +.. currentmodule:: cudf + +Data manipulations +------------------ + +.. autosummary:: + :toctree: api/ + + cudf.concat + cudf.melt + cudf.get_dummies + cudf.merge_sorted + cudf.pivot + cudf.unstack + +Top-level conversions +--------------------- +.. autosummary:: + :toctree: api/ + + cudf.to_numeric + +Top-level dealing with datetimelike +----------------------------------- + +.. autosummary:: + :toctree: api/ + + cudf.to_datetime diff --git a/docs/cudf/source/api_docs/general_utilities.rst b/docs/cudf/source/api_docs/general_utilities.rst new file mode 100644 index 00000000000..d9c53c3fbbd --- /dev/null +++ b/docs/cudf/source/api_docs/general_utilities.rst @@ -0,0 +1,13 @@ +================= +General Utilities +================= + +Testing functions +----------------- +.. autosummary:: + :toctree: api/ + + cudf.testing.testing.assert_column_equal + cudf.testing.testing.assert_frame_equal + cudf.testing.testing.assert_index_equal + cudf.testing.testing.assert_series_equal diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst new file mode 100644 index 00000000000..27a314fa425 --- /dev/null +++ b/docs/cudf/source/api_docs/groupby.rst @@ -0,0 +1,96 @@ +.. _api.groupby: + +======= +GroupBy +======= +.. currentmodule:: cudf.core.groupby + +GroupBy objects are returned by groupby calls: :func:`cudf.DataFrame.groupby`, :func:`cudf.Series.groupby`, etc. + +Indexing, iteration +------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.__iter__ + GroupBy.groups + +.. currentmodule:: cudf + +.. autosummary:: + :toctree: api/ + + Grouper + +.. currentmodule:: cudf.core.groupby.groupby + +Function application +-------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.apply + GroupBy.agg + SeriesGroupBy.aggregate + DataFrameGroupBy.aggregate + GroupBy.pipe + +Computations / descriptive stats +-------------------------------- +.. autosummary:: + :toctree: api/ + + GroupBy.bfill + GroupBy.backfill + GroupBy.count + GroupBy.cumcount + GroupBy.cummax + GroupBy.cummin + GroupBy.cumsum + GroupBy.ffill + GroupBy.max + GroupBy.mean + GroupBy.median + GroupBy.min + GroupBy.nth + GroupBy.pad + GroupBy.prod + GroupBy.size + GroupBy.std + GroupBy.sum + GroupBy.var + +The following methods are available in both ``SeriesGroupBy`` and +``DataFrameGroupBy`` objects, but may differ slightly, usually in that +the ``DataFrameGroupBy`` version usually permits the specification of an +axis argument, and often an argument indicating whether to restrict +application to columns of a specific data type. + +.. autosummary:: + :toctree: api/ + + DataFrameGroupBy.backfill + DataFrameGroupBy.bfill + DataFrameGroupBy.count + DataFrameGroupBy.cumcount + DataFrameGroupBy.cummax + DataFrameGroupBy.cummin + DataFrameGroupBy.cumsum + DataFrameGroupBy.describe + DataFrameGroupBy.ffill + DataFrameGroupBy.fillna + DataFrameGroupBy.idxmax + DataFrameGroupBy.idxmin + DataFrameGroupBy.nunique + DataFrameGroupBy.pad + DataFrameGroupBy.quantile + DataFrameGroupBy.shift + DataFrameGroupBy.size + +The following methods are available only for ``SeriesGroupBy`` objects. + +.. autosummary:: + :toctree: api/ + + SeriesGroupBy.nunique + SeriesGroupBy.unique diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst new file mode 100644 index 00000000000..70b9563fc1d --- /dev/null +++ b/docs/cudf/source/api_docs/index.rst @@ -0,0 +1,19 @@ +============= +API reference +============= + +This page provides a list of all publicly accessible modules, methods and classes through +``cudf.*`` namespace. + +.. toctree:: + :maxdepth: 2 + :caption: API Documentation + + series + dataframe + index_objects + groupby + general_functions + general_utilities + window + diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst new file mode 100644 index 00000000000..c23c9a3f6c1 --- /dev/null +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -0,0 +1,296 @@ +============= +Index objects +============= + +Index +----- +.. currentmodule:: cudf + +**Many of these methods or variants thereof are available on the objects +that contain an index (Series/DataFrame) and those should most likely be +used before calling these methods directly.** + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + Index + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.empty + Index.gpu_values + Index.is_monotonic + Index.is_monotonic_increasing + Index.is_monotonic_decreasing + Index.is_unique + Index.name + Index.names + Index.ndim + Index.nlevels + Index.shape + Index.size + Index.values + + +Modifying and computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.any + Index.copy + Index.drop_duplicates + Index.equals + Index.factorize + Index.min + Index.max + Index.rename + Index.repeat + Index.where + Index.take + Index.unique + +Compatibility with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.set_names + +Missing values +~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.fillna + Index.dropna + Index.isna + Index.notna + +Memory usage +~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.memory_usage + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.astype + Index.to_list + Index.to_series + Index.to_frame + +Sorting +~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.argsort + Index.searchsorted + Index.sort_values + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.shift + +Combining / joining / set operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.append + Index.join + Index.difference + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Index.get_level_values + Index.get_loc + Index.get_slice_bound + Index.isin + +.. _api.numericindex: + +Numeric Index +------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + RangeIndex + Int64Index + UInt64Index + Float64Index + + +.. _api.categoricalindex: + +CategoricalIndex +---------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + CategoricalIndex + +Categorical components +~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + CategoricalIndex.codes + CategoricalIndex.categories + +Modifying and computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + CategoricalIndex.equals + +.. _api.intervalindex: + +IntervalIndex +------------- +.. autosummary:: + :toctree: api/ + + IntervalIndex + +IntervalIndex components +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + IntervalIndex.from_breaks + IntervalIndex.values + IntervalIndex.get_loc + +.. _api.multiindex: + +MultiIndex +---------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + MultiIndex + + +MultiIndex constructors +~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.from_tuples + MultiIndex.from_product + MultiIndex.from_frame + MultiIndex.from_arrow + +MultiIndex properties +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.names + MultiIndex.levels + MultiIndex.codes + MultiIndex.nlevels + +MultiIndex components +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.to_frame + MultiIndex.droplevel + +MultiIndex selecting +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + MultiIndex.get_loc + MultiIndex.get_level_values + +.. _api.datetimeindex: + +DatetimeIndex +------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + DatetimeIndex + +Time/date components +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.year + DatetimeIndex.month + DatetimeIndex.day + DatetimeIndex.hour + DatetimeIndex.minute + DatetimeIndex.second + DatetimeIndex.dayofweek + DatetimeIndex.weekday + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.round + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DatetimeIndex.to_series + DatetimeIndex.to_frame + +TimedeltaIndex +-------------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + TimedeltaIndex + +Components +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TimedeltaIndex.days + TimedeltaIndex.seconds + TimedeltaIndex.microseconds + TimedeltaIndex.nanoseconds + TimedeltaIndex.components + TimedeltaIndex.inferred_freq + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TimedeltaIndex.to_series + TimedeltaIndex.round + TimedeltaIndex.to_frame diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst new file mode 100644 index 00000000000..ffa809268f3 --- /dev/null +++ b/docs/cudf/source/api_docs/series.rst @@ -0,0 +1,478 @@ +====== +Series +====== +.. currentmodule:: cudf + +Constructor +----------- +.. autosummary:: + :toctree: api/ + :template: autosummary/class_with_autosummary.rst + + Series + +Attributes +---------- +**Axes** + +.. autosummary:: + :toctree: api/ + + Series.index + Series.values + Series.data + Series.dtype + Series.shape + Series.ndim + Series.nullable + Series.nullmask + Series.null_count + Series.size + Series.memory_usage + Series.has_nulls + Series.empty + Series.name + Series.valid_count + Series.values_host + +Conversion +---------- +.. autosummary:: + :toctree: api/ + + Series.astype + Series.copy + Series.to_list + Series.__array__ + Series.as_index + Series.as_mask + Series.scale + + +Indexing, iteration +------------------- +.. autosummary:: + :toctree: api/ + + Series.loc + Series.iloc + Series.__iter__ + Series.items + Series.iteritems + Series.keys + +For more information on ``.at``, ``.iat``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +------------------------- +.. autosummary:: + :toctree: api/ + + Series.add + Series.sub + Series.subtract + Series.mul + Series.multiply + Series.truediv + Series.floordiv + Series.mod + Series.pow + Series.radd + Series.rsub + Series.rmul + Series.rtruediv + Series.rfloordiv + Series.rmod + Series.rpow + Series.round + Series.lt + Series.gt + Series.le + Series.ge + Series.ne + Series.eq + Series.product + +Function application, GroupBy & window +-------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.applymap + Series.map + Series.groupby + Series.rolling + Series.pipe + +.. _api.series.stats: + +Computations / descriptive stats +-------------------------------- +.. autosummary:: + :toctree: api/ + + Series.abs + Series.all + Series.any + Series.ceil + Series.clip + Series.corr + Series.count + Series.cov + Series.cummax + Series.cummin + Series.cumprod + Series.cumsum + Series.describe + Series.diff + Series.digitize + Series.factorize + Series.floor + Series.kurt + Series.max + Series.mean + Series.median + Series.min + Series.mode + Series.nlargest + Series.nsmallest + Series.prod + Series.quantile + Series.rank + Series.skew + Series.std + Series.sum + Series.var + Series.kurtosis + Series.unique + Series.nunique + Series.is_unique + Series.is_monotonic + Series.is_monotonic_increasing + Series.is_monotonic_decreasing + Series.value_counts + +Reindexing / selection / label manipulation +------------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.drop + Series.drop_duplicates + Series.equals + Series.head + Series.isin + Series.reindex + Series.rename + Series.reset_index + Series.reverse + Series.sample + Series.set_index + Series.set_mask + Series.take + Series.tail + Series.tile + Series.where + Series.mask + +Missing data handling +--------------------- +.. autosummary:: + :toctree: api/ + + Series.dropna + Series.fillna + Series.isna + Series.isnull + Series.nans_to_nulls + Series.notna + Series.notnull + Series.replace + +Reshaping, sorting +------------------ +.. autosummary:: + :toctree: api/ + + Series.argsort + Series.interleave_columns + Series.sort_values + Series.sort_index + Series.explode + Series.scatter_by_map + Series.searchsorted + Series.repeat + +Combining / comparing / joining / merging / encoding +---------------------------------------------------- +.. autosummary:: + :toctree: api/ + + Series.append + Series.update + Series.label_encoding + Series.one_hot_encoding + +Numerical operations +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.acos + Series.asin + Series.atan + Series.cos + Series.exp + Series.log + Series.sin + Series.sqrt + Series.tan + +Time Series-related +------------------- +.. autosummary:: + :toctree: api/ + + Series.shift + +Accessors +--------- + +pandas provides dtype-specific methods under various accessors. +These are separate namespaces within :class:`Series` that only apply +to specific data types. + +=========================== ================================= +Data Type Accessor +=========================== ================================= +Datetime, Timedelta :ref:`dt ` +String :ref:`str ` +Categorical :ref:`cat ` +List :ref:`list ` +=========================== ================================= + +.. _api.series.dt: + +Datetimelike properties +~~~~~~~~~~~~~~~~~~~~~~~ + +``Series.dt`` can be used to access the values of the series as +datetimelike and return several properties. +These can be accessed like ``Series.dt.``. + +Datetime properties +^^^^^^^^^^^^^^^^^^^ +.. currentmodule:: cudf.core.series.DatetimeProperties + +.. autosummary:: + :toctree: api/ + + day + dayofweek + hour + minute + month + second + weekday + year + +Datetime methods +^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: api/ + + strftime + + +Timedelta properties +^^^^^^^^^^^^^^^^^^^^ + +.. currentmodule:: cudf.core.series.TimedeltaProperties +.. autosummary:: + :toctree: api/ + + components + days + microseconds + nanoseconds + seconds + + +.. _api.series.str: + +String handling +~~~~~~~~~~~~~~~ + +``Series.str`` can be used to access the values of the series as +strings and apply several methods to it. These can be accessed like +``Series.str.``. + +.. currentmodule:: cudf.core.column.string.StringMethods +.. autosummary:: + :toctree: api/ + + byte_count + capitalize + cat + center + character_ngrams + character_tokenize + code_points + contains + count + detokenize + edit_distance + endswith + extract + filter_alphanum + filter_characters + filter_tokens + find + findall + get + get_json_object + htoi + index + insert + ip2int + is_consonant + is_vowel + isalnum + isalpha + isdecimal + isdigit + isempty + isfloat + ishex + isinteger + isipv4 + isspace + islower + isnumeric + isupper + istimestamp + join + len + ljust + lower + lstrip + match + ngrams + ngrams_tokenize + normalize_characters + pad + partition + porter_stemmer_measure + replace + replace_tokens + replace_with_backrefs + rfind + rindex + rjust + rpartition + rstrip + slice + slice_from + slice_replace + split + rsplit + startswith + strip + subword_tokenize + swapcase + title + token_count + tokenize + translate + upper + url_decode + url_encode + wrap + zfill + + + +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series/Index class page) + +.. + .. currentmodule:: cudf + .. autosummary:: + :toctree: api/ + :template: autosummary/accessor.rst + + Series.str + Series.cat + Series.dt + Index.str + +.. _api.series.cat: + +Categorical accessor +~~~~~~~~~~~~~~~~~~~~ + +Categorical-dtype specific methods and attributes are available under +the ``Series.cat`` accessor. + +.. currentmodule:: cudf.core.column.categorical.CategoricalAccessor +.. autosummary:: + :toctree: api/ + + categories + ordered + codes + reorder_categories + add_categories + remove_categories + set_categories + as_ordered + as_unordered + + +.. _api.series.list: + +List handling +~~~~~~~~~~~~~ + +``Series.list`` can be used to access the values of the series as +lists and apply list methods to it. These can be accessed like +``Series.list.``. + +.. currentmodule:: cudf.core.column.lists.ListMethods +.. autosummary:: + :toctree: api/ + + concat + contains + get + len + sort_values + take + unique + + +Serialization / IO / conversion +------------------------------- +.. currentmodule:: cudf +.. autosummary:: + :toctree: api/ + + Series.to_array + Series.to_arrow + Series.to_dlpack + Series.to_frame + Series.to_gpu_array + Series.to_hdf + Series.to_json + Series.to_pandas + Series.to_string + Series.from_arrow + Series.from_categorical + Series.from_masked_array + Series.from_pandas + Series.hash_encode + Series.hash_values + \ No newline at end of file diff --git a/docs/cudf/source/api_docs/window.rst b/docs/cudf/source/api_docs/window.rst new file mode 100644 index 00000000000..9f94f620949 --- /dev/null +++ b/docs/cudf/source/api_docs/window.rst @@ -0,0 +1,24 @@ +.. _api.window: + +====== +Window +====== + +Rolling objects are returned by ``.rolling`` calls: :func:`cudf.DataFrame.rolling`, :func:`cudf.Series.rolling`, etc. + +.. _api.functions_rolling: + +Rolling window functions +------------------------ +.. currentmodule:: cudf.core.window.rolling + +.. autosummary:: + :toctree: api/ + + Rolling.count + Rolling.sum + Rolling.mean + Rolling.min + Rolling.max + Rolling.apply + diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst deleted file mode 100644 index 15b4b43662b..00000000000 --- a/docs/cudf/source/basics.rst +++ /dev/null @@ -1,54 +0,0 @@ -Basics -====== - - -Supported Dtypes ----------------- - -cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``, -``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``, -``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes). - - -The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. - - -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Kind of Data | Data Type | Scalar | String Aliases | -+========================+==================+=====================================================================================+=============================================+ -| Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | -| | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | -| | | | ``'uint32'``, ``'uint64'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Strings | | `str `_ | ``'string'``, ``'object'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | -| | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| -| (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Categorical | CategoricalDtype | (none) | ``'category'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Boolean | | np.bool_ | ``'bool'`` | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ -| Decimal | Decimal64Dtype | (none) | (none) | -+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - -**Note: All dtypes above are Nullable** - -.. _np.int8: -.. _np.int16: -.. _np.int32: -.. _np.int64: -.. _np.uint8: -.. _np.uint16: -.. _np.uint32: -.. _np.uint64: -.. _np.float32: -.. _np.float64: -.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html -.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes -.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/PandasCompat.rst b/docs/cudf/source/basics/PandasCompat.rst similarity index 100% rename from docs/cudf/source/PandasCompat.rst rename to docs/cudf/source/basics/PandasCompat.rst diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst new file mode 100644 index 00000000000..ee63f67daa2 --- /dev/null +++ b/docs/cudf/source/basics/basics.rst @@ -0,0 +1,56 @@ +Basics +====== + + +Supported Dtypes +---------------- + +cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``, +``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``, +``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes). + + +The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. + +.. rst-class:: special-table +.. table:: + + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Kind of Data | Data Type | Scalar | String Aliases | + +========================+==================+=====================================================================================+=============================================+ + | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | + | | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | + | | | | ``'uint32'``, ``'uint64'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Strings | | `str `_ | ``'string'``, ``'object'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | + | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| + | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Categorical | CategoricalDtype | (none) | ``'category'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Boolean | | np.bool_ | ``'bool'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Decimal | Decimal64Dtype | (none) | (none) | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + +**Note: All dtypes above are Nullable** + +.. _np.int8: +.. _np.int16: +.. _np.int32: +.. _np.int64: +.. _np.uint8: +.. _np.uint16: +.. _np.uint32: +.. _np.uint64: +.. _np.float32: +.. _np.float64: +.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html +.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes +.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/dask-cudf.rst b/docs/cudf/source/basics/dask-cudf.rst similarity index 100% rename from docs/cudf/source/dask-cudf.rst rename to docs/cudf/source/basics/dask-cudf.rst diff --git a/docs/cudf/source/groupby.rst b/docs/cudf/source/basics/groupby.rst similarity index 51% rename from docs/cudf/source/groupby.rst rename to docs/cudf/source/basics/groupby.rst index a6ce9db6817..04c4d42fa2a 100644 --- a/docs/cudf/source/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -131,41 +131,44 @@ Aggregations on groups is supported via the ``agg`` method: The following table summarizes the available aggregations and the types that support them: -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| Aggregations / dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | -+====================================+===========+============+==========+===============+========+==========+============+===========+ -| count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| sum | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| idxmin | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| idxmax | ✅ | ✅ | | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| min | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| max | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| mean | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| var | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| std | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| quantile | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| median | ✅ | ✅ | | | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| nth | ✅ | ✅ | ✅ | | | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ -| unique | ✅ | ✅ | ✅ | ✅ | | | | | -+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ +.. rst-class:: special-table +.. table:: + + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | Aggregations / dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | + +====================================+===========+============+==========+===============+========+==========+============+===========+ + | count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | sum | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | idxmin | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | idxmax | ✅ | ✅ | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | min | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | max | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | mean | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | var | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | std | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | quantile | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | median | ✅ | ✅ | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | nth | ✅ | ✅ | ✅ | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | unique | ✅ | ✅ | ✅ | ✅ | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ GroupBy apply ------------- diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst new file mode 100644 index 00000000000..a29866d7e32 --- /dev/null +++ b/docs/cudf/source/basics/index.rst @@ -0,0 +1,15 @@ +====== +Basics +====== + + +.. toctree:: + :maxdepth: 2 + + basics + io.rst + groupby.rst + PandasCompat.rst + dask-cudf.rst + internals.rst + \ No newline at end of file diff --git a/docs/cudf/source/internals.rst b/docs/cudf/source/basics/internals.rst similarity index 100% rename from docs/cudf/source/internals.rst rename to docs/cudf/source/basics/internals.rst diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst similarity index 100% rename from docs/cudf/source/io-gds-integration.rst rename to docs/cudf/source/basics/io-gds-integration.rst diff --git a/docs/cudf/source/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst similarity index 99% rename from docs/cudf/source/io-supported-types.rst rename to docs/cudf/source/basics/io-supported-types.rst index 739c1634ca7..78c1bfb6554 100644 --- a/docs/cudf/source/io-supported-types.rst +++ b/docs/cudf/source/basics/io-supported-types.rst @@ -3,7 +3,7 @@ I/O Supported dtypes The following table lists are compatible cudf types for each supported IO format. -.. rst-class:: io-supported-types-table +.. rst-class:: io-supported-types-table special-table .. table:: :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/basics/io.rst similarity index 100% rename from docs/cudf/source/io.rst rename to docs/cudf/source/basics/io.rst diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c764b64da60..c5f1233d022 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -24,7 +24,10 @@ from docutils.nodes import Text from sphinx.addnodes import pending_xref +import cudf +sys.path.insert(0, os.path.abspath(cudf.__path__[0])) +sys.path.insert(0, os.path.abspath(".")) sys.path.insert(0, os.path.abspath("../..")) sys.path.append(os.path.abspath("./_ext")) @@ -43,7 +46,6 @@ "sphinx.ext.autosummary", "sphinx_copybutton", "numpydoc", - "sphinx_markdown_tables", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", "nbsphinx", @@ -51,9 +53,11 @@ ] copybutton_prompt_text = ">>> " - +autosummary_generate = True ipython_mplbackend = "str" +html_use_modindex = True + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -61,7 +65,7 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = {".rst": "restructuredtext", ".md": "markdown"} +source_suffix = {".rst": "restructuredtext"} # The master toctree document. master_doc = "index" @@ -90,21 +94,30 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] +exclude_patterns = ['venv', "**/includes/**",] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +html_theme_options = { + "external_links": [], + "github_url": "https://github.com/rapidsai/cudf", + "twitter_url": "https://twitter.com/rapidsai", + "show_toc_level": 1, + "navbar_align": "right", +} include_pandas_compat = True -# -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "sphinx_rtd_theme" - +html_theme = "pydata_sphinx_theme" +html_logo = "_static/RAPIDS-logo-purple.png" # on_rtd is whether we are on readthedocs.org on_rtd = os.environ.get("READTHEDOCS", None) == "True" @@ -112,10 +125,10 @@ # only import and set the theme if we're building docs locally # otherwise, readthedocs.org uses their theme by default, # so no need to specify it - import sphinx_rtd_theme + import pydata_sphinx_theme - html_theme = "sphinx_rtd_theme" - html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + html_theme = "pydata_sphinx_theme" + html_theme_path = pydata_sphinx_theme.get_html_theme_path() # Theme options are theme-specific and customize the look and feel of a theme @@ -201,8 +214,9 @@ # Config numpydoc numpydoc_show_inherited_class_members = True numpydoc_class_members_toctree = False +numpydoc_attributes_as_param_list = False -autoclass_content = "init" +autoclass_content = "class" # Replace API shorthands with fullname _reftarget_aliases = { @@ -234,10 +248,27 @@ def ignore_internal_references(app, env, node, contnode): node["reftarget"] = "" return contnode +def process_class_docstrings(app, what, name, obj, options, lines): + """ + For those classes for which we use :: + :template: autosummary/class_without_autosummary.rst + the documented attributes/methods have to be listed in the class + docstring. However, if one of those lists is empty, we use 'None', + which then generates warnings in sphinx / ugly html output. + This "autodoc-process-docstring" event connector removes that part + from the processed docstring. + """ + if what == "class": + if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: + + cut_index = lines.index('.. rubric:: Attributes') + lines[:] = lines[:cut_index] + + + def setup(app): - app.add_js_file("copybutton_pydocs.js") app.add_css_file("params.css") - app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", ignore_internal_references) + app.connect("autodoc-process-docstring", process_class_docstrings) diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 5a6d9a2617d..90b287bd1b6 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -1,25 +1,25 @@ Welcome to cuDF's documentation! ================================= +cuDF is a Python GPU DataFrame library (built on the `Apache Arrow +`_ columnar memory format) for loading, joining, +aggregating, filtering, and otherwise manipulating data. cuDF also provides a +pandas-like API that will be familiar to data engineers & data scientists, so +they can use it to easily accelerate their workflows without going into +the details of CUDA programming. + + .. toctree:: :maxdepth: 2 :caption: Contents: - api.rst - 10min.ipynb - basics.rst - io.rst - groupby.rst - dask-cudf.rst - 10min-cudf-cupy.ipynb - guide-to-udfs.ipynb - internals.rst - Working-with-missing-data.ipynb - PandasCompat.rst + user_guide/index + basics/index + api_docs/index + Indices and tables ================== * :ref:`genindex` -* :ref:`modindex` * :ref:`search` diff --git a/docs/cudf/source/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb similarity index 100% rename from docs/cudf/source/10min-cudf-cupy.ipynb rename to docs/cudf/source/user_guide/10min-cudf-cupy.ipynb diff --git a/docs/cudf/source/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb similarity index 100% rename from docs/cudf/source/10min.ipynb rename to docs/cudf/source/user_guide/10min.ipynb diff --git a/docs/cudf/source/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb similarity index 100% rename from docs/cudf/source/Working-with-missing-data.ipynb rename to docs/cudf/source/user_guide/Working-with-missing-data.ipynb diff --git a/docs/cudf/source/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb similarity index 100% rename from docs/cudf/source/guide-to-udfs.ipynb rename to docs/cudf/source/user_guide/guide-to-udfs.ipynb diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst new file mode 100644 index 00000000000..1061008eb3c --- /dev/null +++ b/docs/cudf/source/user_guide/index.rst @@ -0,0 +1,12 @@ +========== +User Guide +========== + + +.. toctree:: + :maxdepth: 2 + + 10min.ipynb + 10min-cudf-cupy.ipynb + guide-to-udfs.ipynb + Working-with-missing-data.ipynb diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2d52b517242..13c20d8bcd4 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -31,6 +31,7 @@ IntervalIndex, MultiIndex, RangeIndex, + StringIndex, Scalar, Series, TimedeltaIndex, @@ -73,7 +74,14 @@ tan, true_divide, ) -from cudf.core.reshape import concat, get_dummies, melt, merge_sorted +from cudf.core.reshape import ( + concat, + get_dummies, + melt, + merge_sorted, + pivot, + unstack, +) from cudf.core.series import isclose from cudf.core.tools.datetimes import DateOffset, to_datetime from cudf.core.tools.numeric import to_numeric diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 1d9c8fa58e6..a15a180d466 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -359,7 +359,7 @@ def read_csv( See Also -------- - cudf.io.csv.read_csv + cudf.read_csv """ if not isinstance(datasource, (BytesIO, StringIO, bytes, @@ -429,7 +429,7 @@ cpdef write_csv( See Also -------- - cudf.io.csv.to_csv + cudf.to_csv """ cdef table_view input_table_view = \ table.view() if index is True else table.data_view() diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index e15b569ed85..b888f213921 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -84,7 +84,7 @@ cpdef read_orc(object filepaths_or_buffers, See Also -------- - cudf.io.orc.read_orc + cudf.read_orc """ cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( filepaths_or_buffers, @@ -142,7 +142,7 @@ cpdef write_orc(Table table, See Also -------- - cudf.io.orc.read_orc + cudf.read_orc """ cdef compression_type compression_ = _get_comp_type(compression) cdef table_metadata metadata_ = table_metadata() diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 5eaa5b52fd4..016aba2edb3 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -17,6 +17,7 @@ Int64Index, IntervalIndex, RangeIndex, + StringIndex, TimedeltaIndex, UInt8Index, UInt16Index, diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 9f26ac8ee78..38b6f8789bb 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -35,7 +35,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): See Also -------- - cudf.core.series.Series.factorize : Encode the input values of Series. + cudf.Series.factorize : Encode the input values of Series. """ if sort: diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index c6875052685..8d80e488e2e 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -15,6 +15,23 @@ class Buffer(Serializable): + """ + A Buffer represents a device memory allocation. + + Parameters + ---------- + data : Buffer, array_like, int + An array-like object or integer representing a + device or host pointer to pre-allocated memory. + size : int, optional + Size of memory allocation. Required if a pointer + is passed for `data`. + owner : object, optional + Python object to which the lifetime of the memory + allocation is tied. If provided, a reference to this + object is kept in this Buffer. + """ + ptr: int size: int _owner: Any @@ -22,22 +39,7 @@ class Buffer(Serializable): def __init__( self, data: Any = None, size: Optional[int] = None, owner: Any = None ): - """ - A Buffer represents a device memory allocation. - - Parameters - ---------- - data : Buffer, array_like, int - An array-like object or integer representing a - device or host pointer to pre-allocated memory. - size : int, optional - Size of memory allocation. Required if a pointer - is passed for `data`. - owner : object, optional - Python object to which the lifetime of the memory - allocation is tied. If provided, a reference to this - object is kept in this Buffer. - """ + if isinstance(data, Buffer): self.ptr = data.ptr self.size = data.size diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 48398e03b2d..f435e0fa88c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -49,62 +49,63 @@ class CategoricalAccessor(ColumnMethods): + """ + Accessor object for categorical properties of the Series values. + Be aware that assigning to `categories` is a inplace operation, + while all methods return new categorical data per default. + + Parameters + ---------- + column : Column + parent : Series or CategoricalIndex + + Examples + -------- + >>> s = cudf.Series([1,2,3], dtype='category') + >>> s + >>> s + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1, 2, 3] + >>> s.cat.categories + Int64Index([1, 2, 3], dtype='int64') + >>> s.cat.reorder_categories([3,2,1]) + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [3, 2, 1] + >>> s.cat.remove_categories([1]) + 0 + 1 2 + 2 3 + dtype: category + Categories (2, int64): [2, 3] + >>> s.cat.set_categories(list('abcde')) + 0 + 1 + 2 + dtype: category + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] + >>> s.cat.as_ordered() + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1 < 2 < 3] + >>> s.cat.as_unordered() + 0 1 + 1 2 + 2 3 + dtype: category + Categories (3, int64): [1, 2, 3] + """ + _column: CategoricalColumn def __init__(self, parent: SeriesOrIndex): - """ - Accessor object for categorical properties of the Series values. - Be aware that assigning to `categories` is a inplace operation, - while all methods return new categorical data per default. - - Parameters - ---------- - column : Column - parent : Series or CategoricalIndex - - Examples - -------- - >>> s = cudf.Series([1,2,3], dtype='category') - >>> s - >>> s - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - >>> s.cat.categories - Int64Index([1, 2, 3], dtype='int64') - >>> s.cat.reorder_categories([3,2,1]) - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [3, 2, 1] - >>> s.cat.remove_categories([1]) - 0 - 1 2 - 2 3 - dtype: category - Categories (2, int64): [2, 3] - >>> s.cat.set_categories(list('abcde')) - 0 - 1 - 2 - dtype: category - Categories (5, object): ['a', 'b', 'c', 'd', 'e'] - >>> s.cat.as_ordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1 < 2 < 3] - >>> s.cat.as_unordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - """ if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" @@ -648,7 +649,19 @@ def reorder_categories( class CategoricalColumn(column.ColumnBase): - """Implements operations for Columns of Categorical type + """ + Implements operations for Columns of Categorical type + + Parameters + ---------- + dtype : CategoricalDtype + mask : Buffer + The validity mask + offset : int + Data offset + children : Tuple[ColumnBase] + Two non-null columns containing the categories and codes + respectively """ dtype: cudf.core.dtypes.CategoricalDtype @@ -664,18 +677,7 @@ def __init__( null_count: int = None, children: Tuple["column.ColumnBase", ...] = (), ): - """ - Parameters - ---------- - dtype : CategoricalDtype - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[ColumnBase] - Two non-null columns containing the categories and codes - respectively - """ + if size is None: for child in children: assert child.offset == 0 diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f3d1880b290..623d0e43f5d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -52,6 +52,19 @@ class DatetimeColumn(column.ColumnBase): + """ + A Column implementation for Date-time types. + + Parameters + ---------- + data : Buffer + The datetime values + dtype : np.dtype + The data type + mask : Buffer; optional + The validity mask + """ + def __init__( self, data: Buffer, @@ -61,16 +74,7 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - The datetime values - dtype : np.dtype - The data type - mask : Buffer; optional - The validity mask - """ + dtype = np.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a3f4a82a7dc..29211b0f855 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -36,6 +36,17 @@ class NumericalColumn(NumericalBaseColumn): + """ + A Column object for Numeric types. + + Parameters + ---------- + data : Buffer + dtype : np.dtype + The dtype associated with the data Buffer + mask : Buffer, optional + """ + def __init__( self, data: Buffer, @@ -45,14 +56,6 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - dtype : np.dtype - The dtype associated with the data Buffer - mask : Buffer, optional - """ dtype = np.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7d6afbb4056..50cd6c764cd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -98,17 +98,18 @@ def str_to_boolean(column: StringColumn): class StringMethods(ColumnMethods): + """ + Vectorized string functions for Series and Index. + + This mimics pandas ``df.str`` interface. nulls stay null + unless handled otherwise by a particular method. + Patterned after Python’s string methods, with some + inspiration from R’s stringr package. + """ + _column: StringColumn def __init__(self, parent): - """ - Vectorized string functions for Series and Index. - - This mimics pandas ``df.str`` interface. nulls stay null - unless handled otherwise by a particular method. - Patterned after Python’s string methods, with some - inspiration from R’s stringr package. - """ value_type = ( parent.dtype.leaf_type if is_list_dtype(parent.dtype) @@ -2555,7 +2556,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: Also available on indices: - >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999']) + >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx StringIndex(['X 123' 'Y 999'], dtype='object') @@ -2622,7 +2623,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: Also available on indices: - >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999']) + >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx StringIndex(['X 123' 'Y 999'], dtype='object') @@ -3294,7 +3295,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: This is also available on Index. - >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat']) + >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') """ # noqa W605 @@ -4922,7 +4923,18 @@ def _expected_types_format(types): class StringColumn(column.ColumnBase): - """Implements operations for Columns of String type + """ + Implements operations for Columns of String type + + Parameters + ---------- + mask : Buffer + The validity mask + offset : int + Data offset + children : Tuple[Column] + Two non-null columns containing the string data and offsets + respectively """ _start_offset: Optional[int] @@ -4937,17 +4949,6 @@ def __init__( null_count: int = None, children: Tuple["column.ColumnBase", ...] = (), ): - """ - Parameters - ---------- - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively - """ dtype = np.dtype("object") if size is None: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a27c20cc50c..b73353dd720 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -34,6 +34,24 @@ class TimeDeltaColumn(column.ColumnBase): + """ + Parameters + ---------- + data : Buffer + The Timedelta values + dtype : np.dtype + The data type + size : int + Size of memory allocation. + mask : Buffer; optional + The validity mask + offset : int + Data offset + null_count : int, optional + The number of null values. + If None, it is calculated automatically. + """ + def __init__( self, data: Buffer, @@ -43,23 +61,6 @@ def __init__( offset: int = 0, null_count: int = None, ): - """ - Parameters - ---------- - data : Buffer - The Timedelta values - dtype : np.dtype - The data type - size : int - Size of memory allocation. - mask : Buffer; optional - The validity mask - offset : int - Data offset - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - """ dtype = np.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 607b8ac307b..56882f89af8 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -80,6 +80,19 @@ def _to_flat_dict(d): class ColumnAccessor(MutableMapping): + """ + Parameters + ---------- + data : mapping + Mapping of keys to column values. + multiindex : bool, optional + Whether tuple keys represent a hierarchical + index with multiple "levels" (default=False). + level_names : tuple, optional + Tuple containing names for each of the levels. + For a non-hierarchical index, a tuple of size 1 + may be passe. + """ _data: "Dict[Any, ColumnBase]" multiindex: bool @@ -91,19 +104,6 @@ def __init__( multiindex: bool = False, level_names=None, ): - """ - Parameters - ---------- - data : mapping - Mapping of keys to column values. - multiindex : bool, optional - Whether tuple keys represent a hierarchical - index with multiple "levels" (default=False). - level_names : tuple, optional - Tuple containing names for each of the levels. - For a non-hierarchical index, a tuple of size 1 - may be passe. - """ if data is None: data = {} # TODO: we should validate the keys of `data` diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8cdc6eebaee..6c5932e600b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -69,100 +69,101 @@ class DataFrame(Frame, Serializable, GetAttrGetItemMixin): + """ + A GPU Dataframe object. - _PROTECTED_KEYS = frozenset(("_data", "_index")) - - @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") - def __init__(self, data=None, index=None, columns=None, dtype=None): - """ - A GPU Dataframe object. + Parameters + ---------- + data : array-like, Iterable, dict, or DataFrame. + Dict can contain Series, arrays, constants, or list-like objects. - Parameters - ---------- - data : array-like, Iterable, dict, or DataFrame. - Dict can contain Series, arrays, constants, or list-like objects. + index : Index or array-like + Index to use for resulting frame. Will default to + RangeIndex if no indexing information part of input data and + no index provided. - index : Index or array-like - Index to use for resulting frame. Will default to - RangeIndex if no indexing information part of input data and - no index provided. + columns : Index or array-like + Column labels to use for resulting frame. + Will default to RangeIndex (0, 1, 2, …, n) if no column + labels are provided. - columns : Index or array-like - Column labels to use for resulting frame. - Will default to RangeIndex (0, 1, 2, …, n) if no column - labels are provided. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. + If None, infer. - dtype : dtype, default None - Data type to force. Only a single dtype is allowed. - If None, infer. - - Examples - -------- + Examples + -------- - Build dataframe with ``__setitem__``: + Build dataframe with ``__setitem__``: - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df - key val - 0 0 10.0 - 1 1 11.0 - 2 2 12.0 - 3 3 13.0 - 4 4 14.0 + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df + key val + 0 0 10.0 + 1 1 11.0 + 2 2 12.0 + 3 3 13.0 + 4 4 14.0 + + Build DataFrame via dict of columns: + + >>> import numpy as np + >>> from datetime import datetime, timedelta + >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') + >>> n = 5 + >>> df = cudf.DataFrame({ + ... 'id': np.arange(n), + ... 'datetimes': np.array( + ... [(t0+ timedelta(seconds=x)) for x in range(n)]) + ... }) + >>> df + id datetimes + 0 0 2018-10-07T12:00:00.000 + 1 1 2018-10-07T12:00:01.000 + 2 2 2018-10-07T12:00:02.000 + 3 3 2018-10-07T12:00:03.000 + 4 4 2018-10-07T12:00:04.000 + + Build DataFrame via list of rows as tuples: + + >>> df = cudf.DataFrame([ + ... (5, "cats", "jump", np.nan), + ... (2, "dogs", "dig", 7.5), + ... (3, "cows", "moo", -2.1, "occasionally"), + ... ]) + >>> df + 0 1 2 3 4 + 0 5 cats jump + 1 2 dogs dig 7.5 + 2 3 cows moo -2.1 occasionally + + Convert from a Pandas DataFrame: - Build DataFrame via dict of columns: + >>> import pandas as pd + >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) + >>> pdf + a b + 0 0 0.1 + 1 1 0.2 + 2 2 NaN + 3 3 0.3 + >>> df = cudf.from_pandas(pdf) + >>> df + a b + 0 0 0.1 + 1 1 0.2 + 2 2 + 3 3 0.3 + """ - >>> import numpy as np - >>> from datetime import datetime, timedelta - >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') - >>> n = 5 - >>> df = cudf.DataFrame({ - ... 'id': np.arange(n), - ... 'datetimes': np.array( - ... [(t0+ timedelta(seconds=x)) for x in range(n)]) - ... }) - >>> df - id datetimes - 0 0 2018-10-07T12:00:00.000 - 1 1 2018-10-07T12:00:01.000 - 2 2 2018-10-07T12:00:02.000 - 3 3 2018-10-07T12:00:03.000 - 4 4 2018-10-07T12:00:04.000 - - Build DataFrame via list of rows as tuples: - - >>> df = cudf.DataFrame([ - ... (5, "cats", "jump", np.nan), - ... (2, "dogs", "dig", 7.5), - ... (3, "cows", "moo", -2.1, "occasionally"), - ... ]) - >>> df - 0 1 2 3 4 - 0 5 cats jump - 1 2 dogs dig 7.5 - 2 3 cows moo -2.1 occasionally + _PROTECTED_KEYS = frozenset(("_data", "_index")) - Convert from a Pandas DataFrame: + @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") + def __init__(self, data=None, index=None, columns=None, dtype=None): - >>> import pandas as pd - >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) - >>> pdf - a b - 0 0 0.1 - 1 1 0.2 - 2 2 NaN - 3 3 0.3 - >>> df = cudf.from_pandas(pdf) - >>> df - a b - 0 0 0.1 - 1 1 0.2 - 2 2 - 3 3 0.3 - """ super().__init__() if isinstance(columns, (Series, cudf.BaseIndex)): @@ -3462,7 +3463,7 @@ def rename( if index: if ( any(type(item) == str for item in index.values()) - and type(self.index) != cudf.core.index.StringIndex + and type(self.index) != cudf.StringIndex ): raise NotImplementedError( "Implicit conversion of index to " @@ -4455,6 +4456,7 @@ def join( ) return df + @copy_docstring(DataFrameGroupBy) def groupby( self, by=None, @@ -4499,6 +4501,7 @@ def groupby( sort=sort, ) + @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None ): @@ -6482,9 +6485,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True): See Also -------- - cudf.core.series.Series.mode : Return the highest frequency value + cudf.Series.mode : Return the highest frequency value in a Series. - cudf.core.series.Series.value_counts : Return the counts of values + cudf.Series.value_counts : Return the counts of values in a Series. Notes diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6dbe55d0bb8..2d8bf9e5a2c 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -27,14 +27,14 @@ class _BaseDtype(ExtensionDtype, Serializable): class CategoricalDtype(_BaseDtype): + """ + dtype similar to pd.CategoricalDtype with the categories + stored on the GPU. + """ ordered: Optional[bool] def __init__(self, categories=None, ordered: bool = None) -> None: - """ - dtype similar to pd.CategoricalDtype with the categories - stored on the GPU. - """ self._categories = self._init_categories(categories) self.ordered = ordered @@ -223,14 +223,14 @@ def deserialize(cls, header: dict, frames: list): class StructDtype(_BaseDtype): + """ + fields : dict + A mapping of field names to dtypes + """ name = "struct" def __init__(self, fields): - """ - fields : dict - A mapping of field names to dtypes - """ pa_fields = { k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) for k, v in fields.items() @@ -309,34 +309,34 @@ def deserialize(cls, header: dict, frames: list): class Decimal32Dtype(_BaseDtype): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal32Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ name = "decimal32" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) def __init__(self, precision, scale=0): - """ - Parameters - ---------- - precision : int - The total number of digits in each value of this dtype - scale : int, optional - The scale of the Decimal32Dtype. See Notes below. - - Notes - ----- - When the scale is positive: - - numbers with fractional parts (e.g., 0.0042) can be represented - - the scale is the total number of digits to the right of the - decimal point - When the scale is negative: - - only multiples of powers of 10 (including 10**0) can be - represented (e.g., 1729, 4200, 1000000) - - the scale represents the number of trailing zeros in the value. - For example, 42 is representable with precision=2 and scale=0. - 13.0051 is representable with precision=6 and scale=4, - and *not* representable with precision<6 or scale<4. - """ self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) @@ -417,34 +417,34 @@ def deserialize(cls, header: dict, frames: list): class Decimal64Dtype(_BaseDtype): + """ + Parameters + ---------- + precision : int + The total number of digits in each value of this dtype + scale : int, optional + The scale of the Decimal64Dtype. See Notes below. + + Notes + ----- + When the scale is positive: + - numbers with fractional parts (e.g., 0.0042) can be represented + - the scale is the total number of digits to the right of the + decimal point + When the scale is negative: + - only multiples of powers of 10 (including 10**0) can be + represented (e.g., 1729, 4200, 1000000) + - the scale represents the number of trailing zeros in the value. + For example, 42 is representable with precision=2 and scale=0. + 13.0051 is representable with precision=6 and scale=4, + and *not* representable with precision<6 or scale<4. + """ name = "decimal64" _metadata = ("precision", "scale") MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) def __init__(self, precision, scale=0): - """ - Parameters - ---------- - precision : int - The total number of digits in each value of this dtype - scale : int, optional - The scale of the Decimal64Dtype. See Notes below. - - Notes - ----- - When the scale is positive: - - numbers with fractional parts (e.g., 0.0042) can be represented - - the scale is the total number of digits to the right of the - decimal point - When the scale is negative: - - only multiples of powers of 10 (including 10**0) can be - represented (e.g., 1729, 4200, 1000000) - - the scale represents the number of trailing zeros in the value. - For example, 42 is representable with precision=2 and scale=0. - 13.0051 is representable with precision=6 and scale=4, - and *not* representable with precision<6 or scale<4. - """ self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) @@ -525,16 +525,17 @@ def deserialize(cls, header: dict, frames: list): class IntervalDtype(StructDtype): + """ + subtype: str, np.dtype + The dtype of the Interval bounds. + closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’ + Whether the interval is closed on the left-side, right-side, + both or neither. See the Notes for more detailed explanation. + """ + name = "interval" def __init__(self, subtype, closed="right"): - """ - subtype: str, np.dtype - The dtype of the Interval bounds. - closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’ - Whether the interval is closed on the left-side, right-side, - both or neither. See the Notes for more detailed explanation. - """ super().__init__(fields={"left": subtype, "right": subtype}) if closed in ["left", "right", "neither", "both"]: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6a976f54c2b..daa42d994ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1116,19 +1116,19 @@ def dropna( See also -------- - cudf.core.dataframe.DataFrame.isna + cudf.DataFrame.isna Indicate null values. - cudf.core.dataframe.DataFrame.notna + cudf.DataFrame.notna Indicate non-null values. - cudf.core.dataframe.DataFrame.fillna + cudf.DataFrame.fillna Replace null values. - cudf.core.series.Series.dropna + cudf.Series.dropna Drop null values. - cudf.core.index.Index.dropna + cudf.Index.dropna Drop null indices. Examples @@ -4191,6 +4191,12 @@ def shape(self): return (len(self),) def __iter__(self): + """ + Iterating over a GPU object is not effecient and hence not supported. + + Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` + if you wish to iterate over the values. + """ cudf.utils.utils.raise_iteration_error(obj=self) def __len__(self): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4b063e7e57c..5b009984cf7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -352,10 +352,10 @@ def pipe(self, func, *args, **kwargs): See also -------- - cudf.core.series.Series.pipe + cudf.Series.pipe Apply a function with arguments to a series. - cudf.core.dataframe.DataFrame.pipe + cudf.DataFrame.pipe Apply a function with arguments to a dataframe. apply @@ -1017,93 +1017,93 @@ def _mimic_pandas_order( class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): + """ + Group DataFrame using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the object, + applying a function, and combining the results. This can be used to + group large amounts of data and compute operations on these groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. If by is a + function, it’s called on each value of the object’s index. + If a dict or Series is passed, the Series or dict VALUES will + be used to determine the groups (the Series’ values are first + aligned; see .align() method). If a cupy array is passed, the + values are used as-is determine the groups. A label or list + of labels may be passed to group by the columns in self. + Notice that a tuple is interpreted as a (single) key. + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as + the index. Only relevant for DataFrame input. + as_index=False is effectively “SQL-style” grouped output. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. + dropna : bool, optional + If True (default), do not include the "null" group. + + Returns + ------- + DataFrameGroupBy + Returns a groupby object that contains information + about the groups. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + >>> df.groupby(['Animal']).mean() + Max Speed + Animal + Falcon 375.0 + Parrot 25.0 + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, + index=index) + >>> df + Max Speed + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + >>> df.groupby(level=0).mean() + Max Speed + Animal + Falcon 370.0 + Parrot 25.0 + >>> df.groupby(level="Type").mean() + Max Speed + Type + Wild 185.0 + Captive 210.0 + """ + _PROTECTED_KEYS = frozenset(("obj",)) def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): - """ - Group DataFrame using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the object, - applying a function, and combining the results. This can be used to - group large amounts of data and compute operations on these groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. If by is a - function, it’s called on each value of the object’s index. - If a dict or Series is passed, the Series or dict VALUES will - be used to determine the groups (the Series’ values are first - aligned; see .align() method). If a cupy array is passed, the - values are used as-is determine the groups. A label or list - of labels may be passed to group by the columns in self. - Notice that a tuple is interpreted as a (single) key. - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as - the index. Only relevant for DataFrame input. - as_index=False is effectively “SQL-style” grouped output. - sort : bool, default False - Sort result by group key. Differ from Pandas, cudf defaults to - ``False`` for better performance. Note this does not influence - the order of observations within each group. Groupby preserves - the order of rows within each group. - dropna : bool, optional - If True (default), do not include the "null" group. - - Returns - ------- - DataFrameGroupBy - Returns a groupby object that contains information - about the groups. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level="Type").mean() - Max Speed - Type - Wild 185.0 - Captive 210.0 - - """ super().__init__( obj=obj, by=by, @@ -1126,68 +1126,68 @@ def nunique(self): class SeriesGroupBy(GroupBy): + """ + Group Series using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the object, + applying a function, and combining the results. This can be used to + group large amounts of data and compute operations on these groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. If by is a + function, it’s called on each value of the object’s index. + If a dict or Series is passed, the Series or dict VALUES will + be used to determine the groups (the Series’ values are first + aligned; see .align() method). If an cupy array is passed, the + values are used as-is determine the groups. A label or list + of labels may be passed to group by the columns in self. + Notice that a tuple is interpreted as a (single) key. + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as + the index. Only relevant for DataFrame input. + as_index=False is effectively “SQL-style” grouped output. + sort : bool, default False + Sort result by group key. Differ from Pandas, cudf defaults to + ``False`` for better performance. Note this does not influence + the order of observations within each group. Groupby preserves + the order of rows within each group. + + Returns + ------- + SeriesGroupBy + Returns a groupby object that contains information + about the groups. + + Examples + -------- + >>> ser = cudf.Series([390., 350., 30., 20.], + ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Parrot 30.0 + Parrot 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(ser > 100).mean() + Max Speed + False 25.0 + True 370.0 + Name: Max Speed, dtype: float64 + """ + def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True ): - """ - Group Series using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the object, - applying a function, and combining the results. This can be used to - group large amounts of data and compute operations on these groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. If by is a - function, it’s called on each value of the object’s index. - If a dict or Series is passed, the Series or dict VALUES will - be used to determine the groups (the Series’ values are first - aligned; see .align() method). If an cupy array is passed, the - values are used as-is determine the groups. A label or list - of labels may be passed to group by the columns in self. - Notice that a tuple is interpreted as a (single) key. - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as - the index. Only relevant for DataFrame input. - as_index=False is effectively “SQL-style” grouped output. - sort : bool, default False - Sort result by group key. Differ from Pandas, cudf defaults to - ``False`` for better performance. Note this does not influence - the order of observations within each group. Groupby preserves - the order of rows within each group. - - Returns - ------- - SeriesGroupBy - Returns a groupby object that contains information - about the groups. - - Examples - -------- - >>> ser = cudf.Series([390., 350., 30., 20.], - ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... name="Max Speed") - >>> ser - Falcon 390.0 - Falcon 350.0 - Parrot 30.0 - Parrot 20.0 - Name: Max Speed, dtype: float64 - >>> ser.groupby(level=0).mean() - Falcon 370.0 - Parrot 25.0 - Name: Max Speed, dtype: float64 - >>> ser.groupby(ser > 100).mean() - Max Speed - False 25.0 - True 370.0 - Name: Max Speed, dtype: float64 - - """ super().__init__( obj=obj, by=by, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a2f13daf44c..b3ca6f7973b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -326,7 +326,7 @@ def set_names(self, names, level=None, inplace=False): See Also -------- - cudf.core.index.Index.rename : Able to set new names without level. + cudf.Index.rename : Able to set new names without level. Examples -------- @@ -717,8 +717,8 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): See Also -------- - cudf.core.series.Series.min : Sort values of a Series. - cudf.core.dataframe.DataFrame.sort_values : Sort values in a DataFrame. + cudf.Series.min : Sort values of a Series. + cudf.DataFrame.sort_values : Sort values in a DataFrame. Examples -------- @@ -1287,9 +1287,9 @@ def from_pandas(cls, index, nan_as_null=None): >>> import numpy as np >>> data = [10, 20, 30, np.nan] >>> pdi = pd.Index(data) - >>> cudf.core.index.Index.from_pandas(pdi) + >>> cudf.Index.from_pandas(pdi) Float64Index([10.0, 20.0, 30.0, ], dtype='float64') - >>> cudf.core.index.Index.from_pandas(pdi, nan_as_null=False) + >>> cudf.Index.from_pandas(pdi, nan_as_null=False) Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if not isinstance(index, pd.Index): @@ -1709,25 +1709,25 @@ def __mul__(self, other): class GenericIndex(BaseIndex): - """An array of orderable values that represent the indices of another Column + """ + An array of orderable values that represent the indices of another Column Attributes ---------- _values: A Column object name: A string + + Parameters + ---------- + data : Column + The Column of data for this index + name : str optional + The name of the Index. If not provided, the Index adopts the value + Column's name. Otherwise if this name is different from the value + Column's, the data Column will be cloned to adopt this name. """ def __init__(self, data, **kwargs): - """ - Parameters - ---------- - data : Column - The Column of data for this index - name : str optional - The name of the Index. If not provided, the Index adopts the value - Column's name. Otherwise if this name is different from the value - Column's, the data Column will be cloned to adopt this name. - """ kwargs = _setdefault_name(data, **kwargs) # normalize the input @@ -1933,42 +1933,252 @@ def __init__(self, data=None, dtype=None, copy=False, name=None): class Int8Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int8Index is a special case of Index with purely + integer(``int8``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int8Index + """ + _dtype = np.int8 class Int16Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int16Index is a special case of Index with purely + integer(``int16``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int16Index + """ + _dtype = np.int16 class Int32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int32Index is a special case of Index with purely + integer(``int32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int32Index + """ + _dtype = np.int32 class Int64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Int64Index is a special case of Index with purely + integer(``int64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Int64Index + """ + _dtype = np.int64 class UInt8Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt8Index is a special case of Index with purely + integer(``uint64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt8Index + """ + _dtype = np.uint8 class UInt16Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt16Index is a special case of Index with purely + integer(``uint16``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt16Index + """ + _dtype = np.uint16 class UInt32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt32Index is a special case of Index with purely + integer(``uint32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt32Index + """ + _dtype = np.uint32 class UInt64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + UInt64Index is a special case of Index with purely + integer(``uint64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + UInt64Index + """ + _dtype = np.uint64 class Float32Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Float32Index is a special case of Index with purely + float(``float32``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Float32Index + """ + _dtype = np.float32 class Float64Index(NumericIndex): + """ + Immutable, ordered and sliceable sequence of labels. + The basic object storing row labels for all cuDF objects. + Float64Index is a special case of Index with purely + float(``float64``) labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype, + but not used. + copy : bool + Make a copy of input data. + name : object + Name to be stored in the index. + + Returns + ------- + Float64Index + """ + _dtype = np.float64 @@ -2419,6 +2629,13 @@ def components(self): @property def inferred_freq(self): + """ + Infers frequency of TimedeltaIndex. + + Notes + ----- + This property is currently not supported. + """ raise NotImplementedError("inferred_freq is not yet supported") @@ -2724,7 +2941,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): Construct an IntervalIndex from an array of splits. Parameters - --------- + ---------- breaks : array-like (1-dimensional) Left and right bounds for each interval. closed : {"left", "right", "both", "neither"}, default "right" @@ -2804,7 +3021,7 @@ def __repr__(self): + ")" ) - @copy_docstring(StringMethods.__init__) # type: ignore + @copy_docstring(StringMethods) # type: ignore @property def str(self): return StringMethods(parent=self) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 26a893a4676..cdc80b6ef32 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -609,6 +609,30 @@ def to_arrow(self): @property def codes(self): + """ + Returns the codes of the underlying MultiIndex. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx = cudf.MultiIndex.from_frame(df) + >>> midx + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx.codes + a b + 0 0 0 + 1 1 1 + 2 2 2 + """ if self._codes is None: self._compute_levels_and_codes() return self._codes @@ -622,6 +646,37 @@ def nlevels(self): @property def levels(self): + """ + Returns list of levels in the MultiIndex + + Returns + ------- + List of Series objects + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx = cudf.MultiIndex.from_frame(df) + >>> midx + MultiIndex([(1, 10), + (2, 11), + (3, 12)], + names=['a', 'b']) + >>> midx.levels + [0 1 + 1 2 + 2 3 + dtype: int64, 0 10 + 1 11 + 2 12 + dtype: int64] + """ if self._levels is None: self._compute_levels_and_codes() return self._levels @@ -1123,6 +1178,37 @@ def _concat(cls, objs): @classmethod def from_tuples(cls, tuples, names=None): + """ + Convert list of tuples to MultiIndex. + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> cudf.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_tuples(tuples, names=names) result = cls.from_pandas(pdi) @@ -1187,11 +1273,97 @@ def values(self): return self._source_data.values @classmethod - def from_frame(cls, dataframe, names=None): - return cls(source_data=dataframe, names=names) + def from_frame(cls, df, names=None): + """ + Make a MultiIndex from a DataFrame. + + Parameters + ---------- + df : DataFrame + DataFrame to be converted to MultiIndex. + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. + + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + >>> cudf.MultiIndex.from_frame(df) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> cudf.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['state', 'observation']) + """ + return cls(source_data=df, names=names) @classmethod def from_product(cls, arrays, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables. + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + names : list / sequence of str, optional + Names for the levels in the index. + If not explicitly provided, names will be inferred from the + elements of iterables if an element has a name attribute + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> cudf.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_product(arrays, names=names) result = cls.from_pandas(pdi) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 54571ebb31d..1b8405af1a4 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -803,9 +803,9 @@ def _pivot(df, index, columns): Parameters ---------- df : DataFrame - index : cudf.core.index.Index + index : cudf.Index Index labels of the result - columns : cudf.core.index.Index + columns : cudf.Index Column labels of the result """ columns_labels, columns_idx = columns._encode() diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index c6663a25684..4403a58dd30 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -17,45 +17,46 @@ class Scalar(object): + """ + A GPU-backed scalar object with NumPy scalar like properties + May be used in binary operations against other scalars, cuDF + Series, DataFrame, and Index objects. + + Examples + -------- + >>> import cudf + >>> cudf.Scalar(42, dtype='int64') + Scalar(42, dtype=int64) + >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64') + Scalar(84.0, dtype=float64) + >>> cudf.Scalar(42, dtype='int64') + np.int8(21) + Scalar(63, dtype=int64) + >>> x = cudf.Scalar(42, dtype='datetime64[s]') + >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) + >>> x - y + Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) + >>> cudf.Series([1,2,3]) + cudf.Scalar(1) + 0 2 + 1 3 + 2 4 + dtype: int64 + >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]}) + >>> slr = cudf.Scalar(10, dtype='uint8') + >>> df - slr + a b + 0 -9 -5.5 + 1 -8 -4.5 + 2 -7 -3.5 + + Parameters + ---------- + value : Python Scalar, NumPy Scalar, or cuDF Scalar + The scalar value to be converted to a GPU backed scalar object + dtype : np.dtype or string specifier + The data type + """ + def __init__(self, value, dtype=None): - """ - A GPU-backed scalar object with NumPy scalar like properties - May be used in binary operations against other scalars, cuDF - Series, DataFrame, and Index objects. - - Examples - -------- - >>> import cudf - >>> cudf.Scalar(42, dtype='int64') - Scalar(42, dtype=int64) - >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64') - Scalar(84.0, dtype=float64) - >>> cudf.Scalar(42, dtype='int64') + np.int8(21) - Scalar(63, dtype=int64) - >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) - >>> x - y - Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) - >>> cudf.Series([1,2,3]) + cudf.Scalar(1) - 0 2 - 1 3 - 2 4 - dtype: int64 - >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]}) - >>> slr = cudf.Scalar(10, dtype='uint8') - >>> df - slr - a b - 0 -9 -5.5 - 1 -8 -4.5 - 2 -7 -3.5 - - Parameters - ---------- - value : Python Scalar, NumPy Scalar, or cuDF Scalar - The scalar value to be converted to a GPU backed scalar object - dtype : np.dtype or string specifier - The data type - """ self._host_value = None self._host_dtype = None diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cb7a82bd4c8..e7a58be62b5 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -64,6 +64,48 @@ class Series(SingleColumnFrame, Serializable): + """ + One-dimensional GPU array (including time series). + + Labels need not be unique but must be a hashable type. The object + supports both integer- and label-based indexing and provides a + host of methods for performing operations involving the index. + Statistical methods from ndarray have been overridden to + automatically exclude missing data (currently represented + as null/NaN). + + Operations between Series (`+`, `-`, `/`, `*`, `**`) align + values based on their associated index values-– they need + not be the same length. The result index will be the + sorted union of the two indexes. + + ``Series`` objects are used as columns of ``DataFrame``. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. + + index : array-like or Index (1d) + Values must be hashable and have the same length + as data. Non-unique index values are allowed. Will + default to RangeIndex (0, 1, 2, …, n) if not provided. + If both a dict and index sequence are used, the index will + override the keys found in the dict. + + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, + this will be inferred from data. + + name : str, optional + The name to give to the Series. + + nan_as_null : bool, Default True + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + """ + # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property def _constructor(self): @@ -171,47 +213,6 @@ def from_masked_array(cls, data, mask, null_count=None): def __init__( self, data=None, index=None, dtype=None, name=None, nan_as_null=True, ): - """ - One-dimensional GPU array (including time series). - - Labels need not be unique but must be a hashable type. The object - supports both integer- and label-based indexing and provides a - host of methods for performing operations involving the index. - Statistical methods from ndarray have been overridden to - automatically exclude missing data (currently represented - as null/NaN). - - Operations between Series (`+`, `-`, `/`, `*`, `**`) align - values based on their associated index values-– they need - not be the same length. The result index will be the - sorted union of the two indexes. - - ``Series`` objects are used as columns of ``DataFrame``. - - Parameters - ---------- - data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - index : array-like or Index (1d) - Values must be hashable and have the same length - as data. Non-unique index values are allowed. Will - default to RangeIndex (0, 1, 2, …, n) if not provided. - If both a dict and index sequence are used, the index will - override the keys found in the dict. - - dtype : str, numpy.dtype, or ExtensionDtype, optional - Data type for the output Series. If not specified, - this will be inferred from data. - - name : str, optional - The name to give to the Series. - - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - """ if isinstance(data, pd.Series): if name is None: name = data.name @@ -457,7 +458,7 @@ def drop( Return series without null values Series.drop_duplicates Return series with duplicate values removed - cudf.core.dataframe.DataFrame.drop + cudf.DataFrame.drop Drop specified labels from rows or columns in dataframe Examples @@ -879,7 +880,7 @@ def memory_usage(self, index=True, deep=False): See Also -------- - cudf.core.dataframe.DataFrame.memory_usage : Bytes consumed by + cudf.DataFrame.memory_usage : Bytes consumed by a DataFrame. Examples @@ -2344,22 +2345,22 @@ def __invert__(self): f"Operation `~` not supported on {self.dtype.type.__name__}" ) - @copy_docstring(CategoricalAccessor.__init__) # type: ignore + @copy_docstring(CategoricalAccessor) # type: ignore @property def cat(self): return CategoricalAccessor(parent=self) - @copy_docstring(StringMethods.__init__) # type: ignore + @copy_docstring(StringMethods) # type: ignore @property def str(self): return StringMethods(parent=self) - @copy_docstring(ListMethods.__init__) # type: ignore + @copy_docstring(ListMethods) # type: ignore @property def list(self): return ListMethods(parent=self) - @copy_docstring(StructMethods.__init__) # type: ignore + @copy_docstring(StructMethods) # type: ignore @property def struct(self): return StructMethods(parent=self) @@ -2503,10 +2504,10 @@ def dropna(self, axis=0, inplace=False, how=None): Series.fillna : Replace null values. - cudf.core.dataframe.DataFrame.dropna : Drop rows or columns which + cudf.DataFrame.dropna : Drop rows or columns which contain null values. - cudf.core.index.Index.dropna : Drop null indices. + cudf.Index.dropna : Drop null indices. Examples -------- @@ -2845,7 +2846,7 @@ def loc(self): See also -------- - cudf.core.dataframe.DataFrame.loc + cudf.DataFrame.loc Examples -------- @@ -2868,7 +2869,7 @@ def iloc(self): See also -------- - cudf.core.dataframe.DataFrame.iloc + cudf.DataFrame.iloc Examples -------- @@ -4609,7 +4610,7 @@ def value_counts( Series.count Number of non-NA elements in a Series. - cudf.core.dataframe.DataFrame.count + cudf.DataFrame.count Number of non-NA elements in a DataFrame. Examples @@ -5216,7 +5217,7 @@ def diff(self, periods=1): return Series(output_col, name=self.name, index=self.index) - @copy_docstring(SeriesGroupBy.__init__) + @copy_docstring(SeriesGroupBy) def groupby( self, by=None, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 00f60cfc8b5..181fa64240e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -345,6 +345,66 @@ def get_units(value): class DateOffset: + """ + An object used for binary ops where calendrical arithmetic + is desired rather than absolute time arithmetic. Used to + add or subtract a whole number of periods, such as several + months or years, to a series or index of datetime dtype. + Works similarly to pd.DateOffset, but stores the offset + on the device (GPU). + + Parameters + ---------- + n : int, default 1 + The number of time periods the offset represents. + **kwds + Temporal parameter that add to or replace the offset value. + Parameters that **add** to the offset (like Timedelta): + - months + + See Also + -------- + pandas.DateOffset : The equivalent Pandas object that this + object replicates + + Examples + -------- + >>> from cudf import DateOffset + >>> ts = cudf.Series([ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], dtype='datetime64[ns]) + >>> ts + DateOffset(months=3) + 0 2000-04-01 00:00:00.012345678 + 1 2000-04-30 00:00:00.012345678 + 2 2000-05-29 00:00:00.012345678 + dtype: datetime64[ns] + >>> ts - DateOffset(months=12) + 0 1999-01-01 00:00:00.012345678 + 1 1999-01-31 00:00:00.012345678 + 2 1999-02-28 00:00:00.012345678 + dtype: datetime64[ns] + + Notes + ----- + Note that cuDF does not yet support DateOffset arguments + that 'replace' units in the datetime data being operated on + such as + - year + - month + - week + - day + - hour + - minute + - second + - microsecond + - millisecond + - nanosecond + + cuDF does not yet support rounding via a `normalize` + keyword argument. + """ _UNITS_TO_CODES = { "nanoseconds": "ns", @@ -362,66 +422,6 @@ class DateOffset: _CODES_TO_UNITS = {v: k for k, v in _UNITS_TO_CODES.items()} def __init__(self, n=1, normalize=False, **kwds): - """ - An object used for binary ops where calendrical arithmetic - is desired rather than absolute time arithmetic. Used to - add or subtract a whole number of periods, such as several - months or years, to a series or index of datetime dtype. - Works similarly to pd.DateOffset, but stores the offset - on the device (GPU). - - Parameters - ---------- - n : int, default 1 - The number of time periods the offset represents. - **kwds - Temporal parameter that add to or replace the offset value. - Parameters that **add** to the offset (like Timedelta): - - months - - See Also - -------- - pandas.DateOffset : The equivalent Pandas object that this - object replicates - - Examples - -------- - >>> from cudf import DateOffset - >>> ts = cudf.Series([ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ], dtype='datetime64[ns]) - >>> ts + DateOffset(months=3) - 0 2000-04-01 00:00:00.012345678 - 1 2000-04-30 00:00:00.012345678 - 2 2000-05-29 00:00:00.012345678 - dtype: datetime64[ns] - >>> ts - DateOffset(months=12) - 0 1999-01-01 00:00:00.012345678 - 1 1999-01-31 00:00:00.012345678 - 2 1999-02-28 00:00:00.012345678 - dtype: datetime64[ns] - - Notes - ----- - Note that cuDF does not yet support DateOffset arguments - that 'replace' units in the datetime data being operated on - such as - - year - - month - - week - - day - - hour - - minute - - second - - microsecond - - millisecond - - nanosecond - - cuDF does not yet support rounding via a `normalize` - keyword argument. - """ if normalize: raise NotImplementedError( "normalize not yet supported for DateOffset" diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index d9a2fd89165..d2f120a7bb9 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -258,12 +258,12 @@ def apply(self, func, *args, **kwargs): See also -------- - cudf.core.series.Series.applymap : Apply an elementwise function to + cudf.Series.applymap : Apply an elementwise function to transform the values in the Column. Notes ----- - See notes of the :meth:`cudf.core.series.Series.applymap` + See notes of the :meth:`cudf.Series.applymap` """ has_nulls = False @@ -353,14 +353,15 @@ def __repr__(self): class RollingGroupby(Rolling): - def __init__(self, groupby, window, min_periods=None, center=False): - """ - Grouped rolling window calculation. + """ + Grouped rolling window calculation. - See also - -------- - cudf.core.window.Rolling - """ + See also + -------- + cudf.core.window.Rolling + """ + + def __init__(self, groupby, window, min_periods=None, center=False): sort_order = groupby.grouping.keys.argsort() # TODO: there may be overlap between the columns diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 76d24dcd5d2..8744238a062 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3613,9 +3613,7 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int") - gds = cudf.Series( - cudf.core.index.StringIndex(["1", "18", "9"]), dtype="int" - ) + gds = cudf.Series(cudf.StringIndex(["1", "18", "9"]), dtype="int") assert_eq(pds, gds) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index de7d8e35bce..7f402762730 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -985,7 +985,7 @@ def test_groupby_index_type(): df["string_col"] = ["a", "b", "c"] df["counts"] = [1, 2, 3] res = df.groupby(by="string_col").counts.sum() - assert isinstance(res.index, cudf.core.index.StringIndex) + assert isinstance(res.index, cudf.StringIndex) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 38b924006bf..f80bdec0ab5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -332,7 +332,7 @@ def test_index_copy_datetime(name, dtype, deep=True): @pytest.mark.parametrize("name", ["x"]) @pytest.mark.parametrize("dtype", ["category", "object"]) def test_index_copy_string(name, dtype, deep=True): - cidx = cudf.core.index.StringIndex(["a", "b", "c"]) + cidx = cudf.StringIndex(["a", "b", "c"]) pidx = cidx.to_pandas() pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) @@ -389,7 +389,7 @@ def test_index_copy_category(name, dtype, deep=True): "idx", [ cudf.DatetimeIndex(["2001", "2002", "2003"]), - cudf.core.index.StringIndex(["a", "b", "c"]), + cudf.StringIndex(["a", "b", "c"]), cudf.Int64Index([1, 2, 3]), cudf.Float64Index([1.0, 2.0, 3.0]), cudf.CategoricalIndex([1, 2, 3]), @@ -434,7 +434,7 @@ def test_index_copy_deep(idx, deep): idx._values.categories.base_data.ptr == idx_copy._values.categories.base_data.ptr ) == same_ref - elif isinstance(idx, cudf.core.index.StringIndex): + elif isinstance(idx, cudf.StringIndex): children = idx._values._base_children copy_children = idx_copy._values._base_children assert all( @@ -479,7 +479,7 @@ def test_rangeindex_slice_attr_name(): def test_from_pandas_str(): idx = ["a", "b", "c"] pidx = pd.Index(idx, name="idx") - gidx_1 = cudf.core.index.StringIndex(idx, name="idx") + gidx_1 = cudf.StringIndex(idx, name="idx") gidx_2 = cudf.from_pandas(pidx) assert_eq(gidx_1, gidx_2) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 6c3fdd4640a..d0b1ba0758e 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -507,8 +507,8 @@ def test_character_tokenize_index(): actual = sr.str.character_tokenize() assert_eq(expected, actual) - sr = cudf.core.index.as_index([""]) - expected = cudf.core.index.StringIndex([], dtype="object") + sr = cudf.Index([""]) + expected = cudf.StringIndex([], dtype="object") actual = sr.str.character_tokenize() assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 97a472fc132..1927ef96e6f 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -67,8 +67,8 @@ See Also -------- -cudf.io.csv.read_csv -cudf.io.json.read_json +cudf.read_csv +cudf.read_json """.format( remote_data_sources=_docstring_remote_sources ) @@ -175,7 +175,7 @@ -------- cudf.io.parquet.read_parquet_metadata cudf.io.parquet.to_parquet -cudf.io.orc.read_orc +cudf.read_orc """.format( remote_data_sources=_docstring_remote_sources ) @@ -217,7 +217,7 @@ See Also -------- cudf.io.parquet.read_parquet -cudf.io.orc.read_orc +cudf.read_orc """ doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) @@ -276,7 +276,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_read_orc_metadata = docfmt_partial(docstring=_docstring_read_orc_metadata) @@ -302,7 +302,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_read_orc_statistics = docfmt_partial( docstring=_docstring_read_orc_statistics @@ -391,7 +391,7 @@ See Also -------- -cudf.io.orc.read_orc +cudf.read_orc """ doc_to_orc = docfmt_partial(docstring=_docstring_to_orc) @@ -693,7 +693,7 @@ See Also -------- -cudf.io.hdf.read_hdf : Read from HDF file. +cudf.read_hdf : Read from HDF file. cudf.io.parquet.to_parquet : Write a DataFrame to the binary parquet format. cudf.io.feather.to_feather : Write out feather-format for DataFrames. """ @@ -904,7 +904,7 @@ See Also -------- -cudf.io.csv.to_csv +cudf.to_csv """.format( remote_data_sources=_docstring_remote_sources ) @@ -969,7 +969,7 @@ See Also -------- -cudf.io.csv.read_csv +cudf.read_csv """ doc_to_csv = docfmt_partial( docstring=_docstring_to_csv.format( diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt index efb22ddd5a4..f69c246832b 100644 --- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt @@ -23,6 +23,7 @@ packaging pandas>=1.0,<1.3.0dev0 pandoc==2.0a4 protobuf +pydata-sphinx-theme pyorc pytest pytest-benchmark @@ -33,7 +34,6 @@ setuptools sphinx sphinx-copybutton sphinx-markdown-tables -sphinx_rtd_theme sphinxcontrib-websupport transformers typing_extensions diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt index cb88f74399f..e55dc2f921a 100644 --- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt +++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt @@ -23,6 +23,7 @@ packaging pandas>=1.0,<1.3.0dev0 pandoc==2.0a4 protobuf +pydata-sphinx-theme pyorc pytest pytest-benchmark @@ -33,7 +34,6 @@ setuptools sphinx sphinx-copybutton sphinx-markdown-tables -sphinx_rtd_theme sphinxcontrib-websupport transformers typing_extensions diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 53543b9e886..6fb5efbdf0f 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -51,8 +51,8 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.StringIndex): - return cudf.core.index.StringIndex(["cat", "dog"], name=idx.name) + elif isinstance(idx, cudf.StringIndex): + return cudf.StringIndex(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1