From 4116d8730ea4dca4f4f8d617feb5a29dfbaf9b7d Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Tue, 19 Apr 2022 15:26:31 -0400 Subject: [PATCH 01/14] Migrated user guide notebooks to MyST-NB and added sphinx extension --- conda/environments/cudf_dev_cuda11.5.yml | 5 + docs/cudf/source/conf.py | 5 +- .../source/user_guide/10min-cudf-cupy.ipynb | 1387 ---- .../cudf/source/user_guide/10min-cudf-cupy.md | 217 + docs/cudf/source/user_guide/10min.ipynb | 6640 ----------------- docs/cudf/source/user_guide/10min.md | 733 ++ .../Working-with-missing-data.ipynb | 3466 --------- .../user_guide/Working-with-missing-data.md | 489 ++ .../source/user_guide/guide-to-udfs.ipynb | 2313 ------ docs/cudf/source/user_guide/guide-to-udfs.md | 558 ++ docs/cudf/source/user_guide/index.rst | 8 +- 11 files changed, 2010 insertions(+), 13811 deletions(-) delete mode 100644 docs/cudf/source/user_guide/10min-cudf-cupy.ipynb create mode 100644 docs/cudf/source/user_guide/10min-cudf-cupy.md delete mode 100644 docs/cudf/source/user_guide/10min.ipynb create mode 100644 docs/cudf/source/user_guide/10min.md delete mode 100644 docs/cudf/source/user_guide/Working-with-missing-data.ipynb create mode 100644 docs/cudf/source/user_guide/Working-with-missing-data.md delete mode 100644 docs/cudf/source/user_guide/guide-to-udfs.ipynb create mode 100644 docs/cudf/source/user_guide/guide-to-udfs.md diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index bdde007e33e..a97f208d3fc 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -54,6 +54,11 @@ dependencies: - hypothesis - sphinx-markdown-tables - sphinx-copybutton + - sphinx-autobuild + - myst-nb + - jupytext + - scipy + - dask-cuda - mimesis<4.1 - packaging - protobuf diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index d65b77ef74b..c8b30120924 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -46,10 +46,13 @@ "numpydoc", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", - "nbsphinx", "PandasCompat", + "myst_nb", ] +jupyter_execute_notebooks = "force" +execution_timeout = 300 + copybutton_prompt_text = ">>> " autosummary_generate = True ipython_mplbackend = "str" diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb deleted file mode 100644 index 1bcb9335256..00000000000 --- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb +++ /dev/null @@ -1,1387 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 10 Minutes to cuDF and CuPy\n", - "\n", - "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import timeit\n", - "from packaging import version\n", - "\n", - "import cupy as cp\n", - "import cudf\n", - "\n", - "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n", - " cupy_from_dlpack = cp.from_dlpack\n", - "else:\n", - " cupy_from_dlpack = cp.fromDlpack" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting a cuDF DataFrame to a CuPy Array\n", - "\n", - "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n", - "\n", - "1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", - "\n", - "2. We can also use `DataFrame.values`.\n", - "\n", - "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "183 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "553 µs ± 6.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", - "546 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" - ] - } - ], - "source": [ - "nelem = 10000\n", - "df = cudf.DataFrame({'a':range(nelem),\n", - " 'b':range(500, nelem + 500),\n", - " 'c':range(1000, nelem + 1000)}\n", - " )\n", - "\n", - "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", - "%timeit arr_cupy = df.values\n", - "%timeit arr_cupy = df.to_cupy()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 0, 500, 1000],\n", - " [ 1, 501, 1001],\n", - " [ 2, 502, 1002],\n", - " ...,\n", - " [ 9997, 10497, 10997],\n", - " [ 9998, 10498, 10998],\n", - " [ 9999, 10499, 10999]])" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", - "arr_cupy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting a cuDF Series to a CuPy Array" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are also multiple ways to convert a cuDF Series to a CuPy array:\n", - "\n", - "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n", - "2. We can leverage the dlpack interface `to_dlpack()`. \n", - "3. We can also use `Series.values` \n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "76.8 µs ± 636 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "198 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "181 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" - ] - } - ], - "source": [ - "col = 'a'\n", - "\n", - "%timeit cola_cupy = cp.asarray(df[col])\n", - "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n", - "%timeit cola_cupy = df[col].values" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, ..., 9997, 9998, 9999])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cola_cupy = cp.asarray(df[col])\n", - "cola_cupy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 0, 1, 2, ..., 197, 198, 199],\n", - " [ 200, 201, 202, ..., 397, 398, 399],\n", - " [ 400, 401, 402, ..., 597, 598, 599],\n", - " ...,\n", - " [9400, 9401, 9402, ..., 9597, 9598, 9599],\n", - " [9600, 9601, 9602, ..., 9797, 9798, 9799],\n", - " [9800, 9801, 9802, ..., 9997, 9998, 9999]])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reshaped_arr = cola_cupy.reshape(50, 200)\n", - "reshaped_arr" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0, 201, 402, 603, 804, 1005, 1206, 1407, 1608, 1809, 2010,\n", - " 2211, 2412, 2613, 2814, 3015, 3216, 3417, 3618, 3819, 4020, 4221,\n", - " 4422, 4623, 4824, 5025, 5226, 5427, 5628, 5829, 6030, 6231, 6432,\n", - " 6633, 6834, 7035, 7236, 7437, 7638, 7839, 8040, 8241, 8442, 8643,\n", - " 8844, 9045, 9246, 9447, 9648, 9849])" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reshaped_arr.diagonal()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(577306.967739)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cp.linalg.norm(reshaped_arr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting a CuPy Array to a cuDF DataFrame\n", - "\n", - "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n", - "\n", - "1. **Easiest;** We can directly use the `DataFrame` constructor.\n", - "\n", - "2. We can use CUDA array interface with the `DataFrame` constructor.\n", - "\n", - "3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", - "\n", - "For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23.9 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], - "source": [ - "%timeit reshaped_df = cudf.DataFrame(reshaped_arr)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", - "

5 rows × 200 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", - "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", - "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", - "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", - "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", - "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", - "\n", - " 194 195 196 197 198 199 \n", - "0 194 195 196 197 198 199 \n", - "1 394 395 396 397 398 399 \n", - "2 594 595 596 597 598 599 \n", - "3 794 795 796 797 798 799 \n", - "4 994 995 996 997 998 999 \n", - "\n", - "[5 rows x 200 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reshaped_df = cudf.DataFrame(reshaped_arr)\n", - "reshaped_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cp.isfortran(reshaped_arr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9.15 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%%timeit\n", - "\n", - "fortran_arr = cp.asfortranarray(reshaped_arr)\n", - "reshaped_df = cudf.DataFrame(fortran_arr)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5.74 ms ± 29.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" - ] - } - ], - "source": [ - "%%timeit\n", - "\n", - "fortran_arr = cp.asfortranarray(reshaped_arr)\n", - "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", - "

5 rows × 200 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", - "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", - "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", - "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", - "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", - "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", - "\n", - " 194 195 196 197 198 199 \n", - "0 194 195 196 197 198 199 \n", - "1 394 395 396 397 398 399 \n", - "2 594 595 596 597 598 599 \n", - "3 794 795 796 797 798 799 \n", - "4 994 995 996 997 998 999 \n", - "\n", - "[5 rows x 200 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fortran_arr = cp.asfortranarray(reshaped_arr)\n", - "reshaped_df = cudf.DataFrame(fortran_arr)\n", - "reshaped_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting a CuPy Array to a cuDF Series\n", - "\n", - "To convert an array to a Series, we can directly pass the array to the `Series` constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 201\n", - "2 402\n", - "3 603\n", - "4 804\n", - "dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series(reshaped_arr.diagonal()).head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n", - "\n", - "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n", - "\n", - "By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", - "

5 rows × 200 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", - "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", - "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", - "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", - "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", - "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", - "\n", - " 194 195 196 197 198 199 \n", - "0 194 195 196 197 198 199 \n", - "1 394 395 396 397 398 399 \n", - "2 594 595 596 597 598 599 \n", - "3 794 795 796 797 798 799 \n", - "4 994 995 996 997 998 999 \n", - "\n", - "[5 rows x 200 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reshaped_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can just transform it into a CuPy array and use the `axis` argument of `sum`." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 19900, 59900, 99900, 139900, 179900, 219900, 259900,\n", - " 299900, 339900, 379900, 419900, 459900, 499900, 539900,\n", - " 579900, 619900, 659900, 699900, 739900, 779900, 819900,\n", - " 859900, 899900, 939900, 979900, 1019900, 1059900, 1099900,\n", - " 1139900, 1179900, 1219900, 1259900, 1299900, 1339900, 1379900,\n", - " 1419900, 1459900, 1499900, 1539900, 1579900, 1619900, 1659900,\n", - " 1699900, 1739900, 1779900, 1819900, 1859900, 1899900, 1939900,\n", - " 1979900])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_arr = cupy_from_dlpack(reshaped_df.to_dlpack())\n", - "new_arr.sum(axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n", - "\n", - "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n", - "\n", - "The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "def cudf_to_cupy_sparse_matrix(data, sparseformat='column'):\n", - " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\n", - " \"\"\"\n", - " if sparseformat not in ('row', 'column',):\n", - " raise ValueError(\"Let's focus on column and row formats for now.\")\n", - " \n", - " _sparse_constructor = cp.sparse.csc_matrix\n", - " if sparseformat == 'row':\n", - " _sparse_constructor = cp.sparse.csr_matrix\n", - "\n", - " return _sparse_constructor(cp.from_dlpack(data.to_dlpack()))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "df = cudf.DataFrame()\n", - "nelem = 10000\n", - "nonzero = 1000\n", - "for i in range(20):\n", - " arr = cp.random.normal(5, 5, nelem)\n", - " arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0\n", - " df['a' + str(i)] = arr" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
a0a1a2a3a4a5a6a7a8a9a10a11a12a13a14a15a16a17a18a19
00.0000000.00.00.0000000.09.374760.0000000.00.00.0000006.2378590.00.00.0000000.00.00.000000.00.00.000000
10.0000000.00.00.0000000.00.000000.0000000.00.00.0000000.0000000.00.00.0658780.00.012.357050.00.00.000000
23.2327510.00.00.0000000.00.000008.3419150.00.00.0000000.0000000.00.00.0000000.00.00.000000.00.03.110362
30.0000000.00.00.0000000.00.000000.0000000.00.00.0000000.0000000.00.00.0000000.00.00.000000.00.00.000000
40.0000000.00.07.7430240.00.000000.0000000.00.05.9870980.0000000.00.00.0000000.00.00.000000.00.00.000000
\n", - "
" - ], - "text/plain": [ - " a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 \\\n", - "0 0.000000 0.0 0.0 0.000000 0.0 9.37476 0.000000 0.0 0.0 0.000000 \n", - "1 0.000000 0.0 0.0 0.000000 0.0 0.00000 0.000000 0.0 0.0 0.000000 \n", - "2 3.232751 0.0 0.0 0.000000 0.0 0.00000 8.341915 0.0 0.0 0.000000 \n", - "3 0.000000 0.0 0.0 0.000000 0.0 0.00000 0.000000 0.0 0.0 0.000000 \n", - "4 0.000000 0.0 0.0 7.743024 0.0 0.00000 0.000000 0.0 0.0 5.987098 \n", - "\n", - " a10 a11 a12 a13 a14 a15 a16 a17 a18 a19 \n", - "0 6.237859 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.0 0.000000 \n", - "1 0.000000 0.0 0.0 0.065878 0.0 0.0 12.35705 0.0 0.0 0.000000 \n", - "2 0.000000 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.0 3.110362 \n", - "3 0.000000 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.0 0.000000 \n", - "4 0.000000 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.0 0.000000 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (2, 0)\t3.2327506467190874\n", - " (259, 0)\t10.723428115951062\n", - " (643, 0)\t0.47763624588488707\n", - " (899, 0)\t8.857065309921685\n", - " (516, 0)\t8.792407143276648\n", - " (262, 0)\t2.1900894573805396\n", - " (390, 0)\t5.007630701229646\n", - " (646, 0)\t6.630703075588639\n", - " (392, 0)\t5.573713453854357\n", - " (776, 0)\t10.501281989515688\n", - " (904, 0)\t8.261890175181366\n", - " (1033, 0)\t-0.41106824704220446\n", - " (522, 0)\t12.619952511457068\n", - " (139, 0)\t12.753348070606792\n", - " (141, 0)\t4.936902335394504\n", - " (270, 0)\t-1.7695949916946174\n", - " (782, 0)\t4.378746787324408\n", - " (15, 0)\t8.554141682891935\n", - " (527, 0)\t5.1994882136423\n", - " (912, 0)\t2.6101212854793125\n", - " (401, 0)\t5.614628764689268\n", - " (403, 0)\t9.999468341523317\n", - " (787, 0)\t7.6170790481600985\n", - " (404, 0)\t5.105328903336744\n", - " (916, 0)\t1.395526391114967\n", - " :\t:\n", - " (9328, 19)\t5.938629381103238\n", - " (9457, 19)\t4.463547879031807\n", - " (9458, 19)\t-0.8034946631917106\n", - " (8051, 19)\t-1.904327616912268\n", - " (8819, 19)\t8.314944347687199\n", - " (7543, 19)\t1.4303204025224376\n", - " (8824, 19)\t5.1559713157589\n", - " (7673, 19)\t7.478681299798863\n", - " (7802, 19)\t0.502526238006068\n", - " (8186, 19)\t-3.824944685072472\n", - " (8570, 19)\t8.442324394481236\n", - " (8571, 19)\t6.204199957873215\n", - " (7420, 19)\t0.297737356585836\n", - " (9212, 19)\t3.934797966994188\n", - " (7421, 19)\t14.26161925450462\n", - " (8574, 19)\t5.826108027573207\n", - " (9214, 19)\t7.209975861932724\n", - " (9825, 19)\t11.155342644729613\n", - " (9702, 19)\t3.55144040779287\n", - " (9578, 19)\t12.638681362546228\n", - " (9712, 19)\t2.3542852760656348\n", - " (9969, 19)\t-2.645175092587592\n", - " (9973, 19)\t-2.2666402312025213\n", - " (9851, 19)\t-4.293381721466055\n", - " (9596, 19)\t6.6580506888430415\n" - ] - } - ], - "source": [ - "sparse_data = cudf_to_cupy_sparse_matrix(df)\n", - "print(sparse_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From here, we could continue our workflow with a CuPy sparse matrix.\n", - "\n", - "For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.md b/docs/cudf/source/user_guide/10min-cudf-cupy.md new file mode 100644 index 00000000000..a087a3f3a0e --- /dev/null +++ b/docs/cudf/source/user_guide/10min-cudf-cupy.md @@ -0,0 +1,217 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# 10 Minutes to cuDF and CuPy + +This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). + +```{code-cell} ipython3 +import timeit +from packaging import version + +import cupy as cp +import cudf + +if version.parse(cp.__version__) >= version.parse("10.0.0"): + cupy_from_dlpack = cp.from_dlpack +else: + cupy_from_dlpack = cp.fromDlpack +``` + +### Converting a cuDF DataFrame to a CuPy Array + +If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it: + +1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface. + +2. We can also use `DataFrame.values`. + +3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality. + +```{code-cell} ipython3 +nelem = 10000 +df = cudf.DataFrame({'a':range(nelem), + 'b':range(500, nelem + 500), + 'c':range(1000, nelem + 1000)} + ) + +%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack()) +%timeit arr_cupy = df.values +%timeit arr_cupy = df.to_cupy() +``` + +```{code-cell} ipython3 +arr_cupy = cupy_from_dlpack(df.to_dlpack()) +arr_cupy +``` + +### Converting a cuDF Series to a CuPy Array + ++++ + +There are also multiple ways to convert a cuDF Series to a CuPy array: + +1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html). +2. We can leverage the dlpack interface `to_dlpack()`. +3. We can also use `Series.values` + +```{code-cell} ipython3 +col = 'a' + +%timeit cola_cupy = cp.asarray(df[col]) +%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack()) +%timeit cola_cupy = df[col].values +``` + +```{code-cell} ipython3 +cola_cupy = cp.asarray(df[col]) +cola_cupy +``` + +From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm. + +```{code-cell} ipython3 +reshaped_arr = cola_cupy.reshape(50, 200) +reshaped_arr +``` + +```{code-cell} ipython3 +reshaped_arr.diagonal() +``` + +```{code-cell} ipython3 +cp.linalg.norm(reshaped_arr) +``` + +### Converting a CuPy Array to a cuDF DataFrame + +We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it: + +1. **Easiest;** We can directly use the `DataFrame` constructor. + +2. We can use CUDA array interface with the `DataFrame` constructor. + +3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface. + +For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand. + +```{code-cell} ipython3 +%timeit reshaped_df = cudf.DataFrame(reshaped_arr) +``` + +```{code-cell} ipython3 +reshaped_df = cudf.DataFrame(reshaped_arr) +reshaped_df.head() +``` + +We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array. + +```{code-cell} ipython3 +cp.isfortran(reshaped_arr) +``` + +In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively. + +```{code-cell} ipython3 +%%timeit + +fortran_arr = cp.asfortranarray(reshaped_arr) +reshaped_df = cudf.DataFrame(fortran_arr) +``` + +```{code-cell} ipython3 +%%timeit + +fortran_arr = cp.asfortranarray(reshaped_arr) +reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack()) +``` + +```{code-cell} ipython3 +fortran_arr = cp.asfortranarray(reshaped_arr) +reshaped_df = cudf.DataFrame(fortran_arr) +reshaped_df.head() +``` + +### Converting a CuPy Array to a cuDF Series + +To convert an array to a Series, we can directly pass the array to the `Series` constructor. + +```{code-cell} ipython3 +cudf.Series(reshaped_arr.diagonal()).head() +``` + +### Interweaving CuDF and CuPy for Smooth PyData Workflows + +RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive. + +By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame. + +```{code-cell} ipython3 +reshaped_df.head() +``` + +We can just transform it into a CuPy array and use the `axis` argument of `sum`. + +```{code-cell} ipython3 +new_arr = cupy_from_dlpack(reshaped_df.to_dlpack()) +new_arr.sum(axis=1) +``` + +With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed. + ++++ + +### Converting a cuDF DataFrame to a CuPy Sparse Matrix + +We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input. + +The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness. + +```{code-cell} ipython3 +def cudf_to_cupy_sparse_matrix(data, sparseformat='column'): + """Converts a cuDF object to a CuPy Sparse Column matrix. + """ + if sparseformat not in ('row', 'column',): + raise ValueError("Let's focus on column and row formats for now.") + + _sparse_constructor = cp.sparse.csc_matrix + if sparseformat == 'row': + _sparse_constructor = cp.sparse.csr_matrix + + return _sparse_constructor(cp.from_dlpack(data.to_dlpack())) +``` + +We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format. + +```{code-cell} ipython3 +df = cudf.DataFrame() +nelem = 10000 +nonzero = 1000 +for i in range(20): + arr = cp.random.normal(5, 5, nelem) + arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0 + df['a' + str(i)] = arr +``` + +```{code-cell} ipython3 +df.head() +``` + +```{code-cell} ipython3 +sparse_data = cudf_to_cupy_sparse_matrix(df) +print(sparse_data) +``` + +From here, we could continue our workflow with a CuPy sparse matrix. + +For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html). diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb deleted file mode 100644 index ab006847fc6..00000000000 --- a/docs/cudf/source/user_guide/10min.ipynb +++ /dev/null @@ -1,6640 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "10 Minutes to cuDF and Dask-cuDF\n", - "=======================\n", - "\n", - "Modeled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly for new users.\n", - "\n", - "### What are these Libraries?\n", - "\n", - "[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.\n", - "\n", - "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n", - "\n", - "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv().\n", - "\n", - "\n", - "### When to use cuDF and Dask-cuDF\n", - "\n", - "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import cupy as cp\n", - "import pandas as pd\n", - "import cudf\n", - "import dask_cudf\n", - "\n", - "cp.random.seed(12)\n", - "\n", - "#### Portions of this were borrowed and adapted from the\n", - "#### cuDF cheatsheet, existing cuDF documentation,\n", - "#### and 10 Minutes to Pandas." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Object Creation\n", - "---------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating a `cudf.Series` and `dask_cudf.Series`." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 4\n", - "dtype: int64" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s = cudf.Series([1,2,3,None,4])\n", - "s" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 4\n", - "dtype: int64" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = dask_cudf.from_cudf(s, npartitions=2) \n", - "ds.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
22172
33163
44154
55145
66136
77127
88118
99109
1010910
1111811
1212712
1313613
1414514
1515415
1616316
1717217
1818118
1919019
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1\n", - "2 2 17 2\n", - "3 3 16 3\n", - "4 4 15 4\n", - "5 5 14 5\n", - "6 6 13 6\n", - "7 7 12 7\n", - "8 8 11 8\n", - "9 9 10 9\n", - "10 10 9 10\n", - "11 11 8 11\n", - "12 12 7 12\n", - "13 13 6 13\n", - "14 14 5 14\n", - "15 15 4 15\n", - "16 16 3 16\n", - "17 17 2 17\n", - "18 18 1 18\n", - "19 19 0 19" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = cudf.DataFrame({'a': list(range(20)),\n", - " 'b': list(reversed(range(20))),\n", - " 'c': list(range(20))\n", - " })\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
22172
33163
44154
55145
66136
77127
88118
99109
1010910
1111811
1212712
1313613
1414514
1515415
1616316
1717217
1818118
1919019
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1\n", - "2 2 17 2\n", - "3 3 16 3\n", - "4 4 15 4\n", - "5 5 14 5\n", - "6 6 13 6\n", - "7 7 12 7\n", - "8 8 11 8\n", - "9 9 10 9\n", - "10 10 9 10\n", - "11 11 8 11\n", - "12 12 7 12\n", - "13 13 6 13\n", - "14 14 5 14\n", - "15 15 4 15\n", - "16 16 3 16\n", - "17 17 2 17\n", - "18 18 1 18\n", - "19 19 0 19" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf = dask_cudf.from_cudf(df, npartitions=2) \n", - "ddf.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`.\n", - "\n", - "*Note that best practice for using Dask-cuDF is to read data directly into a `dask_cudf.DataFrame` with something like `read_csv` (discussed below).*" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
000.1
110.2
22<NA>
330.3
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 0 0.1\n", - "1 1 0.2\n", - "2 2 \n", - "3 3 0.3" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})\n", - "gdf = cudf.DataFrame.from_pandas(pdf)\n", - "gdf" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
000.1
110.2
22<NA>
330.3
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 0 0.1\n", - "1 1 0.2\n", - "2 2 \n", - "3 3 0.3" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", - "dask_gdf.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Viewing Data\n", - "-------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Viewing the top rows of a GPU dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.head(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sorting by values." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1919019
1818118
1717217
1616316
1515415
1414514
1313613
1212712
1111811
1010910
99109
88118
77127
66136
55145
44154
33163
22172
11181
00190
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "19 19 0 19\n", - "18 18 1 18\n", - "17 17 2 17\n", - "16 16 3 16\n", - "15 15 4 15\n", - "14 14 5 14\n", - "13 13 6 13\n", - "12 12 7 12\n", - "11 11 8 11\n", - "10 10 9 10\n", - "9 9 10 9\n", - "8 8 11 8\n", - "7 7 12 7\n", - "6 6 13 6\n", - "5 5 14 5\n", - "4 4 15 4\n", - "3 3 16 3\n", - "2 2 17 2\n", - "1 1 18 1\n", - "0 0 19 0" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sort_values(by='b')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1919019
1818118
1717217
1616316
1515415
1414514
1313613
1212712
1111811
1010910
99109
88118
77127
66136
55145
44154
33163
22172
11181
00190
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "19 19 0 19\n", - "18 18 1 18\n", - "17 17 2 17\n", - "16 16 3 16\n", - "15 15 4 15\n", - "14 14 5 14\n", - "13 13 6 13\n", - "12 12 7 12\n", - "11 11 8 11\n", - "10 10 9 10\n", - "9 9 10 9\n", - "8 8 11 8\n", - "7 7 12 7\n", - "6 6 13 6\n", - "5 5 14 5\n", - "4 4 15 4\n", - "3 3 16 3\n", - "2 2 17 2\n", - "1 1 18 1\n", - "0 0 19 0" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.sort_values(by='b').compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selection\n", - "------------\n", - "\n", - "## Getting" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5\n", - "6 6\n", - "7 7\n", - "8 8\n", - "9 9\n", - "10 10\n", - "11 11\n", - "12 12\n", - "13 13\n", - "14 14\n", - "15 15\n", - "16 16\n", - "17 17\n", - "18 18\n", - "19 19\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['a']" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4\n", - "5 5\n", - "6 6\n", - "7 7\n", - "8 8\n", - "9 9\n", - "10 10\n", - "11 11\n", - "12 12\n", - "13 13\n", - "14 14\n", - "15 15\n", - "16 16\n", - "17 17\n", - "18 18\n", - "19 19\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf['a'].compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selection by Label" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selecting rows from index 2 to index 5 from columns 'a' and 'b'." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
2217
3316
4415
5514
\n", - "
" - ], - "text/plain": [ - " a b\n", - "2 2 17\n", - "3 3 16\n", - "4 4 15\n", - "5 5 14" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[2:5, ['a', 'b']]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
2217
3316
4415
5514
\n", - "
" - ], - "text/plain": [ - " a b\n", - "2 2 17\n", - "3 3 16\n", - "4 4 15\n", - "5 5 14" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.loc[2:5, ['a', 'b']].compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selection by Position" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "a 0\n", - "b 19\n", - "c 0\n", - "Name: 0, dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0019
1118
2217
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 0 19\n", - "1 1 18\n", - "2 2 17" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[0:3, 0:2]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also select elements of a `DataFrame` or `Series` with direct index access." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
33163
44154
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "3 3 16 3\n", - "4 4 15 4" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[3:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3 \n", - "4 4\n", - "dtype: int64" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s[3:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Boolean Indexing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
22172
33163
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1\n", - "2 2 17 2\n", - "3 3 16 3" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.b > 15]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
11181
22172
33163
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1\n", - "2 2 17 2\n", - "3 3 16 3" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf[ddf.b > 15].compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1616316
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "16 16 3 16" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.query(\"b == 3\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1616316
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "16 16 3 16" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.query(\"b == 3\").compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1616316
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "16 16 3 16" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf_comparator = 3\n", - "df.query(\"b == @cudf_comparator\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1616316
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "16 16 3 16" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dask_cudf_comparator = 3\n", - "ddf.query(\"b == @val\", local_dict={'val':dask_cudf_comparator}).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the `isin` method for filtering." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00190
55145
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 19 0\n", - "5 5 14 5" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.a.isin([0, 5])]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MultiIndex" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MultiIndex([('a', 1),\n", - " ('a', 2),\n", - " ('b', 3),\n", - " ('b', 4)],\n", - " )" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]]\n", - "tuples = list(zip(*arrays))\n", - "idx = cudf.MultiIndex.from_tuples(tuples)\n", - "idx" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This index can back either axis of a DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
firstsecond
a10.0826540.967955
20.3994170.441425
b30.7842970.793582
40.0703030.271711
\n", - "
" - ], - "text/plain": [ - " first second\n", - "a 1 0.082654 0.967955\n", - " 2 0.399417 0.441425\n", - "b 3 0.784297 0.793582\n", - " 4 0.070303 0.271711" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)})\n", - "gdf1.index = idx\n", - "gdf1" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
1234
first0.3433820.0037000.200430.581614
second0.9078120.1015120.241790.224180
\n", - "
" - ], - "text/plain": [ - " a b \n", - " 1 2 3 4\n", - "first 0.343382 0.003700 0.20043 0.581614\n", - "second 0.907812 0.101512 0.24179 0.224180" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T\n", - "gdf2.columns = idx\n", - "gdf2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "first 0.784297\n", - "second 0.793582\n", - "Name: ('b', 3), dtype: float64" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf1.loc[('b', 3)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Missing Data\n", - "------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Missing data can be replaced by using the `fillna` method." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 999\n", - "4 4\n", - "dtype: int64" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s.fillna(999)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 999\n", - "4 4\n", - "dtype: int64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds.fillna(999).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Operations\n", - "------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculating descriptive statistics for a `Series`." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.5, 1.666666666666666)" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s.mean(), s.var()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.5, 1.6666666666666667)" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds.mean().compute(), ds.var().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Applymap" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 10\n", - "1 11\n", - "2 12\n", - "3 13\n", - "4 14\n", - "5 15\n", - "6 16\n", - "7 17\n", - "8 18\n", - "9 19\n", - "10 20\n", - "11 21\n", - "12 22\n", - "13 23\n", - "14 24\n", - "15 25\n", - "16 26\n", - "17 27\n", - "18 28\n", - "19 29\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def add_ten(num):\n", - " return num + 10\n", - "\n", - "df['a'].applymap(add_ten)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 10\n", - "1 11\n", - "2 12\n", - "3 13\n", - "4 14\n", - "5 15\n", - "6 16\n", - "7 17\n", - "8 18\n", - "9 19\n", - "10 20\n", - "11 21\n", - "12 22\n", - "13 23\n", - "14 24\n", - "15 25\n", - "16 26\n", - "17 27\n", - "18 28\n", - "19 29\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf['a'].map_partitions(add_ten).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Histogramming" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Counting the number of occurrences of each unique value of variable." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "15 1\n", - "6 1\n", - "1 1\n", - "14 1\n", - "2 1\n", - "5 1\n", - "11 1\n", - "7 1\n", - "17 1\n", - "13 1\n", - "8 1\n", - "16 1\n", - "0 1\n", - "10 1\n", - "4 1\n", - "9 1\n", - "19 1\n", - "18 1\n", - "3 1\n", - "12 1\n", - "Name: a, dtype: int32" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.a.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "15 1\n", - "6 1\n", - "1 1\n", - "14 1\n", - "2 1\n", - "5 1\n", - "11 1\n", - "7 1\n", - "17 1\n", - "13 1\n", - "8 1\n", - "16 1\n", - "0 1\n", - "10 1\n", - "4 1\n", - "9 1\n", - "19 1\n", - "18 1\n", - "3 1\n", - "12 1\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.a.value_counts().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## String Methods" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information." - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 a\n", - "1 b\n", - "2 c\n", - "3 aaba\n", - "4 baca\n", - "5 \n", - "6 caba\n", - "7 dog\n", - "8 cat\n", - "dtype: object" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'])\n", - "s.str.lower()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 a\n", - "1 b\n", - "2 c\n", - "3 aaba\n", - "4 baca\n", - "5 \n", - "6 caba\n", - "7 dog\n", - "8 cat\n", - "dtype: object" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = dask_cudf.from_cudf(s, npartitions=2)\n", - "ds.str.lower().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Concat" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Concatenating `Series` and `DataFrames` row-wise." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "dtype: int64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s = cudf.Series([1, 2, 3, None, 5])\n", - "cudf.concat([s, s])" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "dtype: int64" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds2 = dask_cudf.from_cudf(s, npartitions=2)\n", - "dask_cudf.concat([ds2, ds2]).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Join" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyvals_avals_b
0a10.0100.0
1c12.0101.0
2e14.0102.0
3b11.0<NA>
4d13.0<NA>
\n", - "
" - ], - "text/plain": [ - " key vals_a vals_b\n", - "0 a 10.0 100.0\n", - "1 c 12.0 101.0\n", - "2 e 14.0 102.0\n", - "3 b 11.0 \n", - "4 d 13.0 " - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_a = cudf.DataFrame()\n", - "df_a['key'] = ['a', 'b', 'c', 'd', 'e']\n", - "df_a['vals_a'] = [float(i + 10) for i in range(5)]\n", - "\n", - "df_b = cudf.DataFrame()\n", - "df_b['key'] = ['a', 'c', 'e']\n", - "df_b['vals_b'] = [float(i+100) for i in range(3)]\n", - "\n", - "merged = df_a.merge(df_b, on=['key'], how='left')\n", - "merged" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyvals_avals_b
0a10.0100.0
1c12.0101.0
2b11.0<NA>
0e14.0102.0
1d13.0<NA>
\n", - "
" - ], - "text/plain": [ - " key vals_a vals_b\n", - "0 a 10.0 100.0\n", - "1 c 12.0 101.0\n", - "2 b 11.0 \n", - "0 e 14.0 102.0\n", - "1 d 13.0 " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf_a = dask_cudf.from_cudf(df_a, npartitions=2)\n", - "ddf_b = dask_cudf.from_cudf(df_b, npartitions=2)\n", - "\n", - "merged = ddf_a.merge(ddf_b, on=['key'], how='left').compute()\n", - "merged" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Append" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Appending values from another `Series` or array-like object." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/core/indexed_frame.py:2271: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "dtype: int64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s.append(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 \n", - "4 5\n", - "dtype: int64" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds2.append(ds2).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Grouping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm." - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))]\n", - "df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))]\n", - "\n", - "ddf = dask_cudf.from_cudf(df, npartitions=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Grouping and then applying the `sum` function to the grouped data." - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col2
agg_col1
190100904
0100901003
\n", - "
" - ], - "text/plain": [ - " a b c agg_col2\n", - "agg_col1 \n", - "1 90 100 90 4\n", - "0 100 90 100 3" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby('agg_col1').sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col2
agg_col1
190100904
0100901003
\n", - "
" - ], - "text/plain": [ - " a b c agg_col2\n", - "agg_col1 \n", - "1 90 100 90 4\n", - "0 100 90 100 3" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.groupby('agg_col1').sum().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Grouping hierarchically then applying the `sum` function to grouped data." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
agg_col1agg_col2
10546054
00736073
11364036
01273027
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "agg_col1 agg_col2 \n", - "1 0 54 60 54\n", - "0 0 73 60 73\n", - "1 1 36 40 36\n", - "0 1 27 30 27" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(['agg_col1', 'agg_col2']).sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
agg_col1agg_col2
11364036
00736073
10546054
01273027
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "agg_col1 agg_col2 \n", - "1 1 36 40 36\n", - "0 0 73 60 73\n", - "1 0 54 60 54\n", - "0 1 27 30 27" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.groupby(['agg_col1', 'agg_col2']).sum().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Grouping and applying statistical functions to specific columns, using `agg`." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
agg_col1
11810.090
0199.0100
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "agg_col1 \n", - "1 18 10.0 90\n", - "0 19 9.0 100" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'})" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
agg_col1
11810.090
0199.0100
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "agg_col1 \n", - "1 18 10.0 90\n", - "0 19 9.0 100" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Transpose" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF." - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
014
125
236
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 4\n", - "1 2 5\n", - "2 3 6" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n", - "sample" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
a123
b456
\n", - "
" - ], - "text/plain": [ - " 0 1 2\n", - "a 1 2 3\n", - "b 4 5 6" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample.transpose()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Time Series\n", - "------------\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps." - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.986051
12018-11-210.232034
22018-11-220.397617
32018-11-230.103839
\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.986051\n", - "1 2018-11-21 0.232034\n", - "2 2018-11-22 0.397617\n", - "3 2018-11-23 0.103839" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import datetime as dt\n", - "\n", - "date_df = cudf.DataFrame()\n", - "date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D')\n", - "date_df['value'] = cp.random.sample(len(date_df))\n", - "\n", - "search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d')\n", - "date_df.query('date <= @search_date')" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.986051
12018-11-210.232034
22018-11-220.397617
32018-11-230.103839
\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.986051\n", - "1 2018-11-21 0.232034\n", - "2 2018-11-22 0.397617\n", - "3 2018-11-23 0.103839" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_ddf = dask_cudf.from_cudf(date_df, npartitions=2)\n", - "date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Categoricals\n", - "------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`DataFrames` support categorical columns." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idgrade
01a
12b
23b
34a
45a
56e
\n", - "
" - ], - "text/plain": [ - " id grade\n", - "0 1 a\n", - "1 2 b\n", - "2 3 b\n", - "3 4 a\n", - "4 5 a\n", - "5 6 e" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf = cudf.DataFrame({\"id\": [1, 2, 3, 4, 5, 6], \"grade\":['a', 'b', 'b', 'a', 'a', 'e']})\n", - "gdf['grade'] = gdf['grade'].astype('category')\n", - "gdf" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idgrade
01a
12b
23b
34a
45a
56e
\n", - "
" - ], - "text/plain": [ - " id grade\n", - "0 1 a\n", - "1 2 b\n", - "2 3 b\n", - "3 4 a\n", - "4 5 a\n", - "5 6 e" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dgdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", - "dgdf.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF." - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "StringIndex(['a' 'b' 'e'], dtype='object')" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf.grade.cat.categories" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Accessing the underlying code values of each categorical observation." - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 1\n", - "2 1\n", - "3 0\n", - "4 0\n", - "5 2\n", - "dtype: uint8" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gdf.grade.cat.codes" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 1\n", - "2 1\n", - "3 0\n", - "4 0\n", - "5 2\n", - "dtype: uint8" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dgdf.grade.cat.codes.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting Data Representation\n", - "--------------------------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pandas" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`." - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head().to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.compute().head().to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Numpy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`." - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 0, 19, 0, 1, 1],\n", - " [ 1, 18, 1, 0, 0],\n", - " [ 2, 17, 2, 1, 0],\n", - " [ 3, 16, 3, 0, 1],\n", - " [ 4, 15, 4, 1, 0],\n", - " [ 5, 14, 5, 0, 0],\n", - " [ 6, 13, 6, 1, 1],\n", - " [ 7, 12, 7, 0, 0],\n", - " [ 8, 11, 8, 1, 0],\n", - " [ 9, 10, 9, 0, 1],\n", - " [10, 9, 10, 1, 0],\n", - " [11, 8, 11, 0, 0],\n", - " [12, 7, 12, 1, 1],\n", - " [13, 6, 13, 0, 0],\n", - " [14, 5, 14, 1, 0],\n", - " [15, 4, 15, 0, 1],\n", - " [16, 3, 16, 1, 0],\n", - " [17, 2, 17, 0, 0],\n", - " [18, 1, 18, 1, 1],\n", - " [19, 0, 19, 0, 0]])" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.to_numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 0, 19, 0, 1, 1],\n", - " [ 1, 18, 1, 0, 0],\n", - " [ 2, 17, 2, 1, 0],\n", - " [ 3, 16, 3, 0, 1],\n", - " [ 4, 15, 4, 1, 0],\n", - " [ 5, 14, 5, 0, 0],\n", - " [ 6, 13, 6, 1, 1],\n", - " [ 7, 12, 7, 0, 0],\n", - " [ 8, 11, 8, 1, 0],\n", - " [ 9, 10, 9, 0, 1],\n", - " [10, 9, 10, 1, 0],\n", - " [11, 8, 11, 0, 0],\n", - " [12, 7, 12, 1, 1],\n", - " [13, 6, 13, 0, 0],\n", - " [14, 5, 14, 1, 0],\n", - " [15, 4, 15, 0, 1],\n", - " [16, 3, 16, 1, 0],\n", - " [17, 2, 17, 0, 0],\n", - " [18, 1, 18, 1, 1],\n", - " [19, 0, 19, 0, 0]])" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.compute().to_numpy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`." - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", - " 17, 18, 19])" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['a'].to_numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", - " 17, 18, 19])" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf['a'].compute().to_numpy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Arrow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`." - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: int64\n", - "c: int64\n", - "agg_col1: int64\n", - "agg_col2: int64\n", - "----\n", - "a: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", - "b: [[19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]]\n", - "c: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", - "agg_col1: [[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]]\n", - "agg_col2: [[1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0]]" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.to_arrow()" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: int64\n", - "c: int64\n", - "agg_col1: int64\n", - "agg_col2: int64\n", - "----\n", - "a: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", - "b: [[19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]]\n", - "c: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", - "agg_col1: [[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]]\n", - "agg_col2: [[1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0]]" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.compute().to_arrow()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Getting Data In/Out\n", - "------------------------\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## CSV" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Writing to a CSV file." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('example_output'):\n", - " os.mkdir('example_output')\n", - " \n", - "df.to_csv('example_output/foo.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [], - "source": [ - "ddf.compute().to_csv('example_output/foo_dask.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reading from a csv file." - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0\n", - "5 5 14 5 0 0\n", - "6 6 13 6 1 1\n", - "7 7 12 7 0 0\n", - "8 8 11 8 1 0\n", - "9 9 10 9 0 1\n", - "10 10 9 10 1 0\n", - "11 11 8 11 0 0\n", - "12 12 7 12 1 1\n", - "13 13 6 13 0 0\n", - "14 14 5 14 1 0\n", - "15 15 4 15 0 1\n", - "16 16 3 16 1 0\n", - "17 17 2 17 0 0\n", - "18 18 1 18 1 1\n", - "19 19 0 19 0 0" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = cudf.read_csv('example_output/foo.csv')\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0\n", - "5 5 14 5 0 0\n", - "6 6 13 6 1 1\n", - "7 7 12 7 0 0\n", - "8 8 11 8 1 0\n", - "9 9 10 9 0 1\n", - "10 10 9 10 1 0\n", - "11 11 8 11 0 0\n", - "12 12 7 12 1 1\n", - "13 13 6 13 0 0\n", - "14 14 5 14 1 0\n", - "15 15 4 15 0 1\n", - "16 16 3 16 1 0\n", - "17 17 2 17 0 0\n", - "18 18 1 18 1 1\n", - "19 19 0 19 0 0" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf = dask_cudf.read_csv('example_output/foo_dask.csv')\n", - "ddf.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0\n", - "5 5 14 5 0 0\n", - "6 6 13 6 1 1\n", - "7 7 12 7 0 0\n", - "8 8 11 8 1 0\n", - "9 9 10 9 0 1\n", - "10 10 9 10 1 0\n", - "11 11 8 11 0 0\n", - "12 12 7 12 1 1\n", - "13 13 6 13 0 0\n", - "14 14 5 14 1 0\n", - "15 15 4 15 0 1\n", - "16 16 3 16 1 0\n", - "17 17 2 17 0 0\n", - "18 18 1 18 1 1\n", - "19 19 0 19 0 0\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0\n", - "5 5 14 5 0 0\n", - "6 6 13 6 1 1\n", - "7 7 12 7 0 0\n", - "8 8 11 8 1 0\n", - "9 9 10 9 0 1\n", - "10 10 9 10 1 0\n", - "11 11 8 11 0 0\n", - "12 12 7 12 1 1\n", - "13 13 6 13 0 0\n", - "14 14 5 14 1 0\n", - "15 15 4 15 0 1\n", - "16 16 3 16 1 0\n", - "17 17 2 17 0 0\n", - "18 18 1 18 1 1\n", - "19 19 0 19 0 0" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf = dask_cudf.read_csv('example_output/*.csv')\n", - "ddf.compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Writing to parquet files, using the CPU via PyArrow." - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_parquet('example_output/temp_parquet')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reading parquet files with a GPU-accelerated parquet reader." - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", - "
" - ], - "text/plain": [ - " a b c agg_col1 agg_col2\n", - "0 0 19 0 1 1\n", - "1 1 18 1 0 0\n", - "2 2 17 2 1 0\n", - "3 3 16 3 0 1\n", - "4 4 15 4 1 0\n", - "5 5 14 5 0 0\n", - "6 6 13 6 1 1\n", - "7 7 12 7 0 0\n", - "8 8 11 8 1 0\n", - "9 9 10 9 0 1\n", - "10 10 9 10 1 0\n", - "11 11 8 11 0 0\n", - "12 12 7 12 1 1\n", - "13 13 6 13 0 0\n", - "14 14 5 14 1 0\n", - "15 15 4 15 0 1\n", - "16 16 3 16 1 0\n", - "17 17 2 17 0 0\n", - "18 18 1 18 1 1\n", - "19 19 0 19 0 0" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = cudf.read_parquet('example_output/temp_parquet')\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood." - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(None,)" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf.to_parquet('example_files') " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ORC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reading ORC files." - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
boolean1byte1short1int1long1float1double1bytes1string1middlelistmap
0False110246553692233720368547758071.0-15.0\u0000\u0001\u0002\u0003\u0004hi{'list': [{'int1': 1, 'string1': 'bye'}, {'int...[{'int1': 3, 'string1': 'good'}, {'int1': 4, '...[]
1True10020486553692233720368547758072.0-5.0bye{'list': [{'int1': 1, 'string1': 'bye'}, {'int...[{'int1': 100000000, 'string1': 'cat'}, {'int1...[{'key': 'chani', 'value': {'int1': 5, 'string...
\n", - "
" - ], - "text/plain": [ - " boolean1 byte1 short1 int1 long1 float1 double1 \\\n", - "0 False 1 1024 65536 9223372036854775807 1.0 -15.0 \n", - "1 True 100 2048 65536 9223372036854775807 2.0 -5.0 \n", - "\n", - " bytes1 string1 middle \\\n", - "0 \u0000\u0001\u0002\u0003\u0004 hi {'list': [{'int1': 1, 'string1': 'bye'}, {'int... \n", - "1 bye {'list': [{'int1': 1, 'string1': 'bye'}, {'int... \n", - "\n", - " list \\\n", - "0 [{'int1': 3, 'string1': 'good'}, {'int1': 4, '... \n", - "1 [{'int1': 100000000, 'string1': 'cat'}, {'int1... \n", - "\n", - " map \n", - "0 [] \n", - "1 [{'key': 'chani', 'value': {'int1': 5, 'string... " - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2 = cudf.read_orc('/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc')\n", - "df2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dask Performance Tips\n", - "--------------------------------\n", - "\n", - "Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed at that moment, most operations are added to a task graph and the actual evaluation is delayed until the result is needed.\n", - "\n", - "Sometimes, though, we want to force the execution of operations. Calling `persist` on a Dask collection fully computes it (or actively computes it in the background), persisting the result into memory. When we're using distributed systems, we may want to wait until `persist` is finished before beginning any downstream operations. We can enforce this contract by using `wait`. Wrapping an operation with `wait` will ensure it doesn't begin executing until all necessary upstream operations have finished.\n", - "\n", - "The snippets below provide basic examples, using `LocalCUDACluster` to create one dask-worker per GPU on the local machine. For more detailed information about `persist` and `wait`, please see the Dask documentation for [persist](https://docs.dask.org/en/latest/api.html#dask.persist) and [wait](https://docs.dask.org/en/latest/futures.html#distributed.wait). Wait relies on the concept of Futures, which is beyond the scope of this tutorial. For more information on Futures, see the Dask [Futures](https://docs.dask.org/en/latest/futures.html) documentation. For more information about multi-GPU clusters, please see the [dask-cuda](https://github.com/rapidsai/dask-cuda) library (documentation is in progress)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster." - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-03-29 12:21:32,328 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n", - "2022-03-29 12:21:32,394 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "

Client

\n", - "

Client-4be800f5-af7c-11ec-8df8-c8d9d2247354

\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
Connection method: Cluster objectCluster type: dask_cuda.LocalCUDACluster
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - "
\n", - "\n", - " \n", - "
\n", - "

Cluster Info

\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

LocalCUDACluster

\n", - "

137d0882

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - " \n", - " Workers: 2\n", - "
\n", - " Total threads: 2\n", - " \n", - " Total memory: 45.79 GiB\n", - "
Status: runningUsing processes: True
\n", - "\n", - "
\n", - " \n", - "

Scheduler Info

\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

Scheduler

\n", - "

Scheduler-08f95e9e-2c10-4d66-a103-955ab4218e91

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Comm: tcp://127.0.0.1:35157\n", - " \n", - " Workers: 2\n", - "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - " \n", - " Total threads: 2\n", - "
\n", - " Started: Just now\n", - " \n", - " Total memory: 45.79 GiB\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "

Workers

\n", - "
\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 0

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:41411\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://127.0.0.1:40997/status\n", - " \n", - " Memory: 22.89 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:42959\n", - "
\n", - " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ruvvgno2\n", - "
\n", - " GPU: Quadro GV100\n", - " \n", - " GPU memory: 31.75 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 1

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:41341\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://127.0.0.1:39963/status\n", - " \n", - " Memory: 22.89 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:33675\n", - "
\n", - " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-phx0wjv_\n", - "
\n", - " GPU: Quadro GV100\n", - " \n", - " GPU memory: 31.74 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import time\n", - "\n", - "from dask.distributed import Client, wait\n", - "from dask_cuda import LocalCUDACluster\n", - "\n", - "cluster = LocalCUDACluster()\n", - "client = Client(cluster)\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Persisting Data\n", - "Next, we create our Dask-cuDF DataFrame and apply a transformation, storing the result as a new column." - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Dask DataFrame Structure:
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
npartitions=5
0int64int64int64
2000000.........
............
8000000.........
9999999.........
\n", - "
\n", - "
Dask Name: assign, 20 tasks
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nrows = 10000000\n", - "\n", - "df2 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)})\n", - "ddf2 = dask_cudf.from_cudf(df2, npartitions=5)\n", - "ddf2['c'] = ddf2['a'] + 5\n", - "ddf2" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tue Mar 29 12:21:33 2022 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|===============================+======================+======================|\n", - "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\n", - "| 36% 49C P2 50W / 250W | 1113MiB / 32508MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\n", - "| 40% 54C P2 50W / 250W | 306MiB / 32498MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=============================================================================|\n", - "+-----------------------------------------------------------------------------+\n" - ] - } - ], - "source": [ - "!nvidia-smi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline)." - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Dask DataFrame Structure:
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
npartitions=5
0int64int64int64
2000000.........
............
8000000.........
9999999.........
\n", - "
\n", - "
Dask Name: assign, 5 tasks
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ddf2 = ddf2.persist()\n", - "ddf2" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tue Mar 29 12:21:34 2022 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|===============================+======================+======================|\n", - "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\n", - "| 36% 49C P2 50W / 250W | 1113MiB / 32508MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\n", - "| 40% 54C P2 50W / 250W | 306MiB / 32498MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=============================================================================|\n", - "+-----------------------------------------------------------------------------+\n" - ] - } - ], - "source": [ - "!nvidia-smi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because we forced computation, we now have a larger object in distributed GPU memory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Wait\n", - "Depending on our workflow or distributed computing setup, we may want to `wait` until all upstream tasks have finished before proceeding with a specific function. This section shows an example of this behavior, adapted from the Dask documentation.\n", - "\n", - "First, we create a new Dask DataFrame and define a function that we'll map to every partition in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "\n", - "nrows = 10000000\n", - "\n", - "df1 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)})\n", - "ddf1 = dask_cudf.from_cudf(df1, npartitions=100)\n", - "\n", - "def func(df):\n", - " time.sleep(random.randint(1, 60))\n", - " return (df + 5) * 3 - 11" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution." - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "results_ddf = ddf2.map_partitions(func)\n", - "results_ddf = results_ddf.persist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`." - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DoneAndNotDoneFutures(done={, , , , }, not_done=set())" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wait(results_ddf)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## With `wait`, we can safely proceed on in our workflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/cudf/source/user_guide/10min.md b/docs/cudf/source/user_guide/10min.md new file mode 100644 index 00000000000..d156be3d13c --- /dev/null +++ b/docs/cudf/source/user_guide/10min.md @@ -0,0 +1,733 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +10 Minutes to cuDF and Dask-cuDF +======================= + +Modeled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly for new users. + +### What are these Libraries? + +[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API. + +[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions. + +[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv(). + + +### When to use cuDF and Dask-cuDF + +If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF. + +```{code-cell} ipython3 +import os + +import cupy as cp +import pandas as pd +import cudf +import dask_cudf + +cp.random.seed(12) + +#### Portions of this were borrowed and adapted from the +#### cuDF cheatsheet, existing cuDF documentation, +#### and 10 Minutes to Pandas. +``` + +Object Creation +--------------- + ++++ + +Creating a `cudf.Series` and `dask_cudf.Series`. + +```{code-cell} ipython3 +s = cudf.Series([1,2,3,None,4]) +s +``` + +```{code-cell} ipython3 +ds = dask_cudf.from_cudf(s, npartitions=2) +ds.compute() +``` + +Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column. + +```{code-cell} ipython3 +df = cudf.DataFrame({'a': list(range(20)), + 'b': list(reversed(range(20))), + 'c': list(range(20)) + }) +df +``` + +```{code-cell} ipython3 +ddf = dask_cudf.from_cudf(df, npartitions=2) +ddf.compute() +``` + +Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`. + +*Note that best practice for using Dask-cuDF is to read data directly into a `dask_cudf.DataFrame` with something like `read_csv` (discussed below).* + +```{code-cell} ipython3 +pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) +gdf = cudf.DataFrame.from_pandas(pdf) +gdf +``` + +```{code-cell} ipython3 +dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2) +dask_gdf.compute() +``` + +Viewing Data +------------- + ++++ + +Viewing the top rows of a GPU dataframe. + +```{code-cell} ipython3 +df.head(2) +``` + +```{code-cell} ipython3 +ddf.head(2) +``` + +Sorting by values. + +```{code-cell} ipython3 +df.sort_values(by='b') +``` + +```{code-cell} ipython3 +ddf.sort_values(by='b').compute() +``` + +Selection +------------ + +## Getting + ++++ + +Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`). + +```{code-cell} ipython3 +df['a'] +``` + +```{code-cell} ipython3 +ddf['a'].compute() +``` + +## Selection by Label + ++++ + +Selecting rows from index 2 to index 5 from columns 'a' and 'b'. + +```{code-cell} ipython3 +df.loc[2:5, ['a', 'b']] +``` + +```{code-cell} ipython3 +ddf.loc[2:5, ['a', 'b']].compute() +``` + +## Selection by Position + ++++ + +Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames. + +```{code-cell} ipython3 +df.iloc[0] +``` + +```{code-cell} ipython3 +df.iloc[0:3, 0:2] +``` + +You can also select elements of a `DataFrame` or `Series` with direct index access. + +```{code-cell} ipython3 +df[3:5] +``` + +```{code-cell} ipython3 +s[3:5] +``` + +## Boolean Indexing + ++++ + +Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing. + +```{code-cell} ipython3 +df[df.b > 15] +``` + +```{code-cell} ipython3 +ddf[ddf.b > 15].compute() +``` + +Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API. + +```{code-cell} ipython3 +df.query("b == 3") +``` + +```{code-cell} ipython3 +ddf.query("b == 3").compute() +``` + +You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`. + +```{code-cell} ipython3 +cudf_comparator = 3 +df.query("b == @cudf_comparator") +``` + +```{code-cell} ipython3 +dask_cudf_comparator = 3 +ddf.query("b == @val", local_dict={'val':dask_cudf_comparator}).compute() +``` + +Using the `isin` method for filtering. + +```{code-cell} ipython3 +df[df.a.isin([0, 5])] +``` + +## MultiIndex + ++++ + +cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex. + +```{code-cell} ipython3 +arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]] +tuples = list(zip(*arrays)) +idx = cudf.MultiIndex.from_tuples(tuples) +idx +``` + +This index can back either axis of a DataFrame. + +```{code-cell} ipython3 +gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}) +gdf1.index = idx +gdf1 +``` + +```{code-cell} ipython3 +gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T +gdf2.columns = idx +gdf2 +``` + +Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported. + +```{code-cell} ipython3 +gdf1.loc[('b', 3)] +``` + +Missing Data +------------ + ++++ + +Missing data can be replaced by using the `fillna` method. + +```{code-cell} ipython3 +s.fillna(999) +``` + +```{code-cell} ipython3 +ds.fillna(999).compute() +``` + +Operations +------------ + ++++ + +## Stats + ++++ + +Calculating descriptive statistics for a `Series`. + +```{code-cell} ipython3 +s.mean(), s.var() +``` + +```{code-cell} ipython3 +ds.mean().compute(), ds.var().compute() +``` + +## Applymap + ++++ + +Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe. + +```{code-cell} ipython3 +def add_ten(num): + return num + 10 + +df['a'].applymap(add_ten) +``` + +```{code-cell} ipython3 +ddf['a'].map_partitions(add_ten).compute() +``` + +## Histogramming + ++++ + +Counting the number of occurrences of each unique value of variable. + +```{code-cell} ipython3 +df.a.value_counts() +``` + +```{code-cell} ipython3 +ddf.a.value_counts().compute() +``` + +## String Methods + ++++ + +Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information. + +```{code-cell} ipython3 +s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat']) +s.str.lower() +``` + +```{code-cell} ipython3 +ds = dask_cudf.from_cudf(s, npartitions=2) +ds.str.lower().compute() +``` + +## Concat + ++++ + +Concatenating `Series` and `DataFrames` row-wise. + +```{code-cell} ipython3 +s = cudf.Series([1, 2, 3, None, 5]) +cudf.concat([s, s]) +``` + +```{code-cell} ipython3 +ds2 = dask_cudf.from_cudf(s, npartitions=2) +dask_cudf.concat([ds2, ds2]).compute() +``` + +## Join + ++++ + +Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index. + +```{code-cell} ipython3 +df_a = cudf.DataFrame() +df_a['key'] = ['a', 'b', 'c', 'd', 'e'] +df_a['vals_a'] = [float(i + 10) for i in range(5)] + +df_b = cudf.DataFrame() +df_b['key'] = ['a', 'c', 'e'] +df_b['vals_b'] = [float(i+100) for i in range(3)] + +merged = df_a.merge(df_b, on=['key'], how='left') +merged +``` + +```{code-cell} ipython3 +ddf_a = dask_cudf.from_cudf(df_a, npartitions=2) +ddf_b = dask_cudf.from_cudf(df_b, npartitions=2) + +merged = ddf_a.merge(ddf_b, on=['key'], how='left').compute() +merged +``` + +## Append + ++++ + +Appending values from another `Series` or array-like object. + +```{code-cell} ipython3 +s.append(s) +``` + +```{code-cell} ipython3 +ds2.append(ds2).compute() +``` + +## Grouping + ++++ + +Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm. + +```{code-cell} ipython3 +df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))] +df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))] + +ddf = dask_cudf.from_cudf(df, npartitions=2) +``` + +Grouping and then applying the `sum` function to the grouped data. + +```{code-cell} ipython3 +df.groupby('agg_col1').sum() +``` + +```{code-cell} ipython3 +ddf.groupby('agg_col1').sum().compute() +``` + +Grouping hierarchically then applying the `sum` function to grouped data. + +```{code-cell} ipython3 +df.groupby(['agg_col1', 'agg_col2']).sum() +``` + +```{code-cell} ipython3 +ddf.groupby(['agg_col1', 'agg_col2']).sum().compute() +``` + +Grouping and applying statistical functions to specific columns, using `agg`. + +```{code-cell} ipython3 +df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}) +``` + +```{code-cell} ipython3 +ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute() +``` + +## Transpose + ++++ + +Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF. + +```{code-cell} ipython3 +sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) +sample +``` + +```{code-cell} ipython3 +sample.transpose() +``` + +Time Series +------------ + ++++ + +`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps. + +```{code-cell} ipython3 +import datetime as dt + +date_df = cudf.DataFrame() +date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D') +date_df['value'] = cp.random.sample(len(date_df)) + +search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d') +date_df.query('date <= @search_date') +``` + +```{code-cell} ipython3 +date_ddf = dask_cudf.from_cudf(date_df, npartitions=2) +date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute() +``` + +Categoricals +------------ + ++++ + +`DataFrames` support categorical columns. + +```{code-cell} ipython3 +gdf = cudf.DataFrame({"id": [1, 2, 3, 4, 5, 6], "grade":['a', 'b', 'b', 'a', 'a', 'e']}) +gdf['grade'] = gdf['grade'].astype('category') +gdf +``` + +```{code-cell} ipython3 +dgdf = dask_cudf.from_cudf(gdf, npartitions=2) +dgdf.compute() +``` + +Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF. + +```{code-cell} ipython3 +gdf.grade.cat.categories +``` + +Accessing the underlying code values of each categorical observation. + +```{code-cell} ipython3 +gdf.grade.cat.codes +``` + +```{code-cell} ipython3 +dgdf.grade.cat.codes.compute() +``` + +Converting Data Representation +-------------------------------- + ++++ + +## Pandas + ++++ + +Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`. + +```{code-cell} ipython3 +df.head().to_pandas() +``` + +```{code-cell} ipython3 +ddf.compute().head().to_pandas() +``` + +## Numpy + ++++ + +Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`. + +```{code-cell} ipython3 +df.to_numpy() +``` + +```{code-cell} ipython3 +ddf.compute().to_numpy() +``` + +Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`. + +```{code-cell} ipython3 +df['a'].to_numpy() +``` + +```{code-cell} ipython3 +ddf['a'].compute().to_numpy() +``` + +## Arrow + ++++ + +Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`. + +```{code-cell} ipython3 +df.to_arrow() +``` + +```{code-cell} ipython3 +ddf.compute().to_arrow() +``` + +Getting Data In/Out +------------------------ + ++++ + +## CSV + ++++ + +Writing to a CSV file. + +```{code-cell} ipython3 +if not os.path.exists('example_output'): + os.mkdir('example_output') + +df.to_csv('example_output/foo.csv', index=False) +``` + +```{code-cell} ipython3 +ddf.compute().to_csv('example_output/foo_dask.csv', index=False) +``` + +Reading from a csv file. + +```{code-cell} ipython3 +df = cudf.read_csv('example_output/foo.csv') +df +``` + +```{code-cell} ipython3 +ddf = dask_cudf.read_csv('example_output/foo_dask.csv') +ddf.compute() +``` + +Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard. + +```{code-cell} ipython3 +ddf = dask_cudf.read_csv('example_output/*.csv') +ddf.compute() +``` + +## Parquet + ++++ + +Writing to parquet files, using the CPU via PyArrow. + +```{code-cell} ipython3 +df.to_parquet('example_output/temp_parquet') +``` + +Reading parquet files with a GPU-accelerated parquet reader. + +```{code-cell} ipython3 +df = cudf.read_parquet('example_output/temp_parquet') +df +``` + +Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood. + +```{code-cell} ipython3 +ddf.to_parquet('example_files') +``` + +## ORC + ++++ + +Reading ORC files. + +```{code-cell} ipython3 +import os +from pathlib import Path +current_dir = os.path.dirname(os.path.realpath("__file__")) +cudf_root = Path(current_dir).parents[3] +file_path = os.path.join(cudf_root, "python", "cudf", "cudf", "tests", "data", "orc", "TestOrcFile.test1.orc") +file_path +``` + +```{code-cell} ipython3 +df2 = cudf.read_orc(file_path) +df2 +``` + +Dask Performance Tips +-------------------------------- + +Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed at that moment, most operations are added to a task graph and the actual evaluation is delayed until the result is needed. + +Sometimes, though, we want to force the execution of operations. Calling `persist` on a Dask collection fully computes it (or actively computes it in the background), persisting the result into memory. When we're using distributed systems, we may want to wait until `persist` is finished before beginning any downstream operations. We can enforce this contract by using `wait`. Wrapping an operation with `wait` will ensure it doesn't begin executing until all necessary upstream operations have finished. + +The snippets below provide basic examples, using `LocalCUDACluster` to create one dask-worker per GPU on the local machine. For more detailed information about `persist` and `wait`, please see the Dask documentation for [persist](https://docs.dask.org/en/latest/api.html#dask.persist) and [wait](https://docs.dask.org/en/latest/futures.html#distributed.wait). Wait relies on the concept of Futures, which is beyond the scope of this tutorial. For more information on Futures, see the Dask [Futures](https://docs.dask.org/en/latest/futures.html) documentation. For more information about multi-GPU clusters, please see the [dask-cuda](https://github.com/rapidsai/dask-cuda) library (documentation is in progress). + ++++ + +First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster. + +```{code-cell} ipython3 +import time + +from dask.distributed import Client, wait +from dask_cuda import LocalCUDACluster + +cluster = LocalCUDACluster() +client = Client(cluster) +client +``` + +### Persisting Data +Next, we create our Dask-cuDF DataFrame and apply a transformation, storing the result as a new column. + +```{code-cell} ipython3 +nrows = 10000000 + +df2 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)}) +ddf2 = dask_cudf.from_cudf(df2, npartitions=5) +ddf2['c'] = ddf2['a'] + 5 +ddf2 +``` + +```{code-cell} ipython3 +!nvidia-smi +``` + +Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline). + +```{code-cell} ipython3 +ddf2 = ddf2.persist() +ddf2 +``` + +```{code-cell} ipython3 +!nvidia-smi +``` + +Because we forced computation, we now have a larger object in distributed GPU memory. + ++++ + +### Wait +Depending on our workflow or distributed computing setup, we may want to `wait` until all upstream tasks have finished before proceeding with a specific function. This section shows an example of this behavior, adapted from the Dask documentation. + +First, we create a new Dask DataFrame and define a function that we'll map to every partition in the dataframe. + +```{code-cell} ipython3 +import random + +nrows = 10000000 + +df1 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)}) +ddf1 = dask_cudf.from_cudf(df1, npartitions=100) + +def func(df): + time.sleep(random.randint(1, 60)) + return (df + 5) * 3 - 11 +``` + +This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution. + +```{code-cell} ipython3 +results_ddf = ddf2.map_partitions(func) +results_ddf = results_ddf.persist() +``` + +However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`. + +```{code-cell} ipython3 +wait(results_ddf) +``` + +## With `wait`, we can safely proceed on in our workflow. + +```{code-cell} ipython3 + +``` diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb deleted file mode 100644 index 54fe774060e..00000000000 --- a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb +++ /dev/null @@ -1,3466 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Working with missing data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by ``. These values are also referenced as \"null values\"." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. [How to Detect missing values](#How-to-Detect-missing-values)\n", - "2. [Float dtypes and missing data](#Float-dtypes-and-missing-data)\n", - "3. [Datetimes](#Datetimes)\n", - "4. [Calculations with missing data](#Calculations-with-missing-data)\n", - "5. [Sum/product of Null/nans](#Sum/product-of-Null/nans)\n", - "6. [NA values in GroupBy](#NA-values-in-GroupBy)\n", - "7. [Inserting missing data](#Inserting-missing-data)\n", - "8. [Filling missing values: fillna](#Filling-missing-values:-fillna)\n", - "9. [Filling with cudf Object](#Filling-with-cudf-Object)\n", - "10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna)\n", - "11. [Replacing generic values](#Replacing-generic-values)\n", - "12. [String/regular expression replacement](#String/regular-expression-replacement)\n", - "13. [Numeric replacement](#Numeric-replacement)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to Detect missing values" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To detect missing values, you can use `isna()` and `notna()` functions." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import cudf\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]})" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
010.1
12<NA>
2<NA>2.3
3417.17
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 0.1\n", - "1 2 \n", - "2 2.3\n", - "3 4 17.17" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0FalseFalse
1FalseTrue
2TrueFalse
3FalseFalse
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 False False\n", - "1 False True\n", - "2 True False\n", - "3 False False" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isna()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 True\n", - "1 True\n", - "2 False\n", - "3 True\n", - "Name: a, dtype: bool" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['a'].notna()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "None == None" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.nan == np.nan" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 False\n", - "1 False\n", - "2 False\n", - "3 False\n", - "Name: b, dtype: bool" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['b'] == np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "s = cudf.Series([None, 1, 2])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 \n", - "1 1\n", - "2 2\n", - "dtype: int64" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 False\n", - "1 False\n", - "2 False\n", - "dtype: bool" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s == None" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "s = cudf.Series([1, 2, np.nan], nan_as_null=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1.0\n", - "1 2.0\n", - "2 NaN\n", - "dtype: float64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 False\n", - "1 False\n", - "2 False\n", - "dtype: bool" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s == np.nan" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Float dtypes and missing data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default.\n", - "\n", - "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `` value. " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 \n", - "dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([1, 2, np.nan])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1.0\n", - "1 2.0\n", - "2 NaN\n", - "dtype: float64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([1, 2, np.nan], nan_as_null=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Datetimes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(``) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2012-01-01 00:00:00.000000\n", - "1 \n", - "2 2012-01-01 00:00:00.000000\n", - "dtype: datetime64[us]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n", - "datetime_series" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2012-01-01\n", - "1 NaT\n", - "2 2012-01-01\n", - "dtype: datetime64[ns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "datetime_series.to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "any operations on rows having `` values in `datetime` column will result in `` value at the same location in resulting column:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0 days 00:00:00\n", - "1 \n", - "2 0 days 00:00:00\n", - "dtype: timedelta64[us]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "datetime_series - datetime_series" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculations with missing data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Null values propagate naturally through arithmetic operations between pandas objects." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)})" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])})" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 NaN\n", - "1 2.0\n", - "2 2 3.2\n", - "3 3 0.1\n", - "4 1.0" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
010.23
11122.0
223.2
334<NA>
4101.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 0.23\n", - "1 11 22.0\n", - "2 2 3.2\n", - "3 34 \n", - "4 10 1.0" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
02NaN
1<NA>24.0
246.4
337<NA>
4<NA>2.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 2 NaN\n", - "1 24.0\n", - "2 4 6.4\n", - "3 37 \n", - "4 2.0" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1 + df2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While summing the data along a series, `NA` values will be treated as `0`." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 \n", - "2 2\n", - "3 3\n", - "4 \n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a']" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2`" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2.0" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter.\n", - "By default it's value is\n", - "set to `True`, we can change it to `False` to preserve `NA` values." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nan" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].sum(skipna=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nan" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].mean(skipna=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 \n", - "2 3\n", - "3 6\n", - "4 \n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].cumsum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To preserve `NA` values in cumulative methods, provide `skipna=False`." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].cumsum(skipna=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sum/product of Null/nans" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sum of an empty or all-NA Series of a DataFrame is 0." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.0" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([np.nan], nan_as_null=False).sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nan" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([np.nan], nan_as_null=False).sum(skipna=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.0" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([], dtype='float64').sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The product of an empty or all-NA Series of a DataFrame is 1." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([np.nan], nan_as_null=False).prod()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nan" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([np.nan], nan_as_null=False).prod(skipna=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cudf.Series([], dtype='float64').prod()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NA values in GroupBy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`NA` groups in GroupBy are automatically excluded. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 NaN\n", - "1 2.0\n", - "2 2 3.2\n", - "3 3 0.1\n", - "4 1.0" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
b
a
23.2
1NaN
30.1
\n", - "
" - ], - "text/plain": [ - " b\n", - "a \n", - "2 3.2\n", - "1 NaN\n", - "3 0.1" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.groupby('a').mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is also possible to include `NA` in groups by passing `dropna=False`" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
b
a
23.2
1NaN
30.1
<NA>1.5
\n", - "
" - ], - "text/plain": [ - " b\n", - "a \n", - "2 3.2\n", - "1 NaN\n", - "3 0.1\n", - " 1.5" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.groupby('a', dropna=False).mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inserting missing data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "series = cudf.Series([1, 2, 3, 4])" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "3 4\n", - "dtype: int64" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "series[2] = None" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 \n", - "3 4\n", - "dtype: int64" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filling missing values: fillna" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`fillna()` can fill in `NA` & `NaN` values with non-NA data." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 NaN\n", - "1 2.0\n", - "2 2 3.2\n", - "3 3 0.1\n", - "4 1.0" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 10.0\n", - "1 2.0\n", - "2 3.2\n", - "3 0.1\n", - "4 1.0\n", - "Name: b, dtype: float64" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['b'].fillna(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filling with cudf Object" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column." - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "import cupy as cp\n", - "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC'))" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "dff.iloc[3:5, 0] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "dff.iloc[4:6, 1] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "dff.iloc[5:8, 2] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ABC
00.7712450.0510241.199239
1-1.1680410.702664-0.270806
2-1.467009-0.143080-0.806151
3NaN-0.610798-0.272895
4NaNNaN1.396784
5-0.439343NaNNaN
61.093102-0.764758NaN
70.003098-0.722648NaN
8-0.095899-1.285156-0.300566
90.1094652.497843-1.199856
\n", - "
" - ], - "text/plain": [ - " A B C\n", - "0 0.771245 0.051024 1.199239\n", - "1 -1.168041 0.702664 -0.270806\n", - "2 -1.467009 -0.143080 -0.806151\n", - "3 NaN -0.610798 -0.272895\n", - "4 NaN NaN 1.396784\n", - "5 -0.439343 NaN NaN\n", - "6 1.093102 -0.764758 NaN\n", - "7 0.003098 -0.722648 NaN\n", - "8 -0.095899 -1.285156 -0.300566\n", - "9 0.109465 2.497843 -1.199856" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dff" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ABC
00.7712450.0510241.199239
1-1.1680410.702664-0.270806
2-1.467009-0.143080-0.806151
3-0.149173-0.610798-0.272895
4-0.149173-0.0343641.396784
5-0.439343-0.034364-0.036322
61.093102-0.764758-0.036322
70.003098-0.722648-0.036322
8-0.095899-1.285156-0.300566
90.1094652.497843-1.199856
\n", - "
" - ], - "text/plain": [ - " A B C\n", - "0 0.771245 0.051024 1.199239\n", - "1 -1.168041 0.702664 -0.270806\n", - "2 -1.467009 -0.143080 -0.806151\n", - "3 -0.149173 -0.610798 -0.272895\n", - "4 -0.149173 -0.034364 1.396784\n", - "5 -0.439343 -0.034364 -0.036322\n", - "6 1.093102 -0.764758 -0.036322\n", - "7 0.003098 -0.722648 -0.036322\n", - "8 -0.095899 -1.285156 -0.300566\n", - "9 0.109465 2.497843 -1.199856" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dff.fillna(dff.mean())" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ABC
00.7712450.0510241.199239
1-1.1680410.702664-0.270806
2-1.467009-0.143080-0.806151
3NaN-0.610798-0.272895
4NaN-0.0343641.396784
5-0.439343-0.034364-0.036322
61.093102-0.764758-0.036322
70.003098-0.722648-0.036322
8-0.095899-1.285156-0.300566
90.1094652.497843-1.199856
\n", - "
" - ], - "text/plain": [ - " A B C\n", - "0 0.771245 0.051024 1.199239\n", - "1 -1.168041 0.702664 -0.270806\n", - "2 -1.467009 -0.143080 -0.806151\n", - "3 NaN -0.610798 -0.272895\n", - "4 NaN -0.034364 1.396784\n", - "5 -0.439343 -0.034364 -0.036322\n", - "6 1.093102 -0.764758 -0.036322\n", - "7 0.003098 -0.722648 -0.036322\n", - "8 -0.095899 -1.285156 -0.300566\n", - "9 0.109465 2.497843 -1.199856" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dff.fillna(dff.mean()[1:3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dropping axis labels with missing data: dropna" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Missing data can be excluded using `dropna()`:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 NaN\n", - "1 2.0\n", - "2 2 3.2\n", - "3 3 0.1\n", - "4 1.0" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
223.2
330.1
\n", - "
" - ], - "text/plain": [ - " a b\n", - "2 2 3.2\n", - "3 3 0.1" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.dropna(axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0
1
2
3
4
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: [0, 1, 2, 3, 4]" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.dropna(axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An equivalent `dropna()` is available for Series. " - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "2 2\n", - "3 3\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1['a'].dropna()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Replacing generic values" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Often times we want to replace arbitrary values with other values.\n", - "\n", - "`replace()` in Series and `replace()` in DataFrame provides an efficient yet flexible way to perform such replacements." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "series = cudf.Series([0.0, 1.0, 2.0, 3.0, 4.0])" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0.0\n", - "1 1.0\n", - "2 2.0\n", - "3 3.0\n", - "4 4.0\n", - "dtype: float64" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 5.0\n", - "1 1.0\n", - "2 2.0\n", - "3 3.0\n", - "4 4.0\n", - "dtype: float64" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series.replace(0, 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also replace any value with a `` value." - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 \n", - "1 1.0\n", - "2 2.0\n", - "3 3.0\n", - "4 4.0\n", - "dtype: float64" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series.replace(0, None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can replace a list of values by a list of other values:" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 4.0\n", - "1 3.0\n", - "2 2.0\n", - "3 1.0\n", - "4 0.0\n", - "dtype: float64" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also specify a mapping dict:" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 10.0\n", - "1 100.0\n", - "2 2.0\n", - "3 3.0\n", - "4 4.0\n", - "dtype: float64" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "series.replace({0: 10, 1: 100})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For a DataFrame, you can specify individual values by column:" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "df = cudf.DataFrame({\"a\": [0, 1, 2, 3, 4], \"b\": [5, 6, 7, 8, 9]})" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
005
116
227
338
449
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 0 5\n", - "1 1 6\n", - "2 2 7\n", - "3 3 8\n", - "4 4 9" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0100100
116
227
338
449
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 100 100\n", - "1 1 6\n", - "2 2 7\n", - "3 3 8\n", - "4 4 9" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace({\"a\": 0, \"b\": 5}, 100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## String/regular expression replacement" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "cudf supports replacing string values using `replace` API:" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "d = {\"a\": list(range(4)), \"b\": list(\"ab..\"), \"c\": [\"a\", \"b\", None, \"d\"]}" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "df = cudf.DataFrame(d)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00aa
11bb
22.<NA>
33.d
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 a a\n", - "1 1 b b\n", - "2 2 . \n", - "3 3 . d" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00aa
11bb
22A Dot<NA>
33A Dotd
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 a a\n", - "1 1 b b\n", - "2 2 A Dot \n", - "3 3 A Dot d" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace(\".\", \"A Dot\")" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00aa
11<NA><NA>
22A Dot<NA>
33A Dotd
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 a a\n", - "1 1 \n", - "2 2 A Dot \n", - "3 3 A Dot d" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace([\".\", \"b\"], [\"A Dot\", None])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Replace a few different values (list -> list):\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00bb
11bb
22--<NA>
33--d
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 b b\n", - "1 1 b b\n", - "2 2 -- \n", - "3 3 -- d" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace([\"a\", \".\"], [\"b\", \"--\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Only search in column 'b' (dict -> dict):" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
00aa
11bb
22replacement value<NA>
33replacement valued
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 0 a a\n", - "1 1 b b\n", - "2 2 replacement value \n", - "3 3 replacement value d" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace({\"b\": \".\"}, {\"b\": \"replacement value\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Numeric replacement" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`replace()` can also be used similar to `fillna()`." - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [], - "source": [ - "df = cudf.DataFrame(cp.random.randn(10, 2))" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [], - "source": [ - "df[np.random.rand(df.shape[0]) > 0.5] = 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
0<NA><NA>
1<NA><NA>
20.1231607461.09464783
3<NA><NA>
4<NA><NA>
50.68137677-0.357346253
6<NA><NA>
7<NA><NA>
81.173285961-0.968616065
90.147922362-0.154880098
\n", - "
" - ], - "text/plain": [ - " 0 1\n", - "0 \n", - "1 \n", - "2 0.123160746 1.09464783\n", - "3 \n", - "4 \n", - "5 0.68137677 -0.357346253\n", - "6 \n", - "7 \n", - "8 1.173285961 -0.968616065\n", - "9 0.147922362 -0.154880098" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace(1.5, None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Replacing more than one value is possible by passing a list.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [], - "source": [ - "df00 = df.iloc[0, 0]" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
05.0000005.000000
15.0000005.000000
20.1231611.094648
35.0000005.000000
45.0000005.000000
50.681377-0.357346
65.0000005.000000
75.0000005.000000
81.173286-0.968616
90.147922-0.154880
\n", - "
" - ], - "text/plain": [ - " 0 1\n", - "0 5.000000 5.000000\n", - "1 5.000000 5.000000\n", - "2 0.123161 1.094648\n", - "3 5.000000 5.000000\n", - "4 5.000000 5.000000\n", - "5 0.681377 -0.357346\n", - "6 5.000000 5.000000\n", - "7 5.000000 5.000000\n", - "8 1.173286 -0.968616\n", - "9 0.147922 -0.154880" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.replace([1.5, df00], [5, 10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also operate on the DataFrame in place:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [], - "source": [ - "df.replace(1.5, None, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
0<NA><NA>
1<NA><NA>
20.1231607461.09464783
3<NA><NA>
4<NA><NA>
50.68137677-0.357346253
6<NA><NA>
7<NA><NA>
81.173285961-0.968616065
90.147922362-0.154880098
\n", - "
" - ], - "text/plain": [ - " 0 1\n", - "0 \n", - "1 \n", - "2 0.123160746 1.09464783\n", - "3 \n", - "4 \n", - "5 0.68137677 -0.357346253\n", - "6 \n", - "7 \n", - "8 1.173285961 -0.968616065\n", - "9 0.147922362 -0.154880098" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.md b/docs/cudf/source/user_guide/Working-with-missing-data.md new file mode 100644 index 00000000000..6932d0fa9f1 --- /dev/null +++ b/docs/cudf/source/user_guide/Working-with-missing-data.md @@ -0,0 +1,489 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Working with missing data + ++++ + +In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by ``. These values are also referenced as "null values". + ++++ + +1. [How to Detect missing values](#How-to-Detect-missing-values) +2. [Float dtypes and missing data](#Float-dtypes-and-missing-data) +3. [Datetimes](#Datetimes) +4. [Calculations with missing data](#Calculations-with-missing-data) +5. [Sum/product of Null/nans](#Sum/product-of-Null/nans) +6. [NA values in GroupBy](#NA-values-in-GroupBy) +7. [Inserting missing data](#Inserting-missing-data) +8. [Filling missing values: fillna](#Filling-missing-values:-fillna) +9. [Filling with cudf Object](#Filling-with-cudf-Object) +10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna) +11. [Replacing generic values](#Replacing-generic-values) +12. [String/regular expression replacement](#String/regular-expression-replacement) +13. [Numeric replacement](#Numeric-replacement) + ++++ + +## How to Detect missing values + ++++ + +To detect missing values, you can use `isna()` and `notna()` functions. + +```{code-cell} ipython3 +import cudf +import numpy as np +``` + +```{code-cell} ipython3 +df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]}) +``` + +```{code-cell} ipython3 +df +``` + +```{code-cell} ipython3 +df.isna() +``` + +```{code-cell} ipython3 +df['a'].notna() +``` + +One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`. + +```{code-cell} ipython3 +None == None +``` + +```{code-cell} ipython3 +np.nan == np.nan +``` + +So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information. + + +```{code-cell} ipython3 +df['b'] == np.nan +``` + +```{code-cell} ipython3 +s = cudf.Series([None, 1, 2]) +``` + +```{code-cell} ipython3 +s +``` + +```{code-cell} ipython3 +s == None +``` + +```{code-cell} ipython3 +s = cudf.Series([1, 2, np.nan], nan_as_null=False) +``` + +```{code-cell} ipython3 +s +``` + +```{code-cell} ipython3 +s == np.nan +``` + +## Float dtypes and missing data + ++++ + +Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default. + +By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `` value. + +```{code-cell} ipython3 +cudf.Series([1, 2, np.nan]) +``` + +Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor. + +```{code-cell} ipython3 +cudf.Series([1, 2, np.nan], nan_as_null=False) +``` + +## Datetimes + ++++ + +For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(``) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object. + + +```{code-cell} ipython3 +import pandas as pd +datetime_series = cudf.Series([pd.Timestamp("20120101"), pd.NaT, pd.Timestamp("20120101")]) +datetime_series +``` + +```{code-cell} ipython3 +datetime_series.to_pandas() +``` + +any operations on rows having `` values in `datetime` column will result in `` value at the same location in resulting column: + +```{code-cell} ipython3 +datetime_series - datetime_series +``` + +## Calculations with missing data + ++++ + +Null values propagate naturally through arithmetic operations between pandas objects. + +```{code-cell} ipython3 +df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)}) +``` + +```{code-cell} ipython3 +df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])}) +``` + +```{code-cell} ipython3 +df1 +``` + +```{code-cell} ipython3 +df2 +``` + +```{code-cell} ipython3 +df1 + df2 +``` + +While summing the data along a series, `NA` values will be treated as `0`. + +```{code-cell} ipython3 +df1['a'] +``` + +```{code-cell} ipython3 +df1['a'].sum() +``` + +Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2` + +```{code-cell} ipython3 +df1['a'].mean() +``` + +To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter. +By default it's value is +set to `True`, we can change it to `False` to preserve `NA` values. + +```{code-cell} ipython3 +df1['a'].sum(skipna=False) +``` + +```{code-cell} ipython3 +df1['a'].mean(skipna=False) +``` + +Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default. + +```{code-cell} ipython3 +df1['a'].cumsum() +``` + +To preserve `NA` values in cumulative methods, provide `skipna=False`. + +```{code-cell} ipython3 +df1['a'].cumsum(skipna=False) +``` + +## Sum/product of Null/nans + ++++ + +The sum of an empty or all-NA Series of a DataFrame is 0. + +```{code-cell} ipython3 +cudf.Series([np.nan], nan_as_null=False).sum() +``` + +```{code-cell} ipython3 +cudf.Series([np.nan], nan_as_null=False).sum(skipna=False) +``` + +```{code-cell} ipython3 +cudf.Series([], dtype='float64').sum() +``` + +The product of an empty or all-NA Series of a DataFrame is 1. + +```{code-cell} ipython3 +cudf.Series([np.nan], nan_as_null=False).prod() +``` + +```{code-cell} ipython3 +cudf.Series([np.nan], nan_as_null=False).prod(skipna=False) +``` + +```{code-cell} ipython3 +cudf.Series([], dtype='float64').prod() +``` + +## NA values in GroupBy + ++++ + +`NA` groups in GroupBy are automatically excluded. For example: + +```{code-cell} ipython3 +df1 +``` + +```{code-cell} ipython3 +df1.groupby('a').mean() +``` + +It is also possible to include `NA` in groups by passing `dropna=False` + +```{code-cell} ipython3 +df1.groupby('a', dropna=False).mean() +``` + +## Inserting missing data + ++++ + +All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`. + +```{code-cell} ipython3 +series = cudf.Series([1, 2, 3, 4]) +``` + +```{code-cell} ipython3 +series +``` + +```{code-cell} ipython3 +series[2] = None +``` + +```{code-cell} ipython3 +series +``` + +## Filling missing values: fillna + ++++ + +`fillna()` can fill in `NA` & `NaN` values with non-NA data. + +```{code-cell} ipython3 +df1 +``` + +```{code-cell} ipython3 +df1['b'].fillna(10) +``` + +## Filling with cudf Object + ++++ + +You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column. + +```{code-cell} ipython3 +import cupy as cp +dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC')) +``` + +```{code-cell} ipython3 +dff.iloc[3:5, 0] = np.nan +``` + +```{code-cell} ipython3 +dff.iloc[4:6, 1] = np.nan +``` + +```{code-cell} ipython3 +dff.iloc[5:8, 2] = np.nan +``` + +```{code-cell} ipython3 +dff +``` + +```{code-cell} ipython3 +dff.fillna(dff.mean()) +``` + +```{code-cell} ipython3 +dff.fillna(dff.mean()[1:3]) +``` + +## Dropping axis labels with missing data: dropna + ++++ + +Missing data can be excluded using `dropna()`: + + +```{code-cell} ipython3 +df1 +``` + +```{code-cell} ipython3 +df1.dropna(axis=0) +``` + +```{code-cell} ipython3 +df1.dropna(axis=1) +``` + +An equivalent `dropna()` is available for Series. + +```{code-cell} ipython3 +df1['a'].dropna() +``` + +## Replacing generic values + ++++ + +Often times we want to replace arbitrary values with other values. + +`replace()` in Series and `replace()` in DataFrame provides an efficient yet flexible way to perform such replacements. + +```{code-cell} ipython3 +series = cudf.Series([0.0, 1.0, 2.0, 3.0, 4.0]) +``` + +```{code-cell} ipython3 +series +``` + +```{code-cell} ipython3 +series.replace(0, 5) +``` + +We can also replace any value with a `` value. + +```{code-cell} ipython3 +series.replace(0, None) +``` + +You can replace a list of values by a list of other values: + +```{code-cell} ipython3 +series.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) +``` + +You can also specify a mapping dict: + +```{code-cell} ipython3 +series.replace({0: 10, 1: 100}) +``` + +For a DataFrame, you can specify individual values by column: + +```{code-cell} ipython3 +df = cudf.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) +``` + +```{code-cell} ipython3 +df +``` + +```{code-cell} ipython3 +df.replace({"a": 0, "b": 5}, 100) +``` + +## String/regular expression replacement + ++++ + +cudf supports replacing string values using `replace` API: + +```{code-cell} ipython3 +d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", None, "d"]} +``` + +```{code-cell} ipython3 +df = cudf.DataFrame(d) +``` + +```{code-cell} ipython3 +df +``` + +```{code-cell} ipython3 +df.replace(".", "A Dot") +``` + +```{code-cell} ipython3 +df.replace([".", "b"], ["A Dot", None]) +``` + +Replace a few different values (list -> list): + +```{code-cell} ipython3 +df.replace(["a", "."], ["b", "--"]) +``` + +Only search in column 'b' (dict -> dict): + +```{code-cell} ipython3 +df.replace({"b": "."}, {"b": "replacement value"}) +``` + +## Numeric replacement + ++++ + +`replace()` can also be used similar to `fillna()`. + +```{code-cell} ipython3 +df = cudf.DataFrame(cp.random.randn(10, 2)) +``` + +```{code-cell} ipython3 +df[np.random.rand(df.shape[0]) > 0.5] = 1.5 +``` + +```{code-cell} ipython3 +df.replace(1.5, None) +``` + +Replacing more than one value is possible by passing a list. + + +```{code-cell} ipython3 +df00 = df.iloc[0, 0] +``` + +```{code-cell} ipython3 +df.replace([1.5, df00], [5, 10]) +``` + +You can also operate on the DataFrame in place: + + +```{code-cell} ipython3 +df.replace(1.5, None, inplace=True) +``` + +```{code-cell} ipython3 +df +``` diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb deleted file mode 100644 index 0d05ddb00b4..00000000000 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ /dev/null @@ -1,2313 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Overview of User Defined Functions with cuDF" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import cudf\n", - "from cudf.datasets import randomdata\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n", - "\n", - "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures:\n", - "\n", - "- Series\n", - "- DataFrame\n", - "- Rolling Windows Series\n", - "- Groupby DataFrames\n", - "- CuPy NDArrays\n", - "- Numba DeviceNDArrays\n", - "\n", - "It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Series UDFs\n", - "\n", - "You can execute UDFs on Series in two ways:\n", - "\n", - "- Writing a standard python function and using `cudf.Series.apply`\n", - "- Writing a Numba kernel and using Numba's `forall` syntax\n", - "\n", - "Using `apply` or is simpler, but writing a Numba kernel offers the flexibility to build more complex functions (we'll be writing only simple kernels in this guide)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `cudf.Series.apply`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a cuDF series\n", - "sr = cudf.Series([1, 2, 3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "UDFs destined for `cudf.Series.apply` might look something like this:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# define a scalar function\n", - "def f(x):\n", - " return x + 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2\n", - "1 3\n", - "2 4\n", - "dtype: int64" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sr.apply(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Functions with Additional Scalar Arguments" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def g(x, const):\n", - " return x + const" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 43\n", - "1 44\n", - "2 45\n", - "dtype: int64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# cuDF apply\n", - "sr.apply(g, args=(42,))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As a final note, `**kwargs` is not yet supported." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nullable Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 \n", - "2 3\n", - "dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create a cuDF series with nulls\n", - "sr = cudf.Series([1, cudf.NA, 3])\n", - "sr" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# redefine the same function from above\n", - "def f(x):\n", - " return x + 1" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2\n", - "1 \n", - "2 4\n", - "dtype: int64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# cuDF result\n", - "sr.apply(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def f_null_sensitive(x):\n", - " # do something if the input is null\n", - " if x is cudf.NA:\n", - " return 42\n", - " else:\n", - " return x + 1" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2\n", - "1 42\n", - "2 4\n", - "dtype: int64" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# cuDF result\n", - "sr.apply(f_null_sensitive)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lower level control with custom `numba` kernels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n", - "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details.\n", - "\n", - "The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from numba import cuda\n", - "\n", - "@cuda.jit\n", - "def multiply(in_col, out_col, multiplier):\n", - " i = cuda.grid(1)\n", - " if i < in_col.size: # boundary guard\n", - " out_col[i] = in_col[i] * multiplier" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n", - "\n", - "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "size = len(df['a'])\n", - "df['e'] = 0.0\n", - "multiply.forall(size)(df['a'], df['e'], 10.0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After calling our kernel, our DataFrame is now populated with the result." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abce
096310059979630.0
197710269809770.0
210481026101910480.0
3107896098510780.0
497998210119790.0
\n", - "
" - ], - "text/plain": [ - " a b c e\n", - "0 963 1005 997 9630.0\n", - "1 977 1026 980 9770.0\n", - "2 1048 1026 1019 10480.0\n", - "3 1078 960 985 10780.0\n", - "4 979 982 1011 9790.0" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DataFrame UDFs\n", - "\n", - "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n", - "\n", - "- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf\n", - "- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel\n", - "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `cudf.DataFrame.apply`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a \"row\" argument. The \"row\" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def f(row):\n", - " return row['A'] + row['B']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's create some very basic toy data containing at least one null." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
014
12<NA>
236
\n", - "
" - ], - "text/plain": [ - " A B\n", - "0 1 4\n", - "1 2 \n", - "2 3 6" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = cudf.DataFrame({\n", - " 'A': [1,2,3],\n", - " 'B': [4,cudf.NA,6]\n", - "})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame: " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 5\n", - "1 \n", - "2 9\n", - "dtype: int64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The same function should produce the same result as pandas:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 5\n", - "1 \n", - "2 9\n", - "dtype: object" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.to_pandas(nullable=True).apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that Pandas returns `object` dtype - see notes on this in the caveats section." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
a
01
1<NA>
23
\n", - "
" - ], - "text/plain": [ - " a\n", - "0 1\n", - "1 \n", - "2 3" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def f(row):\n", - " x = row['a']\n", - " if x is cudf.NA:\n", - " return 0\n", - " else:\n", - " return x + 1\n", - "\n", - "df = cudf.DataFrame({'a': [1, cudf.NA, 3]})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2\n", - "1 0\n", - "2 4\n", - "dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
012
121
231
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 2\n", - "1 2 1\n", - "2 3 1" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def f(row):\n", - " x = row['a']\n", - " y = row['b']\n", - " if x + y > 3:\n", - " return cudf.NA\n", - " else:\n", - " return x + y\n", - "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [2, 1, 1]\n", - "})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 3\n", - "1 3\n", - "2 \n", - "dtype: int64" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column:" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
010.5
12<NA>
233.14
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 1 0.5\n", - "1 2 \n", - "2 3 3.14" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def f(row):\n", - " return row['a'] + row['b']\n", - "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [0.5, cudf.NA, 3.14]\n", - "})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1.5\n", - "1 \n", - "2 6.14\n", - "dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like:\n", - "\n", - "```python\n", - "def f(x):\n", - " if x > 1000:\n", - " return 1.5\n", - " else:\n", - " return 2\n", - "```\n", - "And your data is:\n", - "```python\n", - "[1,2,3,4,5]\n", - "```\n", - "You will get floats in the final data even though a float is never returned. This is because Numba ultimately needs to produce one function that can handle any data, which means if there's any possibility a float could result, you must always assume it will happen. Here's an example of a function that returns a scalar in some cases:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
a
01
13
25
\n", - "
" - ], - "text/plain": [ - " a\n", - "0 1\n", - "1 3\n", - "2 5" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def f(row):\n", - " x = row['a']\n", - " if x > 3:\n", - " return x\n", - " else:\n", - " return 1.5\n", - "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 3, 5]\n", - "})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1.5\n", - "1 1.5\n", - "2 5.0\n", - "dtype: float64" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Any number of columns and many arithmetic operators are supported, allowing for complex UDFs:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcde
014<NA>87
125471
236486
\n", - "
" - ], - "text/plain": [ - " a b c d e\n", - "0 1 4 8 7\n", - "1 2 5 4 7 1\n", - "2 3 6 4 8 6" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def f(row):\n", - " return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n", - "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3],\n", - " 'b': [4, 5, 6],\n", - " 'c': [cudf.NA, 4, 4],\n", - " 'd': [8, 7, 8],\n", - " 'e': [7, 1, 6]\n", - "})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 \n", - "1 2.428571429\n", - "2 8.5\n", - "dtype: float64" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.apply(f, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Numba kernels for DataFrames" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior.\n", - "\n", - "Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def conditional_add(x, y, out):\n", - " for i, (a, e) in enumerate(zip(x, y)):\n", - " if a > 0:\n", - " out[i] = a + e\n", - " else:\n", - " out[i] = a" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments:\n", - "- incols\n", - " - A list of names of input columns that match the function arguments. Or, a dictionary mapping input column names to their corresponding function arguments such as `{'col1': 'arg1'}`.\n", - "- outcols\n", - " - A dictionary defining our output column names and their data types. These names must match our function arguments.\n", - "- kwargs (optional)\n", - " - We can optionally pass keyword arguments as a dictionary. Since we don't need any, we pass an empty one.\n", - " \n", - "While it looks like our function is looping sequentially through our columns, it actually executes in parallel in multiple threads on the GPU. This parallelism is the heart of GPU-accelerated computing. With that background, we're ready to use our UDF." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcdeout
014<NA>878.0
1254713.0
2364869.0
\n", - "
" - ], - "text/plain": [ - " a b c d e out\n", - "0 1 4 8 7 8.0\n", - "1 2 5 4 7 1 3.0\n", - "2 3 6 4 8 6 9.0" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.apply_rows(conditional_add, \n", - " incols={'a':'x', 'e':'y'},\n", - " outcols={'out': np.float64},\n", - " kwargs={}\n", - " )\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Null Handling in `apply_rows` and `apply_chunks`\n", - "\n", - "By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
09631005997
19771026<NA>
2<NA>10261019
31078<NA>985
49799821011
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "0 963 1005 997\n", - "1 977 1026 \n", - "2 1026 1019\n", - "3 1078 985\n", - "4 979 982 1011" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def gpu_add(a, b, out):\n", - " for i, (x, y) in enumerate(zip(a, b)):\n", - " out[i] = x + y\n", - "\n", - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)\n", - "df.loc[2, 'a'] = None\n", - "df.loc[3, 'b'] = None\n", - "df.loc[1, 'c'] = None\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us)." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcout
096310059971968.0
19771026<NA>2003.0
2<NA>10261019<NA>
31078<NA>985<NA>
497998210111961.0
\n", - "
" - ], - "text/plain": [ - " a b c out\n", - "0 963 1005 997 1968.0\n", - "1 977 1026 2003.0\n", - "2 1026 1019 \n", - "3 1078 985 \n", - "4 979 982 1011 1961.0" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.apply_rows(gpu_add, \n", - " incols=['a', 'b'],\n", - " outcols={'out':np.float64},\n", - " kwargs={})\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Rolling Window UDFs\n", - "\n", - "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n", - "\n", - "We can apply more complex functions to rolling windows to `rolling` Series and DataFrames using `apply`. This example is adapted from cuDF's [API documentation](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.DataFrame.rolling.html). First, we'll create an example Series and then create a `rolling` object from the Series." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 16.0\n", - "1 25.0\n", - "2 36.0\n", - "3 49.0\n", - "4 64.0\n", - "5 81.0\n", - "dtype: float64" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n", - "ser" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Rolling [window=3,min_periods=3,center=False]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rolling = ser.rolling(window=3, min_periods=3, center=False)\n", - "rolling" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "import math\n", - "\n", - "def example_func(window):\n", - " b = 0\n", - " for a in window:\n", - " b = max(b, math.sqrt(a))\n", - " if b == 8:\n", - " return 100 \n", - " return b" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 \n", - "1 \n", - "2 6.0\n", - "3 7.0\n", - "4 100.0\n", - "5 9.0\n", - "dtype: float64" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rolling.apply(example_func)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can apply this function to every column in a DataFrame, too." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
055.055.0
156.056.0
257.057.0
358.058.0
459.059.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 55.0 55.0\n", - "1 56.0 56.0\n", - "2 57.0 57.0\n", - "3 58.0 58.0\n", - "4 59.0 59.0" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2 = cudf.DataFrame()\n", - "df2['a'] = np.arange(55, 65, dtype='float64')\n", - "df2['b'] = np.arange(55, 65, dtype='float64')\n", - "df2.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0<NA><NA>
1<NA><NA>
27.5498344357.549834435
37.6157731067.615773106
47.6811457487.681145748
57.7459666927.745966692
67.8102496767.810249676
77.8740078747.874007874
87.9372539337.937253933
9100.0100.0
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 \n", - "1 \n", - "2 7.549834435 7.549834435\n", - "3 7.615773106 7.615773106\n", - "4 7.681145748 7.681145748\n", - "5 7.745966692 7.745966692\n", - "6 7.810249676 7.810249676\n", - "7 7.874007874 7.874007874\n", - "8 7.937253933 7.937253933\n", - "9 100.0 100.0" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rolling = df2.rolling(window=3, min_periods=3, center=False)\n", - "rolling.apply(example_func)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## GroupBy DataFrame UDFs\n", - "\n", - "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n", - "\n", - "First, we'll group our DataFrame based on column `b`, which is either True or False." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abce
0-0.691674TrueDan-0.958380
10.480099FalseBob-0.729580
2-0.473370TrueXavier-0.767454
30.067479TrueAlice-0.380205
4-0.970850FalseSarah0.342905
\n", - "
" - ], - "text/plain": [ - " a b c e\n", - "0 -0.691674 True Dan -0.958380\n", - "1 0.480099 False Bob -0.729580\n", - "2 -0.473370 True Xavier -0.767454\n", - "3 0.067479 True Alice -0.380205\n", - "4 -0.970850 False Sarah 0.342905" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "grouped = df.groupby(['b'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "def rolling_avg(e, rolling_avg_e):\n", - " win_size = 3\n", - " for i in range(cuda.threadIdx.x, len(e), cuda.blockDim.x):\n", - " if i < win_size - 1:\n", - " # If there is not enough data to fill the window,\n", - " # take the average to be NaN\n", - " rolling_avg_e[i] = np.nan\n", - " else:\n", - " total = 0\n", - " for j in range(i - win_size + 1, i + 1):\n", - " total += e[j]\n", - " rolling_avg_e[i] = total / win_size" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcerolling_avg_e
10.480099FalseBob-0.729580NaN
4-0.970850FalseSarah0.342905NaN
60.801430FalseSarah0.6323370.081887
7-0.933157FalseQuinn-0.4208260.184805
0-0.691674TrueDan-0.958380NaN
2-0.473370TrueXavier-0.767454NaN
30.067479TrueAlice-0.380205-0.702013
50.837494TrueWendy-0.057540-0.401733
80.913899TrueUrsula0.4662520.009502
9-0.725581TrueGeorge0.4052450.271319
\n", - "
" - ], - "text/plain": [ - " a b c e rolling_avg_e\n", - "1 0.480099 False Bob -0.729580 NaN\n", - "4 -0.970850 False Sarah 0.342905 NaN\n", - "6 0.801430 False Sarah 0.632337 0.081887\n", - "7 -0.933157 False Quinn -0.420826 0.184805\n", - "0 -0.691674 True Dan -0.958380 NaN\n", - "2 -0.473370 True Xavier -0.767454 NaN\n", - "3 0.067479 True Alice -0.380205 -0.702013\n", - "5 0.837494 True Wendy -0.057540 -0.401733\n", - "8 0.913899 True Ursula 0.466252 0.009502\n", - "9 -0.725581 True George 0.405245 0.271319" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results = grouped.apply_grouped(rolling_avg,\n", - " incols=['e'],\n", - " outcols=dict(rolling_avg_e=np.float64))\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Numba Kernels on CuPy Arrays\n", - "\n", - "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 1., 2., 3., 4., 10.])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import cupy as cp\n", - "\n", - "s = cudf.Series([1.0, 2, 3, 4, 10])\n", - "arr = cp.asarray(s)\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 5\n", - "1 10\n", - "2 15\n", - "3 20\n", - "4 50\n", - "dtype: int32" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@cuda.jit\n", - "def multiply_by_5(x, out):\n", - " i = cuda.grid(1)\n", - " if i < x.size:\n", - " out[i] = x[i] * 5\n", - " \n", - "out = cudf.Series(cp.zeros(len(s), dtype='int32'))\n", - "multiply_by_5.forall(s.shape[0])(s, out)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we execute the same function on our array. We allocate an empty array `out` to store our results." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 5., 10., 15., 20., 50.])" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "out = cp.empty_like(arr)\n", - "multiply_by_5.forall(arr.size)(arr, out)\n", - "out" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Caveats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n", - "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n", - "\n", - "- Series\n", - "- DataFrame\n", - "- Rolling Windows\n", - "- GroupBy DataFrames\n", - "- CuPy NDArrays\n", - "- Numba DeviceNDArrays\n", - "- Generalized NA UDFs\n", - "\n", - "\n", - "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/cudf/source/user_guide/guide-to-udfs.md b/docs/cudf/source/user_guide/guide-to-udfs.md new file mode 100644 index 00000000000..b4f8d7b6f75 --- /dev/null +++ b/docs/cudf/source/user_guide/guide-to-udfs.md @@ -0,0 +1,558 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Overview of User Defined Functions with cuDF + +```{code-cell} ipython3 +import cudf +from cudf.datasets import randomdata +import numpy as np +``` + +Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame. + +In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures: + +- Series +- DataFrame +- Rolling Windows Series +- Groupby DataFrames +- CuPy NDArrays +- Numba DeviceNDArrays + +It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values. + ++++ + +## Series UDFs + +You can execute UDFs on Series in two ways: + +- Writing a standard python function and using `cudf.Series.apply` +- Writing a Numba kernel and using Numba's `forall` syntax + +Using `apply` or is simpler, but writing a Numba kernel offers the flexibility to build more complex functions (we'll be writing only simple kernels in this guide). + ++++ + +# `cudf.Series.apply` + ++++ + +cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example. + +```{code-cell} ipython3 +# Create a cuDF series +sr = cudf.Series([1, 2, 3]) +``` + +UDFs destined for `cudf.Series.apply` might look something like this: + +```{code-cell} ipython3 +# define a scalar function +def f(x): + return x + 1 +``` + +`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object: + +```{code-cell} ipython3 +sr.apply(f) +``` + +### Functions with Additional Scalar Arguments + ++++ + +In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF: + +```{code-cell} ipython3 +def g(x, const): + return x + const +``` + +```{code-cell} ipython3 +# cuDF apply +sr.apply(g, args=(42,)) +``` + +As a final note, `**kwargs` is not yet supported. + ++++ + +### Nullable Data + ++++ + +The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data: + +```{code-cell} ipython3 +# Create a cuDF series with nulls +sr = cudf.Series([1, cudf.NA, 3]) +sr +``` + +```{code-cell} ipython3 +# redefine the same function from above +def f(x): + return x + 1 +``` + +```{code-cell} ipython3 +# cuDF result +sr.apply(f) +``` + +Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling: + +```{code-cell} ipython3 +def f_null_sensitive(x): + # do something if the input is null + if x is cudf.NA: + return 42 + else: + return x + 1 +``` + +```{code-cell} ipython3 +# cuDF result +sr.apply(f_null_sensitive) +``` + +In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases. + ++++ + +### Lower level control with custom `numba` kernels + ++++ + +In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html). +Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details. + +The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it. + +```{code-cell} ipython3 +df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12) +``` + +```{code-cell} ipython3 +from numba import cuda + +@cuda.jit +def multiply(in_col, out_col, multiplier): + i = cuda.grid(1) + if i < in_col.size: # boundary guard + out_col[i] = in_col[i] * multiplier +``` + +This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior. + +To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series. + +```{code-cell} ipython3 +size = len(df['a']) +df['e'] = 0.0 +multiply.forall(size)(df['a'], df['e'], 10.0) +``` + +After calling our kernel, our DataFrame is now populated with the result. + +```{code-cell} ipython3 +df.head() +``` + +This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster. + ++++ + +## DataFrame UDFs + +Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input: + +- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf +- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel +- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control. + ++++ + +# `cudf.DataFrame.apply` + ++++ + +`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a "row" argument. The "row" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF. + +```{code-cell} ipython3 +def f(row): + return row['A'] + row['B'] +``` + +Let's create some very basic toy data containing at least one null. + +```{code-cell} ipython3 +df = cudf.DataFrame({ + 'A': [1,2,3], + 'B': [4,cudf.NA,6] +}) +df +``` + +Finally call the function as you would in pandas - by using a lambda function to map the UDF onto "rows" of the DataFrame: + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +The same function should produce the same result as pandas: + +```{code-cell} ipython3 +df.to_pandas(nullable=True).apply(f, axis=1) +``` + +Notice that Pandas returns `object` dtype - see notes on this in the caveats section. + ++++ + +Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null: + +```{code-cell} ipython3 +def f(row): + x = row['a'] + if x is cudf.NA: + return 0 + else: + return x + 1 + +df = cudf.DataFrame({'a': [1, cudf.NA, 3]}) +df +``` + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row: + +```{code-cell} ipython3 +def f(row): + x = row['a'] + y = row['b'] + if x + y > 3: + return cudf.NA + else: + return x + y + +df = cudf.DataFrame({ + 'a': [1, 2, 3], + 'b': [2, 1, 1] +}) +df +``` + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column: + +```{code-cell} ipython3 +def f(row): + return row['a'] + row['b'] + +df = cudf.DataFrame({ + 'a': [1, 2, 3], + 'b': [0.5, cudf.NA, 3.14] +}) +df +``` + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like: + +```python +def f(x): + if x > 1000: + return 1.5 + else: + return 2 +``` +And your data is: +```python +[1,2,3,4,5] +``` +You will get floats in the final data even though a float is never returned. This is because Numba ultimately needs to produce one function that can handle any data, which means if there's any possibility a float could result, you must always assume it will happen. Here's an example of a function that returns a scalar in some cases: + +```{code-cell} ipython3 +def f(row): + x = row['a'] + if x > 3: + return x + else: + return 1.5 + +df = cudf.DataFrame({ + 'a': [1, 3, 5] +}) +df +``` + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +Any number of columns and many arithmetic operators are supported, allowing for complex UDFs: + +```{code-cell} ipython3 +def f(row): + return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e'] + +df = cudf.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': [cudf.NA, 4, 4], + 'd': [8, 7, 8], + 'e': [7, 1, 6] +}) +df +``` + +```{code-cell} ipython3 +df.apply(f, axis=1) +``` + +# Numba kernels for DataFrames + ++++ + + +We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior. + +Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them. + +```{code-cell} ipython3 +def conditional_add(x, y, out): + for i, (a, e) in enumerate(zip(x, y)): + if a > 0: + out[i] = a + e + else: + out[i] = a +``` + +Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments: +- incols + - A list of names of input columns that match the function arguments. Or, a dictionary mapping input column names to their corresponding function arguments such as `{'col1': 'arg1'}`. +- outcols + - A dictionary defining our output column names and their data types. These names must match our function arguments. +- kwargs (optional) + - We can optionally pass keyword arguments as a dictionary. Since we don't need any, we pass an empty one. + +While it looks like our function is looping sequentially through our columns, it actually executes in parallel in multiple threads on the GPU. This parallelism is the heart of GPU-accelerated computing. With that background, we're ready to use our UDF. + +```{code-cell} ipython3 +df = df.apply_rows(conditional_add, + incols={'a':'x', 'e':'y'}, + outcols={'out': np.float64}, + kwargs={} + ) +df.head() +``` + +As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF. + ++++ + +## Null Handling in `apply_rows` and `apply_chunks` + +By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below. + +```{code-cell} ipython3 +def gpu_add(a, b, out): + for i, (x, y) in enumerate(zip(a, b)): + out[i] = x + y + +df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12) +df.loc[2, 'a'] = None +df.loc[3, 'b'] = None +df.loc[1, 'c'] = None +df.head() +``` + +In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us). + +```{code-cell} ipython3 +df = df.apply_rows(gpu_add, + incols=['a', 'b'], + outcols={'out':np.float64}, + kwargs={}) +df.head() +``` + +As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not. + ++++ + +## Rolling Window UDFs + +For time-series data, we may need to operate on a small \"window\" of our column at a time, processing each portion independently. We could slide (\"roll\") this window over the entire column to answer questions like \"What is the 3-day moving average of a stock price over the past year?" + +We can apply more complex functions to rolling windows to `rolling` Series and DataFrames using `apply`. This example is adapted from cuDF's [API documentation](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.DataFrame.rolling.html). First, we'll create an example Series and then create a `rolling` object from the Series. + +```{code-cell} ipython3 +ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64') +ser +``` + +```{code-cell} ipython3 +rolling = ser.rolling(window=3, min_periods=3, center=False) +rolling +``` + +Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values. + +```{code-cell} ipython3 +import math + +def example_func(window): + b = 0 + for a in window: + b = max(b, math.sqrt(a)) + if b == 8: + return 100 + return b +``` + +We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`. + +```{code-cell} ipython3 +rolling.apply(example_func) +``` + +We can apply this function to every column in a DataFrame, too. + +```{code-cell} ipython3 +df2 = cudf.DataFrame() +df2['a'] = np.arange(55, 65, dtype='float64') +df2['b'] = np.arange(55, 65, dtype='float64') +df2.head() +``` + +```{code-cell} ipython3 +rolling = df2.rolling(window=3, min_periods=3, center=False) +rolling.apply(example_func) +``` + +## GroupBy DataFrame UDFs + +We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation](). + +First, we'll group our DataFrame based on column `b`, which is either True or False. + +```{code-cell} ipython3 +df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12) +df.head() +``` + +```{code-cell} ipython3 +grouped = df.groupby(['b']) +``` + +Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`. + +```{code-cell} ipython3 +def rolling_avg(e, rolling_avg_e): + win_size = 3 + for i in range(cuda.threadIdx.x, len(e), cuda.blockDim.x): + if i < win_size - 1: + # If there is not enough data to fill the window, + # take the average to be NaN + rolling_avg_e[i] = np.nan + else: + total = 0 + for j in range(i - win_size + 1, i + 1): + total += e[j] + rolling_avg_e[i] = total / win_size +``` + +We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group. + +```{code-cell} ipython3 +results = grouped.apply_grouped(rolling_avg, + incols=['e'], + outcols=dict(rolling_avg_e=np.float64)) +results +``` + +Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null. + ++++ + +## Numba Kernels on CuPy Arrays + +We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series. + +```{code-cell} ipython3 +import cupy as cp + +s = cudf.Series([1.0, 2, 3, 4, 10]) +arr = cp.asarray(s) +arr +``` + +Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`. + +```{code-cell} ipython3 +@cuda.jit +def multiply_by_5(x, out): + i = cuda.grid(1) + if i < x.size: + out[i] = x[i] * 5 + +out = cudf.Series(cp.zeros(len(s), dtype='int32')) +multiply_by_5.forall(s.shape[0])(s, out) +out +``` + +Finally, we execute the same function on our array. We allocate an empty array `out` to store our results. + +```{code-cell} ipython3 +out = cp.empty_like(arr) +multiply_by_5.forall(arr.size)(arr, out) +out +``` + +## Caveats + ++++ + +- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`. +- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented. + ++++ + +## Summary + +This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on + +- Series +- DataFrame +- Rolling Windows +- GroupBy DataFrames +- CuPy NDArrays +- Numba DeviceNDArrays +- Generalized NA UDFs + + +For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation. diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst index 1061008eb3c..11cfc8a50a7 100644 --- a/docs/cudf/source/user_guide/index.rst +++ b/docs/cudf/source/user_guide/index.rst @@ -6,7 +6,7 @@ User Guide .. toctree:: :maxdepth: 2 - 10min.ipynb - 10min-cudf-cupy.ipynb - guide-to-udfs.ipynb - Working-with-missing-data.ipynb + 10min.md + 10min-cudf-cupy.md + guide-to-udfs.md + Working-with-missing-data.md From bb807e1849508406341fc045c2ff6f12531a6d81 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 13:02:07 -0400 Subject: [PATCH 02/14] Reorg --- docs/cudf/source/basics/basics.rst | 62 -------------- docs/cudf/source/basics/index.rst | 15 ---- .../cudf/source/user_guide/10min-cudf-cupy.md | 2 +- .../{basics => user_guide}/PandasCompat.rst | 0 .../{basics => user_guide}/dask-cudf.rst | 0 docs/cudf/source/user_guide/data_types.rst | 80 +++++++++++++++++++ .../source/{basics => user_guide}/groupby.rst | 0 docs/cudf/source/user_guide/index.rst | 11 ++- .../{basics => user_guide}/internals.rst | 0 .../io-gds-integration.rst | 0 .../io-nvcomp-integration.rst | 0 .../io-supported-types.rst | 0 .../cudf/source/{basics => user_guide}/io.rst | 0 13 files changed, 90 insertions(+), 80 deletions(-) delete mode 100644 docs/cudf/source/basics/basics.rst delete mode 100644 docs/cudf/source/basics/index.rst rename docs/cudf/source/{basics => user_guide}/PandasCompat.rst (100%) rename docs/cudf/source/{basics => user_guide}/dask-cudf.rst (100%) create mode 100644 docs/cudf/source/user_guide/data_types.rst rename docs/cudf/source/{basics => user_guide}/groupby.rst (100%) rename docs/cudf/source/{basics => user_guide}/internals.rst (100%) rename docs/cudf/source/{basics => user_guide}/io-gds-integration.rst (100%) rename docs/cudf/source/{basics => user_guide}/io-nvcomp-integration.rst (100%) rename docs/cudf/source/{basics => user_guide}/io-supported-types.rst (100%) rename docs/cudf/source/{basics => user_guide}/io.rst (100%) diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst deleted file mode 100644 index 9b8983fba49..00000000000 --- a/docs/cudf/source/basics/basics.rst +++ /dev/null @@ -1,62 +0,0 @@ -Basics -====== - - -Supported Dtypes ----------------- - -cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``, -``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``, -``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes). - - -The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. - -.. rst-class:: special-table -.. table:: - - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Kind of Data | Data Type | Scalar | String Aliases | - +=================+==================+==============================================================+==============================================+ - | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, | ``'int8'``, ``'int16'``, ``'int32'``, | - | | | np.uint16_, np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | - | | | | ``'uint32'``, ``'uint64'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Strings | | `str `_ | ``'string'``, ``'object'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | - | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, | - | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Categorical | CategoricalDtype | (none) | ``'category'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Boolean | | np.bool_ | ``'bool'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Decimal | Decimal32Dtype, | (none) | (none) | - | | Decimal64Dtype, | | | - | | Decimal128Dtype | | | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Lists | ListDtype | list | ``'list'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - | Structs | StructDtype | dict | ``'struct'`` | - +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ - -**Note: All dtypes above are Nullable** - -.. _np.int8: -.. _np.int16: -.. _np.int32: -.. _np.int64: -.. _np.uint8: -.. _np.uint16: -.. _np.uint32: -.. _np.uint64: -.. _np.float32: -.. _np.float64: -.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html -.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes -.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst deleted file mode 100644 index a29866d7e32..00000000000 --- a/docs/cudf/source/basics/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -====== -Basics -====== - - -.. toctree:: - :maxdepth: 2 - - basics - io.rst - groupby.rst - PandasCompat.rst - dask-cudf.rst - internals.rst - \ No newline at end of file diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.md b/docs/cudf/source/user_guide/10min-cudf-cupy.md index a087a3f3a0e..0397507c7b3 100644 --- a/docs/cudf/source/user_guide/10min-cudf-cupy.md +++ b/docs/cudf/source/user_guide/10min-cudf-cupy.md @@ -11,7 +11,7 @@ kernelspec: name: python3 --- -# 10 Minutes to cuDF and CuPy +# Interoperability of cuDF with CuPy This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). diff --git a/docs/cudf/source/basics/PandasCompat.rst b/docs/cudf/source/user_guide/PandasCompat.rst similarity index 100% rename from docs/cudf/source/basics/PandasCompat.rst rename to docs/cudf/source/user_guide/PandasCompat.rst diff --git a/docs/cudf/source/basics/dask-cudf.rst b/docs/cudf/source/user_guide/dask-cudf.rst similarity index 100% rename from docs/cudf/source/basics/dask-cudf.rst rename to docs/cudf/source/user_guide/dask-cudf.rst diff --git a/docs/cudf/source/user_guide/data_types.rst b/docs/cudf/source/user_guide/data_types.rst new file mode 100644 index 00000000000..afa4a1202eb --- /dev/null +++ b/docs/cudf/source/user_guide/data_types.rst @@ -0,0 +1,80 @@ +.. _basics.datatypes: + +Supported Data Types +==================== + +cuDF lets you store and operate on many different types of data on the +GPU. Each type of data is associated with a data type (or "dtype"). +cuDF supports many data types supported by NumPy and Pandas, including +numeric, datetime, timedelta, categorical and string data types. In +addition cuDF supports special data types for decimals and "nested +types" (lists and structs). + +Unlike in Pandas, all data types in cuDF are nullable. +See :doc:`Working With Missing Data `. + + +.. rst-class:: special-table +.. table:: + + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Kind of Data | Data Type | Scalar | String Aliases | + +=================+============================+==============================================================+==============================================+ + | Integer |np.dtype(...) | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, | ``'int8'``, ``'int16'``, ``'int32'``, | + | | | np.uint16_, np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | + | | | | ``'uint32'``, ``'uint64'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Float |np.dtype(...) | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Strings |np.dtype('object') | `str `_ | ``'string'``, ``'object'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Datetime |np.dtype('datetime64[...]') | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | + | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Timedelta |np.dtype('timedelta64[...]')| np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, | + | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Categorical |cudf.CategoricalDtype(...) |(none) | ``'category'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Boolean |np.dtype('bool') | np.bool_ | ``'bool'`` | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Decimal |cudf.Decimal32Dtype(...), |(none) |(none) | + | |cudf.Decimal64Dtype(...), | | | + | |cudf.Decimal128Dtype(...) | | | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Lists |cudf.ListDtype(...) | list |(none) | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + | Structs |cudf.StructDtype(...) | dict |(none) | + +-----------------+----------------------------+--------------------------------------------------------------+----------------------------------------------+ + + +A note on strings +----------------- + +The data type associated with string data in cuDF is ``"object"``. + +.. code:: python + >>> import cudf + >>> s = cudf.Series(["abc", "def", "ghi"]) + >>> s.dtype + dtype("object") + +This is for compatibility with Pandas, but it can be misleading. In +both NumPy and Pandas, ``"object"`` is the data type associated data +composed of arbitrary Python objects (not just strings). However, +cuDF does not support storing arbitrary Python objects. + + +.. _np.int8: +.. _np.int16: +.. _np.int32: +.. _np.int64: +.. _np.uint8: +.. _np.uint16: +.. _np.uint32: +.. _np.uint64: +.. _np.float32: +.. _np.float64: +.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html +.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes +.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/user_guide/groupby.rst similarity index 100% rename from docs/cudf/source/basics/groupby.rst rename to docs/cudf/source/user_guide/groupby.rst diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst index 11cfc8a50a7..ba0ba05bfaf 100644 --- a/docs/cudf/source/user_guide/index.rst +++ b/docs/cudf/source/user_guide/index.rst @@ -7,6 +7,13 @@ User Guide :maxdepth: 2 10min.md - 10min-cudf-cupy.md - guide-to-udfs.md + data_types.rst + io.rst Working-with-missing-data.md + groupby.rst + guide-to-udfs.md + cupy-interop.md + guide-to-udfs.md + dask-cudf.rst + internals.rst + PandasCompat.rst diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/user_guide/internals.rst similarity index 100% rename from docs/cudf/source/basics/internals.rst rename to docs/cudf/source/user_guide/internals.rst diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/user_guide/io-gds-integration.rst similarity index 100% rename from docs/cudf/source/basics/io-gds-integration.rst rename to docs/cudf/source/user_guide/io-gds-integration.rst diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/user_guide/io-nvcomp-integration.rst similarity index 100% rename from docs/cudf/source/basics/io-nvcomp-integration.rst rename to docs/cudf/source/user_guide/io-nvcomp-integration.rst diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/user_guide/io-supported-types.rst similarity index 100% rename from docs/cudf/source/basics/io-supported-types.rst rename to docs/cudf/source/user_guide/io-supported-types.rst diff --git a/docs/cudf/source/basics/io.rst b/docs/cudf/source/user_guide/io.rst similarity index 100% rename from docs/cudf/source/basics/io.rst rename to docs/cudf/source/user_guide/io.rst From fa5c52409c9b6195712b185f01bedbf78d3f7a22 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 13:02:23 -0400 Subject: [PATCH 03/14] Change CuPy notebook name --- docs/cudf/source/index.rst | 1 - .../source/user_guide/{10min-cudf-cupy.md => cupy-interop.md} | 0 2 files changed, 1 deletion(-) rename docs/cudf/source/user_guide/{10min-cudf-cupy.md => cupy-interop.md} (100%) diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 90b287bd1b6..2c1df4a0c12 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -14,7 +14,6 @@ the details of CUDA programming. :caption: Contents: user_guide/index - basics/index api_docs/index diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.md b/docs/cudf/source/user_guide/cupy-interop.md similarity index 100% rename from docs/cudf/source/user_guide/10min-cudf-cupy.md rename to docs/cudf/source/user_guide/cupy-interop.md From 40e96e2dad954a14033ba7ad0e063412a8d062b5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 13:11:29 -0400 Subject: [PATCH 04/14] Remove duplicate --- docs/cudf/source/user_guide/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst index ba0ba05bfaf..6bf2aea6003 100644 --- a/docs/cudf/source/user_guide/index.rst +++ b/docs/cudf/source/user_guide/index.rst @@ -13,7 +13,6 @@ User Guide groupby.rst guide-to-udfs.md cupy-interop.md - guide-to-udfs.md dask-cudf.rst internals.rst PandasCompat.rst From 1fad8a251ab9a23310718eeb7dc06ce2299170d0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 13:23:14 -0400 Subject: [PATCH 05/14] Change title --- docs/cudf/source/user_guide/cupy-interop.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/cupy-interop.md b/docs/cudf/source/user_guide/cupy-interop.md index 0397507c7b3..880537c703c 100644 --- a/docs/cudf/source/user_guide/cupy-interop.md +++ b/docs/cudf/source/user_guide/cupy-interop.md @@ -11,7 +11,7 @@ kernelspec: name: python3 --- -# Interoperability of cuDF with CuPy +# Interoperability between cuDF and CuPy This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). From 43ddc2d968c094c02f1fbc6e6e3037b6f6ffaff5 Mon Sep 17 00:00:00 2001 From: Mike McCarty Date: Thu, 21 Apr 2022 10:16:36 -0400 Subject: [PATCH 06/14] Using MyST-NB sphinx extension for Notebook execution and rendering --- conda/environments/cudf_dev_cuda11.5.yml | 4 + docs/cudf/source/conf.py | 5 +- docs/cudf/source/user_guide/10min.ipynb | 153 +++++++++++++++-------- 3 files changed, 111 insertions(+), 51 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index bdde007e33e..15f4bff583e 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -54,6 +54,10 @@ dependencies: - hypothesis - sphinx-markdown-tables - sphinx-copybutton + - sphinx-autobuild + - myst-nb + - scipy + - dask-cuda - mimesis<4.1 - packaging - protobuf diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index d65b77ef74b..c8b30120924 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -46,10 +46,13 @@ "numpydoc", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", - "nbsphinx", "PandasCompat", + "myst_nb", ] +jupyter_execute_notebooks = "force" +execution_timeout = 300 + copybutton_prompt_text = ">>> " autosummary_generate = True ipython_mplbackend = "str" diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index ab006847fc6..9bb95406e8a 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -2484,6 +2484,14 @@ "execution_count": 35, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/series.py:2223: FutureWarning: Series.applymap is deprecated and will be removed in a future cuDF release. Use Series.apply instead.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/plain": [ @@ -3024,7 +3032,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/core/indexed_frame.py:2271: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n", + "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/indexed_frame.py:2329: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n", " warnings.warn(\n" ] }, @@ -5850,7 +5858,32 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/home/mmccarty/sandbox/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "from pathlib import Path\n", + "current_dir = os.path.dirname(os.path.realpath(\"__file__\"))\n", + "cudf_root = Path(current_dir).parents[3]\n", + "file_path = os.path.join(cudf_root, \"python\", \"cudf\", \"cudf\", \"tests\", \"data\", \"orc\", \"TestOrcFile.test1.orc\")\n", + "file_path" + ] + }, + { + "cell_type": "code", + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -5941,13 +5974,13 @@ "1 [{'key': 'chani', 'value': {'int1': 5, 'string... " ] }, - "execution_count": 79, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df2 = cudf.read_orc('/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc')\n", + "df2 = cudf.read_orc(file_path)\n", "df2" ] }, @@ -5974,15 +6007,17 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-03-29 12:21:32,328 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n", - "2022-03-29 12:21:32,394 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n" + "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ghcx5g0e', purging\n", + "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-wh16f0h3', purging\n", + "2022-04-21 10:11:07,360 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n", + "2022-04-21 10:11:07,388 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n" ] }, { @@ -5992,7 +6027,7 @@ "
\n", "
\n", "

Client

\n", - "

Client-4be800f5-af7c-11ec-8df8-c8d9d2247354

\n", + "

Client-e3492c89-c17c-11ec-813e-fc3497a62adc

\n", " \n", "\n", " \n", @@ -6021,7 +6056,7 @@ " \n", "
\n", "

LocalCUDACluster

\n", - "

137d0882

\n", + "

db2501e1

\n", "
\n", " \n", " \n", " \n", " \n", " \n", @@ -6058,11 +6093,11 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-08f95e9e-2c10-4d66-a103-955ab4218e91

\n", + "

Scheduler-6f476508-e52f-49e9-8f1f-6a8641e177bd

\n", "
\n", @@ -6036,7 +6071,7 @@ " Total threads: 2\n", " \n", - " Total memory: 45.79 GiB\n", + " Total memory: 125.65 GiB\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", - " Comm: tcp://127.0.0.1:35157\n", + " Comm: tcp://127.0.0.1:39755\n", " \n", " Workers: 2\n", @@ -6081,7 +6116,7 @@ " Started: Just now\n", " \n", - " Total memory: 45.79 GiB\n", + " Total memory: 125.65 GiB\n", "
\n", @@ -6104,7 +6139,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -6158,7 +6193,7 @@ "
\n", - " Comm: tcp://127.0.0.1:41411\n", + " Comm: tcp://127.0.0.1:33491\n", " \n", " Total threads: 1\n", @@ -6112,31 +6147,31 @@ "
\n", - " Dashboard: http://127.0.0.1:40997/status\n", + " Dashboard: http://127.0.0.1:34333/status\n", " \n", - " Memory: 22.89 GiB\n", + " Memory: 62.82 GiB\n", "
\n", - " Nanny: tcp://127.0.0.1:42959\n", + " Nanny: tcp://127.0.0.1:43093\n", "
\n", - " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ruvvgno2\n", + " Local directory: /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-jsuvfju4\n", "
\n", - " GPU: Quadro GV100\n", + " GPU: NVIDIA RTX A6000\n", " \n", - " GPU memory: 31.75 GiB\n", + " GPU memory: 47.51 GiB\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -6216,10 +6251,10 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 80, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -6245,7 +6280,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "metadata": {}, "outputs": [ { @@ -6321,7 +6356,7 @@ "" ] }, - "execution_count": 81, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -6337,14 +6372,14 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tue Mar 29 12:21:33 2022 \n", + "Thu Apr 21 10:11:07 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", "|-------------------------------+----------------------+----------------------+\n", @@ -6352,12 +6387,12 @@ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", - "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\n", - "| 36% 49C P2 50W / 250W | 1113MiB / 32508MiB | 0% Default |\n", + "| 0 NVIDIA RTX A6000 On | 00000000:01:00.0 On | Off |\n", + "| 30% 48C P2 83W / 300W | 2970MiB / 48651MiB | 7% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", - "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\n", - "| 40% 54C P2 50W / 250W | 306MiB / 32498MiB | 0% Default |\n", + "| 1 NVIDIA RTX A6000 On | 00000000:02:00.0 Off | Off |\n", + "| 30% 36C P2 25W / 300W | 265MiB / 48685MiB | 5% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", @@ -6366,6 +6401,15 @@ "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", + "| 0 N/A N/A 2292 G /usr/lib/xorg/Xorg 871MiB |\n", + "| 0 N/A N/A 2441 G /usr/bin/gnome-shell 316MiB |\n", + "| 0 N/A N/A 1240494 G ...AAAAAAAAA= --shared-files 68MiB |\n", + "| 0 N/A N/A 1240525 G ...RendererForSitePerProcess 41MiB |\n", + "| 0 N/A N/A 1243689 C .../envs/cudf_dev/bin/python 593MiB |\n", + "| 0 N/A N/A 1245502 C .../envs/cudf_dev/bin/python 753MiB |\n", + "| 0 N/A N/A 1245751 C .../envs/cudf_dev/bin/python 257MiB |\n", + "| 1 N/A N/A 2292 G /usr/lib/xorg/Xorg 4MiB |\n", + "| 1 N/A N/A 1245748 C .../envs/cudf_dev/bin/python 257MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } @@ -6383,7 +6427,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -6459,7 +6503,7 @@ "" ] }, - "execution_count": 83, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -6471,14 +6515,14 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tue Mar 29 12:21:34 2022 \n", + "Thu Apr 21 10:11:08 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", "|-------------------------------+----------------------+----------------------+\n", @@ -6486,12 +6530,12 @@ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", - "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\n", - "| 36% 49C P2 50W / 250W | 1113MiB / 32508MiB | 0% Default |\n", + "| 0 NVIDIA RTX A6000 On | 00000000:01:00.0 On | Off |\n", + "| 30% 48C P2 84W / 300W | 2970MiB / 48651MiB | 3% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", - "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\n", - "| 40% 54C P2 50W / 250W | 306MiB / 32498MiB | 0% Default |\n", + "| 1 NVIDIA RTX A6000 On | 00000000:02:00.0 Off | Off |\n", + "| 30% 36C P2 37W / 300W | 265MiB / 48685MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", @@ -6500,6 +6544,15 @@ "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", + "| 0 N/A N/A 2292 G /usr/lib/xorg/Xorg 871MiB |\n", + "| 0 N/A N/A 2441 G /usr/bin/gnome-shell 316MiB |\n", + "| 0 N/A N/A 1240494 G ...AAAAAAAAA= --shared-files 68MiB |\n", + "| 0 N/A N/A 1240525 G ...RendererForSitePerProcess 41MiB |\n", + "| 0 N/A N/A 1243689 C .../envs/cudf_dev/bin/python 593MiB |\n", + "| 0 N/A N/A 1245502 C .../envs/cudf_dev/bin/python 753MiB |\n", + "| 0 N/A N/A 1245751 C .../envs/cudf_dev/bin/python 257MiB |\n", + "| 1 N/A N/A 2292 G /usr/lib/xorg/Xorg 4MiB |\n", + "| 1 N/A N/A 1245748 C .../envs/cudf_dev/bin/python 257MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } @@ -6527,7 +6580,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -6552,7 +6605,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -6569,16 +6622,16 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "DoneAndNotDoneFutures(done={, , , , }, not_done=set())" + "DoneAndNotDoneFutures(done={, , , , }, not_done=set())" ] }, - "execution_count": 87, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } From 5e70c13740c89b9be41688946c4bb3e28ebf1712 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 11:42:27 -0400 Subject: [PATCH 07/14] Add Pandas comparison docs --- .../{data_types.rst => data-types.rst} | 2 - docs/cudf/source/user_guide/index.rst | 3 +- .../source/user_guide/pandas-comparison.rst | 103 ++++++++++++++++++ 3 files changed, 105 insertions(+), 3 deletions(-) rename docs/cudf/source/user_guide/{data_types.rst => data-types.rst} (99%) create mode 100644 docs/cudf/source/user_guide/pandas-comparison.rst diff --git a/docs/cudf/source/user_guide/data_types.rst b/docs/cudf/source/user_guide/data-types.rst similarity index 99% rename from docs/cudf/source/user_guide/data_types.rst rename to docs/cudf/source/user_guide/data-types.rst index afa4a1202eb..336e578955e 100644 --- a/docs/cudf/source/user_guide/data_types.rst +++ b/docs/cudf/source/user_guide/data-types.rst @@ -1,5 +1,3 @@ -.. _basics.datatypes: - Supported Data Types ==================== diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst index 6bf2aea6003..4e9c97cfae3 100644 --- a/docs/cudf/source/user_guide/index.rst +++ b/docs/cudf/source/user_guide/index.rst @@ -7,7 +7,8 @@ User Guide :maxdepth: 2 10min.md - data_types.rst + pandas-comparison.rst + data-types.rst io.rst Working-with-missing-data.md groupby.rst diff --git a/docs/cudf/source/user_guide/pandas-comparison.rst b/docs/cudf/source/user_guide/pandas-comparison.rst new file mode 100644 index 00000000000..0b17c03642f --- /dev/null +++ b/docs/cudf/source/user_guide/pandas-comparison.rst @@ -0,0 +1,103 @@ +Comparison of cuDF and Pandas +============================= + +cuDF is a DataFrame library that closely matches the Pandas API, but +leverages NVIDIA GPUs for performing computations for speed. However, +there are some differences between cuDF and Pandas, both in terms API +and behavior. This page documents the similarities and differences +between cuDF and Pandas. + +Data types +---------- + +cuDF supports many common data types supported by Pandas, including +numeric, datetime, timestamp, string, and categorical data types. In +addition, we support special data types for decimal, list and "struct" +values. See the section on :doc:`Data Types ` for +details. + +Note that we do not support custom data types like Pandas' +``ExtensionDtype``. + +Result ordering +--------------- + +By default, ``join`` (or ``merge``) and ``groupby`` operations in cuDF +do *not* guarantee output ordering by default. +Compare the results obtained from Pandas and cuDF below: + +.. code:: python + + >>> import cupy as cp + >>> df = cudf.DataFrame({'a': cp.random.randint(0, 1000, 1000), 'b': range(1000)}) + >>> df.groupby("a").mean().head() + b + a + 742 694.5 + 29 840.0 + 459 525.5 + 442 363.0 + 666 7.0 + >>> df.to_pandas().groupby("a").mean().head() + b + a + 2 643.75 + 6 48.00 + 7 631.00 + 9 906.00 + 10 640.00 + +To match Pandas behavior, you must explicitly pass ``sort=True``: + +.. code:: python + + >>> df.to_pandas().groupby("a", sort=True).mean().head() + b + a + 2 643.75 + 6 48.00 + 7 631.00 + 9 906.00 + 10 640.00 + +Column names +------------ + +Unlike Pandas, cuDF does not support duplicate column names. +It is best to use strings for column names. + +No true ``"object"`` data type +------------------------------ + +In Pandas and NumPy, the ``"object"`` data type is used for +collections of arbitrary Python objects. For example, in Pandas you +can do the following: + +.. code:: python + >>> import pandas as pd + >>> s = pd.Series(["a", 1, [1, 2, 3]]) + 0 a + 1 1 + 2 [1, 2, 3] + dtype: object + +For compatibilty with Pandas, cuDF reports the data type for strings +as ``"object"``, but we do *not* support storing or operating on +collections of arbitrary Python objects. + +``.apply()`` function limitations +--------------------------------- + +The ``.apply()`` function in Pandas accecpts a user-defined function +(UDF) that can include arbitrary operations that are applied to each +value of a ``Series``, ``DataFrame``, or in the case of a groupby, +each group. cuDF also supports ``apply()``, but it relies on Numba to +JIT compile the UDF and execute it on the GPU. This can be extremely +fast, but imposes a few limitations on what operations are allowed in +the UDF. See our :doc:`UDF docs ` for details. + +How to check if a particular Pandas feature is available in cuDF? +----------------------------------------------------------------- + +The best way to see if we support a particular feature is to search +our `API docs `_. From dd7c4806973321ed180033dd0aa9dfdbf8af0f2e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 11:57:06 -0400 Subject: [PATCH 08/14] Move back to ipynb --- docs/cudf/source/user_guide/10min.ipynb | 1698 +++++++++++++++++ .../Working-with-missing-data.ipynb | 1227 ++++++++++++ .../cudf/source/user_guide/cupy-interop.ipynb | 430 +++++ .../source/user_guide/guide-to-udfs.ipynb | 1110 +++++++++++ 4 files changed, 4465 insertions(+) create mode 100644 docs/cudf/source/user_guide/10min.ipynb create mode 100644 docs/cudf/source/user_guide/Working-with-missing-data.ipynb create mode 100644 docs/cudf/source/user_guide/cupy-interop.ipynb create mode 100644 docs/cudf/source/user_guide/guide-to-udfs.ipynb diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb new file mode 100644 index 00000000000..d516ed618d6 --- /dev/null +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -0,0 +1,1698 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e9357872", + "metadata": {}, + "source": [ + "10 Minutes to cuDF and Dask-cuDF\n", + "=======================\n", + "\n", + "Modeled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly for new users.\n", + "\n", + "### What are these Libraries?\n", + "\n", + "[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.\n", + "\n", + "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n", + "\n", + "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv().\n", + "\n", + "\n", + "### When to use cuDF and Dask-cuDF\n", + "\n", + "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92eed4cb", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import cupy as cp\n", + "import pandas as pd\n", + "import cudf\n", + "import dask_cudf\n", + "\n", + "cp.random.seed(12)\n", + "\n", + "#### Portions of this were borrowed and adapted from the\n", + "#### cuDF cheatsheet, existing cuDF documentation,\n", + "#### and 10 Minutes to Pandas." + ] + }, + { + "cell_type": "markdown", + "id": "ed6c6047", + "metadata": {}, + "source": [ + "Object Creation\n", + "---------------" + ] + }, + { + "cell_type": "markdown", + "id": "aeedd961", + "metadata": {}, + "source": [ + "Creating a `cudf.Series` and `dask_cudf.Series`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8b08e5", + "metadata": {}, + "outputs": [], + "source": [ + "s = cudf.Series([1,2,3,None,4])\n", + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "083a5898", + "metadata": {}, + "outputs": [], + "source": [ + "ds = dask_cudf.from_cudf(s, npartitions=2) \n", + "ds.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "6346e1b1", + "metadata": {}, + "source": [ + "Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83d1e7f5", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame({'a': list(range(20)),\n", + " 'b': list(reversed(range(20))),\n", + " 'c': list(range(20))\n", + " })\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71b61d62", + "metadata": {}, + "outputs": [], + "source": [ + "ddf = dask_cudf.from_cudf(df, npartitions=2) \n", + "ddf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "c7cb5abc", + "metadata": {}, + "source": [ + "Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`.\n", + "\n", + "*Note that best practice for using Dask-cuDF is to read data directly into a `dask_cudf.DataFrame` with something like `read_csv` (discussed below).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07a62244", + "metadata": {}, + "outputs": [], + "source": [ + "pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})\n", + "gdf = cudf.DataFrame.from_pandas(pdf)\n", + "gdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5cb0c65", + "metadata": {}, + "outputs": [], + "source": [ + "dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", + "dask_gdf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "025eac40", + "metadata": {}, + "source": [ + "Viewing Data\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "47a567e8", + "metadata": {}, + "source": [ + "Viewing the top rows of a GPU dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab8cbdb8", + "metadata": {}, + "outputs": [], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e923d8a", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "61257b4b", + "metadata": {}, + "source": [ + "Sorting by values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "512770f9", + "metadata": {}, + "outputs": [], + "source": [ + "df.sort_values(by='b')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a13993f", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.sort_values(by='b').compute()" + ] + }, + { + "cell_type": "markdown", + "id": "19bce4c4", + "metadata": {}, + "source": [ + "Selection\n", + "------------\n", + "\n", + "## Getting" + ] + }, + { + "cell_type": "markdown", + "id": "ba55980e", + "metadata": {}, + "source": [ + "Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885989a6", + "metadata": {}, + "outputs": [], + "source": [ + "df['a']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14a74255", + "metadata": {}, + "outputs": [], + "source": [ + "ddf['a'].compute()" + ] + }, + { + "cell_type": "markdown", + "id": "498d79f2", + "metadata": {}, + "source": [ + "## Selection by Label" + ] + }, + { + "cell_type": "markdown", + "id": "4b8b8e13", + "metadata": {}, + "source": [ + "Selecting rows from index 2 to index 5 from columns 'a' and 'b'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40bc19c", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[2:5, ['a', 'b']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7688535b", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.loc[2:5, ['a', 'b']].compute()" + ] + }, + { + "cell_type": "markdown", + "id": "8a64ce7a", + "metadata": {}, + "source": [ + "## Selection by Position" + ] + }, + { + "cell_type": "markdown", + "id": "dfba2bb2", + "metadata": {}, + "source": [ + "Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb8d6d43", + "metadata": {}, + "outputs": [], + "source": [ + "df.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "263231da", + "metadata": {}, + "outputs": [], + "source": [ + "df.iloc[0:3, 0:2]" + ] + }, + { + "cell_type": "markdown", + "id": "2223b089", + "metadata": {}, + "source": [ + "You can also select elements of a `DataFrame` or `Series` with direct index access." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13f6158b", + "metadata": {}, + "outputs": [], + "source": [ + "df[3:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cf4aa26", + "metadata": {}, + "outputs": [], + "source": [ + "s[3:5]" + ] + }, + { + "cell_type": "markdown", + "id": "ff633b2d", + "metadata": {}, + "source": [ + "## Boolean Indexing" + ] + }, + { + "cell_type": "markdown", + "id": "bbdef48f", + "metadata": {}, + "source": [ + "Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "becb916f", + "metadata": {}, + "outputs": [], + "source": [ + "df[df.b > 15]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9475c43", + "metadata": {}, + "outputs": [], + "source": [ + "ddf[ddf.b > 15].compute()" + ] + }, + { + "cell_type": "markdown", + "id": "ecf982f5", + "metadata": {}, + "source": [ + "Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc2fc9f9", + "metadata": {}, + "outputs": [], + "source": [ + "df.query(\"b == 3\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a05a07f", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.query(\"b == 3\").compute()" + ] + }, + { + "cell_type": "markdown", + "id": "7f8955a0", + "metadata": {}, + "source": [ + "You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49485a4b", + "metadata": {}, + "outputs": [], + "source": [ + "cudf_comparator = 3\n", + "df.query(\"b == @cudf_comparator\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f3a9116", + "metadata": {}, + "outputs": [], + "source": [ + "dask_cudf_comparator = 3\n", + "ddf.query(\"b == @val\", local_dict={'val':dask_cudf_comparator}).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "c355af07", + "metadata": {}, + "source": [ + "Using the `isin` method for filtering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f44a5a57", + "metadata": {}, + "outputs": [], + "source": [ + "df[df.a.isin([0, 5])]" + ] + }, + { + "cell_type": "markdown", + "id": "79a50beb", + "metadata": {}, + "source": [ + "## MultiIndex" + ] + }, + { + "cell_type": "markdown", + "id": "14e70234", + "metadata": {}, + "source": [ + "cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "882973ed", + "metadata": {}, + "outputs": [], + "source": [ + "arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]]\n", + "tuples = list(zip(*arrays))\n", + "idx = cudf.MultiIndex.from_tuples(tuples)\n", + "idx" + ] + }, + { + "cell_type": "markdown", + "id": "c10971cc", + "metadata": {}, + "source": [ + "This index can back either axis of a DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5417aeb9", + "metadata": {}, + "outputs": [], + "source": [ + "gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)})\n", + "gdf1.index = idx\n", + "gdf1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6fb4ff", + "metadata": {}, + "outputs": [], + "source": [ + "gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T\n", + "gdf2.columns = idx\n", + "gdf2" + ] + }, + { + "cell_type": "markdown", + "id": "63dc11d8", + "metadata": {}, + "source": [ + "Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3644920c", + "metadata": {}, + "outputs": [], + "source": [ + "gdf1.loc[('b', 3)]" + ] + }, + { + "cell_type": "markdown", + "id": "697a9a36", + "metadata": {}, + "source": [ + "Missing Data\n", + "------------" + ] + }, + { + "cell_type": "markdown", + "id": "86655274", + "metadata": {}, + "source": [ + "Missing data can be replaced by using the `fillna` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28b06c52", + "metadata": {}, + "outputs": [], + "source": [ + "s.fillna(999)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fb6a126", + "metadata": {}, + "outputs": [], + "source": [ + "ds.fillna(999).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "7a0b732f", + "metadata": {}, + "source": [ + "Operations\n", + "------------" + ] + }, + { + "cell_type": "markdown", + "id": "1e8b0464", + "metadata": {}, + "source": [ + "## Stats" + ] + }, + { + "cell_type": "markdown", + "id": "7523512b", + "metadata": {}, + "source": [ + "Calculating descriptive statistics for a `Series`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7cb604e", + "metadata": {}, + "outputs": [], + "source": [ + "s.mean(), s.var()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8957a5f", + "metadata": {}, + "outputs": [], + "source": [ + "ds.mean().compute(), ds.var().compute()" + ] + }, + { + "cell_type": "markdown", + "id": "71fa928a", + "metadata": {}, + "source": [ + "## Applymap" + ] + }, + { + "cell_type": "markdown", + "id": "d98d6f7b", + "metadata": {}, + "source": [ + "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2924276b", + "metadata": {}, + "outputs": [], + "source": [ + "def add_ten(num):\n", + " return num + 10\n", + "\n", + "df['a'].applymap(add_ten)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96cf628e", + "metadata": {}, + "outputs": [], + "source": [ + "ddf['a'].map_partitions(add_ten).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "cd69c00a", + "metadata": {}, + "source": [ + "## Histogramming" + ] + }, + { + "cell_type": "markdown", + "id": "39982866", + "metadata": {}, + "source": [ + "Counting the number of occurrences of each unique value of variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62808675", + "metadata": {}, + "outputs": [], + "source": [ + "df.a.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b2a42ce", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.a.value_counts().compute()" + ] + }, + { + "cell_type": "markdown", + "id": "2d7e62e4", + "metadata": {}, + "source": [ + "## String Methods" + ] + }, + { + "cell_type": "markdown", + "id": "4e704eca", + "metadata": {}, + "source": [ + "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73e70bb", + "metadata": {}, + "outputs": [], + "source": [ + "s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'])\n", + "s.str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "697c1c94", + "metadata": {}, + "outputs": [], + "source": [ + "ds = dask_cudf.from_cudf(s, npartitions=2)\n", + "ds.str.lower().compute()" + ] + }, + { + "cell_type": "markdown", + "id": "dfc1371e", + "metadata": {}, + "source": [ + "## Concat" + ] + }, + { + "cell_type": "markdown", + "id": "f6fb9b53", + "metadata": {}, + "source": [ + "Concatenating `Series` and `DataFrames` row-wise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60538bbd", + "metadata": {}, + "outputs": [], + "source": [ + "s = cudf.Series([1, 2, 3, None, 5])\n", + "cudf.concat([s, s])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17953847", + "metadata": {}, + "outputs": [], + "source": [ + "ds2 = dask_cudf.from_cudf(s, npartitions=2)\n", + "dask_cudf.concat([ds2, ds2]).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "27f0d621", + "metadata": {}, + "source": [ + "## Join" + ] + }, + { + "cell_type": "markdown", + "id": "fd35f1a7", + "metadata": {}, + "source": [ + "Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52ada00a", + "metadata": {}, + "outputs": [], + "source": [ + "df_a = cudf.DataFrame()\n", + "df_a['key'] = ['a', 'b', 'c', 'd', 'e']\n", + "df_a['vals_a'] = [float(i + 10) for i in range(5)]\n", + "\n", + "df_b = cudf.DataFrame()\n", + "df_b['key'] = ['a', 'c', 'e']\n", + "df_b['vals_b'] = [float(i+100) for i in range(3)]\n", + "\n", + "merged = df_a.merge(df_b, on=['key'], how='left')\n", + "merged" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "409fcf92", + "metadata": {}, + "outputs": [], + "source": [ + "ddf_a = dask_cudf.from_cudf(df_a, npartitions=2)\n", + "ddf_b = dask_cudf.from_cudf(df_b, npartitions=2)\n", + "\n", + "merged = ddf_a.merge(ddf_b, on=['key'], how='left').compute()\n", + "merged" + ] + }, + { + "cell_type": "markdown", + "id": "d9dcb86b", + "metadata": {}, + "source": [ + "## Append" + ] + }, + { + "cell_type": "markdown", + "id": "1f896819", + "metadata": {}, + "source": [ + "Appending values from another `Series` or array-like object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03ac8c1a", + "metadata": {}, + "outputs": [], + "source": [ + "s.append(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe5c54ab", + "metadata": {}, + "outputs": [], + "source": [ + "ds2.append(ds2).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "9fa10ef3", + "metadata": {}, + "source": [ + "## Grouping" + ] + }, + { + "cell_type": "markdown", + "id": "8a6e41f5", + "metadata": {}, + "source": [ + "Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a8cafa7", + "metadata": {}, + "outputs": [], + "source": [ + "df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))]\n", + "df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))]\n", + "\n", + "ddf = dask_cudf.from_cudf(df, npartitions=2)" + ] + }, + { + "cell_type": "markdown", + "id": "0179d60c", + "metadata": {}, + "source": [ + "Grouping and then applying the `sum` function to the grouped data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c56d186", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby('agg_col1').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8823b30", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.groupby('agg_col1').sum().compute()" + ] + }, + { + "cell_type": "markdown", + "id": "a84cb883", + "metadata": {}, + "source": [ + "Grouping hierarchically then applying the `sum` function to grouped data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2184e3ad", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby(['agg_col1', 'agg_col2']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ec311c1", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.groupby(['agg_col1', 'agg_col2']).sum().compute()" + ] + }, + { + "cell_type": "markdown", + "id": "dedfeb1b", + "metadata": {}, + "source": [ + "Grouping and applying statistical functions to specific columns, using `agg`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2563d8b2", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c77e75", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "6d074822", + "metadata": {}, + "source": [ + "## Transpose" + ] + }, + { + "cell_type": "markdown", + "id": "16c0f0a8", + "metadata": {}, + "source": [ + "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e265861e", + "metadata": {}, + "outputs": [], + "source": [ + "sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n", + "sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fe9b972", + "metadata": {}, + "outputs": [], + "source": [ + "sample.transpose()" + ] + }, + { + "cell_type": "markdown", + "id": "9ce02827", + "metadata": {}, + "source": [ + "Time Series\n", + "------------" + ] + }, + { + "cell_type": "markdown", + "id": "fec907ff", + "metadata": {}, + "source": [ + "`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a425d3f", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime as dt\n", + "\n", + "date_df = cudf.DataFrame()\n", + "date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D')\n", + "date_df['value'] = cp.random.sample(len(date_df))\n", + "\n", + "search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d')\n", + "date_df.query('date <= @search_date')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87f0e56e", + "metadata": {}, + "outputs": [], + "source": [ + "date_ddf = dask_cudf.from_cudf(date_df, npartitions=2)\n", + "date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "0d0e541c", + "metadata": {}, + "source": [ + "Categoricals\n", + "------------" + ] + }, + { + "cell_type": "markdown", + "id": "a36f9543", + "metadata": {}, + "source": [ + "`DataFrames` support categorical columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05bd8be8", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = cudf.DataFrame({\"id\": [1, 2, 3, 4, 5, 6], \"grade\":['a', 'b', 'b', 'a', 'a', 'e']})\n", + "gdf['grade'] = gdf['grade'].astype('category')\n", + "gdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "676b4963", + "metadata": {}, + "outputs": [], + "source": [ + "dgdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", + "dgdf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "e24f2e7b", + "metadata": {}, + "source": [ + "Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06310c36", + "metadata": {}, + "outputs": [], + "source": [ + "gdf.grade.cat.categories" + ] + }, + { + "cell_type": "markdown", + "id": "4eb6f858", + "metadata": {}, + "source": [ + "Accessing the underlying code values of each categorical observation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6db260", + "metadata": {}, + "outputs": [], + "source": [ + "gdf.grade.cat.codes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b87c4375", + "metadata": {}, + "outputs": [], + "source": [ + "dgdf.grade.cat.codes.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "3f816916", + "metadata": {}, + "source": [ + "Converting Data Representation\n", + "--------------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "64a17f6d", + "metadata": {}, + "source": [ + "## Pandas" + ] + }, + { + "cell_type": "markdown", + "id": "3acdcacc", + "metadata": {}, + "source": [ + "Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1fed919", + "metadata": {}, + "outputs": [], + "source": [ + "df.head().to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "567c7363", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.compute().head().to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "c2121453", + "metadata": {}, + "source": [ + "## Numpy" + ] + }, + { + "cell_type": "markdown", + "id": "a9faa2c5", + "metadata": {}, + "source": [ + "Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5490d226", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b77ac8ae", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.compute().to_numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "1d24d30f", + "metadata": {}, + "source": [ + "Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f71a0ba3", + "metadata": {}, + "outputs": [], + "source": [ + "df['a'].to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a45a74b5", + "metadata": {}, + "outputs": [], + "source": [ + "ddf['a'].compute().to_numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "0d78a4d2", + "metadata": {}, + "source": [ + "## Arrow" + ] + }, + { + "cell_type": "markdown", + "id": "7e35b829", + "metadata": {}, + "source": [ + "Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb9e9a2a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_arrow()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d020de7", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.compute().to_arrow()" + ] + }, + { + "cell_type": "markdown", + "id": "ace7b4f9", + "metadata": {}, + "source": [ + "Getting Data In/Out\n", + "------------------------" + ] + }, + { + "cell_type": "markdown", + "id": "161abb12", + "metadata": {}, + "source": [ + "## CSV" + ] + }, + { + "cell_type": "markdown", + "id": "7e5dc381", + "metadata": {}, + "source": [ + "Writing to a CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a59715f", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('example_output'):\n", + " os.mkdir('example_output')\n", + " \n", + "df.to_csv('example_output/foo.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ebe98ed", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.compute().to_csv('example_output/foo_dask.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "0479fc4f", + "metadata": {}, + "source": [ + "Reading from a csv file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a70e831", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.read_csv('example_output/foo.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c3d9ca3", + "metadata": {}, + "outputs": [], + "source": [ + "ddf = dask_cudf.read_csv('example_output/foo_dask.csv')\n", + "ddf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "3d739c6e", + "metadata": {}, + "source": [ + "Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb7187d2", + "metadata": {}, + "outputs": [], + "source": [ + "ddf = dask_cudf.read_csv('example_output/*.csv')\n", + "ddf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "c0939a1e", + "metadata": {}, + "source": [ + "## Parquet" + ] + }, + { + "cell_type": "markdown", + "id": "14e6a634", + "metadata": {}, + "source": [ + "Writing to parquet files, using the CPU via PyArrow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1812346f", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_parquet('example_output/temp_parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "093cd0fe", + "metadata": {}, + "source": [ + "Reading parquet files with a GPU-accelerated parquet reader." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2354b20b", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.read_parquet('example_output/temp_parquet')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "132c3ff2", + "metadata": {}, + "source": [ + "Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5d7686c", + "metadata": {}, + "outputs": [], + "source": [ + "ddf.to_parquet('example_files') " + ] + }, + { + "cell_type": "markdown", + "id": "0d73d1dd", + "metadata": {}, + "source": [ + "## ORC" + ] + }, + { + "cell_type": "markdown", + "id": "61b5f466", + "metadata": {}, + "source": [ + "Reading ORC files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4903923f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "current_dir = os.path.dirname(os.path.realpath(\"__file__\"))\n", + "cudf_root = Path(current_dir).parents[3]\n", + "file_path = os.path.join(cudf_root, \"python\", \"cudf\", \"cudf\", \"tests\", \"data\", \"orc\", \"TestOrcFile.test1.orc\")\n", + "file_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc3862ff", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = cudf.read_orc(file_path)\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "id": "238ce6a4", + "metadata": {}, + "source": [ + "Dask Performance Tips\n", + "--------------------------------\n", + "\n", + "Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed at that moment, most operations are added to a task graph and the actual evaluation is delayed until the result is needed.\n", + "\n", + "Sometimes, though, we want to force the execution of operations. Calling `persist` on a Dask collection fully computes it (or actively computes it in the background), persisting the result into memory. When we're using distributed systems, we may want to wait until `persist` is finished before beginning any downstream operations. We can enforce this contract by using `wait`. Wrapping an operation with `wait` will ensure it doesn't begin executing until all necessary upstream operations have finished.\n", + "\n", + "The snippets below provide basic examples, using `LocalCUDACluster` to create one dask-worker per GPU on the local machine. For more detailed information about `persist` and `wait`, please see the Dask documentation for [persist](https://docs.dask.org/en/latest/api.html#dask.persist) and [wait](https://docs.dask.org/en/latest/futures.html#distributed.wait). Wait relies on the concept of Futures, which is beyond the scope of this tutorial. For more information on Futures, see the Dask [Futures](https://docs.dask.org/en/latest/futures.html) documentation. For more information about multi-GPU clusters, please see the [dask-cuda](https://github.com/rapidsai/dask-cuda) library (documentation is in progress)." + ] + }, + { + "cell_type": "markdown", + "id": "3de9aeca", + "metadata": {}, + "source": [ + "First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1504a73d", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "from dask.distributed import Client, wait\n", + "from dask_cuda import LocalCUDACluster\n", + "\n", + "cluster = LocalCUDACluster()\n", + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "id": "181e4d10", + "metadata": {}, + "source": [ + "### Persisting Data\n", + "Next, we create our Dask-cuDF DataFrame and apply a transformation, storing the result as a new column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b446d789", + "metadata": {}, + "outputs": [], + "source": [ + "nrows = 10000000\n", + "\n", + "df2 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)})\n", + "ddf2 = dask_cudf.from_cudf(df2, npartitions=5)\n", + "ddf2['c'] = ddf2['a'] + 5\n", + "ddf2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff93ffe", + "metadata": {}, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "b98810c4", + "metadata": {}, + "source": [ + "Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "519cfb36", + "metadata": {}, + "outputs": [], + "source": [ + "ddf2 = ddf2.persist()\n", + "ddf2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be570522", + "metadata": {}, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "ff9e14b6", + "metadata": {}, + "source": [ + "Because we forced computation, we now have a larger object in distributed GPU memory." + ] + }, + { + "cell_type": "markdown", + "id": "bb3b3dee", + "metadata": {}, + "source": [ + "### Wait\n", + "Depending on our workflow or distributed computing setup, we may want to `wait` until all upstream tasks have finished before proceeding with a specific function. This section shows an example of this behavior, adapted from the Dask documentation.\n", + "\n", + "First, we create a new Dask DataFrame and define a function that we'll map to every partition in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f8df429", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "nrows = 10000000\n", + "\n", + "df1 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)})\n", + "ddf1 = dask_cudf.from_cudf(df1, npartitions=100)\n", + "\n", + "def func(df):\n", + " time.sleep(random.randint(1, 60))\n", + " return (df + 5) * 3 - 11" + ] + }, + { + "cell_type": "markdown", + "id": "e1099ec0", + "metadata": {}, + "source": [ + "This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a0313a3", + "metadata": {}, + "outputs": [], + "source": [ + "results_ddf = ddf2.map_partitions(func)\n", + "results_ddf = results_ddf.persist()" + ] + }, + { + "cell_type": "markdown", + "id": "5eb83a7e", + "metadata": {}, + "source": [ + "However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79426902", + "metadata": {}, + "outputs": [], + "source": [ + "wait(results_ddf)" + ] + }, + { + "cell_type": "markdown", + "id": "447301f5", + "metadata": {}, + "source": [ + "## With `wait`, we can safely proceed on in our workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e06fcf4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb new file mode 100644 index 00000000000..b261ebe785e --- /dev/null +++ b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb @@ -0,0 +1,1227 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f8ffbea7", + "metadata": {}, + "source": [ + "# Working with missing data" + ] + }, + { + "cell_type": "markdown", + "id": "7e3ab093", + "metadata": {}, + "source": [ + "In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by ``. These values are also referenced as \"null values\"." + ] + }, + { + "cell_type": "markdown", + "id": "d970a34a", + "metadata": {}, + "source": [ + "1. [How to Detect missing values](#How-to-Detect-missing-values)\n", + "2. [Float dtypes and missing data](#Float-dtypes-and-missing-data)\n", + "3. [Datetimes](#Datetimes)\n", + "4. [Calculations with missing data](#Calculations-with-missing-data)\n", + "5. [Sum/product of Null/nans](#Sum/product-of-Null/nans)\n", + "6. [NA values in GroupBy](#NA-values-in-GroupBy)\n", + "7. [Inserting missing data](#Inserting-missing-data)\n", + "8. [Filling missing values: fillna](#Filling-missing-values:-fillna)\n", + "9. [Filling with cudf Object](#Filling-with-cudf-Object)\n", + "10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna)\n", + "11. [Replacing generic values](#Replacing-generic-values)\n", + "12. [String/regular expression replacement](#String/regular-expression-replacement)\n", + "13. [Numeric replacement](#Numeric-replacement)" + ] + }, + { + "cell_type": "markdown", + "id": "8d657a82", + "metadata": {}, + "source": [ + "## How to Detect missing values" + ] + }, + { + "cell_type": "markdown", + "id": "9ea9f672", + "metadata": {}, + "source": [ + "To detect missing values, you can use `isna()` and `notna()` functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58050adb", + "metadata": {}, + "outputs": [], + "source": [ + "import cudf\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "416d73da", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dfc6bc3", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7f7a6d", + "metadata": {}, + "outputs": [], + "source": [ + "df.isna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40edca67", + "metadata": {}, + "outputs": [], + "source": [ + "df['a'].notna()" + ] + }, + { + "cell_type": "markdown", + "id": "acdf29d7", + "metadata": {}, + "source": [ + "One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c269c1f5", + "metadata": {}, + "outputs": [], + "source": [ + "None == None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99fb083a", + "metadata": {}, + "outputs": [], + "source": [ + "np.nan == np.nan" + ] + }, + { + "cell_type": "markdown", + "id": "4fdb8bc7", + "metadata": {}, + "source": [ + "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "630ef6bb", + "metadata": {}, + "outputs": [], + "source": [ + "df['b'] == np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8162e383", + "metadata": {}, + "outputs": [], + "source": [ + "s = cudf.Series([None, 1, 2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "199775b3", + "metadata": {}, + "outputs": [], + "source": [ + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd09d80c", + "metadata": {}, + "outputs": [], + "source": [ + "s == None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b23bb0c", + "metadata": {}, + "outputs": [], + "source": [ + "s = cudf.Series([1, 2, np.nan], nan_as_null=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cafb79ee", + "metadata": {}, + "outputs": [], + "source": [ + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13363897", + "metadata": {}, + "outputs": [], + "source": [ + "s == np.nan" + ] + }, + { + "cell_type": "markdown", + "id": "208a3776", + "metadata": {}, + "source": [ + "## Float dtypes and missing data" + ] + }, + { + "cell_type": "markdown", + "id": "2c174b88", + "metadata": {}, + "source": [ + "Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default.\n", + "\n", + "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `` value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c59c3c54", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([1, 2, np.nan])" + ] + }, + { + "cell_type": "markdown", + "id": "a9eb2d9c", + "metadata": {}, + "source": [ + "Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecc5ae92", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([1, 2, np.nan], nan_as_null=False)" + ] + }, + { + "cell_type": "markdown", + "id": "d1db7b08", + "metadata": {}, + "source": [ + "## Datetimes" + ] + }, + { + "cell_type": "markdown", + "id": "548d3734", + "metadata": {}, + "source": [ + "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(``) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de70f244", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n", + "datetime_series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8411a914", + "metadata": {}, + "outputs": [], + "source": [ + "datetime_series.to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "df664145", + "metadata": {}, + "source": [ + "any operations on rows having `` values in `datetime` column will result in `` value at the same location in resulting column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "829c32d0", + "metadata": {}, + "outputs": [], + "source": [ + "datetime_series - datetime_series" + ] + }, + { + "cell_type": "markdown", + "id": "aa8031ef", + "metadata": {}, + "source": [ + "## Calculations with missing data" + ] + }, + { + "cell_type": "markdown", + "id": "c587fae2", + "metadata": {}, + "source": [ + "Null values propagate naturally through arithmetic operations between pandas objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8f2aec7", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c8a3011", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "052f6c2b", + "metadata": {}, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fb0a083", + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8152c0", + "metadata": {}, + "outputs": [], + "source": [ + "df1 + df2" + ] + }, + { + "cell_type": "markdown", + "id": "11170d49", + "metadata": {}, + "source": [ + "While summing the data along a series, `NA` values will be treated as `0`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45081790", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39922658", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "6e99afe0", + "metadata": {}, + "source": [ + "Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2f16ddb", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].mean()" + ] + }, + { + "cell_type": "markdown", + "id": "07f2ec5a", + "metadata": {}, + "source": [ + "To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter.\n", + "By default it's value is\n", + "set to `True`, we can change it to `False` to preserve `NA` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4a463a0", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].sum(skipna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a944c42e", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].mean(skipna=False)" + ] + }, + { + "cell_type": "markdown", + "id": "fb8c8f18", + "metadata": {}, + "source": [ + "Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f2a7306", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].cumsum()" + ] + }, + { + "cell_type": "markdown", + "id": "c8f6054b", + "metadata": {}, + "source": [ + "To preserve `NA` values in cumulative methods, provide `skipna=False`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4c46776", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].cumsum(skipna=False)" + ] + }, + { + "cell_type": "markdown", + "id": "67077d65", + "metadata": {}, + "source": [ + "## Sum/product of Null/nans" + ] + }, + { + "cell_type": "markdown", + "id": "ffbb9ca1", + "metadata": {}, + "source": [ + "The sum of an empty or all-NA Series of a DataFrame is 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f430c9ce", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([np.nan], nan_as_null=False).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fde514b", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([np.nan], nan_as_null=False).sum(skipna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56cedd17", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([], dtype='float64').sum()" + ] + }, + { + "cell_type": "markdown", + "id": "cb188adb", + "metadata": {}, + "source": [ + "The product of an empty or all-NA Series of a DataFrame is 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d20bbbef", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([np.nan], nan_as_null=False).prod()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75abbcfa", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([np.nan], nan_as_null=False).prod(skipna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "becce0cc", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series([], dtype='float64').prod()" + ] + }, + { + "cell_type": "markdown", + "id": "0e899e03", + "metadata": {}, + "source": [ + "## NA values in GroupBy" + ] + }, + { + "cell_type": "markdown", + "id": "7fb20874", + "metadata": {}, + "source": [ + "`NA` groups in GroupBy are automatically excluded. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1379037c", + "metadata": {}, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b91e6f", + "metadata": {}, + "outputs": [], + "source": [ + "df1.groupby('a').mean()" + ] + }, + { + "cell_type": "markdown", + "id": "cb83fb11", + "metadata": {}, + "source": [ + "It is also possible to include `NA` in groups by passing `dropna=False`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768c3e50", + "metadata": {}, + "outputs": [], + "source": [ + "df1.groupby('a', dropna=False).mean()" + ] + }, + { + "cell_type": "markdown", + "id": "133816b4", + "metadata": {}, + "source": [ + "## Inserting missing data" + ] + }, + { + "cell_type": "markdown", + "id": "306082ad", + "metadata": {}, + "source": [ + "All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ddde1fe", + "metadata": {}, + "outputs": [], + "source": [ + "series = cudf.Series([1, 2, 3, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16e54597", + "metadata": {}, + "outputs": [], + "source": [ + "series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f628f94d", + "metadata": {}, + "outputs": [], + "source": [ + "series[2] = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b30590b7", + "metadata": {}, + "outputs": [], + "source": [ + "series" + ] + }, + { + "cell_type": "markdown", + "id": "a1b123d0", + "metadata": {}, + "source": [ + "## Filling missing values: fillna" + ] + }, + { + "cell_type": "markdown", + "id": "114aa23a", + "metadata": {}, + "source": [ + "`fillna()` can fill in `NA` & `NaN` values with non-NA data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59e22668", + "metadata": {}, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c221ee", + "metadata": {}, + "outputs": [], + "source": [ + "df1['b'].fillna(10)" + ] + }, + { + "cell_type": "markdown", + "id": "401f91b2", + "metadata": {}, + "source": [ + "## Filling with cudf Object" + ] + }, + { + "cell_type": "markdown", + "id": "e79346d6", + "metadata": {}, + "source": [ + "You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f52c5d8f", + "metadata": {}, + "outputs": [], + "source": [ + "import cupy as cp\n", + "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6affebe9", + "metadata": {}, + "outputs": [], + "source": [ + "dff.iloc[3:5, 0] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce1b96f", + "metadata": {}, + "outputs": [], + "source": [ + "dff.iloc[4:6, 1] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90829195", + "metadata": {}, + "outputs": [], + "source": [ + "dff.iloc[5:8, 2] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0feac14", + "metadata": {}, + "outputs": [], + "source": [ + "dff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a07c1260", + "metadata": {}, + "outputs": [], + "source": [ + "dff.fillna(dff.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e70d61a", + "metadata": {}, + "outputs": [], + "source": [ + "dff.fillna(dff.mean()[1:3])" + ] + }, + { + "cell_type": "markdown", + "id": "0ace728d", + "metadata": {}, + "source": [ + "## Dropping axis labels with missing data: dropna" + ] + }, + { + "cell_type": "markdown", + "id": "2ccd7115", + "metadata": {}, + "source": [ + "Missing data can be excluded using `dropna()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98c57be7", + "metadata": {}, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc3f273a", + "metadata": {}, + "outputs": [], + "source": [ + "df1.dropna(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a48d4de0", + "metadata": {}, + "outputs": [], + "source": [ + "df1.dropna(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "0b1954f9", + "metadata": {}, + "source": [ + "An equivalent `dropna()` is available for Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dd8f660", + "metadata": {}, + "outputs": [], + "source": [ + "df1['a'].dropna()" + ] + }, + { + "cell_type": "markdown", + "id": "121eb6d7", + "metadata": {}, + "source": [ + "## Replacing generic values" + ] + }, + { + "cell_type": "markdown", + "id": "3cc4c5f1", + "metadata": {}, + "source": [ + "Often times we want to replace arbitrary values with other values.\n", + "\n", + "`replace()` in Series and `replace()` in DataFrame provides an efficient yet flexible way to perform such replacements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6c14e8a", + "metadata": {}, + "outputs": [], + "source": [ + "series = cudf.Series([0.0, 1.0, 2.0, 3.0, 4.0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a852f0cb", + "metadata": {}, + "outputs": [], + "source": [ + "series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6ac12eb", + "metadata": {}, + "outputs": [], + "source": [ + "series.replace(0, 5)" + ] + }, + { + "cell_type": "markdown", + "id": "a6e1b6d7", + "metadata": {}, + "source": [ + "We can also replace any value with a `` value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0156bff", + "metadata": {}, + "outputs": [], + "source": [ + "series.replace(0, None)" + ] + }, + { + "cell_type": "markdown", + "id": "6673eefb", + "metadata": {}, + "source": [ + "You can replace a list of values by a list of other values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3110f5b", + "metadata": {}, + "outputs": [], + "source": [ + "series.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "61521e8b", + "metadata": {}, + "source": [ + "You can also specify a mapping dict:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45862d05", + "metadata": {}, + "outputs": [], + "source": [ + "series.replace({0: 10, 1: 100})" + ] + }, + { + "cell_type": "markdown", + "id": "04a34549", + "metadata": {}, + "source": [ + "For a DataFrame, you can specify individual values by column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "348caa64", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame({\"a\": [0, 1, 2, 3, 4], \"b\": [5, 6, 7, 8, 9]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cca41ec4", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64334693", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace({\"a\": 0, \"b\": 5}, 100)" + ] + }, + { + "cell_type": "markdown", + "id": "2f0ceec7", + "metadata": {}, + "source": [ + "## String/regular expression replacement" + ] + }, + { + "cell_type": "markdown", + "id": "c6f44740", + "metadata": {}, + "source": [ + "cudf supports replacing string values using `replace` API:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "031d3533", + "metadata": {}, + "outputs": [], + "source": [ + "d = {\"a\": list(range(4)), \"b\": list(\"ab..\"), \"c\": [\"a\", \"b\", None, \"d\"]}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12b41efb", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d450df49", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f823bc46", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace(\".\", \"A Dot\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc52f6e9", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace([\".\", \"b\"], [\"A Dot\", None])" + ] + }, + { + "cell_type": "markdown", + "id": "7c1087be", + "metadata": {}, + "source": [ + "Replace a few different values (list -> list):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e23eba9", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace([\"a\", \".\"], [\"b\", \"--\"])" + ] + }, + { + "cell_type": "markdown", + "id": "42845a9c", + "metadata": {}, + "source": [ + "Only search in column 'b' (dict -> dict):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e79805", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace({\"b\": \".\"}, {\"b\": \"replacement value\"})" + ] + }, + { + "cell_type": "markdown", + "id": "774b42a6", + "metadata": {}, + "source": [ + "## Numeric replacement" + ] + }, + { + "cell_type": "markdown", + "id": "1c1926ac", + "metadata": {}, + "source": [ + "`replace()` can also be used similar to `fillna()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "355a2f0d", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame(cp.random.randn(10, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9eed372", + "metadata": {}, + "outputs": [], + "source": [ + "df[np.random.rand(df.shape[0]) > 0.5] = 1.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae944244", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace(1.5, None)" + ] + }, + { + "cell_type": "markdown", + "id": "0f32607c", + "metadata": {}, + "source": [ + "Replacing more than one value is possible by passing a list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59b81c60", + "metadata": {}, + "outputs": [], + "source": [ + "df00 = df.iloc[0, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01a71d4c", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace([1.5, df00], [5, 10])" + ] + }, + { + "cell_type": "markdown", + "id": "1080e97b", + "metadata": {}, + "source": [ + "You can also operate on the DataFrame in place:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0859d7", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace(1.5, None, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cf28369", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb new file mode 100644 index 00000000000..309fb71542f --- /dev/null +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -0,0 +1,430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8e5e6878", + "metadata": {}, + "source": [ + "# Interoperability between cuDF and CuPy\n", + "\n", + "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b2d45c3", + "metadata": {}, + "outputs": [], + "source": [ + "import timeit\n", + "from packaging import version\n", + "\n", + "import cupy as cp\n", + "import cudf\n", + "\n", + "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n", + " cupy_from_dlpack = cp.from_dlpack\n", + "else:\n", + " cupy_from_dlpack = cp.fromDlpack" + ] + }, + { + "cell_type": "markdown", + "id": "e7e64b1a", + "metadata": {}, + "source": [ + "### Converting a cuDF DataFrame to a CuPy Array\n", + "\n", + "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n", + "\n", + "1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", + "\n", + "2. We can also use `DataFrame.values`.\n", + "\n", + "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45c482ab", + "metadata": {}, + "outputs": [], + "source": [ + "nelem = 10000\n", + "df = cudf.DataFrame({'a':range(nelem),\n", + " 'b':range(500, nelem + 500),\n", + " 'c':range(1000, nelem + 1000)}\n", + " )\n", + "\n", + "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", + "%timeit arr_cupy = df.values\n", + "%timeit arr_cupy = df.to_cupy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565effc", + "metadata": {}, + "outputs": [], + "source": [ + "arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", + "arr_cupy" + ] + }, + { + "cell_type": "markdown", + "id": "0759ab29", + "metadata": {}, + "source": [ + "### Converting a cuDF Series to a CuPy Array" + ] + }, + { + "cell_type": "markdown", + "id": "4f35ffbd", + "metadata": {}, + "source": [ + "There are also multiple ways to convert a cuDF Series to a CuPy array:\n", + "\n", + "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n", + "2. We can leverage the dlpack interface `to_dlpack()`. \n", + "3. We can also use `Series.values`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f97f304", + "metadata": {}, + "outputs": [], + "source": [ + "col = 'a'\n", + "\n", + "%timeit cola_cupy = cp.asarray(df[col])\n", + "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n", + "%timeit cola_cupy = df[col].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f96d5676", + "metadata": {}, + "outputs": [], + "source": [ + "cola_cupy = cp.asarray(df[col])\n", + "cola_cupy" + ] + }, + { + "cell_type": "markdown", + "id": "c36e5b88", + "metadata": {}, + "source": [ + "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a7ae43f", + "metadata": {}, + "outputs": [], + "source": [ + "reshaped_arr = cola_cupy.reshape(50, 200)\n", + "reshaped_arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b442a30c", + "metadata": {}, + "outputs": [], + "source": [ + "reshaped_arr.diagonal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7f4d32", + "metadata": {}, + "outputs": [], + "source": [ + "cp.linalg.norm(reshaped_arr)" + ] + }, + { + "cell_type": "markdown", + "id": "b353bded", + "metadata": {}, + "source": [ + "### Converting a CuPy Array to a cuDF DataFrame\n", + "\n", + "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n", + "\n", + "1. **Easiest;** We can directly use the `DataFrame` constructor.\n", + "\n", + "2. We can use CUDA array interface with the `DataFrame` constructor.\n", + "\n", + "3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface.\n", + "\n", + "For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8887b253", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit reshaped_df = cudf.DataFrame(reshaped_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ec4ffa", + "metadata": {}, + "outputs": [], + "source": [ + "reshaped_df = cudf.DataFrame(reshaped_arr)\n", + "reshaped_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6804d291", + "metadata": {}, + "source": [ + "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b8bd0d", + "metadata": {}, + "outputs": [], + "source": [ + "cp.isfortran(reshaped_arr)" + ] + }, + { + "cell_type": "markdown", + "id": "151982ad", + "metadata": {}, + "source": [ + "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27b2f563", + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "\n", + "fortran_arr = cp.asfortranarray(reshaped_arr)\n", + "reshaped_df = cudf.DataFrame(fortran_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a0cc290", + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "\n", + "fortran_arr = cp.asfortranarray(reshaped_arr)\n", + "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d2c5beb", + "metadata": {}, + "outputs": [], + "source": [ + "fortran_arr = cp.asfortranarray(reshaped_arr)\n", + "reshaped_df = cudf.DataFrame(fortran_arr)\n", + "reshaped_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "395e2bba", + "metadata": {}, + "source": [ + "### Converting a CuPy Array to a cuDF Series\n", + "\n", + "To convert an array to a Series, we can directly pass the array to the `Series` constructor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8518208", + "metadata": {}, + "outputs": [], + "source": [ + "cudf.Series(reshaped_arr.diagonal()).head()" + ] + }, + { + "cell_type": "markdown", + "id": "7e159619", + "metadata": {}, + "source": [ + "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n", + "\n", + "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n", + "\n", + "By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb8ed81", + "metadata": {}, + "outputs": [], + "source": [ + "reshaped_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2f3d4e78", + "metadata": {}, + "source": [ + "We can just transform it into a CuPy array and use the `axis` argument of `sum`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dde030d", + "metadata": {}, + "outputs": [], + "source": [ + "new_arr = cupy_from_dlpack(reshaped_df.to_dlpack())\n", + "new_arr.sum(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "4450dcc3", + "metadata": {}, + "source": [ + "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed." + ] + }, + { + "cell_type": "markdown", + "id": "61bfb868", + "metadata": {}, + "source": [ + "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n", + "\n", + "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n", + "\n", + "The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e531fd15", + "metadata": {}, + "outputs": [], + "source": [ + "def cudf_to_cupy_sparse_matrix(data, sparseformat='column'):\n", + " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\n", + " \"\"\"\n", + " if sparseformat not in ('row', 'column',):\n", + " raise ValueError(\"Let's focus on column and row formats for now.\")\n", + " \n", + " _sparse_constructor = cp.sparse.csc_matrix\n", + " if sparseformat == 'row':\n", + " _sparse_constructor = cp.sparse.csr_matrix\n", + "\n", + " return _sparse_constructor(cp.from_dlpack(data.to_dlpack()))" + ] + }, + { + "cell_type": "markdown", + "id": "3f5e6ade", + "metadata": {}, + "source": [ + "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58c7e074", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame()\n", + "nelem = 10000\n", + "nonzero = 1000\n", + "for i in range(20):\n", + " arr = cp.random.normal(5, 5, nelem)\n", + " arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0\n", + " df['a' + str(i)] = arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9265228d", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ba1a551", + "metadata": {}, + "outputs": [], + "source": [ + "sparse_data = cudf_to_cupy_sparse_matrix(df)\n", + "print(sparse_data)" + ] + }, + { + "cell_type": "markdown", + "id": "e8e58cd5", + "metadata": {}, + "source": [ + "From here, we could continue our workflow with a CuPy sparse matrix.\n", + "\n", + "For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb new file mode 100644 index 00000000000..8ea088a1d72 --- /dev/null +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -0,0 +1,1110 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "77149e57", + "metadata": {}, + "source": [ + "# Overview of User Defined Functions with cuDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c6b65ce", + "metadata": {}, + "outputs": [], + "source": [ + "import cudf\n", + "from cudf.datasets import randomdata\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "8826af13", + "metadata": {}, + "source": [ + "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n", + "\n", + "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures:\n", + "\n", + "- Series\n", + "- DataFrame\n", + "- Rolling Windows Series\n", + "- Groupby DataFrames\n", + "- CuPy NDArrays\n", + "- Numba DeviceNDArrays\n", + "\n", + "It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values." + ] + }, + { + "cell_type": "markdown", + "id": "32a8f4fb", + "metadata": {}, + "source": [ + "## Series UDFs\n", + "\n", + "You can execute UDFs on Series in two ways:\n", + "\n", + "- Writing a standard python function and using `cudf.Series.apply`\n", + "- Writing a Numba kernel and using Numba's `forall` syntax\n", + "\n", + "Using `apply` or is simpler, but writing a Numba kernel offers the flexibility to build more complex functions (we'll be writing only simple kernels in this guide)." + ] + }, + { + "cell_type": "markdown", + "id": "49399a84", + "metadata": {}, + "source": [ + "### `cudf.Series.apply`" + ] + }, + { + "cell_type": "markdown", + "id": "0a209ea2", + "metadata": {}, + "source": [ + "cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e28d5b82", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cuDF series\n", + "sr = cudf.Series([1, 2, 3])" + ] + }, + { + "cell_type": "markdown", + "id": "48a9fa5e", + "metadata": {}, + "source": [ + "UDFs destined for `cudf.Series.apply` might look something like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96aeb19f", + "metadata": {}, + "outputs": [], + "source": [ + "# define a scalar function\n", + "def f(x):\n", + " return x + 1" + ] + }, + { + "cell_type": "markdown", + "id": "e61d0169", + "metadata": {}, + "source": [ + "`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ca08834", + "metadata": {}, + "outputs": [], + "source": [ + "sr.apply(f)" + ] + }, + { + "cell_type": "markdown", + "id": "c98dab03", + "metadata": {}, + "source": [ + "### Functions with Additional Scalar Arguments" + ] + }, + { + "cell_type": "markdown", + "id": "2aa3df6f", + "metadata": {}, + "source": [ + "In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d156d01", + "metadata": {}, + "outputs": [], + "source": [ + "def g(x, const):\n", + " return x + const" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dee82d7", + "metadata": {}, + "outputs": [], + "source": [ + "# cuDF apply\n", + "sr.apply(g, args=(42,))" + ] + }, + { + "cell_type": "markdown", + "id": "22739e28", + "metadata": {}, + "source": [ + "As a final note, `**kwargs` is not yet supported." + ] + }, + { + "cell_type": "markdown", + "id": "afbf33dc", + "metadata": {}, + "source": [ + "### Nullable Data" + ] + }, + { + "cell_type": "markdown", + "id": "5dc06e8c", + "metadata": {}, + "source": [ + "The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bda261dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cuDF series with nulls\n", + "sr = cudf.Series([1, cudf.NA, 3])\n", + "sr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0123ae07", + "metadata": {}, + "outputs": [], + "source": [ + "# redefine the same function from above\n", + "def f(x):\n", + " return x + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e95868dd", + "metadata": {}, + "outputs": [], + "source": [ + "# cuDF result\n", + "sr.apply(f)" + ] + }, + { + "cell_type": "markdown", + "id": "97372e15", + "metadata": {}, + "source": [ + "Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c65241b", + "metadata": {}, + "outputs": [], + "source": [ + "def f_null_sensitive(x):\n", + " # do something if the input is null\n", + " if x is cudf.NA:\n", + " return 42\n", + " else:\n", + " return x + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0f4dbf", + "metadata": {}, + "outputs": [], + "source": [ + "# cuDF result\n", + "sr.apply(f_null_sensitive)" + ] + }, + { + "cell_type": "markdown", + "id": "bdddc4e8", + "metadata": {}, + "source": [ + "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases." + ] + }, + { + "cell_type": "markdown", + "id": "54cafbc0", + "metadata": {}, + "source": [ + "### Lower level control with custom `numba` kernels" + ] + }, + { + "cell_type": "markdown", + "id": "00914f2a", + "metadata": {}, + "source": [ + "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n", + "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details.\n", + "\n", + "The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "732434f6", + "metadata": {}, + "outputs": [], + "source": [ + "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f5997e5", + "metadata": {}, + "outputs": [], + "source": [ + "from numba import cuda\n", + "\n", + "@cuda.jit\n", + "def multiply(in_col, out_col, multiplier):\n", + " i = cuda.grid(1)\n", + " if i < in_col.size: # boundary guard\n", + " out_col[i] = in_col[i] * multiplier" + ] + }, + { + "cell_type": "markdown", + "id": "d9667a55", + "metadata": {}, + "source": [ + "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n", + "\n", + "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea6008a6", + "metadata": {}, + "outputs": [], + "source": [ + "size = len(df['a'])\n", + "df['e'] = 0.0\n", + "multiply.forall(size)(df['a'], df['e'], 10.0)" + ] + }, + { + "cell_type": "markdown", + "id": "3fb69909", + "metadata": {}, + "source": [ + "After calling our kernel, our DataFrame is now populated with the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "183a82ed", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ab9c305e", + "metadata": {}, + "source": [ + "This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster." + ] + }, + { + "cell_type": "markdown", + "id": "0acc6ef2", + "metadata": {}, + "source": [ + "## DataFrame UDFs\n", + "\n", + "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n", + "\n", + "- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf\n", + "- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel\n", + "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control." + ] + }, + { + "cell_type": "markdown", + "id": "2102c3ed", + "metadata": {}, + "source": [ + "### `cudf.DataFrame.apply`" + ] + }, + { + "cell_type": "markdown", + "id": "238bec41", + "metadata": {}, + "source": [ + "`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a \"row\" argument. The \"row\" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73653918", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " return row['A'] + row['B']" + ] + }, + { + "cell_type": "markdown", + "id": "b5eb32dd", + "metadata": {}, + "source": [ + "Let's create some very basic toy data containing at least one null." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "077feb75", + "metadata": {}, + "outputs": [], + "source": [ + "df = cudf.DataFrame({\n", + " 'A': [1,2,3],\n", + " 'B': [4,cudf.NA,6]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "609a3da5", + "metadata": {}, + "source": [ + "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "091e39e1", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "44e54c31", + "metadata": {}, + "source": [ + "The same function should produce the same result as pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd345fab", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_pandas(nullable=True).apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "004fbbba", + "metadata": {}, + "source": [ + "Notice that Pandas returns `object` dtype - see notes on this in the caveats section." + ] + }, + { + "cell_type": "markdown", + "id": "0b11c172", + "metadata": {}, + "source": [ + "Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b70f4b3b", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " x = row['a']\n", + " if x is cudf.NA:\n", + " return 0\n", + " else:\n", + " return x + 1\n", + "\n", + "df = cudf.DataFrame({'a': [1, cudf.NA, 3]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0313c8df", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "313c77f3", + "metadata": {}, + "source": [ + "`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96a7952a", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " x = row['a']\n", + " y = row['b']\n", + " if x + y > 3:\n", + " return cudf.NA\n", + " else:\n", + " return x + y\n", + "\n", + "df = cudf.DataFrame({\n", + " 'a': [1, 2, 3], \n", + " 'b': [2, 1, 1]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0815f60", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "b9c674f4", + "metadata": {}, + "source": [ + "Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495efd14", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " return row['a'] + row['b']\n", + "\n", + "df = cudf.DataFrame({\n", + " 'a': [1, 2, 3], \n", + " 'b': [0.5, cudf.NA, 3.14]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678b0b5a", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ce0897c0", + "metadata": {}, + "source": [ + "Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like:\n", + "\n", + "```python\n", + "def f(x):\n", + " if x > 1000:\n", + " return 1.5\n", + " else:\n", + " return 2\n", + "```\n", + "And your data is:\n", + "```python\n", + "[1,2,3,4,5]\n", + "```\n", + "You will get floats in the final data even though a float is never returned. This is because Numba ultimately needs to produce one function that can handle any data, which means if there's any possibility a float could result, you must always assume it will happen. Here's an example of a function that returns a scalar in some cases:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acf48d56", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " x = row['a']\n", + " if x > 3:\n", + " return x\n", + " else:\n", + " return 1.5\n", + "\n", + "df = cudf.DataFrame({\n", + " 'a': [1, 3, 5]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78a98172", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "2ceaece4", + "metadata": {}, + "source": [ + "Any number of columns and many arithmetic operators are supported, allowing for complex UDFs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142c30a9", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n", + "\n", + "df = cudf.DataFrame({\n", + " 'a': [1, 2, 3],\n", + " 'b': [4, 5, 6],\n", + " 'c': [cudf.NA, 4, 4],\n", + " 'd': [8, 7, 8],\n", + " 'e': [7, 1, 6]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fee9198a", + "metadata": {}, + "outputs": [], + "source": [ + "df.apply(f, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "9c587bd2", + "metadata": {}, + "source": [ + "### Numba kernels for DataFrames" + ] + }, + { + "cell_type": "markdown", + "id": "adc6a459", + "metadata": {}, + "source": [ + "We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior.\n", + "\n", + "Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90cbcd85", + "metadata": {}, + "outputs": [], + "source": [ + "def conditional_add(x, y, out):\n", + " for i, (a, e) in enumerate(zip(x, y)):\n", + " if a > 0:\n", + " out[i] = a + e\n", + " else:\n", + " out[i] = a" + ] + }, + { + "cell_type": "markdown", + "id": "bce045f2", + "metadata": {}, + "source": [ + "Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments:\n", + "- incols\n", + " - A list of names of input columns that match the function arguments. Or, a dictionary mapping input column names to their corresponding function arguments such as `{'col1': 'arg1'}`.\n", + "- outcols\n", + " - A dictionary defining our output column names and their data types. These names must match our function arguments.\n", + "- kwargs (optional)\n", + " - We can optionally pass keyword arguments as a dictionary. Since we don't need any, we pass an empty one.\n", + " \n", + "While it looks like our function is looping sequentially through our columns, it actually executes in parallel in multiple threads on the GPU. This parallelism is the heart of GPU-accelerated computing. With that background, we're ready to use our UDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e782daff", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.apply_rows(conditional_add, \n", + " incols={'a':'x', 'e':'y'},\n", + " outcols={'out': np.float64},\n", + " kwargs={}\n", + " )\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6b838b89", + "metadata": {}, + "source": [ + "As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF." + ] + }, + { + "cell_type": "markdown", + "id": "fca97003", + "metadata": {}, + "source": [ + "### Null Handling in `apply_rows` and `apply_chunks`\n", + "\n", + "By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "befd8333", + "metadata": {}, + "outputs": [], + "source": [ + "def gpu_add(a, b, out):\n", + " for i, (x, y) in enumerate(zip(a, b)):\n", + " out[i] = x + y\n", + "\n", + "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)\n", + "df.loc[2, 'a'] = None\n", + "df.loc[3, 'b'] = None\n", + "df.loc[1, 'c'] = None\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c710ce86", + "metadata": {}, + "source": [ + "In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1f3dcaf", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.apply_rows(gpu_add, \n", + " incols=['a', 'b'],\n", + " outcols={'out':np.float64},\n", + " kwargs={})\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "53b9a2f8", + "metadata": {}, + "source": [ + "As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not." + ] + }, + { + "cell_type": "markdown", + "id": "4bbefa67", + "metadata": {}, + "source": [ + "## Rolling Window UDFs\n", + "\n", + "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n", + "\n", + "We can apply more complex functions to rolling windows to `rolling` Series and DataFrames using `apply`. This example is adapted from cuDF's [API documentation](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.DataFrame.rolling.html). First, we'll create an example Series and then create a `rolling` object from the Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bc6aea3", + "metadata": {}, + "outputs": [], + "source": [ + "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n", + "ser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4c31df1", + "metadata": {}, + "outputs": [], + "source": [ + "rolling = ser.rolling(window=3, min_periods=3, center=False)\n", + "rolling" + ] + }, + { + "cell_type": "markdown", + "id": "ff40d863", + "metadata": {}, + "source": [ + "Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb5a081b", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "\n", + "def example_func(window):\n", + " b = 0\n", + " for a in window:\n", + " b = max(b, math.sqrt(a))\n", + " if b == 8:\n", + " return 100 \n", + " return b" + ] + }, + { + "cell_type": "markdown", + "id": "df8ba31d", + "metadata": {}, + "source": [ + "We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddec3263", + "metadata": {}, + "outputs": [], + "source": [ + "rolling.apply(example_func)" + ] + }, + { + "cell_type": "markdown", + "id": "187478db", + "metadata": {}, + "source": [ + "We can apply this function to every column in a DataFrame, too." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b61094a", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = cudf.DataFrame()\n", + "df2['a'] = np.arange(55, 65, dtype='float64')\n", + "df2['b'] = np.arange(55, 65, dtype='float64')\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb8c3019", + "metadata": {}, + "outputs": [], + "source": [ + "rolling = df2.rolling(window=3, min_periods=3, center=False)\n", + "rolling.apply(example_func)" + ] + }, + { + "cell_type": "markdown", + "id": "d4785060", + "metadata": {}, + "source": [ + "## GroupBy DataFrame UDFs\n", + "\n", + "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n", + "\n", + "First, we'll group our DataFrame based on column `b`, which is either True or False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dc272ab", + "metadata": {}, + "outputs": [], + "source": [ + "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0578e0a", + "metadata": {}, + "outputs": [], + "source": [ + "grouped = df.groupby(['b'])" + ] + }, + { + "cell_type": "markdown", + "id": "4808726f", + "metadata": {}, + "source": [ + "Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19f0f7fe", + "metadata": {}, + "outputs": [], + "source": [ + "def rolling_avg(e, rolling_avg_e):\n", + " win_size = 3\n", + " for i in range(cuda.threadIdx.x, len(e), cuda.blockDim.x):\n", + " if i < win_size - 1:\n", + " # If there is not enough data to fill the window,\n", + " # take the average to be NaN\n", + " rolling_avg_e[i] = np.nan\n", + " else:\n", + " total = 0\n", + " for j in range(i - win_size + 1, i + 1):\n", + " total += e[j]\n", + " rolling_avg_e[i] = total / win_size" + ] + }, + { + "cell_type": "markdown", + "id": "7566f359", + "metadata": {}, + "source": [ + "We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c43426c3", + "metadata": {}, + "outputs": [], + "source": [ + "results = grouped.apply_grouped(rolling_avg,\n", + " incols=['e'],\n", + " outcols=dict(rolling_avg_e=np.float64))\n", + "results" + ] + }, + { + "cell_type": "markdown", + "id": "c8511306", + "metadata": {}, + "source": [ + "Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null." + ] + }, + { + "cell_type": "markdown", + "id": "0060678c", + "metadata": {}, + "source": [ + "## Numba Kernels on CuPy Arrays\n", + "\n", + "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa6a8509", + "metadata": {}, + "outputs": [], + "source": [ + "import cupy as cp\n", + "\n", + "s = cudf.Series([1.0, 2, 3, 4, 10])\n", + "arr = cp.asarray(s)\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "id": "0fed556f", + "metadata": {}, + "source": [ + "Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bb8bf93", + "metadata": {}, + "outputs": [], + "source": [ + "@cuda.jit\n", + "def multiply_by_5(x, out):\n", + " i = cuda.grid(1)\n", + " if i < x.size:\n", + " out[i] = x[i] * 5\n", + " \n", + "out = cudf.Series(cp.zeros(len(s), dtype='int32'))\n", + "multiply_by_5.forall(s.shape[0])(s, out)\n", + "out" + ] + }, + { + "cell_type": "markdown", + "id": "a857b169", + "metadata": {}, + "source": [ + "Finally, we execute the same function on our array. We allocate an empty array `out` to store our results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce60b639", + "metadata": {}, + "outputs": [], + "source": [ + "out = cp.empty_like(arr)\n", + "multiply_by_5.forall(arr.size)(arr, out)\n", + "out" + ] + }, + { + "cell_type": "markdown", + "id": "b899d51c", + "metadata": {}, + "source": [ + "## Caveats" + ] + }, + { + "cell_type": "markdown", + "id": "fe7eb68b", + "metadata": {}, + "source": [ + "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n", + "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented." + ] + }, + { + "cell_type": "markdown", + "id": "c690563b", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n", + "\n", + "- Series\n", + "- DataFrame\n", + "- Rolling Windows\n", + "- GroupBy DataFrames\n", + "- CuPy NDArrays\n", + "- Numba DeviceNDArrays\n", + "- Generalized NA UDFs\n", + "\n", + "\n", + "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9828b01336506cf88165036c78d2576955b3894e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 11:57:16 -0400 Subject: [PATCH 09/14] Deletee md --- docs/cudf/source/user_guide/10min.md | 733 ------------------ .../user_guide/Working-with-missing-data.md | 489 ------------ docs/cudf/source/user_guide/cupy-interop.md | 217 ------ docs/cudf/source/user_guide/guide-to-udfs.md | 558 ------------- 4 files changed, 1997 deletions(-) delete mode 100644 docs/cudf/source/user_guide/10min.md delete mode 100644 docs/cudf/source/user_guide/Working-with-missing-data.md delete mode 100644 docs/cudf/source/user_guide/cupy-interop.md delete mode 100644 docs/cudf/source/user_guide/guide-to-udfs.md diff --git a/docs/cudf/source/user_guide/10min.md b/docs/cudf/source/user_guide/10min.md deleted file mode 100644 index d156be3d13c..00000000000 --- a/docs/cudf/source/user_guide/10min.md +++ /dev/null @@ -1,733 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -10 Minutes to cuDF and Dask-cuDF -======================= - -Modeled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly for new users. - -### What are these Libraries? - -[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API. - -[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions. - -[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv(). - - -### When to use cuDF and Dask-cuDF - -If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF. - -```{code-cell} ipython3 -import os - -import cupy as cp -import pandas as pd -import cudf -import dask_cudf - -cp.random.seed(12) - -#### Portions of this were borrowed and adapted from the -#### cuDF cheatsheet, existing cuDF documentation, -#### and 10 Minutes to Pandas. -``` - -Object Creation ---------------- - -+++ - -Creating a `cudf.Series` and `dask_cudf.Series`. - -```{code-cell} ipython3 -s = cudf.Series([1,2,3,None,4]) -s -``` - -```{code-cell} ipython3 -ds = dask_cudf.from_cudf(s, npartitions=2) -ds.compute() -``` - -Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column. - -```{code-cell} ipython3 -df = cudf.DataFrame({'a': list(range(20)), - 'b': list(reversed(range(20))), - 'c': list(range(20)) - }) -df -``` - -```{code-cell} ipython3 -ddf = dask_cudf.from_cudf(df, npartitions=2) -ddf.compute() -``` - -Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`. - -*Note that best practice for using Dask-cuDF is to read data directly into a `dask_cudf.DataFrame` with something like `read_csv` (discussed below).* - -```{code-cell} ipython3 -pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) -gdf = cudf.DataFrame.from_pandas(pdf) -gdf -``` - -```{code-cell} ipython3 -dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2) -dask_gdf.compute() -``` - -Viewing Data -------------- - -+++ - -Viewing the top rows of a GPU dataframe. - -```{code-cell} ipython3 -df.head(2) -``` - -```{code-cell} ipython3 -ddf.head(2) -``` - -Sorting by values. - -```{code-cell} ipython3 -df.sort_values(by='b') -``` - -```{code-cell} ipython3 -ddf.sort_values(by='b').compute() -``` - -Selection ------------- - -## Getting - -+++ - -Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`). - -```{code-cell} ipython3 -df['a'] -``` - -```{code-cell} ipython3 -ddf['a'].compute() -``` - -## Selection by Label - -+++ - -Selecting rows from index 2 to index 5 from columns 'a' and 'b'. - -```{code-cell} ipython3 -df.loc[2:5, ['a', 'b']] -``` - -```{code-cell} ipython3 -ddf.loc[2:5, ['a', 'b']].compute() -``` - -## Selection by Position - -+++ - -Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames. - -```{code-cell} ipython3 -df.iloc[0] -``` - -```{code-cell} ipython3 -df.iloc[0:3, 0:2] -``` - -You can also select elements of a `DataFrame` or `Series` with direct index access. - -```{code-cell} ipython3 -df[3:5] -``` - -```{code-cell} ipython3 -s[3:5] -``` - -## Boolean Indexing - -+++ - -Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing. - -```{code-cell} ipython3 -df[df.b > 15] -``` - -```{code-cell} ipython3 -ddf[ddf.b > 15].compute() -``` - -Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API. - -```{code-cell} ipython3 -df.query("b == 3") -``` - -```{code-cell} ipython3 -ddf.query("b == 3").compute() -``` - -You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`. - -```{code-cell} ipython3 -cudf_comparator = 3 -df.query("b == @cudf_comparator") -``` - -```{code-cell} ipython3 -dask_cudf_comparator = 3 -ddf.query("b == @val", local_dict={'val':dask_cudf_comparator}).compute() -``` - -Using the `isin` method for filtering. - -```{code-cell} ipython3 -df[df.a.isin([0, 5])] -``` - -## MultiIndex - -+++ - -cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex. - -```{code-cell} ipython3 -arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]] -tuples = list(zip(*arrays)) -idx = cudf.MultiIndex.from_tuples(tuples) -idx -``` - -This index can back either axis of a DataFrame. - -```{code-cell} ipython3 -gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}) -gdf1.index = idx -gdf1 -``` - -```{code-cell} ipython3 -gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T -gdf2.columns = idx -gdf2 -``` - -Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported. - -```{code-cell} ipython3 -gdf1.loc[('b', 3)] -``` - -Missing Data ------------- - -+++ - -Missing data can be replaced by using the `fillna` method. - -```{code-cell} ipython3 -s.fillna(999) -``` - -```{code-cell} ipython3 -ds.fillna(999).compute() -``` - -Operations ------------- - -+++ - -## Stats - -+++ - -Calculating descriptive statistics for a `Series`. - -```{code-cell} ipython3 -s.mean(), s.var() -``` - -```{code-cell} ipython3 -ds.mean().compute(), ds.var().compute() -``` - -## Applymap - -+++ - -Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe. - -```{code-cell} ipython3 -def add_ten(num): - return num + 10 - -df['a'].applymap(add_ten) -``` - -```{code-cell} ipython3 -ddf['a'].map_partitions(add_ten).compute() -``` - -## Histogramming - -+++ - -Counting the number of occurrences of each unique value of variable. - -```{code-cell} ipython3 -df.a.value_counts() -``` - -```{code-cell} ipython3 -ddf.a.value_counts().compute() -``` - -## String Methods - -+++ - -Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information. - -```{code-cell} ipython3 -s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat']) -s.str.lower() -``` - -```{code-cell} ipython3 -ds = dask_cudf.from_cudf(s, npartitions=2) -ds.str.lower().compute() -``` - -## Concat - -+++ - -Concatenating `Series` and `DataFrames` row-wise. - -```{code-cell} ipython3 -s = cudf.Series([1, 2, 3, None, 5]) -cudf.concat([s, s]) -``` - -```{code-cell} ipython3 -ds2 = dask_cudf.from_cudf(s, npartitions=2) -dask_cudf.concat([ds2, ds2]).compute() -``` - -## Join - -+++ - -Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index. - -```{code-cell} ipython3 -df_a = cudf.DataFrame() -df_a['key'] = ['a', 'b', 'c', 'd', 'e'] -df_a['vals_a'] = [float(i + 10) for i in range(5)] - -df_b = cudf.DataFrame() -df_b['key'] = ['a', 'c', 'e'] -df_b['vals_b'] = [float(i+100) for i in range(3)] - -merged = df_a.merge(df_b, on=['key'], how='left') -merged -``` - -```{code-cell} ipython3 -ddf_a = dask_cudf.from_cudf(df_a, npartitions=2) -ddf_b = dask_cudf.from_cudf(df_b, npartitions=2) - -merged = ddf_a.merge(ddf_b, on=['key'], how='left').compute() -merged -``` - -## Append - -+++ - -Appending values from another `Series` or array-like object. - -```{code-cell} ipython3 -s.append(s) -``` - -```{code-cell} ipython3 -ds2.append(ds2).compute() -``` - -## Grouping - -+++ - -Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm. - -```{code-cell} ipython3 -df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))] -df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))] - -ddf = dask_cudf.from_cudf(df, npartitions=2) -``` - -Grouping and then applying the `sum` function to the grouped data. - -```{code-cell} ipython3 -df.groupby('agg_col1').sum() -``` - -```{code-cell} ipython3 -ddf.groupby('agg_col1').sum().compute() -``` - -Grouping hierarchically then applying the `sum` function to grouped data. - -```{code-cell} ipython3 -df.groupby(['agg_col1', 'agg_col2']).sum() -``` - -```{code-cell} ipython3 -ddf.groupby(['agg_col1', 'agg_col2']).sum().compute() -``` - -Grouping and applying statistical functions to specific columns, using `agg`. - -```{code-cell} ipython3 -df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}) -``` - -```{code-cell} ipython3 -ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute() -``` - -## Transpose - -+++ - -Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF. - -```{code-cell} ipython3 -sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) -sample -``` - -```{code-cell} ipython3 -sample.transpose() -``` - -Time Series ------------- - -+++ - -`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps. - -```{code-cell} ipython3 -import datetime as dt - -date_df = cudf.DataFrame() -date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D') -date_df['value'] = cp.random.sample(len(date_df)) - -search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d') -date_df.query('date <= @search_date') -``` - -```{code-cell} ipython3 -date_ddf = dask_cudf.from_cudf(date_df, npartitions=2) -date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute() -``` - -Categoricals ------------- - -+++ - -`DataFrames` support categorical columns. - -```{code-cell} ipython3 -gdf = cudf.DataFrame({"id": [1, 2, 3, 4, 5, 6], "grade":['a', 'b', 'b', 'a', 'a', 'e']}) -gdf['grade'] = gdf['grade'].astype('category') -gdf -``` - -```{code-cell} ipython3 -dgdf = dask_cudf.from_cudf(gdf, npartitions=2) -dgdf.compute() -``` - -Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF. - -```{code-cell} ipython3 -gdf.grade.cat.categories -``` - -Accessing the underlying code values of each categorical observation. - -```{code-cell} ipython3 -gdf.grade.cat.codes -``` - -```{code-cell} ipython3 -dgdf.grade.cat.codes.compute() -``` - -Converting Data Representation --------------------------------- - -+++ - -## Pandas - -+++ - -Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`. - -```{code-cell} ipython3 -df.head().to_pandas() -``` - -```{code-cell} ipython3 -ddf.compute().head().to_pandas() -``` - -## Numpy - -+++ - -Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`. - -```{code-cell} ipython3 -df.to_numpy() -``` - -```{code-cell} ipython3 -ddf.compute().to_numpy() -``` - -Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`. - -```{code-cell} ipython3 -df['a'].to_numpy() -``` - -```{code-cell} ipython3 -ddf['a'].compute().to_numpy() -``` - -## Arrow - -+++ - -Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`. - -```{code-cell} ipython3 -df.to_arrow() -``` - -```{code-cell} ipython3 -ddf.compute().to_arrow() -``` - -Getting Data In/Out ------------------------- - -+++ - -## CSV - -+++ - -Writing to a CSV file. - -```{code-cell} ipython3 -if not os.path.exists('example_output'): - os.mkdir('example_output') - -df.to_csv('example_output/foo.csv', index=False) -``` - -```{code-cell} ipython3 -ddf.compute().to_csv('example_output/foo_dask.csv', index=False) -``` - -Reading from a csv file. - -```{code-cell} ipython3 -df = cudf.read_csv('example_output/foo.csv') -df -``` - -```{code-cell} ipython3 -ddf = dask_cudf.read_csv('example_output/foo_dask.csv') -ddf.compute() -``` - -Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard. - -```{code-cell} ipython3 -ddf = dask_cudf.read_csv('example_output/*.csv') -ddf.compute() -``` - -## Parquet - -+++ - -Writing to parquet files, using the CPU via PyArrow. - -```{code-cell} ipython3 -df.to_parquet('example_output/temp_parquet') -``` - -Reading parquet files with a GPU-accelerated parquet reader. - -```{code-cell} ipython3 -df = cudf.read_parquet('example_output/temp_parquet') -df -``` - -Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood. - -```{code-cell} ipython3 -ddf.to_parquet('example_files') -``` - -## ORC - -+++ - -Reading ORC files. - -```{code-cell} ipython3 -import os -from pathlib import Path -current_dir = os.path.dirname(os.path.realpath("__file__")) -cudf_root = Path(current_dir).parents[3] -file_path = os.path.join(cudf_root, "python", "cudf", "cudf", "tests", "data", "orc", "TestOrcFile.test1.orc") -file_path -``` - -```{code-cell} ipython3 -df2 = cudf.read_orc(file_path) -df2 -``` - -Dask Performance Tips --------------------------------- - -Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed at that moment, most operations are added to a task graph and the actual evaluation is delayed until the result is needed. - -Sometimes, though, we want to force the execution of operations. Calling `persist` on a Dask collection fully computes it (or actively computes it in the background), persisting the result into memory. When we're using distributed systems, we may want to wait until `persist` is finished before beginning any downstream operations. We can enforce this contract by using `wait`. Wrapping an operation with `wait` will ensure it doesn't begin executing until all necessary upstream operations have finished. - -The snippets below provide basic examples, using `LocalCUDACluster` to create one dask-worker per GPU on the local machine. For more detailed information about `persist` and `wait`, please see the Dask documentation for [persist](https://docs.dask.org/en/latest/api.html#dask.persist) and [wait](https://docs.dask.org/en/latest/futures.html#distributed.wait). Wait relies on the concept of Futures, which is beyond the scope of this tutorial. For more information on Futures, see the Dask [Futures](https://docs.dask.org/en/latest/futures.html) documentation. For more information about multi-GPU clusters, please see the [dask-cuda](https://github.com/rapidsai/dask-cuda) library (documentation is in progress). - -+++ - -First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster. - -```{code-cell} ipython3 -import time - -from dask.distributed import Client, wait -from dask_cuda import LocalCUDACluster - -cluster = LocalCUDACluster() -client = Client(cluster) -client -``` - -### Persisting Data -Next, we create our Dask-cuDF DataFrame and apply a transformation, storing the result as a new column. - -```{code-cell} ipython3 -nrows = 10000000 - -df2 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)}) -ddf2 = dask_cudf.from_cudf(df2, npartitions=5) -ddf2['c'] = ddf2['a'] + 5 -ddf2 -``` - -```{code-cell} ipython3 -!nvidia-smi -``` - -Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline). - -```{code-cell} ipython3 -ddf2 = ddf2.persist() -ddf2 -``` - -```{code-cell} ipython3 -!nvidia-smi -``` - -Because we forced computation, we now have a larger object in distributed GPU memory. - -+++ - -### Wait -Depending on our workflow or distributed computing setup, we may want to `wait` until all upstream tasks have finished before proceeding with a specific function. This section shows an example of this behavior, adapted from the Dask documentation. - -First, we create a new Dask DataFrame and define a function that we'll map to every partition in the dataframe. - -```{code-cell} ipython3 -import random - -nrows = 10000000 - -df1 = cudf.DataFrame({'a': cp.arange(nrows), 'b': cp.arange(nrows)}) -ddf1 = dask_cudf.from_cudf(df1, npartitions=100) - -def func(df): - time.sleep(random.randint(1, 60)) - return (df + 5) * 3 - 11 -``` - -This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution. - -```{code-cell} ipython3 -results_ddf = ddf2.map_partitions(func) -results_ddf = results_ddf.persist() -``` - -However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`. - -```{code-cell} ipython3 -wait(results_ddf) -``` - -## With `wait`, we can safely proceed on in our workflow. - -```{code-cell} ipython3 - -``` diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.md b/docs/cudf/source/user_guide/Working-with-missing-data.md deleted file mode 100644 index 6932d0fa9f1..00000000000 --- a/docs/cudf/source/user_guide/Working-with-missing-data.md +++ /dev/null @@ -1,489 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Working with missing data - -+++ - -In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by ``. These values are also referenced as "null values". - -+++ - -1. [How to Detect missing values](#How-to-Detect-missing-values) -2. [Float dtypes and missing data](#Float-dtypes-and-missing-data) -3. [Datetimes](#Datetimes) -4. [Calculations with missing data](#Calculations-with-missing-data) -5. [Sum/product of Null/nans](#Sum/product-of-Null/nans) -6. [NA values in GroupBy](#NA-values-in-GroupBy) -7. [Inserting missing data](#Inserting-missing-data) -8. [Filling missing values: fillna](#Filling-missing-values:-fillna) -9. [Filling with cudf Object](#Filling-with-cudf-Object) -10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna) -11. [Replacing generic values](#Replacing-generic-values) -12. [String/regular expression replacement](#String/regular-expression-replacement) -13. [Numeric replacement](#Numeric-replacement) - -+++ - -## How to Detect missing values - -+++ - -To detect missing values, you can use `isna()` and `notna()` functions. - -```{code-cell} ipython3 -import cudf -import numpy as np -``` - -```{code-cell} ipython3 -df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]}) -``` - -```{code-cell} ipython3 -df -``` - -```{code-cell} ipython3 -df.isna() -``` - -```{code-cell} ipython3 -df['a'].notna() -``` - -One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`. - -```{code-cell} ipython3 -None == None -``` - -```{code-cell} ipython3 -np.nan == np.nan -``` - -So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information. - - -```{code-cell} ipython3 -df['b'] == np.nan -``` - -```{code-cell} ipython3 -s = cudf.Series([None, 1, 2]) -``` - -```{code-cell} ipython3 -s -``` - -```{code-cell} ipython3 -s == None -``` - -```{code-cell} ipython3 -s = cudf.Series([1, 2, np.nan], nan_as_null=False) -``` - -```{code-cell} ipython3 -s -``` - -```{code-cell} ipython3 -s == np.nan -``` - -## Float dtypes and missing data - -+++ - -Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default. - -By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `` value. - -```{code-cell} ipython3 -cudf.Series([1, 2, np.nan]) -``` - -Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor. - -```{code-cell} ipython3 -cudf.Series([1, 2, np.nan], nan_as_null=False) -``` - -## Datetimes - -+++ - -For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(``) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object. - - -```{code-cell} ipython3 -import pandas as pd -datetime_series = cudf.Series([pd.Timestamp("20120101"), pd.NaT, pd.Timestamp("20120101")]) -datetime_series -``` - -```{code-cell} ipython3 -datetime_series.to_pandas() -``` - -any operations on rows having `` values in `datetime` column will result in `` value at the same location in resulting column: - -```{code-cell} ipython3 -datetime_series - datetime_series -``` - -## Calculations with missing data - -+++ - -Null values propagate naturally through arithmetic operations between pandas objects. - -```{code-cell} ipython3 -df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)}) -``` - -```{code-cell} ipython3 -df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])}) -``` - -```{code-cell} ipython3 -df1 -``` - -```{code-cell} ipython3 -df2 -``` - -```{code-cell} ipython3 -df1 + df2 -``` - -While summing the data along a series, `NA` values will be treated as `0`. - -```{code-cell} ipython3 -df1['a'] -``` - -```{code-cell} ipython3 -df1['a'].sum() -``` - -Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2` - -```{code-cell} ipython3 -df1['a'].mean() -``` - -To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter. -By default it's value is -set to `True`, we can change it to `False` to preserve `NA` values. - -```{code-cell} ipython3 -df1['a'].sum(skipna=False) -``` - -```{code-cell} ipython3 -df1['a'].mean(skipna=False) -``` - -Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default. - -```{code-cell} ipython3 -df1['a'].cumsum() -``` - -To preserve `NA` values in cumulative methods, provide `skipna=False`. - -```{code-cell} ipython3 -df1['a'].cumsum(skipna=False) -``` - -## Sum/product of Null/nans - -+++ - -The sum of an empty or all-NA Series of a DataFrame is 0. - -```{code-cell} ipython3 -cudf.Series([np.nan], nan_as_null=False).sum() -``` - -```{code-cell} ipython3 -cudf.Series([np.nan], nan_as_null=False).sum(skipna=False) -``` - -```{code-cell} ipython3 -cudf.Series([], dtype='float64').sum() -``` - -The product of an empty or all-NA Series of a DataFrame is 1. - -```{code-cell} ipython3 -cudf.Series([np.nan], nan_as_null=False).prod() -``` - -```{code-cell} ipython3 -cudf.Series([np.nan], nan_as_null=False).prod(skipna=False) -``` - -```{code-cell} ipython3 -cudf.Series([], dtype='float64').prod() -``` - -## NA values in GroupBy - -+++ - -`NA` groups in GroupBy are automatically excluded. For example: - -```{code-cell} ipython3 -df1 -``` - -```{code-cell} ipython3 -df1.groupby('a').mean() -``` - -It is also possible to include `NA` in groups by passing `dropna=False` - -```{code-cell} ipython3 -df1.groupby('a', dropna=False).mean() -``` - -## Inserting missing data - -+++ - -All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`. - -```{code-cell} ipython3 -series = cudf.Series([1, 2, 3, 4]) -``` - -```{code-cell} ipython3 -series -``` - -```{code-cell} ipython3 -series[2] = None -``` - -```{code-cell} ipython3 -series -``` - -## Filling missing values: fillna - -+++ - -`fillna()` can fill in `NA` & `NaN` values with non-NA data. - -```{code-cell} ipython3 -df1 -``` - -```{code-cell} ipython3 -df1['b'].fillna(10) -``` - -## Filling with cudf Object - -+++ - -You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column. - -```{code-cell} ipython3 -import cupy as cp -dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC')) -``` - -```{code-cell} ipython3 -dff.iloc[3:5, 0] = np.nan -``` - -```{code-cell} ipython3 -dff.iloc[4:6, 1] = np.nan -``` - -```{code-cell} ipython3 -dff.iloc[5:8, 2] = np.nan -``` - -```{code-cell} ipython3 -dff -``` - -```{code-cell} ipython3 -dff.fillna(dff.mean()) -``` - -```{code-cell} ipython3 -dff.fillna(dff.mean()[1:3]) -``` - -## Dropping axis labels with missing data: dropna - -+++ - -Missing data can be excluded using `dropna()`: - - -```{code-cell} ipython3 -df1 -``` - -```{code-cell} ipython3 -df1.dropna(axis=0) -``` - -```{code-cell} ipython3 -df1.dropna(axis=1) -``` - -An equivalent `dropna()` is available for Series. - -```{code-cell} ipython3 -df1['a'].dropna() -``` - -## Replacing generic values - -+++ - -Often times we want to replace arbitrary values with other values. - -`replace()` in Series and `replace()` in DataFrame provides an efficient yet flexible way to perform such replacements. - -```{code-cell} ipython3 -series = cudf.Series([0.0, 1.0, 2.0, 3.0, 4.0]) -``` - -```{code-cell} ipython3 -series -``` - -```{code-cell} ipython3 -series.replace(0, 5) -``` - -We can also replace any value with a `` value. - -```{code-cell} ipython3 -series.replace(0, None) -``` - -You can replace a list of values by a list of other values: - -```{code-cell} ipython3 -series.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) -``` - -You can also specify a mapping dict: - -```{code-cell} ipython3 -series.replace({0: 10, 1: 100}) -``` - -For a DataFrame, you can specify individual values by column: - -```{code-cell} ipython3 -df = cudf.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) -``` - -```{code-cell} ipython3 -df -``` - -```{code-cell} ipython3 -df.replace({"a": 0, "b": 5}, 100) -``` - -## String/regular expression replacement - -+++ - -cudf supports replacing string values using `replace` API: - -```{code-cell} ipython3 -d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", None, "d"]} -``` - -```{code-cell} ipython3 -df = cudf.DataFrame(d) -``` - -```{code-cell} ipython3 -df -``` - -```{code-cell} ipython3 -df.replace(".", "A Dot") -``` - -```{code-cell} ipython3 -df.replace([".", "b"], ["A Dot", None]) -``` - -Replace a few different values (list -> list): - -```{code-cell} ipython3 -df.replace(["a", "."], ["b", "--"]) -``` - -Only search in column 'b' (dict -> dict): - -```{code-cell} ipython3 -df.replace({"b": "."}, {"b": "replacement value"}) -``` - -## Numeric replacement - -+++ - -`replace()` can also be used similar to `fillna()`. - -```{code-cell} ipython3 -df = cudf.DataFrame(cp.random.randn(10, 2)) -``` - -```{code-cell} ipython3 -df[np.random.rand(df.shape[0]) > 0.5] = 1.5 -``` - -```{code-cell} ipython3 -df.replace(1.5, None) -``` - -Replacing more than one value is possible by passing a list. - - -```{code-cell} ipython3 -df00 = df.iloc[0, 0] -``` - -```{code-cell} ipython3 -df.replace([1.5, df00], [5, 10]) -``` - -You can also operate on the DataFrame in place: - - -```{code-cell} ipython3 -df.replace(1.5, None, inplace=True) -``` - -```{code-cell} ipython3 -df -``` diff --git a/docs/cudf/source/user_guide/cupy-interop.md b/docs/cudf/source/user_guide/cupy-interop.md deleted file mode 100644 index 880537c703c..00000000000 --- a/docs/cudf/source/user_guide/cupy-interop.md +++ /dev/null @@ -1,217 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Interoperability between cuDF and CuPy - -This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations). - -```{code-cell} ipython3 -import timeit -from packaging import version - -import cupy as cp -import cudf - -if version.parse(cp.__version__) >= version.parse("10.0.0"): - cupy_from_dlpack = cp.from_dlpack -else: - cupy_from_dlpack = cp.fromDlpack -``` - -### Converting a cuDF DataFrame to a CuPy Array - -If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it: - -1. We can use the [dlpack](https://github.com/dmlc/dlpack) interface. - -2. We can also use `DataFrame.values`. - -3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality. - -```{code-cell} ipython3 -nelem = 10000 -df = cudf.DataFrame({'a':range(nelem), - 'b':range(500, nelem + 500), - 'c':range(1000, nelem + 1000)} - ) - -%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack()) -%timeit arr_cupy = df.values -%timeit arr_cupy = df.to_cupy() -``` - -```{code-cell} ipython3 -arr_cupy = cupy_from_dlpack(df.to_dlpack()) -arr_cupy -``` - -### Converting a cuDF Series to a CuPy Array - -+++ - -There are also multiple ways to convert a cuDF Series to a CuPy array: - -1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html). -2. We can leverage the dlpack interface `to_dlpack()`. -3. We can also use `Series.values` - -```{code-cell} ipython3 -col = 'a' - -%timeit cola_cupy = cp.asarray(df[col]) -%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack()) -%timeit cola_cupy = df[col].values -``` - -```{code-cell} ipython3 -cola_cupy = cp.asarray(df[col]) -cola_cupy -``` - -From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm. - -```{code-cell} ipython3 -reshaped_arr = cola_cupy.reshape(50, 200) -reshaped_arr -``` - -```{code-cell} ipython3 -reshaped_arr.diagonal() -``` - -```{code-cell} ipython3 -cp.linalg.norm(reshaped_arr) -``` - -### Converting a CuPy Array to a cuDF DataFrame - -We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it: - -1. **Easiest;** We can directly use the `DataFrame` constructor. - -2. We can use CUDA array interface with the `DataFrame` constructor. - -3. We can also use the [dlpack](https://github.com/dmlc/dlpack) interface. - -For the latter two cases, we'll need to make sure that our CuPy array is Fortran contiguous in memory (if it's not already). We can either transpose the array or simply coerce it to be Fortran contiguous beforehand. - -```{code-cell} ipython3 -%timeit reshaped_df = cudf.DataFrame(reshaped_arr) -``` - -```{code-cell} ipython3 -reshaped_df = cudf.DataFrame(reshaped_arr) -reshaped_df.head() -``` - -We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array. - -```{code-cell} ipython3 -cp.isfortran(reshaped_arr) -``` - -In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively. - -```{code-cell} ipython3 -%%timeit - -fortran_arr = cp.asfortranarray(reshaped_arr) -reshaped_df = cudf.DataFrame(fortran_arr) -``` - -```{code-cell} ipython3 -%%timeit - -fortran_arr = cp.asfortranarray(reshaped_arr) -reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack()) -``` - -```{code-cell} ipython3 -fortran_arr = cp.asfortranarray(reshaped_arr) -reshaped_df = cudf.DataFrame(fortran_arr) -reshaped_df.head() -``` - -### Converting a CuPy Array to a cuDF Series - -To convert an array to a Series, we can directly pass the array to the `Series` constructor. - -```{code-cell} ipython3 -cudf.Series(reshaped_arr.diagonal()).head() -``` - -### Interweaving CuDF and CuPy for Smooth PyData Workflows - -RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive. - -By leveraging the interoperability of the GPU PyData ecosystem, this operation becomes very easy. Let's take the row-wise sum of our previously reshaped cuDF DataFrame. - -```{code-cell} ipython3 -reshaped_df.head() -``` - -We can just transform it into a CuPy array and use the `axis` argument of `sum`. - -```{code-cell} ipython3 -new_arr = cupy_from_dlpack(reshaped_df.to_dlpack()) -new_arr.sum(axis=1) -``` - -With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed. - -+++ - -### Converting a cuDF DataFrame to a CuPy Sparse Matrix - -We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input. - -The sparse matrix data structure is defined by three dense arrays. We'll define a small helper function for cleanliness. - -```{code-cell} ipython3 -def cudf_to_cupy_sparse_matrix(data, sparseformat='column'): - """Converts a cuDF object to a CuPy Sparse Column matrix. - """ - if sparseformat not in ('row', 'column',): - raise ValueError("Let's focus on column and row formats for now.") - - _sparse_constructor = cp.sparse.csc_matrix - if sparseformat == 'row': - _sparse_constructor = cp.sparse.csr_matrix - - return _sparse_constructor(cp.from_dlpack(data.to_dlpack())) -``` - -We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format. - -```{code-cell} ipython3 -df = cudf.DataFrame() -nelem = 10000 -nonzero = 1000 -for i in range(20): - arr = cp.random.normal(5, 5, nelem) - arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0 - df['a' + str(i)] = arr -``` - -```{code-cell} ipython3 -df.head() -``` - -```{code-cell} ipython3 -sparse_data = cudf_to_cupy_sparse_matrix(df) -print(sparse_data) -``` - -From here, we could continue our workflow with a CuPy sparse matrix. - -For a full list of the functionality built into these libraries, we encourage you to check out the API docs for [cuDF](https://docs.rapids.ai/api/cudf/nightly/) and [CuPy](https://docs-cupy.chainer.org/en/stable/index.html). diff --git a/docs/cudf/source/user_guide/guide-to-udfs.md b/docs/cudf/source/user_guide/guide-to-udfs.md deleted file mode 100644 index 396f67303f0..00000000000 --- a/docs/cudf/source/user_guide/guide-to-udfs.md +++ /dev/null @@ -1,558 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Overview of User Defined Functions with cuDF - -```{code-cell} ipython3 -import cudf -from cudf.datasets import randomdata -import numpy as np -``` - -Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame. - -In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures: - -- Series -- DataFrame -- Rolling Windows Series -- Groupby DataFrames -- CuPy NDArrays -- Numba DeviceNDArrays - -It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values. - -+++ - -## Series UDFs - -You can execute UDFs on Series in two ways: - -- Writing a standard python function and using `cudf.Series.apply` -- Writing a Numba kernel and using Numba's `forall` syntax - -Using `apply` or is simpler, but writing a Numba kernel offers the flexibility to build more complex functions (we'll be writing only simple kernels in this guide). - -+++ - -### `cudf.Series.apply` - -+++ - -cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example. - -```{code-cell} ipython3 -# Create a cuDF series -sr = cudf.Series([1, 2, 3]) -``` - -UDFs destined for `cudf.Series.apply` might look something like this: - -```{code-cell} ipython3 -# define a scalar function -def f(x): - return x + 1 -``` - -`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object: - -```{code-cell} ipython3 -sr.apply(f) -``` - -### Functions with Additional Scalar Arguments - -+++ - -In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF: - -```{code-cell} ipython3 -def g(x, const): - return x + const -``` - -```{code-cell} ipython3 -# cuDF apply -sr.apply(g, args=(42,)) -``` - -As a final note, `**kwargs` is not yet supported. - -+++ - -### Nullable Data - -+++ - -The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data: - -```{code-cell} ipython3 -# Create a cuDF series with nulls -sr = cudf.Series([1, cudf.NA, 3]) -sr -``` - -```{code-cell} ipython3 -# redefine the same function from above -def f(x): - return x + 1 -``` - -```{code-cell} ipython3 -# cuDF result -sr.apply(f) -``` - -Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling: - -```{code-cell} ipython3 -def f_null_sensitive(x): - # do something if the input is null - if x is cudf.NA: - return 42 - else: - return x + 1 -``` - -```{code-cell} ipython3 -# cuDF result -sr.apply(f_null_sensitive) -``` - -In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases. - -+++ - -### Lower level control with custom `numba` kernels - -+++ - -In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html). -Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details. - -The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it. - -```{code-cell} ipython3 -df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12) -``` - -```{code-cell} ipython3 -from numba import cuda - -@cuda.jit -def multiply(in_col, out_col, multiplier): - i = cuda.grid(1) - if i < in_col.size: # boundary guard - out_col[i] = in_col[i] * multiplier -``` - -This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior. - -To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series. - -```{code-cell} ipython3 -size = len(df['a']) -df['e'] = 0.0 -multiply.forall(size)(df['a'], df['e'], 10.0) -``` - -After calling our kernel, our DataFrame is now populated with the result. - -```{code-cell} ipython3 -df.head() -``` - -This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster. - -+++ - -## DataFrame UDFs - -Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input: - -- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf -- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel -- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control. - -+++ - -### `cudf.DataFrame.apply` - -+++ - -`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a "row" argument. The "row" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF. - -```{code-cell} ipython3 -def f(row): - return row['A'] + row['B'] -``` - -Let's create some very basic toy data containing at least one null. - -```{code-cell} ipython3 -df = cudf.DataFrame({ - 'A': [1,2,3], - 'B': [4,cudf.NA,6] -}) -df -``` - -Finally call the function as you would in pandas - by using a lambda function to map the UDF onto "rows" of the DataFrame: - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -The same function should produce the same result as pandas: - -```{code-cell} ipython3 -df.to_pandas(nullable=True).apply(f, axis=1) -``` - -Notice that Pandas returns `object` dtype - see notes on this in the caveats section. - -+++ - -Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null: - -```{code-cell} ipython3 -def f(row): - x = row['a'] - if x is cudf.NA: - return 0 - else: - return x + 1 - -df = cudf.DataFrame({'a': [1, cudf.NA, 3]}) -df -``` - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row: - -```{code-cell} ipython3 -def f(row): - x = row['a'] - y = row['b'] - if x + y > 3: - return cudf.NA - else: - return x + y - -df = cudf.DataFrame({ - 'a': [1, 2, 3], - 'b': [2, 1, 1] -}) -df -``` - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column: - -```{code-cell} ipython3 -def f(row): - return row['a'] + row['b'] - -df = cudf.DataFrame({ - 'a': [1, 2, 3], - 'b': [0.5, cudf.NA, 3.14] -}) -df -``` - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like: - -```python -def f(x): - if x > 1000: - return 1.5 - else: - return 2 -``` -And your data is: -```python -[1,2,3,4,5] -``` -You will get floats in the final data even though a float is never returned. This is because Numba ultimately needs to produce one function that can handle any data, which means if there's any possibility a float could result, you must always assume it will happen. Here's an example of a function that returns a scalar in some cases: - -```{code-cell} ipython3 -def f(row): - x = row['a'] - if x > 3: - return x - else: - return 1.5 - -df = cudf.DataFrame({ - 'a': [1, 3, 5] -}) -df -``` - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -Any number of columns and many arithmetic operators are supported, allowing for complex UDFs: - -```{code-cell} ipython3 -def f(row): - return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e'] - -df = cudf.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [cudf.NA, 4, 4], - 'd': [8, 7, 8], - 'e': [7, 1, 6] -}) -df -``` - -```{code-cell} ipython3 -df.apply(f, axis=1) -``` - -### Numba kernels for DataFrames - -+++ - - -We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior. - -Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them. - -```{code-cell} ipython3 -def conditional_add(x, y, out): - for i, (a, e) in enumerate(zip(x, y)): - if a > 0: - out[i] = a + e - else: - out[i] = a -``` - -Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments: -- incols - - A list of names of input columns that match the function arguments. Or, a dictionary mapping input column names to their corresponding function arguments such as `{'col1': 'arg1'}`. -- outcols - - A dictionary defining our output column names and their data types. These names must match our function arguments. -- kwargs (optional) - - We can optionally pass keyword arguments as a dictionary. Since we don't need any, we pass an empty one. - -While it looks like our function is looping sequentially through our columns, it actually executes in parallel in multiple threads on the GPU. This parallelism is the heart of GPU-accelerated computing. With that background, we're ready to use our UDF. - -```{code-cell} ipython3 -df = df.apply_rows(conditional_add, - incols={'a':'x', 'e':'y'}, - outcols={'out': np.float64}, - kwargs={} - ) -df.head() -``` - -As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF. - -+++ - -### Null Handling in `apply_rows` and `apply_chunks` - -By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below. - -```{code-cell} ipython3 -def gpu_add(a, b, out): - for i, (x, y) in enumerate(zip(a, b)): - out[i] = x + y - -df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12) -df.loc[2, 'a'] = None -df.loc[3, 'b'] = None -df.loc[1, 'c'] = None -df.head() -``` - -In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us). - -```{code-cell} ipython3 -df = df.apply_rows(gpu_add, - incols=['a', 'b'], - outcols={'out':np.float64}, - kwargs={}) -df.head() -``` - -As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not. - -+++ - -## Rolling Window UDFs - -For time-series data, we may need to operate on a small \"window\" of our column at a time, processing each portion independently. We could slide (\"roll\") this window over the entire column to answer questions like \"What is the 3-day moving average of a stock price over the past year?" - -We can apply more complex functions to rolling windows to `rolling` Series and DataFrames using `apply`. This example is adapted from cuDF's [API documentation](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.DataFrame.rolling.html). First, we'll create an example Series and then create a `rolling` object from the Series. - -```{code-cell} ipython3 -ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64') -ser -``` - -```{code-cell} ipython3 -rolling = ser.rolling(window=3, min_periods=3, center=False) -rolling -``` - -Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values. - -```{code-cell} ipython3 -import math - -def example_func(window): - b = 0 - for a in window: - b = max(b, math.sqrt(a)) - if b == 8: - return 100 - return b -``` - -We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`. - -```{code-cell} ipython3 -rolling.apply(example_func) -``` - -We can apply this function to every column in a DataFrame, too. - -```{code-cell} ipython3 -df2 = cudf.DataFrame() -df2['a'] = np.arange(55, 65, dtype='float64') -df2['b'] = np.arange(55, 65, dtype='float64') -df2.head() -``` - -```{code-cell} ipython3 -rolling = df2.rolling(window=3, min_periods=3, center=False) -rolling.apply(example_func) -``` - -## GroupBy DataFrame UDFs - -We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation](). - -First, we'll group our DataFrame based on column `b`, which is either True or False. - -```{code-cell} ipython3 -df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12) -df.head() -``` - -```{code-cell} ipython3 -grouped = df.groupby(['b']) -``` - -Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`. - -```{code-cell} ipython3 -def rolling_avg(e, rolling_avg_e): - win_size = 3 - for i in range(cuda.threadIdx.x, len(e), cuda.blockDim.x): - if i < win_size - 1: - # If there is not enough data to fill the window, - # take the average to be NaN - rolling_avg_e[i] = np.nan - else: - total = 0 - for j in range(i - win_size + 1, i + 1): - total += e[j] - rolling_avg_e[i] = total / win_size -``` - -We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group. - -```{code-cell} ipython3 -results = grouped.apply_grouped(rolling_avg, - incols=['e'], - outcols=dict(rolling_avg_e=np.float64)) -results -``` - -Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null. - -+++ - -## Numba Kernels on CuPy Arrays - -We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series. - -```{code-cell} ipython3 -import cupy as cp - -s = cudf.Series([1.0, 2, 3, 4, 10]) -arr = cp.asarray(s) -arr -``` - -Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`. - -```{code-cell} ipython3 -@cuda.jit -def multiply_by_5(x, out): - i = cuda.grid(1) - if i < x.size: - out[i] = x[i] * 5 - -out = cudf.Series(cp.zeros(len(s), dtype='int32')) -multiply_by_5.forall(s.shape[0])(s, out) -out -``` - -Finally, we execute the same function on our array. We allocate an empty array `out` to store our results. - -```{code-cell} ipython3 -out = cp.empty_like(arr) -multiply_by_5.forall(arr.size)(arr, out) -out -``` - -## Caveats - -+++ - -- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`. -- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented. - -+++ - -## Summary - -This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on - -- Series -- DataFrame -- Rolling Windows -- GroupBy DataFrames -- CuPy NDArrays -- Numba DeviceNDArrays -- Generalized NA UDFs - - -For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation. From bb87e732ceeabc100341f23895372580c9cb7bda Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 13:08:55 -0400 Subject: [PATCH 10/14] Add more Pandas comparison notes --- .../source/user_guide/pandas-comparison.rst | 58 ++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/docs/cudf/source/user_guide/pandas-comparison.rst b/docs/cudf/source/user_guide/pandas-comparison.rst index 0b17c03642f..a7fb2f75f15 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.rst +++ b/docs/cudf/source/user_guide/pandas-comparison.rst @@ -7,6 +7,18 @@ there are some differences between cuDF and Pandas, both in terms API and behavior. This page documents the similarities and differences between cuDF and Pandas. +Supported operations +-------------------- + +cuDF supports many of the same data structures and operations as +Pandas. This includes ``Series``, ``DataFrame``, ``Index`` and +operations on them such as unary and binary operations, indexing, +filtering, concatenating, joining, groupby and window operations - +among many others. + +The best way to see if we support a particular Pandas API is to search +our `API docs `_. + Data types ---------- @@ -19,6 +31,44 @@ details. Note that we do not support custom data types like Pandas' ``ExtensionDtype``. +Null (or "missing") values +-------------------------- + +Unlike Pandas, *all* data types in cuDF are nullable, +meaning they can contain missing values (represented by ``cudf.NA``). + +.. code:: python + >>> s = cudf.Series([1, 2, cudf.NA]) + >>> s + >>> s + 0 1 + 1 2 + 2 + dtype: int64 + +Nulls are not coerced to ``nan`` in any situation; +compare the behaviour of cuDF with Pandas below: + +.. code:: python + >>> s = cudf.Series([1, 2, cudf.NA], dtype="category") + >>> s + 0 1 + 1 2 + 2 + dtype: category + Categories (2, int64): [1, 2] + + >>> s = pd.Series([1, 2, pd.NA], dtype="category") + >>> s + 0 1 + 1 2 + 2 NaN + dtype: category + Categories (2, int64): [1, 2] + +See our :doc:`docs on missing data` +for details. + Result ordering --------------- @@ -94,10 +144,4 @@ value of a ``Series``, ``DataFrame``, or in the case of a groupby, each group. cuDF also supports ``apply()``, but it relies on Numba to JIT compile the UDF and execute it on the GPU. This can be extremely fast, but imposes a few limitations on what operations are allowed in -the UDF. See our :doc:`UDF docs ` for details. - -How to check if a particular Pandas feature is available in cuDF? ------------------------------------------------------------------ - -The best way to see if we support a particular feature is to search -our `API docs `_. +the UDF. See our :doc:`UDF docs ` for details. From 3f9fcf1c8146270695d8e5f8ad4bfe07377d878c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 13:42:44 -0400 Subject: [PATCH 11/14] Run notebooks --- docs/cudf/source/user_guide/10min.ipynb | 5076 ++++++++++++++++- .../Working-with-missing-data.ipynb | 2639 ++++++++- .../cudf/source/user_guide/cupy-interop.ipynb | 1071 +++- .../source/user_guide/guide-to-udfs.ipynb | 1456 ++++- 4 files changed, 9686 insertions(+), 556 deletions(-) diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index e9ded302874..02e1ba40f1f 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "92eed4cb", "metadata": {}, "outputs": [], @@ -64,10 +64,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "cf8b08e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 4\n", + "dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s = cudf.Series([1,2,3,None,4])\n", "s" @@ -75,10 +91,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "083a5898", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 4\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds = dask_cudf.from_cudf(s, npartitions=2) \n", "ds.compute()" @@ -94,10 +126,190 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "83d1e7f5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", - " Comm: tcp://127.0.0.1:41341\n", + " Comm: tcp://127.0.0.1:44033\n", " \n", " Total threads: 1\n", @@ -6166,31 +6201,31 @@ "
\n", - " Dashboard: http://127.0.0.1:39963/status\n", + " Dashboard: http://127.0.0.1:45225/status\n", " \n", - " Memory: 22.89 GiB\n", + " Memory: 62.82 GiB\n", "
\n", - " Nanny: tcp://127.0.0.1:33675\n", + " Nanny: tcp://127.0.0.1:46529\n", "
\n", - " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-phx0wjv_\n", + " Local directory: /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-zlsacw8_\n", "
\n", - " GPU: Quadro GV100\n", + " GPU: NVIDIA RTX A6000\n", " \n", - " GPU memory: 31.74 GiB\n", + " GPU memory: 47.54 GiB\n", "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
22172
33163
44154
55145
66136
77127
88118
99109
1010910
1111811
1212712
1313613
1414514
1515415
1616316
1717217
1818118
1919019
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3\n", + "4 4 15 4\n", + "5 5 14 5\n", + "6 6 13 6\n", + "7 7 12 7\n", + "8 8 11 8\n", + "9 9 10 9\n", + "10 10 9 10\n", + "11 11 8 11\n", + "12 12 7 12\n", + "13 13 6 13\n", + "14 14 5 14\n", + "15 15 4 15\n", + "16 16 3 16\n", + "17 17 2 17\n", + "18 18 1 18\n", + "19 19 0 19" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = cudf.DataFrame({'a': list(range(20)),\n", " 'b': list(reversed(range(20))),\n", @@ -108,10 +320,190 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "71b61d62", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
22172
33163
44154
55145
66136
77127
88118
99109
1010910
1111811
1212712
1313613
1414514
1515415
1616316
1717217
1818118
1919019
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3\n", + "4 4 15 4\n", + "5 5 14 5\n", + "6 6 13 6\n", + "7 7 12 7\n", + "8 8 11 8\n", + "9 9 10 9\n", + "10 10 9 10\n", + "11 11 8 11\n", + "12 12 7 12\n", + "13 13 6 13\n", + "14 14 5 14\n", + "15 15 4 15\n", + "16 16 3 16\n", + "17 17 2 17\n", + "18 18 1 18\n", + "19 19 0 19" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf = dask_cudf.from_cudf(df, npartitions=2) \n", "ddf.compute()" @@ -129,60 +521,294 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "07a62244", "metadata": {}, - "outputs": [], - "source": [ - "pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})\n", - "gdf = cudf.DataFrame.from_pandas(pdf)\n", - "gdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5cb0c65", - "metadata": {}, - "outputs": [], - "source": [ - "dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", - "dask_gdf.compute()" - ] - }, - { - "cell_type": "markdown", - "id": "025eac40", - "metadata": {}, - "source": [ - "Viewing Data\n", - "-------------" - ] - }, - { - "cell_type": "markdown", - "id": "47a567e8", - "metadata": {}, - "source": [ - "Viewing the top rows of a GPU dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab8cbdb8", - "metadata": {}, - "outputs": [], - "source": [ - "df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
000.1
110.2
22<NA>
330.3
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 0 0.1\n", + "1 1 0.2\n", + "2 2 \n", + "3 3 0.3" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})\n", + "gdf = cudf.DataFrame.from_pandas(pdf)\n", + "gdf" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f5cb0c65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
000.1
110.2
22<NA>
330.3
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 0 0.1\n", + "1 1 0.2\n", + "2 2 \n", + "3 3 0.3" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", + "dask_gdf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "025eac40", + "metadata": {}, + "source": [ + "Viewing Data\n", + "-------------" + ] + }, + { + "cell_type": "markdown", + "id": "47a567e8", + "metadata": {}, + "source": [ + "Viewing the top rows of a GPU dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ab8cbdb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "id": "2e923d8a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.head(2)" ] @@ -197,20 +823,380 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "512770f9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1919019
1818118
1717217
1616316
1515415
1414514
1313613
1212712
1111811
1010910
99109
88118
77127
66136
55145
44154
33163
22172
11181
00190
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "19 19 0 19\n", + "18 18 1 18\n", + "17 17 2 17\n", + "16 16 3 16\n", + "15 15 4 15\n", + "14 14 5 14\n", + "13 13 6 13\n", + "12 12 7 12\n", + "11 11 8 11\n", + "10 10 9 10\n", + "9 9 10 9\n", + "8 8 11 8\n", + "7 7 12 7\n", + "6 6 13 6\n", + "5 5 14 5\n", + "4 4 15 4\n", + "3 3 16 3\n", + "2 2 17 2\n", + "1 1 18 1\n", + "0 0 19 0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.sort_values(by='b')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "1a13993f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1919019
1818118
1717217
1616316
1515415
1414514
1313613
1212712
1111811
1010910
99109
88118
77127
66136
55145
44154
33163
22172
11181
00190
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "19 19 0 19\n", + "18 18 1 18\n", + "17 17 2 17\n", + "16 16 3 16\n", + "15 15 4 15\n", + "14 14 5 14\n", + "13 13 6 13\n", + "12 12 7 12\n", + "11 11 8 11\n", + "10 10 9 10\n", + "9 9 10 9\n", + "8 8 11 8\n", + "7 7 12 7\n", + "6 6 13 6\n", + "5 5 14 5\n", + "4 4 15 4\n", + "3 3 16 3\n", + "2 2 17 2\n", + "1 1 18 1\n", + "0 0 19 0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.sort_values(by='b').compute()" ] @@ -236,20 +1222,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "885989a6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4\n", + "5 5\n", + "6 6\n", + "7 7\n", + "8 8\n", + "9 9\n", + "10 10\n", + "11 11\n", + "12 12\n", + "13 13\n", + "14 14\n", + "15 15\n", + "16 16\n", + "17 17\n", + "18 18\n", + "19 19\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df['a']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "14a74255", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4\n", + "5 5\n", + "6 6\n", + "7 7\n", + "8 8\n", + "9 9\n", + "10 10\n", + "11 11\n", + "12 12\n", + "13 13\n", + "14 14\n", + "15 15\n", + "16 16\n", + "17 17\n", + "18 18\n", + "19 19\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf['a'].compute()" ] @@ -272,20 +1320,146 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "d40bc19c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
2217
3316
4415
5514
\n", + "
" + ], + "text/plain": [ + " a b\n", + "2 2 17\n", + "3 3 16\n", + "4 4 15\n", + "5 5 14" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.loc[2:5, ['a', 'b']]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "7688535b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
2217
3316
4415
5514
\n", + "
" + ], + "text/plain": [ + " a b\n", + "2 2 17\n", + "3 3 16\n", + "4 4 15\n", + "5 5 14" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.loc[2:5, ['a', 'b']].compute()" ] @@ -308,20 +1482,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "fb8d6d43", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "a 0\n", + "b 19\n", + "c 0\n", + "Name: 0, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.iloc[0]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "263231da", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
0019
1118
2217
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 0 19\n", + "1 1 18\n", + "2 2 17" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.iloc[0:3, 0:2]" ] @@ -336,20 +1581,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "13f6158b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
33163
44154
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "3 3 16 3\n", + "4 4 15 4" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df[3:5]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "3cf4aa26", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "3 \n", + "4 4\n", + "dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s[3:5]" ] @@ -372,20 +1684,156 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "becb916f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
22172
33163
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df[df.b > 15]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "b9475c43", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
11181
22172
33163
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf[ddf.b > 15].compute()" ] @@ -400,20 +1848,114 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "fc2fc9f9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1616316
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "16 16 3 16" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.query(\"b == 3\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "1a05a07f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1616316
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "16 16 3 16" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.query(\"b == 3\").compute()" ] @@ -428,10 +1970,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "49485a4b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1616316
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "16 16 3 16" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf_comparator = 3\n", "df.query(\"b == @cudf_comparator\")" @@ -439,10 +2028,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "0f3a9116", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
1616316
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "16 16 3 16" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dask_cudf_comparator = 3\n", "ddf.query(\"b == @val\", local_dict={'val':dask_cudf_comparator}).compute()" @@ -458,10 +2094,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "f44a5a57", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00190
55145
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 19 0\n", + "5 5 14 5" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df[df.a.isin([0, 5])]" ] @@ -484,10 +2174,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "882973ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex([('a', 1),\n", + " ('a', 2),\n", + " ('b', 3),\n", + " ('b', 4)],\n", + " )" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]]\n", "tuples = list(zip(*arrays))\n", @@ -505,10 +2210,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "5417aeb9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
firstsecond
a10.0826540.967955
20.3994170.441425
b30.7842970.793582
40.0703030.271711
\n", + "
" + ], + "text/plain": [ + " first second\n", + "a 1 0.082654 0.967955\n", + " 2 0.399417 0.441425\n", + "b 3 0.784297 0.793582\n", + " 4 0.070303 0.271711" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)})\n", "gdf1.index = idx\n", @@ -517,10 +2288,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "4d6fb4ff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
1234
first0.3433820.0037000.200430.581614
second0.9078120.1015120.241790.224180
\n", + "
" + ], + "text/plain": [ + " a b \n", + " 1 2 3 4\n", + "first 0.343382 0.003700 0.20043 0.581614\n", + "second 0.907812 0.101512 0.24179 0.224180" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T\n", "gdf2.columns = idx\n", @@ -537,10 +2371,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "3644920c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "first 0.784297\n", + "second 0.793582\n", + "Name: ('b', 3), dtype: float64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf1.loc[('b', 3)]" ] @@ -564,20 +2411,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "28b06c52", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 999\n", + "4 4\n", + "dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s.fillna(999)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "7fb6a126", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 999\n", + "4 4\n", + "dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds.fillna(999).compute()" ] @@ -609,20 +2488,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "f7cb604e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(2.5, 1.666666666666666)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s.mean(), s.var()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "b8957a5f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(2.5, 1.6666666666666667)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds.mean().compute(), ds.var().compute()" ] @@ -646,13 +2547,14 @@ { "cell_type": "code", "execution_count": 35, + "id": "5e627811", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/series.py:2223: FutureWarning: Series.applymap is deprecated and will be removed in a future cuDF release. Use Series.apply instead.\n", + "/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/core/series.py:2223: FutureWarning: Series.applymap is deprecated and will be removed in a future cuDF release. Use Series.apply instead.\n", " warnings.warn(\n" ] }, @@ -696,10 +2598,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "96cf628e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 10\n", + "1 11\n", + "2 12\n", + "3 13\n", + "4 14\n", + "5 15\n", + "6 16\n", + "7 17\n", + "8 18\n", + "9 19\n", + "10 20\n", + "11 21\n", + "12 22\n", + "13 23\n", + "14 24\n", + "15 25\n", + "16 26\n", + "17 27\n", + "18 28\n", + "19 29\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf['a'].map_partitions(add_ten).compute()" ] @@ -722,20 +2655,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "62808675", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "15 1\n", + "6 1\n", + "1 1\n", + "14 1\n", + "2 1\n", + "5 1\n", + "11 1\n", + "7 1\n", + "17 1\n", + "13 1\n", + "8 1\n", + "16 1\n", + "0 1\n", + "10 1\n", + "4 1\n", + "9 1\n", + "19 1\n", + "18 1\n", + "3 1\n", + "12 1\n", + "Name: a, dtype: int32" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.a.value_counts()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "5b2a42ce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "15 1\n", + "6 1\n", + "1 1\n", + "14 1\n", + "2 1\n", + "5 1\n", + "11 1\n", + "7 1\n", + "17 1\n", + "13 1\n", + "8 1\n", + "16 1\n", + "0 1\n", + "10 1\n", + "4 1\n", + "9 1\n", + "19 1\n", + "18 1\n", + "3 1\n", + "12 1\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.a.value_counts().compute()" ] @@ -758,10 +2753,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "c73e70bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 a\n", + "1 b\n", + "2 c\n", + "3 aaba\n", + "4 baca\n", + "5 \n", + "6 caba\n", + "7 dog\n", + "8 cat\n", + "dtype: object" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'])\n", "s.str.lower()" @@ -769,10 +2784,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "697c1c94", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 a\n", + "1 b\n", + "2 c\n", + "3 aaba\n", + "4 baca\n", + "5 \n", + "6 caba\n", + "7 dog\n", + "8 cat\n", + "dtype: object" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds = dask_cudf.from_cudf(s, npartitions=2)\n", "ds.str.lower().compute()" @@ -796,10 +2831,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "60538bbd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s = cudf.Series([1, 2, 3, None, 5])\n", "cudf.concat([s, s])" @@ -807,10 +2863,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "17953847", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds2 = dask_cudf.from_cudf(s, npartitions=2)\n", "dask_cudf.concat([ds2, ds2]).compute()" @@ -834,10 +2911,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "52ada00a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keyvals_avals_b
0a10.0100.0
1c12.0101.0
2e14.0102.0
3b11.0<NA>
4d13.0<NA>
\n", + "
" + ], + "text/plain": [ + " key vals_a vals_b\n", + "0 a 10.0 100.0\n", + "1 c 12.0 101.0\n", + "2 e 14.0 102.0\n", + "3 b 11.0 \n", + "4 d 13.0 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_a = cudf.DataFrame()\n", "df_a['key'] = ['a', 'b', 'c', 'd', 'e']\n", @@ -853,10 +3005,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "409fcf92", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keyvals_avals_b
0a10.0100.0
1c12.0101.0
2b11.0<NA>
0e14.0102.0
1d13.0<NA>
\n", + "
" + ], + "text/plain": [ + " key vals_a vals_b\n", + "0 a 10.0 100.0\n", + "1 c 12.0 101.0\n", + "2 b 11.0 \n", + "0 e 14.0 102.0\n", + "1 d 13.0 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf_a = dask_cudf.from_cudf(df_a, npartitions=2)\n", "ddf_b = dask_cudf.from_cudf(df_b, npartitions=2)\n", @@ -884,13 +3111,14 @@ { "cell_type": "code", "execution_count": 45, + "id": "9976c1ce", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/indexed_frame.py:2329: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n", + "/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/core/indexed_frame.py:2329: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n", " warnings.warn(\n" ] }, @@ -921,10 +3149,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "fe5c54ab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 5\n", + "dtype: int64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds2.append(ds2).compute()" ] @@ -947,7 +3196,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "2a8cafa7", "metadata": {}, "outputs": [], @@ -968,20 +3217,150 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "id": "7c56d186", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col2
agg_col1
190100904
0100901003
\n", + "
" + ], + "text/plain": [ + " a b c agg_col2\n", + "agg_col1 \n", + "1 90 100 90 4\n", + "0 100 90 100 3" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.groupby('agg_col1').sum()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "f8823b30", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col2
agg_col1
190100904
0100901003
\n", + "
" + ], + "text/plain": [ + " a b c agg_col2\n", + "agg_col1 \n", + "1 90 100 90 4\n", + "0 100 90 100 3" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.groupby('agg_col1').sum().compute()" ] @@ -996,20 +3375,182 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "2184e3ad", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
agg_col1agg_col2
10546054
00736073
11364036
01273027
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "agg_col1 agg_col2 \n", + "1 0 54 60 54\n", + "0 0 73 60 73\n", + "1 1 36 40 36\n", + "0 1 27 30 27" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.groupby(['agg_col1', 'agg_col2']).sum()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "4ec311c1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
agg_col1agg_col2
11364036
00736073
10546054
01273027
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "agg_col1 agg_col2 \n", + "1 1 36 40 36\n", + "0 0 73 60 73\n", + "1 0 54 60 54\n", + "0 1 27 30 27" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.groupby(['agg_col1', 'agg_col2']).sum().compute()" ] @@ -1024,20 +3565,142 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "2563d8b2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
agg_col1
11810.090
0199.0100
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "agg_col1 \n", + "1 18 10.0 90\n", + "0 19 9.0 100" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'})" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "id": "22c77e75", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
agg_col1
11810.090
0199.0100
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "agg_col1 \n", + "1 18 10.0 90\n", + "0 19 9.0 100" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute()" ] @@ -1060,10 +3723,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "e265861e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
014
125
236
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 4\n", + "1 2 5\n", + "2 3 6" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n", "sample" @@ -1071,10 +3791,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "1fe9b972", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
a123
b456
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "a 1 2 3\n", + "b 4 5 6" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sample.transpose()" ] @@ -1098,10 +3872,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "7a425d3f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datevalue
02018-11-200.986051
12018-11-210.232034
22018-11-220.397617
32018-11-230.103839
\n", + "
" + ], + "text/plain": [ + " date value\n", + "0 2018-11-20 0.986051\n", + "1 2018-11-21 0.232034\n", + "2 2018-11-22 0.397617\n", + "3 2018-11-23 0.103839" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import datetime as dt\n", "\n", @@ -1115,10 +3952,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "id": "87f0e56e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datevalue
02018-11-200.986051
12018-11-210.232034
22018-11-220.397617
32018-11-230.103839
\n", + "
" + ], + "text/plain": [ + " date value\n", + "0 2018-11-20 0.986051\n", + "1 2018-11-21 0.232034\n", + "2 2018-11-22 0.397617\n", + "3 2018-11-23 0.103839" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "date_ddf = dask_cudf.from_cudf(date_df, npartitions=2)\n", "date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute()" @@ -1143,10 +4043,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "05bd8be8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgrade
01a
12b
23b
34a
45a
56e
\n", + "
" + ], + "text/plain": [ + " id grade\n", + "0 1 a\n", + "1 2 b\n", + "2 3 b\n", + "3 4 a\n", + "4 5 a\n", + "5 6 e" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf = cudf.DataFrame({\"id\": [1, 2, 3, 4, 5, 6], \"grade\":['a', 'b', 'b', 'a', 'a', 'e']})\n", "gdf['grade'] = gdf['grade'].astype('category')\n", @@ -1155,10 +4130,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "676b4963", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgrade
01a
12b
23b
34a
45a
56e
\n", + "
" + ], + "text/plain": [ + " id grade\n", + "0 1 a\n", + "1 2 b\n", + "2 3 b\n", + "3 4 a\n", + "4 5 a\n", + "5 6 e" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dgdf = dask_cudf.from_cudf(gdf, npartitions=2)\n", "dgdf.compute()" @@ -1174,10 +4224,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "06310c36", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "StringIndex(['a' 'b' 'e'], dtype='object')" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf.grade.cat.categories" ] @@ -1192,20 +4253,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "id": "0f6db260", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 1\n", + "2 1\n", + "3 0\n", + "4 0\n", + "5 2\n", + "dtype: uint8" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gdf.grade.cat.codes" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "id": "b87c4375", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 1\n", + "2 1\n", + "3 0\n", + "4 0\n", + "5 2\n", + "dtype: uint8" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dgdf.grade.cat.codes.compute()" ] @@ -1237,20 +4332,194 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "d1fed919", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.head().to_pandas()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "567c7363", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.compute().head().to_pandas()" ] @@ -1273,20 +4542,80 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "5490d226", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 19, 0, 1, 1],\n", + " [ 1, 18, 1, 0, 0],\n", + " [ 2, 17, 2, 1, 0],\n", + " [ 3, 16, 3, 0, 1],\n", + " [ 4, 15, 4, 1, 0],\n", + " [ 5, 14, 5, 0, 0],\n", + " [ 6, 13, 6, 1, 1],\n", + " [ 7, 12, 7, 0, 0],\n", + " [ 8, 11, 8, 1, 0],\n", + " [ 9, 10, 9, 0, 1],\n", + " [10, 9, 10, 1, 0],\n", + " [11, 8, 11, 0, 0],\n", + " [12, 7, 12, 1, 1],\n", + " [13, 6, 13, 0, 0],\n", + " [14, 5, 14, 1, 0],\n", + " [15, 4, 15, 0, 1],\n", + " [16, 3, 16, 1, 0],\n", + " [17, 2, 17, 0, 0],\n", + " [18, 1, 18, 1, 1],\n", + " [19, 0, 19, 0, 0]])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.to_numpy()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "b77ac8ae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 19, 0, 1, 1],\n", + " [ 1, 18, 1, 0, 0],\n", + " [ 2, 17, 2, 1, 0],\n", + " [ 3, 16, 3, 0, 1],\n", + " [ 4, 15, 4, 1, 0],\n", + " [ 5, 14, 5, 0, 0],\n", + " [ 6, 13, 6, 1, 1],\n", + " [ 7, 12, 7, 0, 0],\n", + " [ 8, 11, 8, 1, 0],\n", + " [ 9, 10, 9, 0, 1],\n", + " [10, 9, 10, 1, 0],\n", + " [11, 8, 11, 0, 0],\n", + " [12, 7, 12, 1, 1],\n", + " [13, 6, 13, 0, 0],\n", + " [14, 5, 14, 1, 0],\n", + " [15, 4, 15, 0, 1],\n", + " [16, 3, 16, 1, 0],\n", + " [17, 2, 17, 0, 0],\n", + " [18, 1, 18, 1, 1],\n", + " [19, 0, 19, 0, 0]])" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.compute().to_numpy()" ] @@ -1301,20 +4630,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "id": "f71a0ba3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", + " 17, 18, 19])" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df['a'].to_numpy()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "id": "a45a74b5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", + " 17, 18, 19])" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf['a'].compute().to_numpy()" ] @@ -1337,20 +4690,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "id": "bb9e9a2a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: int64\n", + "c: int64\n", + "agg_col1: int64\n", + "agg_col2: int64\n", + "----\n", + "a: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", + "b: [[19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]]\n", + "c: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", + "agg_col1: [[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]]\n", + "agg_col2: [[1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0]]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.to_arrow()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "id": "4d020de7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: int64\n", + "c: int64\n", + "agg_col1: int64\n", + "agg_col2: int64\n", + "----\n", + "a: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", + "b: [[19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]]\n", + "c: [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]]\n", + "agg_col1: [[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]]\n", + "agg_col2: [[1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0]]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.compute().to_arrow()" ] @@ -1382,7 +4779,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "id": "3a59715f", "metadata": {}, "outputs": [], @@ -1395,7 +4792,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "id": "4ebe98ed", "metadata": {}, "outputs": [], @@ -1413,10 +4810,232 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "id": "1a70e831", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", + "10 10 9 10 1 0\n", + "11 11 8 11 0 0\n", + "12 12 7 12 1 1\n", + "13 13 6 13 0 0\n", + "14 14 5 14 1 0\n", + "15 15 4 15 0 1\n", + "16 16 3 16 1 0\n", + "17 17 2 17 0 0\n", + "18 18 1 18 1 1\n", + "19 19 0 19 0 0" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = cudf.read_csv('example_output/foo.csv')\n", "df" @@ -1424,10 +5043,232 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "id": "4c3d9ca3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", + "10 10 9 10 1 0\n", + "11 11 8 11 0 0\n", + "12 12 7 12 1 1\n", + "13 13 6 13 0 0\n", + "14 14 5 14 1 0\n", + "15 15 4 15 0 1\n", + "16 16 3 16 1 0\n", + "17 17 2 17 0 0\n", + "18 18 1 18 1 1\n", + "19 19 0 19 0 0" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf = dask_cudf.read_csv('example_output/foo_dask.csv')\n", "ddf.compute()" @@ -1443,10 +5284,412 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "id": "cb7187d2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", + "10 10 9 10 1 0\n", + "11 11 8 11 0 0\n", + "12 12 7 12 1 1\n", + "13 13 6 13 0 0\n", + "14 14 5 14 1 0\n", + "15 15 4 15 0 1\n", + "16 16 3 16 1 0\n", + "17 17 2 17 0 0\n", + "18 18 1 18 1 1\n", + "19 19 0 19 0 0\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", + "10 10 9 10 1 0\n", + "11 11 8 11 0 0\n", + "12 12 7 12 1 1\n", + "13 13 6 13 0 0\n", + "14 14 5 14 1 0\n", + "15 15 4 15 0 1\n", + "16 16 3 16 1 0\n", + "17 17 2 17 0 0\n", + "18 18 1 18 1 1\n", + "19 19 0 19 0 0" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf = dask_cudf.read_csv('example_output/*.csv')\n", "ddf.compute()" @@ -1470,7 +5713,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "id": "1812346f", "metadata": {}, "outputs": [], @@ -1488,10 +5731,232 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "id": "2354b20b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcagg_col1agg_col2
0019011
1118100
2217210
3316301
4415410
5514500
6613611
7712700
8811810
9910901
101091010
111181100
121271211
131361300
141451410
151541501
161631610
171721700
181811811
191901900
\n", + "
" + ], + "text/plain": [ + " a b c agg_col1 agg_col2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", + "10 10 9 10 1 0\n", + "11 11 8 11 0 0\n", + "12 12 7 12 1 1\n", + "13 13 6 13 0 0\n", + "14 14 5 14 1 0\n", + "15 15 4 15 0 1\n", + "16 16 3 16 1 0\n", + "17 17 2 17 0 0\n", + "18 18 1 18 1 1\n", + "19 19 0 19 0 0" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = cudf.read_parquet('example_output/temp_parquet')\n", "df" @@ -1507,10 +5972,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "id": "c5d7686c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(None,)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ddf.to_parquet('example_files') " ] @@ -1533,16 +6009,17 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 79, + "id": "93364ff3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/home/mmccarty/sandbox/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'" + "'/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'" ] }, - "execution_count": 80, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1558,7 +6035,8 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 80, + "id": "2b6785c7", "metadata": {}, "outputs": [ { @@ -1649,7 +6127,7 @@ "1 [{'key': 'chani', 'value': {'int1': 5, 'string... " ] }, - "execution_count": 81, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -1684,17 +6162,16 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 81, + "id": "e4852d48", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ghcx5g0e', purging\n", - "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-wh16f0h3', purging\n", - "2022-04-21 10:11:07,360 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n", - "2022-04-21 10:11:07,388 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n" + "2022-04-21 13:26:06,860 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n", + "2022-04-21 13:26:06,904 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n" ] }, { @@ -1704,7 +6181,7 @@ "
\n", "
\n", "

Client

\n", - "

Client-e3492c89-c17c-11ec-813e-fc3497a62adc

\n", + "

Client-20d00fd5-c198-11ec-906c-c8d9d2247354

\n", " \n", "\n", " \n", @@ -1733,7 +6210,7 @@ " \n", "
\n", "

LocalCUDACluster

\n", - "

db2501e1

\n", + "

47648c26

\n", "
\n", " \n", " \n", " \n", " \n", " \n", @@ -1770,11 +6247,11 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-6f476508-e52f-49e9-8f1f-6a8641e177bd

\n", + "

Scheduler-f28bff16-cb70-452c-b8af-b9299a8d7b20

\n", "
\n", @@ -1748,7 +6225,7 @@ " Total threads: 2\n", " \n", - " Total memory: 125.65 GiB\n", + " Total memory: 45.79 GiB\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", - " Comm: tcp://127.0.0.1:39755\n", + " Comm: tcp://127.0.0.1:33995\n", " \n", " Workers: 2\n", @@ -1793,7 +6270,7 @@ " Started: Just now\n", " \n", - " Total memory: 125.65 GiB\n", + " Total memory: 45.79 GiB\n", "
\n", @@ -1816,7 +6293,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1870,7 +6347,7 @@ "
\n", - " Comm: tcp://127.0.0.1:33491\n", + " Comm: tcp://127.0.0.1:40479\n", " \n", " Total threads: 1\n", @@ -1824,31 +6301,31 @@ "
\n", - " Dashboard: http://127.0.0.1:34333/status\n", + " Dashboard: http://127.0.0.1:38985/status\n", " \n", - " Memory: 62.82 GiB\n", + " Memory: 22.89 GiB\n", "
\n", - " Nanny: tcp://127.0.0.1:43093\n", + " Nanny: tcp://127.0.0.1:33447\n", "
\n", - " Local directory: /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-jsuvfju4\n", + " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-be7zg92w\n", "
\n", - " GPU: NVIDIA RTX A6000\n", + " GPU: Quadro GV100\n", " \n", - " GPU memory: 47.51 GiB\n", + " GPU memory: 31.75 GiB\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1928,10 +6405,10 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 82, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -1958,7 +6435,8 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 82, + "id": "d47a1142", "metadata": {}, "outputs": [ { @@ -2034,7 +6512,7 @@ "" ] }, - "execution_count": 83, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -2050,45 +6528,37 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 83, + "id": "c3cb612a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Thu Apr 21 10:11:07 2022 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|===============================+======================+======================|\n", - "| 0 NVIDIA RTX A6000 On | 00000000:01:00.0 On | Off |\n", - "| 30% 48C P2 83W / 300W | 2970MiB / 48651MiB | 7% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 NVIDIA RTX A6000 On | 00000000:02:00.0 Off | Off |\n", - "| 30% 36C P2 25W / 300W | 265MiB / 48685MiB | 5% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=============================================================================|\n", - "| 0 N/A N/A 2292 G /usr/lib/xorg/Xorg 871MiB |\n", - "| 0 N/A N/A 2441 G /usr/bin/gnome-shell 316MiB |\n", - "| 0 N/A N/A 1240494 G ...AAAAAAAAA= --shared-files 68MiB |\n", - "| 0 N/A N/A 1240525 G ...RendererForSitePerProcess 41MiB |\n", - "| 0 N/A N/A 1243689 C .../envs/cudf_dev/bin/python 593MiB |\n", - "| 0 N/A N/A 1245502 C .../envs/cudf_dev/bin/python 753MiB |\n", - "| 0 N/A N/A 1245751 C .../envs/cudf_dev/bin/python 257MiB |\n", - "| 1 N/A N/A 2292 G /usr/lib/xorg/Xorg 4MiB |\n", - "| 1 N/A N/A 1245748 C .../envs/cudf_dev/bin/python 257MiB |\n", - "+-----------------------------------------------------------------------------+\n" + "Thu Apr 21 13:26:07 2022 \r\n", + "+-----------------------------------------------------------------------------+\r\n", + "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\r\n", + "|-------------------------------+----------------------+----------------------+\r\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", + "| | | MIG M. |\r\n", + "|===============================+======================+======================|\r\n", + "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\r\n", + "| 39% 52C P2 51W / 250W | 1115MiB / 32508MiB | 0% Default |\r\n", + "| | | N/A |\r\n", + "+-------------------------------+----------------------+----------------------+\r\n", + "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\r\n", + "| 43% 57C P2 52W / 250W | 306MiB / 32498MiB | 0% Default |\r\n", + "| | | N/A |\r\n", + "+-------------------------------+----------------------+----------------------+\r\n", + " \r\n", + "+-----------------------------------------------------------------------------+\r\n", + "| Processes: |\r\n", + "| GPU GI CI PID Type Process name GPU Memory |\r\n", + "| ID ID Usage |\r\n", + "|=============================================================================|\r\n", + "+-----------------------------------------------------------------------------+\r\n" ] } ], @@ -2106,7 +6576,8 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 84, + "id": "a929577c", "metadata": {}, "outputs": [ { @@ -2182,7 +6653,7 @@ "" ] }, - "execution_count": 85, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -2194,45 +6665,37 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 85, + "id": "8aa7c079", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Thu Apr 21 10:11:08 2022 \n", - "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\n", - "|-------------------------------+----------------------+----------------------+\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", - "| | | MIG M. |\n", - "|===============================+======================+======================|\n", - "| 0 NVIDIA RTX A6000 On | 00000000:01:00.0 On | Off |\n", - "| 30% 48C P2 84W / 300W | 2970MiB / 48651MiB | 3% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 NVIDIA RTX A6000 On | 00000000:02:00.0 Off | Off |\n", - "| 30% 36C P2 37W / 300W | 265MiB / 48685MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - " \n", - "+-----------------------------------------------------------------------------+\n", - "| Processes: |\n", - "| GPU GI CI PID Type Process name GPU Memory |\n", - "| ID ID Usage |\n", - "|=============================================================================|\n", - "| 0 N/A N/A 2292 G /usr/lib/xorg/Xorg 871MiB |\n", - "| 0 N/A N/A 2441 G /usr/bin/gnome-shell 316MiB |\n", - "| 0 N/A N/A 1240494 G ...AAAAAAAAA= --shared-files 68MiB |\n", - "| 0 N/A N/A 1240525 G ...RendererForSitePerProcess 41MiB |\n", - "| 0 N/A N/A 1243689 C .../envs/cudf_dev/bin/python 593MiB |\n", - "| 0 N/A N/A 1245502 C .../envs/cudf_dev/bin/python 753MiB |\n", - "| 0 N/A N/A 1245751 C .../envs/cudf_dev/bin/python 257MiB |\n", - "| 1 N/A N/A 2292 G /usr/lib/xorg/Xorg 4MiB |\n", - "| 1 N/A N/A 1245748 C .../envs/cudf_dev/bin/python 257MiB |\n", - "+-----------------------------------------------------------------------------+\n" + "Thu Apr 21 13:26:08 2022 \r\n", + "+-----------------------------------------------------------------------------+\r\n", + "| NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 |\r\n", + "|-------------------------------+----------------------+----------------------+\r\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", + "| | | MIG M. |\r\n", + "|===============================+======================+======================|\r\n", + "| 0 Quadro GV100 Off | 00000000:15:00.0 Off | Off |\r\n", + "| 39% 52C P2 52W / 250W | 1115MiB / 32508MiB | 3% Default |\r\n", + "| | | N/A |\r\n", + "+-------------------------------+----------------------+----------------------+\r\n", + "| 1 Quadro GV100 Off | 00000000:2D:00.0 Off | Off |\r\n", + "| 43% 57C P2 51W / 250W | 306MiB / 32498MiB | 0% Default |\r\n", + "| | | N/A |\r\n", + "+-------------------------------+----------------------+----------------------+\r\n", + " \r\n", + "+-----------------------------------------------------------------------------+\r\n", + "| Processes: |\r\n", + "| GPU GI CI PID Type Process name GPU Memory |\r\n", + "| ID ID Usage |\r\n", + "|=============================================================================|\r\n", + "+-----------------------------------------------------------------------------+\r\n" ] } ], @@ -2261,7 +6724,8 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 86, + "id": "ef71bf00", "metadata": {}, "outputs": [], "source": [ @@ -2287,7 +6751,8 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 87, + "id": "700dd799", "metadata": {}, "outputs": [], "source": [ @@ -2305,16 +6770,17 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 88, + "id": "73bccf94", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "DoneAndNotDoneFutures(done={, , , , }, not_done=set())" + "DoneAndNotDoneFutures(done={, , , , }, not_done=set())" ] }, - "execution_count": 89, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -2328,7 +6794,7 @@ "id": "447301f5", "metadata": {}, "source": [ - "## With `wait`, we can safely proceed on in our workflow." + "With `wait`, we can safely proceed on in our workflow." ] }, { @@ -2345,6 +6811,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb index b261ebe785e..e57aec25fed 100644 --- a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb +++ b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "58050adb", "metadata": {}, "outputs": [], @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "416d73da", "metadata": {}, "outputs": [], @@ -75,30 +75,171 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5dfc6bc3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", - " Comm: tcp://127.0.0.1:44033\n", + " Comm: tcp://127.0.0.1:40519\n", " \n", " Total threads: 1\n", @@ -1878,31 +6355,31 @@ "
\n", - " Dashboard: http://127.0.0.1:45225/status\n", + " Dashboard: http://127.0.0.1:40951/status\n", " \n", - " Memory: 62.82 GiB\n", + " Memory: 22.89 GiB\n", "
\n", - " Nanny: tcp://127.0.0.1:46529\n", + " Nanny: tcp://127.0.0.1:39133\n", "
\n", - " Local directory: /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-zlsacw8_\n", + " Local directory: /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-3v0c20ux\n", "
\n", - " GPU: NVIDIA RTX A6000\n", + " GPU: Quadro GV100\n", " \n", - " GPU memory: 47.54 GiB\n", + " GPU memory: 31.74 GiB\n", "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
010.1
12<NA>
2<NA>2.3
3417.17
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 0.1\n", + "1 2 \n", + "2 2.3\n", + "3 4 17.17" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "4d7f7a6d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
0FalseFalse
1FalseTrue
2TrueFalse
3FalseFalse
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 False False\n", + "1 False True\n", + "2 True False\n", + "3 False False" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.isna()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "40edca67", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 True\n", + "Name: a, dtype: bool" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df['a'].notna()" ] @@ -113,20 +254,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "c269c1f5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "None == None" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "99fb083a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "np.nan == np.nan" ] @@ -141,17 +304,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "630ef6bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 \n", + "2 False\n", + "3 False\n", + "Name: b, dtype: bool" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df['b'] == np.nan" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "8162e383", "metadata": {}, "outputs": [], @@ -161,27 +339,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "199775b3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 \n", + "1 1\n", + "2 2\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "cd09d80c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 \n", + "1 \n", + "2 \n", + "dtype: bool" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s == None" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "6b23bb0c", "metadata": {}, "outputs": [], @@ -191,20 +397,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "cafb79ee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.0\n", + "1 2.0\n", + "2 NaN\n", + "dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "13363897", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "dtype: bool" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s == np.nan" ] @@ -229,10 +463,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "c59c3c54", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 \n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([1, 2, np.nan])" ] @@ -247,10 +495,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "ecc5ae92", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.0\n", + "1 2.0\n", + "2 NaN\n", + "dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([1, 2, np.nan], nan_as_null=False)" ] @@ -273,10 +535,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "de70f244", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2012-01-01 00:00:00.000000\n", + "1 \n", + "2 2012-01-01 00:00:00.000000\n", + "dtype: datetime64[us]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n", @@ -285,10 +561,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "8411a914", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2012-01-01\n", + "1 NaT\n", + "2 2012-01-01\n", + "dtype: datetime64[ns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "datetime_series.to_pandas()" ] @@ -303,10 +593,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "829c32d0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0 days 00:00:00\n", + "1 \n", + "2 0 days 00:00:00\n", + "dtype: timedelta64[us]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "datetime_series - datetime_series" ] @@ -329,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "f8f2aec7", "metadata": {}, "outputs": [], @@ -339,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "0c8a3011", "metadata": {}, "outputs": [], @@ -349,30 +653,237 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "052f6c2b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 NaN\n", + "1 2.0\n", + "2 2 3.2\n", + "3 3 0.1\n", + "4 1.0" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "0fb0a083", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
010.23
11122.0
223.2
334<NA>
4101.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 0.23\n", + "1 11 22.0\n", + "2 2 3.2\n", + "3 34 \n", + "4 10 1.0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df2" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "6f8152c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
02NaN
1<NA>24.0
246.4
337<NA>
4<NA>2.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 2 NaN\n", + "1 24.0\n", + "2 4 6.4\n", + "3 37 \n", + "4 2.0" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1 + df2" ] @@ -387,20 +898,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "45081790", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 \n", + "2 2\n", + "3 3\n", + "4 \n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "39922658", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].sum()" ] @@ -415,10 +953,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "b2f16ddb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "2.0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].mean()" ] @@ -435,20 +984,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "d4a463a0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].sum(skipna=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "a944c42e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].mean(skipna=False)" ] @@ -463,10 +1034,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "4f2a7306", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 \n", + "2 3\n", + "3 6\n", + "4 \n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].cumsum()" ] @@ -481,10 +1068,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "d4c46776", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].cumsum(skipna=False)" ] @@ -507,30 +1110,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "f430c9ce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([np.nan], nan_as_null=False).sum()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "7fde514b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([np.nan], nan_as_null=False).sum(skipna=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "56cedd17", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([], dtype='float64').sum()" ] @@ -545,30 +1181,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "d20bbbef", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([np.nan], nan_as_null=False).prod()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "75abbcfa", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([np.nan], nan_as_null=False).prod(skipna=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "becce0cc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series([], dtype='float64').prod()" ] @@ -591,20 +1260,147 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "1379037c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 NaN\n", + "1 2.0\n", + "2 2 3.2\n", + "3 3 0.1\n", + "4 1.0" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "d6b91e6f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b
a
23.2
1NaN
30.1
\n", + "
" + ], + "text/plain": [ + " b\n", + "a \n", + "2 3.2\n", + "1 NaN\n", + "3 0.1" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1.groupby('a').mean()" ] @@ -619,10 +1415,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "768c3e50", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b
a
23.2
1NaN
30.1
<NA>1.5
\n", + "
" + ], + "text/plain": [ + " b\n", + "a \n", + "2 3.2\n", + "1 NaN\n", + "3 0.1\n", + " 1.5" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1.groupby('a', dropna=False).mean()" ] @@ -645,7 +1504,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "7ddde1fe", "metadata": {}, "outputs": [], @@ -655,17 +1514,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "16e54597", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 4\n", + "dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "f628f94d", "metadata": {}, "outputs": [], @@ -675,10 +1549,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "b30590b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 \n", + "3 4\n", + "dtype: int64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series" ] @@ -701,20 +1590,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "59e22668", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 NaN\n", + "1 2.0\n", + "2 2 3.2\n", + "3 3 0.1\n", + "4 1.0" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "05c221ee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 10.0\n", + "1 2.0\n", + "2 3.2\n", + "3 0.1\n", + "4 1.0\n", + "Name: b, dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['b'].fillna(10)" ] @@ -737,7 +1711,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "f52c5d8f", "metadata": {}, "outputs": [], @@ -748,7 +1722,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "id": "6affebe9", "metadata": {}, "outputs": [], @@ -758,7 +1732,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "1ce1b96f", "metadata": {}, "outputs": [], @@ -768,7 +1742,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "90829195", "metadata": {}, "outputs": [], @@ -778,30 +1752,360 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "c0feac14", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0-0.408268-0.676643-1.274743
1-0.029322-0.873593-1.214105
2-0.8663711.081735-0.226840
3NaN0.8122781.074973
4NaNNaN-0.366725
5-1.016239NaNNaN
60.6751231.067536NaN
70.2215682.025961NaN
8-0.3172411.0112750.674891
9-0.877041-1.919394-1.029201
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 -0.408268 -0.676643 -1.274743\n", + "1 -0.029322 -0.873593 -1.214105\n", + "2 -0.866371 1.081735 -0.226840\n", + "3 NaN 0.812278 1.074973\n", + "4 NaN NaN -0.366725\n", + "5 -1.016239 NaN NaN\n", + "6 0.675123 1.067536 NaN\n", + "7 0.221568 2.025961 NaN\n", + "8 -0.317241 1.011275 0.674891\n", + "9 -0.877041 -1.919394 -1.029201" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dff" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "a07c1260", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0-0.408268-0.676643-1.274743
1-0.029322-0.873593-1.214105
2-0.8663711.081735-0.226840
3-0.3272240.8122781.074973
4-0.3272240.316145-0.366725
5-1.0162390.316145-0.337393
60.6751231.067536-0.337393
70.2215682.025961-0.337393
8-0.3172411.0112750.674891
9-0.877041-1.919394-1.029201
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 -0.408268 -0.676643 -1.274743\n", + "1 -0.029322 -0.873593 -1.214105\n", + "2 -0.866371 1.081735 -0.226840\n", + "3 -0.327224 0.812278 1.074973\n", + "4 -0.327224 0.316145 -0.366725\n", + "5 -1.016239 0.316145 -0.337393\n", + "6 0.675123 1.067536 -0.337393\n", + "7 0.221568 2.025961 -0.337393\n", + "8 -0.317241 1.011275 0.674891\n", + "9 -0.877041 -1.919394 -1.029201" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dff.fillna(dff.mean())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "id": "9e70d61a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0-0.408268-0.676643-1.274743
1-0.029322-0.873593-1.214105
2-0.8663711.081735-0.226840
3NaN0.8122781.074973
4NaN0.316145-0.366725
5-1.0162390.316145-0.337393
60.6751231.067536-0.337393
70.2215682.025961-0.337393
8-0.3172411.0112750.674891
9-0.877041-1.919394-1.029201
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 -0.408268 -0.676643 -1.274743\n", + "1 -0.029322 -0.873593 -1.214105\n", + "2 -0.866371 1.081735 -0.226840\n", + "3 NaN 0.812278 1.074973\n", + "4 NaN 0.316145 -0.366725\n", + "5 -1.016239 0.316145 -0.337393\n", + "6 0.675123 1.067536 -0.337393\n", + "7 0.221568 2.025961 -0.337393\n", + "8 -0.317241 1.011275 0.674891\n", + "9 -0.877041 -1.919394 -1.029201" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dff.fillna(dff.mean()[1:3])" ] @@ -824,30 +2128,204 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "98c57be7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
01NaN
1<NA>2.0
223.2
330.1
4<NA>1.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 NaN\n", + "1 2.0\n", + "2 2 3.2\n", + "3 3 0.1\n", + "4 1.0" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "bc3f273a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
223.2
330.1
\n", + "
" + ], + "text/plain": [ + " a b\n", + "2 2 3.2\n", + "3 3 0.1" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1.dropna(axis=0)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "a48d4de0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
1
2
3
4
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: [0, 1, 2, 3, 4]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1.dropna(axis=1)" ] @@ -862,10 +2340,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "id": "2dd8f660", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "2 2\n", + "3 3\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df1['a'].dropna()" ] @@ -890,7 +2382,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "e6c14e8a", "metadata": {}, "outputs": [], @@ -900,20 +2392,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "a852f0cb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.0\n", + "1 1.0\n", + "2 2.0\n", + "3 3.0\n", + "4 4.0\n", + "dtype: float64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "f6ac12eb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 5.0\n", + "1 1.0\n", + "2 2.0\n", + "3 3.0\n", + "4 4.0\n", + "dtype: float64" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series.replace(0, 5)" ] @@ -928,10 +2452,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "id": "f0156bff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 \n", + "1 1.0\n", + "2 2.0\n", + "3 3.0\n", + "4 4.0\n", + "dtype: float64" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series.replace(0, None)" ] @@ -946,10 +2486,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "id": "f3110f5b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 4.0\n", + "1 3.0\n", + "2 2.0\n", + "3 1.0\n", + "4 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])" ] @@ -964,10 +2520,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "45862d05", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 10.0\n", + "1 100.0\n", + "2 2.0\n", + "3 3.0\n", + "4 4.0\n", + "dtype: float64" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series.replace({0: 10, 1: 100})" ] @@ -982,7 +2554,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "348caa64", "metadata": {}, "outputs": [], @@ -992,20 +2564,158 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "cca41ec4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
005
116
227
338
449
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 0 5\n", + "1 1 6\n", + "2 2 7\n", + "3 3 8\n", + "4 4 9" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "64334693", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
0100100
116
227
338
449
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 100 100\n", + "1 1 6\n", + "2 2 7\n", + "3 3 8\n", + "4 4 9" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace({\"a\": 0, \"b\": 5}, 100)" ] @@ -1028,7 +2738,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "id": "031d3533", "metadata": {}, "outputs": [], @@ -1038,7 +2748,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "id": "12b41efb", "metadata": {}, "outputs": [], @@ -1048,30 +2758,234 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "id": "d450df49", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00aa
11bb
22.<NA>
33.d
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 a a\n", + "1 1 b b\n", + "2 2 . \n", + "3 3 . d" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "id": "f823bc46", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00aa
11bb
22A Dot<NA>
33A Dotd
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 a a\n", + "1 1 b b\n", + "2 2 A Dot \n", + "3 3 A Dot d" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace(\".\", \"A Dot\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "id": "bc52f6e9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00aa
11<NA><NA>
22A Dot<NA>
33A Dotd
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 a a\n", + "1 1 \n", + "2 2 A Dot \n", + "3 3 A Dot d" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace([\".\", \"b\"], [\"A Dot\", None])" ] @@ -1086,10 +3000,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "id": "7e23eba9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00bb
11bb
22--<NA>
33--d
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 b b\n", + "1 1 b b\n", + "2 2 -- \n", + "3 3 -- d" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace([\"a\", \".\"], [\"b\", \"--\"])" ] @@ -1104,10 +3086,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "id": "d2e79805", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
00aa
11bb
22replacement value<NA>
33replacement valued
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 0 a a\n", + "1 1 b b\n", + "2 2 replacement value \n", + "3 3 replacement value d" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace({\"b\": \".\"}, {\"b\": \"replacement value\"})" ] @@ -1130,7 +3180,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "id": "355a2f0d", "metadata": {}, "outputs": [], @@ -1140,7 +3190,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "id": "d9eed372", "metadata": {}, "outputs": [], @@ -1150,10 +3200,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "id": "ae944244", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0-0.089358787-0.728419386
1-2.141612003-0.574415182
2<NA><NA>
30.7746434622.07287721
40.93799853-1.054129436
5<NA><NA>
6-0.4352930121.163009584
71.3466232870.31961371
8<NA><NA>
9<NA><NA>
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 -0.089358787 -0.728419386\n", + "1 -2.141612003 -0.574415182\n", + "2 \n", + "3 0.774643462 2.07287721\n", + "4 0.93799853 -1.054129436\n", + "5 \n", + "6 -0.435293012 1.163009584\n", + "7 1.346623287 0.31961371\n", + "8 \n", + "9 " + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace(1.5, None)" ] @@ -1168,7 +3317,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "id": "59b81c60", "metadata": {}, "outputs": [], @@ -1178,10 +3327,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "id": "01a71d4c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
010.000000-0.728419
1-2.141612-0.574415
25.0000005.000000
30.7746432.072877
40.937999-1.054129
55.0000005.000000
6-0.4352931.163010
71.3466230.319614
85.0000005.000000
95.0000005.000000
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 10.000000 -0.728419\n", + "1 -2.141612 -0.574415\n", + "2 5.000000 5.000000\n", + "3 0.774643 2.072877\n", + "4 0.937999 -1.054129\n", + "5 5.000000 5.000000\n", + "6 -0.435293 1.163010\n", + "7 1.346623 0.319614\n", + "8 5.000000 5.000000\n", + "9 5.000000 5.000000" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.replace([1.5, df00], [5, 10])" ] @@ -1196,7 +3444,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "id": "5f0859d7", "metadata": {}, "outputs": [], @@ -1206,10 +3454,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "id": "5cf28369", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0-0.089358787-0.728419386
1-2.141612003-0.574415182
2<NA><NA>
30.7746434622.07287721
40.93799853-1.054129436
5<NA><NA>
6-0.4352930121.163009584
71.3466232870.31961371
8<NA><NA>
9<NA><NA>
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 -0.089358787 -0.728419386\n", + "1 -2.141612003 -0.574415182\n", + "2 \n", + "3 0.774643462 2.07287721\n", + "4 0.93799853 -1.054129436\n", + "5 \n", + "6 -0.435293012 1.163009584\n", + "7 1.346623287 0.31961371\n", + "8 \n", + "9 " + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] @@ -1220,6 +3567,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb index 309fb71542f..3f444fe16a5 100644 --- a/docs/cudf/source/user_guide/cupy-interop.ipynb +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "8b2d45c3", "metadata": {}, "outputs": [], @@ -47,10 +47,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "45c482ab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "118 µs ± 77.2 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", + "360 µs ± 6.04 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n", + "355 µs ± 722 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], "source": [ "nelem = 10000\n", "df = cudf.DataFrame({'a':range(nelem),\n", @@ -65,10 +75,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a565effc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 500, 1000],\n", + " [ 1, 501, 1001],\n", + " [ 2, 502, 1002],\n", + " ...,\n", + " [ 9997, 10497, 10997],\n", + " [ 9998, 10498, 10998],\n", + " [ 9999, 10499, 10999]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", "arr_cupy" @@ -96,10 +123,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8f97f304", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "54.4 µs ± 66 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", + "125 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", + "119 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], "source": [ "col = 'a'\n", "\n", @@ -110,10 +147,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "f96d5676", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 9997, 9998, 9999])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cola_cupy = cp.asarray(df[col])\n", "cola_cupy" @@ -129,10 +177,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2a7ae43f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2, ..., 197, 198, 199],\n", + " [ 200, 201, 202, ..., 397, 398, 399],\n", + " [ 400, 401, 402, ..., 597, 598, 599],\n", + " ...,\n", + " [9400, 9401, 9402, ..., 9597, 9598, 9599],\n", + " [9600, 9601, 9602, ..., 9797, 9798, 9799],\n", + " [9800, 9801, 9802, ..., 9997, 9998, 9999]])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "reshaped_arr = cola_cupy.reshape(50, 200)\n", "reshaped_arr" @@ -140,20 +205,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b442a30c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 201, 402, 603, 804, 1005, 1206, 1407, 1608, 1809, 2010,\n", + " 2211, 2412, 2613, 2814, 3015, 3216, 3417, 3618, 3819, 4020, 4221,\n", + " 4422, 4623, 4824, 5025, 5226, 5427, 5628, 5829, 6030, 6231, 6432,\n", + " 6633, 6834, 7035, 7236, 7437, 7638, 7839, 8040, 8241, 8442, 8643,\n", + " 8844, 9045, 9246, 9447, 9648, 9849])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "reshaped_arr.diagonal()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "be7f4d32", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(577306.967739)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cp.linalg.norm(reshaped_arr)" ] @@ -178,20 +269,221 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "8887b253", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.3 ms ± 33.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], "source": [ "%timeit reshaped_df = cudf.DataFrame(reshaped_arr)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "08ec4ffa", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", + "

5 rows × 200 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", + "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", + "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", + "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", + "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", + "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", + "\n", + " 194 195 196 197 198 199 \n", + "0 194 195 196 197 198 199 \n", + "1 394 395 396 397 398 399 \n", + "2 594 595 596 597 598 599 \n", + "3 794 795 796 797 798 799 \n", + "4 994 995 996 997 998 999 \n", + "\n", + "[5 rows x 200 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "reshaped_df = cudf.DataFrame(reshaped_arr)\n", "reshaped_df.head()" @@ -207,10 +499,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "65b8bd0d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cp.isfortran(reshaped_arr)" ] @@ -225,10 +528,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "27b2f563", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.57 ms ± 9.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], "source": [ "%%timeit\n", "\n", @@ -238,10 +549,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "0a0cc290", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.48 ms ± 7.89 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], "source": [ "%%timeit\n", "\n", @@ -251,10 +570,203 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "0d2c5beb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", + "

5 rows × 200 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", + "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", + "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", + "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", + "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", + "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", + "\n", + " 194 195 196 197 198 199 \n", + "0 194 195 196 197 198 199 \n", + "1 394 395 396 397 398 399 \n", + "2 594 595 596 597 598 599 \n", + "3 794 795 796 797 798 799 \n", + "4 994 995 996 997 998 999 \n", + "\n", + "[5 rows x 200 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "fortran_arr = cp.asfortranarray(reshaped_arr)\n", "reshaped_df = cudf.DataFrame(fortran_arr)\n", @@ -273,10 +785,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "d8518208", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 201\n", + "2 402\n", + "3 603\n", + "4 804\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cudf.Series(reshaped_arr.diagonal()).head()" ] @@ -295,10 +823,203 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "2bb8ed81", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...190191192193194195196197198199
00123456789...190191192193194195196197198199
1200201202203204205206207208209...390391392393394395396397398399
2400401402403404405406407408409...590591592593594595596597598599
3600601602603604605606607608609...790791792793794795796797798799
4800801802803804805806807808809...990991992993994995996997998999
\n", + "

5 rows × 200 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \\\n", + "0 0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 \n", + "1 200 201 202 203 204 205 206 207 208 209 ... 390 391 392 393 \n", + "2 400 401 402 403 404 405 406 407 408 409 ... 590 591 592 593 \n", + "3 600 601 602 603 604 605 606 607 608 609 ... 790 791 792 793 \n", + "4 800 801 802 803 804 805 806 807 808 809 ... 990 991 992 993 \n", + "\n", + " 194 195 196 197 198 199 \n", + "0 194 195 196 197 198 199 \n", + "1 394 395 396 397 398 399 \n", + "2 594 595 596 597 598 599 \n", + "3 794 795 796 797 798 799 \n", + "4 994 995 996 997 998 999 \n", + "\n", + "[5 rows x 200 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "reshaped_df.head()" ] @@ -313,10 +1034,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "2dde030d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 19900, 59900, 99900, 139900, 179900, 219900, 259900,\n", + " 299900, 339900, 379900, 419900, 459900, 499900, 539900,\n", + " 579900, 619900, 659900, 699900, 739900, 779900, 819900,\n", + " 859900, 899900, 939900, 979900, 1019900, 1059900, 1099900,\n", + " 1139900, 1179900, 1219900, 1259900, 1299900, 1339900, 1379900,\n", + " 1419900, 1459900, 1499900, 1539900, 1579900, 1619900, 1659900,\n", + " 1699900, 1739900, 1779900, 1819900, 1859900, 1899900, 1939900,\n", + " 1979900])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "new_arr = cupy_from_dlpack(reshaped_df.to_dlpack())\n", "new_arr.sum(axis=1)" @@ -344,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "e531fd15", "metadata": {}, "outputs": [], @@ -372,7 +1111,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "58c7e074", "metadata": {}, "outputs": [], @@ -388,20 +1127,262 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "9265228d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a0a1a2a3a4a5a6a7a8a9a10a11a12a13a14a15a16a17a18a19
00.00.00.00.00.00.00.0000000.00.00.0000000.00.00.00.000000.0000000.00.00.00.011.308953
10.00.00.00.00.00.00.0000000.00.0-5.2412970.00.00.017.584760.0000000.00.00.00.00.000000
20.00.00.00.00.00.00.0000000.00.00.0000000.00.00.00.000000.0000000.00.00.00.00.000000
30.00.00.00.00.00.00.0000000.00.00.0000000.00.00.00.0000010.8692790.00.00.00.00.000000
40.00.00.00.00.00.02.5262740.00.00.0000000.00.00.00.000000.0000000.00.00.00.00.000000
\n", + "
" + ], + "text/plain": [ + " a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 -5.241297 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 2.526274 0.0 0.0 0.000000 0.0 0.0 0.0 \n", + "\n", + " a13 a14 a15 a16 a17 a18 a19 \n", + "0 0.00000 0.000000 0.0 0.0 0.0 0.0 11.308953 \n", + "1 17.58476 0.000000 0.0 0.0 0.0 0.0 0.000000 \n", + "2 0.00000 0.000000 0.0 0.0 0.0 0.0 0.000000 \n", + "3 0.00000 10.869279 0.0 0.0 0.0 0.0 0.000000 \n", + "4 0.00000 0.000000 0.0 0.0 0.0 0.0 0.000000 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "5ba1a551", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (770, 0)\t-1.373354548007899\n", + " (771, 0)\t11.641890592020793\n", + " (644, 0)\t-1.4820515981598015\n", + " (773, 0)\t4.374245789758399\n", + " (646, 0)\t4.58071340724814\n", + " (776, 0)\t5.115792716318899\n", + " (649, 0)\t8.676941295251092\n", + " (522, 0)\t-0.11573951593420229\n", + " (396, 0)\t8.124303607236273\n", + " (652, 0)\t9.359339954077681\n", + " (141, 0)\t8.50710863345112\n", + " (272, 0)\t7.440244879175392\n", + " (1042, 0)\t4.286859524587998\n", + " (275, 0)\t-0.6091666840632348\n", + " (787, 0)\t10.124449357828695\n", + " (915, 0)\t11.391560911074649\n", + " (1043, 0)\t11.478396096078907\n", + " (408, 0)\t11.204049991287349\n", + " (536, 0)\t13.239689100708974\n", + " (26, 0)\t4.951917355877771\n", + " (794, 0)\t2.736556006961319\n", + " (539, 0)\t12.553519350929216\n", + " (412, 0)\t2.8682583361020786\n", + " (540, 0)\t-1.2121388231076713\n", + " (796, 0)\t6.986443354019786\n", + " :\t:\n", + " (9087, 19)\t-2.9543770156500395\n", + " (9440, 19)\t3.903613949374532\n", + " (9186, 19)\t0.3141028170017329\n", + " (9571, 19)\t1.7347840594688502\n", + " (9188, 19)\t14.68745562157488\n", + " (9316, 19)\t13.808308442016436\n", + " (9957, 19)\t9.705810918221086\n", + " (9318, 19)\t9.984168186940485\n", + " (9446, 19)\t5.173000114288142\n", + " (9830, 19)\t3.2442816093793607\n", + " (9835, 19)\t5.713078257113576\n", + " (9580, 19)\t5.373437384911853\n", + " (9326, 19)\t10.736403419943093\n", + " (9711, 19)\t-4.003216472911014\n", + " (9200, 19)\t5.560182026578174\n", + " (9844, 19)\t6.17251145210342\n", + " (9333, 19)\t7.085353006324948\n", + " (9208, 19)\t6.789030498520347\n", + " (9464, 19)\t4.314887636528589\n", + " (9720, 19)\t12.446300974563027\n", + " (9594, 19)\t4.317523130615451\n", + " (9722, 19)\t-2.3257161477576336\n", + " (9723, 19)\t1.9288133227037407\n", + " (9469, 19)\t0.268312217498608\n", + " (9599, 19)\t4.100996763787237\n" + ] + } + ], "source": [ "sparse_data = cudf_to_cupy_sparse_matrix(df)\n", "print(sparse_data)" @@ -423,6 +1404,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 8ea088a1d72..ef7500a2be9 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "0c6b65ce", "metadata": {}, "outputs": [], @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e28d5b82", "metadata": {}, "outputs": [], @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "96aeb19f", "metadata": {}, "outputs": [], @@ -111,10 +111,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8ca08834", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 3\n", + "2 4\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sr.apply(f)" ] @@ -137,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "8d156d01", "metadata": {}, "outputs": [], @@ -148,10 +162,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "1dee82d7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 43\n", + "1 44\n", + "2 45\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# cuDF apply\n", "sr.apply(g, args=(42,))" @@ -183,10 +211,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bda261dd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 \n", + "2 3\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Create a cuDF series with nulls\n", "sr = cudf.Series([1, cudf.NA, 3])\n", @@ -195,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "0123ae07", "metadata": {}, "outputs": [], @@ -207,10 +249,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "e95868dd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 \n", + "2 4\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# cuDF result\n", "sr.apply(f)" @@ -226,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "6c65241b", "metadata": {}, "outputs": [], @@ -241,10 +297,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "ab0f4dbf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 42\n", + "2 4\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# cuDF result\n", "sr.apply(f_null_sensitive)" @@ -279,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "732434f6", "metadata": {}, "outputs": [], @@ -289,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "4f5997e5", "metadata": {}, "outputs": [], @@ -315,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "ea6008a6", "metadata": {}, "outputs": [], @@ -335,10 +405,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "183a82ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abce
096310059979630.0
197710269809770.0
210481026101910480.0
3107896098510780.0
497998210119790.0
\n", + "
" + ], + "text/plain": [ + " a b c e\n", + "0 963 1005 997 9630.0\n", + "1 977 1026 980 9770.0\n", + "2 1048 1026 1019 10480.0\n", + "3 1078 960 985 10780.0\n", + "4 979 982 1011 9790.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.head()" ] @@ -383,7 +534,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "73653918", "metadata": {}, "outputs": [], @@ -402,10 +553,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "077feb75", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
014
12<NA>
236
\n", + "
" + ], + "text/plain": [ + " A B\n", + "0 1 4\n", + "1 2 \n", + "2 3 6" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = cudf.DataFrame({\n", " 'A': [1,2,3],\n", @@ -424,10 +632,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "091e39e1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 5\n", + "1 \n", + "2 9\n", + "dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -442,10 +664,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "bd345fab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 5\n", + "1 \n", + "2 9\n", + "dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.to_pandas(nullable=True).apply(f, axis=1)" ] @@ -468,10 +704,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "b70f4b3b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a
01
1<NA>
23
\n", + "
" + ], + "text/plain": [ + " a\n", + "0 1\n", + "1 \n", + "2 3" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def f(row):\n", " x = row['a']\n", @@ -486,10 +775,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "0313c8df", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 0\n", + "2 4\n", + "dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -504,10 +807,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "96a7952a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
012
121
231
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 2\n", + "1 2 1\n", + "2 3 1" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def f(row):\n", " x = row['a']\n", @@ -526,10 +886,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "e0815f60", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 3\n", + "2 \n", + "dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -544,10 +918,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "495efd14", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
010.5
12<NA>
233.14
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 1 0.5\n", + "1 2 \n", + "2 3 3.14" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def f(row):\n", " return row['a'] + row['b']\n", @@ -561,10 +992,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "678b0b5a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.5\n", + "1 \n", + "2 6.14\n", + "dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -592,10 +1037,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "acf48d56", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a
01
13
25
\n", + "
" + ], + "text/plain": [ + " a\n", + "0 1\n", + "1 3\n", + "2 5" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def f(row):\n", " x = row['a']\n", @@ -612,10 +1110,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "78a98172", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.5\n", + "1 1.5\n", + "2 5.0\n", + "dtype: float64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -630,10 +1142,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "142c30a9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcde
014<NA>87
125471
236486
\n", + "
" + ], + "text/plain": [ + " a b c d e\n", + "0 1 4 8 7\n", + "1 2 5 4 7 1\n", + "2 3 6 4 8 6" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def f(row):\n", " return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n", @@ -650,10 +1231,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "fee9198a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 \n", + "1 2.428571429\n", + "2 8.5\n", + "dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.apply(f, axis=1)" ] @@ -678,7 +1273,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "90cbcd85", "metadata": {}, "outputs": [], @@ -709,10 +1304,83 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "e782daff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdeout
014<NA>878.0
1254713.0
2364869.0
\n", + "
" + ], + "text/plain": [ + " a b c d e out\n", + "0 1 4 8 7 8.0\n", + "1 2 5 4 7 1 3.0\n", + "2 3 6 4 8 6 9.0" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = df.apply_rows(conditional_add, \n", " incols={'a':'x', 'e':'y'},\n", @@ -742,10 +1410,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "befd8333", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
09631005997
19771026<NA>
2<NA>10261019
31078<NA>985
49799821011
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 963 1005 997\n", + "1 977 1026 \n", + "2 1026 1019\n", + "3 1078 985\n", + "4 979 982 1011" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def gpu_add(a, b, out):\n", " for i, (x, y) in enumerate(zip(a, b)):\n", @@ -768,10 +1511,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "d1f3dcaf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcout
096310059971968.0
19771026<NA>2003.0
2<NA>10261019<NA>
31078<NA>985<NA>
497998210111961.0
\n", + "
" + ], + "text/plain": [ + " a b c out\n", + "0 963 1005 997 1968.0\n", + "1 977 1026 2003.0\n", + "2 1026 1019 \n", + "3 1078 985 \n", + "4 979 982 1011 1961.0" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = df.apply_rows(gpu_add, \n", " incols=['a', 'b'],\n", @@ -802,10 +1626,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "6bc6aea3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 16.0\n", + "1 25.0\n", + "2 36.0\n", + "3 49.0\n", + "4 64.0\n", + "5 81.0\n", + "dtype: float64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n", "ser" @@ -813,10 +1654,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "a4c31df1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Rolling [window=3,min_periods=3,center=False]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "rolling = ser.rolling(window=3, min_periods=3, center=False)\n", "rolling" @@ -832,7 +1684,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "eb5a081b", "metadata": {}, "outputs": [], @@ -858,10 +1710,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "ddec3263", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 \n", + "1 \n", + "2 6.0\n", + "3 7.0\n", + "4 100.0\n", + "5 9.0\n", + "dtype: float64" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "rolling.apply(example_func)" ] @@ -876,10 +1745,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "8b61094a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
055.055.0
156.056.0
257.057.0
358.058.0
459.059.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 55.0 55.0\n", + "1 56.0 56.0\n", + "2 57.0 57.0\n", + "3 58.0 58.0\n", + "4 59.0 59.0" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df2 = cudf.DataFrame()\n", "df2['a'] = np.arange(55, 65, dtype='float64')\n", @@ -889,10 +1827,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "bb8c3019", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
0<NA><NA>
1<NA><NA>
27.5498344357.549834435
37.6157731067.615773106
47.6811457487.681145748
57.7459666927.745966692
67.8102496767.810249676
77.8740078747.874007874
87.9372539337.937253933
9100.0100.0
\n", + "
" + ], + "text/plain": [ + " a b\n", + "0 \n", + "1 \n", + "2 7.549834435 7.549834435\n", + "3 7.615773106 7.615773106\n", + "4 7.681145748 7.681145748\n", + "5 7.745966692 7.745966692\n", + "6 7.810249676 7.810249676\n", + "7 7.874007874 7.874007874\n", + "8 7.937253933 7.937253933\n", + "9 100.0 100.0" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "rolling = df2.rolling(window=3, min_periods=3, center=False)\n", "rolling.apply(example_func)" @@ -912,10 +1949,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "3dc272ab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abce
0-0.691674TrueDan-0.958380
10.480099FalseBob-0.729580
2-0.473370TrueXavier-0.767454
30.067479TrueAlice-0.380205
4-0.970850FalseSarah0.342905
\n", + "
" + ], + "text/plain": [ + " a b c e\n", + "0 -0.691674 True Dan -0.958380\n", + "1 0.480099 False Bob -0.729580\n", + "2 -0.473370 True Xavier -0.767454\n", + "3 0.067479 True Alice -0.380205\n", + "4 -0.970850 False Sarah 0.342905" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n", "df.head()" @@ -923,7 +2041,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "c0578e0a", "metadata": {}, "outputs": [], @@ -941,7 +2059,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "19f0f7fe", "metadata": {}, "outputs": [], @@ -970,10 +2088,142 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "c43426c3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcerolling_avg_e
10.480099FalseBob-0.729580NaN
4-0.970850FalseSarah0.342905NaN
60.801430FalseSarah0.6323370.081887
7-0.933157FalseQuinn-0.4208260.184805
0-0.691674TrueDan-0.958380NaN
2-0.473370TrueXavier-0.767454NaN
30.067479TrueAlice-0.380205-0.702013
50.837494TrueWendy-0.057540-0.401733
80.913899TrueUrsula0.4662520.009502
9-0.725581TrueGeorge0.4052450.271319
\n", + "
" + ], + "text/plain": [ + " a b c e rolling_avg_e\n", + "1 0.480099 False Bob -0.729580 NaN\n", + "4 -0.970850 False Sarah 0.342905 NaN\n", + "6 0.801430 False Sarah 0.632337 0.081887\n", + "7 -0.933157 False Quinn -0.420826 0.184805\n", + "0 -0.691674 True Dan -0.958380 NaN\n", + "2 -0.473370 True Xavier -0.767454 NaN\n", + "3 0.067479 True Alice -0.380205 -0.702013\n", + "5 0.837494 True Wendy -0.057540 -0.401733\n", + "8 0.913899 True Ursula 0.466252 0.009502\n", + "9 -0.725581 True George 0.405245 0.271319" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "results = grouped.apply_grouped(rolling_avg,\n", " incols=['e'],\n", @@ -1001,10 +2251,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "aa6a8509", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1., 2., 3., 4., 10.])" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import cupy as cp\n", "\n", @@ -1023,10 +2284,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "0bb8bf93", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 5\n", + "1 10\n", + "2 15\n", + "3 20\n", + "4 50\n", + "dtype: int32" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "@cuda.jit\n", "def multiply_by_5(x, out):\n", @@ -1049,10 +2326,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "ce60b639", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 5., 10., 15., 20., 50.])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "out = cp.empty_like(arr)\n", "multiply_by_5.forall(arr.size)(arr, out)\n", @@ -1103,6 +2391,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, From dcf91f524874749ca9220e80d62f27342f1837b2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 14:57:21 -0400 Subject: [PATCH 12/14] Add a note about iteration --- docs/cudf/source/user_guide/pandas-comparison.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/cudf/source/user_guide/pandas-comparison.rst b/docs/cudf/source/user_guide/pandas-comparison.rst index a7fb2f75f15..038c45c5853 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.rst +++ b/docs/cudf/source/user_guide/pandas-comparison.rst @@ -69,6 +69,20 @@ compare the behaviour of cuDF with Pandas below: See our :doc:`docs on missing data` for details. +Iteration +--------- + +Iterating over a cuDF ``Series``, ``DataFrame`` or ``Index`` is not +supported. This is because iterating over data that resides on the GPU +will yield *extremely* poor performance, as GPUs are optimized for +highly parallel operations rather than sequential operations. + +In the vast majority of cases, it is possible to avoid iteration and +use an existing function or method to accomplish the same task. If you +absolutely must iterate, copy the data from GPU to CPU by using +``.to_arrow()`` or ``.to_pandas()``, then copy the result back to GPU +using ``.from_arrow()`` or ``.from_pandas()``. + Result ordering --------------- From 1f4b8845e717da0a29d322ed3bda12f5c24d4b8e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 15:31:31 -0400 Subject: [PATCH 13/14] Add whitespace --- docs/cudf/source/user_guide/pandas-comparison.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/cudf/source/user_guide/pandas-comparison.rst b/docs/cudf/source/user_guide/pandas-comparison.rst index 038c45c5853..23cc66ce7e6 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.rst +++ b/docs/cudf/source/user_guide/pandas-comparison.rst @@ -38,6 +38,7 @@ Unlike Pandas, *all* data types in cuDF are nullable, meaning they can contain missing values (represented by ``cudf.NA``). .. code:: python + >>> s = cudf.Series([1, 2, cudf.NA]) >>> s >>> s @@ -50,6 +51,7 @@ Nulls are not coerced to ``nan`` in any situation; compare the behaviour of cuDF with Pandas below: .. code:: python + >>> s = cudf.Series([1, 2, cudf.NA], dtype="category") >>> s 0 1 @@ -138,6 +140,7 @@ collections of arbitrary Python objects. For example, in Pandas you can do the following: .. code:: python + >>> import pandas as pd >>> s = pd.Series(["a", 1, [1, 2, 3]]) 0 a From 705365bff82ed4ee5441938e45e00d955acbb730 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 22 Apr 2022 14:35:12 -0400 Subject: [PATCH 14/14] Change from rst -> md --- .../source/user_guide/pandas-comparison.md | 155 +++++++++++++++++ .../source/user_guide/pandas-comparison.rst | 164 ------------------ 2 files changed, 155 insertions(+), 164 deletions(-) create mode 100644 docs/cudf/source/user_guide/pandas-comparison.md delete mode 100644 docs/cudf/source/user_guide/pandas-comparison.rst diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md new file mode 100644 index 00000000000..e0e4dc0157e --- /dev/null +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -0,0 +1,155 @@ +# Comparison of cuDF and Pandas + +cuDF is a DataFrame library that closely matches the Pandas API, but +leverages NVIDIA GPUs for performing computations for speed. However, +there are some differences between cuDF and Pandas, both in terms API +and behavior. This page documents the similarities and differences +between cuDF and Pandas. + +## Supported operations + +cuDF supports many of the same data structures and operations as +Pandas. This includes `Series`, `DataFrame`, `Index` and +operations on them such as unary and binary operations, indexing, +filtering, concatenating, joining, groupby and window operations - +among many others. + +The best way to see if we support a particular Pandas API is to search +our [API docs](/api_docs/index). + +## Data types + +cuDF supports many common data types supported by Pandas, including +numeric, datetime, timestamp, string, and categorical data types. In +addition, we support special data types for decimal, list and "struct" +values. See the section on [Data Types](data-types) for +details. + +Note that we do not support custom data types like Pandas' +`ExtensionDtype`. + +## Null (or "missing") values + +Unlike Pandas, *all* data types in cuDF are nullable, +meaning they can contain missing values (represented by `cudf.NA`). + +```{code} python +>>> s = cudf.Series([1, 2, cudf.NA]) +>>> s +>>> s +0 1 +1 2 +2 +dtype: int64 +``` + +Nulls are not coerced to `nan` in any situation; +compare the behaviour of cuDF with Pandas below: + +```{code} python +>>> s = cudf.Series([1, 2, cudf.NA], dtype="category") +>>> s +0 1 +1 2 +2 +dtype: category +Categories (2, int64): [1, 2] + +>>> s = pd.Series([1, 2, pd.NA], dtype="category") +>>> s +0 1 +1 2 +2 NaN +dtype: category +Categories (2, int64): [1, 2] +``` + +See the docs on [missing data](Working-with-missing-data) for +details. + +## Iteration + +Iterating over a cuDF `Series`, `DataFrame` or `Index` is not +supported. This is because iterating over data that resides on the GPU +will yield *extremely* poor performance, as GPUs are optimized for +highly parallel operations rather than sequential operations. + +In the vast majority of cases, it is possible to avoid iteration and +use an existing function or method to accomplish the same task. If you +absolutely must iterate, copy the data from GPU to CPU by using +`.to_arrow()` or `.to_pandas()`, then copy the result back to GPU +using `.from_arrow()` or `.from_pandas()`. + +## Result ordering + +By default, `join` (or `merge`) and `groupby` operations in cuDF +do *not* guarantee output ordering by default. +Compare the results obtained from Pandas and cuDF below: + +```{code} python + >>> import cupy as cp + >>> df = cudf.DataFrame({'a': cp.random.randint(0, 1000, 1000), 'b': range(1000)}) + >>> df.groupby("a").mean().head() + b + a + 742 694.5 + 29 840.0 + 459 525.5 + 442 363.0 + 666 7.0 + >>> df.to_pandas().groupby("a").mean().head() + b + a + 2 643.75 + 6 48.00 + 7 631.00 + 9 906.00 + 10 640.00 +``` + +To match Pandas behavior, you must explicitly pass `sort=True`: + +```{code} python +>>> df.to_pandas().groupby("a", sort=True).mean().head() + b +a +2 643.75 +6 48.00 +7 631.00 +9 906.00 +10 640.00 +``` + +## Column names + +Unlike Pandas, cuDF does not support duplicate column names. +It is best to use strings for column names. + +## No true `"object"` data type + +In Pandas and NumPy, the `"object"` data type is used for +collections of arbitrary Python objects. For example, in Pandas you +can do the following: + +```{code} python +>>> import pandas as pd +>>> s = pd.Series(["a", 1, [1, 2, 3]]) +0 a +1 1 +2 [1, 2, 3] +dtype: object +``` + +For compatibilty with Pandas, cuDF reports the data type for strings +as `"object"`, but we do *not* support storing or operating on +collections of arbitrary Python objects. + +## `.apply()` function limitations + +The `.apply()` function in Pandas accecpts a user-defined function +(UDF) that can include arbitrary operations that are applied to each +value of a `Series`, `DataFrame`, or in the case of a groupby, +each group. cuDF also supports `apply()`, but it relies on Numba to +JIT compile the UDF and execute it on the GPU. This can be extremely +fast, but imposes a few limitations on what operations are allowed in +the UDF. See the docs on [UDFs](guide-to-udfs) for details. diff --git a/docs/cudf/source/user_guide/pandas-comparison.rst b/docs/cudf/source/user_guide/pandas-comparison.rst deleted file mode 100644 index 23cc66ce7e6..00000000000 --- a/docs/cudf/source/user_guide/pandas-comparison.rst +++ /dev/null @@ -1,164 +0,0 @@ -Comparison of cuDF and Pandas -============================= - -cuDF is a DataFrame library that closely matches the Pandas API, but -leverages NVIDIA GPUs for performing computations for speed. However, -there are some differences between cuDF and Pandas, both in terms API -and behavior. This page documents the similarities and differences -between cuDF and Pandas. - -Supported operations --------------------- - -cuDF supports many of the same data structures and operations as -Pandas. This includes ``Series``, ``DataFrame``, ``Index`` and -operations on them such as unary and binary operations, indexing, -filtering, concatenating, joining, groupby and window operations - -among many others. - -The best way to see if we support a particular Pandas API is to search -our `API docs `_. - -Data types ----------- - -cuDF supports many common data types supported by Pandas, including -numeric, datetime, timestamp, string, and categorical data types. In -addition, we support special data types for decimal, list and "struct" -values. See the section on :doc:`Data Types ` for -details. - -Note that we do not support custom data types like Pandas' -``ExtensionDtype``. - -Null (or "missing") values --------------------------- - -Unlike Pandas, *all* data types in cuDF are nullable, -meaning they can contain missing values (represented by ``cudf.NA``). - -.. code:: python - - >>> s = cudf.Series([1, 2, cudf.NA]) - >>> s - >>> s - 0 1 - 1 2 - 2 - dtype: int64 - -Nulls are not coerced to ``nan`` in any situation; -compare the behaviour of cuDF with Pandas below: - -.. code:: python - - >>> s = cudf.Series([1, 2, cudf.NA], dtype="category") - >>> s - 0 1 - 1 2 - 2 - dtype: category - Categories (2, int64): [1, 2] - - >>> s = pd.Series([1, 2, pd.NA], dtype="category") - >>> s - 0 1 - 1 2 - 2 NaN - dtype: category - Categories (2, int64): [1, 2] - -See our :doc:`docs on missing data` -for details. - -Iteration ---------- - -Iterating over a cuDF ``Series``, ``DataFrame`` or ``Index`` is not -supported. This is because iterating over data that resides on the GPU -will yield *extremely* poor performance, as GPUs are optimized for -highly parallel operations rather than sequential operations. - -In the vast majority of cases, it is possible to avoid iteration and -use an existing function or method to accomplish the same task. If you -absolutely must iterate, copy the data from GPU to CPU by using -``.to_arrow()`` or ``.to_pandas()``, then copy the result back to GPU -using ``.from_arrow()`` or ``.from_pandas()``. - -Result ordering ---------------- - -By default, ``join`` (or ``merge``) and ``groupby`` operations in cuDF -do *not* guarantee output ordering by default. -Compare the results obtained from Pandas and cuDF below: - -.. code:: python - - >>> import cupy as cp - >>> df = cudf.DataFrame({'a': cp.random.randint(0, 1000, 1000), 'b': range(1000)}) - >>> df.groupby("a").mean().head() - b - a - 742 694.5 - 29 840.0 - 459 525.5 - 442 363.0 - 666 7.0 - >>> df.to_pandas().groupby("a").mean().head() - b - a - 2 643.75 - 6 48.00 - 7 631.00 - 9 906.00 - 10 640.00 - -To match Pandas behavior, you must explicitly pass ``sort=True``: - -.. code:: python - - >>> df.to_pandas().groupby("a", sort=True).mean().head() - b - a - 2 643.75 - 6 48.00 - 7 631.00 - 9 906.00 - 10 640.00 - -Column names ------------- - -Unlike Pandas, cuDF does not support duplicate column names. -It is best to use strings for column names. - -No true ``"object"`` data type ------------------------------- - -In Pandas and NumPy, the ``"object"`` data type is used for -collections of arbitrary Python objects. For example, in Pandas you -can do the following: - -.. code:: python - - >>> import pandas as pd - >>> s = pd.Series(["a", 1, [1, 2, 3]]) - 0 a - 1 1 - 2 [1, 2, 3] - dtype: object - -For compatibilty with Pandas, cuDF reports the data type for strings -as ``"object"``, but we do *not* support storing or operating on -collections of arbitrary Python objects. - -``.apply()`` function limitations ---------------------------------- - -The ``.apply()`` function in Pandas accecpts a user-defined function -(UDF) that can include arbitrary operations that are applied to each -value of a ``Series``, ``DataFrame``, or in the case of a groupby, -each group. cuDF also supports ``apply()``, but it relies on Numba to -JIT compile the UDF and execute it on the GPU. This can be extremely -fast, but imposes a few limitations on what operations are allowed in -the UDF. See our :doc:`UDF docs ` for details.