From 34432ad51078f7ceec498dad1e908b140436f9ff Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 6 Feb 2023 18:54:24 +0000 Subject: [PATCH 1/9] WIP: dask-cudf docs skeleton --- docs/dask_cudf/Makefile | 20 +++++++++ docs/dask_cudf/make.bat | 35 +++++++++++++++ docs/dask_cudf/source/conf.py | 80 +++++++++++++++++++++++++++++++++ docs/dask_cudf/source/index.rst | 20 +++++++++ 4 files changed, 155 insertions(+) create mode 100644 docs/dask_cudf/Makefile create mode 100644 docs/dask_cudf/make.bat create mode 100644 docs/dask_cudf/source/conf.py create mode 100644 docs/dask_cudf/source/index.rst diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile new file mode 100644 index 00000000000..d0c3cbf1020 --- /dev/null +++ b/docs/dask_cudf/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat new file mode 100644 index 00000000000..747ffb7b303 --- /dev/null +++ b/docs/dask_cudf/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py new file mode 100644 index 00000000000..750ff2f57bc --- /dev/null +++ b/docs/dask_cudf/source/conf.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018-2023, NVIDIA CORPORATION. + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "dask-cudf" +copyright = "2018-2023, NVIDIA Corporation" +author = "NVIDIA Corporation" +version = "23.04" +release = "23.04.00" + +language = "en" + + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.intersphinx", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx_copybutton", + "numpydoc", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "PandasCompat", + "myst_nb", +] + +templates_path = ["_templates"] +exclude_patterns = [] + +# Enable automatic generation of systematic, namespaced labels for sections +myst_heading_anchors = 2 + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "pydata_sphinx_theme" +html_logo = "_static/RAPIDS-logo-purple.png" +htmlhelp_basename = "dask-cudfdoc" + +html_theme = "alabaster" +html_static_path = ["_static"] + +pygments_style = "sphinx" + +html_theme_options = { + "external_links": [], + "github_url": "https://github.com/rapidsai/cudf", + "twitter_url": "https://twitter.com/rapidsai", + "show_toc_level": 1, + "navbar_align": "right", +} +include_pandas_compat = True + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "cupy": ("https://docs.cupy.dev/en/stable/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), + "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), +} + +numpydoc_show_inherited_class_members = True +numpydoc_class_members_toctree = False +numpydoc_attributes_as_param_list = False + + +def setup(app): + app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") + app.add_js_file( + "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer" + ) diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst new file mode 100644 index 00000000000..5ccbc7f0d5f --- /dev/null +++ b/docs/dask_cudf/source/index.rst @@ -0,0 +1,20 @@ +.. dask-cudf documentation master file, created by + sphinx-quickstart on Mon Feb 6 18:48:11 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to dask-cudf's documentation! +===================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` From aefc0acbb5a7098ee6a26c5146c3740cbb07e8ca Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 7 Feb 2023 18:09:18 +0000 Subject: [PATCH 2/9] More skeleton pages Now with autodoc and some prose. --- .../source/_static/RAPIDS-logo-purple.png | Bin 0 -> 22593 bytes docs/dask_cudf/source/api.rst | 71 ++++++++++++++++++ docs/dask_cudf/source/conf.py | 4 +- docs/dask_cudf/source/index.rst | 18 ++++- 4 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 docs/dask_cudf/source/_static/RAPIDS-logo-purple.png create mode 100644 docs/dask_cudf/source/api.rst diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png new file mode 100644 index 0000000000000000000000000000000000000000..d884e01374dcd5e62db937b24990074d2f584ff3 GIT binary patch literal 22593 zcmeEu_dk{IAOC%hnH?Ers|Xn-E3&01B&%an$d0V6Lq?HQcCsoXyX<)mB3TF7>yW*V zJuPJzQeB{eAc*$v9W^}&qCi3ryor(= z{Es44H}Rh{Zg(DeKoB(}@eizTBJc!)c%Zv#%KAQ^RwpK$IWz{zrdz%|vQ=FmS4oJjo1#VU@g{`ZJvBSruB1SwM@VgEgbFhC^#J(2Q4L^8n-r3ysz z-xEV9#P;744<$_ezb7RIxaEIONS^;~NaV!-c=%5r{--wo*zi9~`NxL;Ifs92_@8t5 zj}8BG4*#E=L&lYWMD2;Q5QJPf`!BR-9Tsl%IeL>pkY&KX$ep2vq6!A#e^B6Gz}LJ? z3gW(T?7v8VC7A&Pd`sH@UVN@%3HA|S{I`!J0~{a)`QU%=<%SX8n-V4*&WRE6!?`Dv z!GE06CTm0>wE^#?fpF|-&WL<~u`e@{|M~Xecq$QEMD}_w38pY(se@s;>e#GA!7Nw3 zqm}fiAPbqv=1TX{osFTxbZ*ul+wIMCh?M6ab&*>?A(=D5uq$`5S*(K5LFRKo4Q;#| z%Iea~q7+Ts34}u{DV=mff)*5c`M(7}ZbP3!(a=3!qaD%H5e>49xOERU|N3)O`E2A@ z;vlvpH7tH$-u90#|0)LOcG~k)Cr2rhK`W<=RiwqC^x)HGbrY+|Esg%8(#aZ~Lp(v& znc}eHBg~BvzJawfaEV(udXIX)aMdSUKGu6Pry_6CtlESsv{=3a7N-i)tQrIwE9eNO zR&DW4h&eBc+O~adnQwL?Eb^1$?j3sUdTK7+7$*PetT}U=V3HDdbyg^f8gsU*%Iv0p zg-^Jf?$@&zN@Og9kK$J8FLjc5mqKGgf!U+x!@8^|ha=kt=8lJ*ejn9&{8LG+(zzif zAY#8i-b>fBc3PPQvYwaf18(hkkb{-Po#kuTG_&8(uf@0*Cy`IwU3r5J@lxEPzMq6z>r3<8=n2r{q+smUP zAC`cLm#|7v`Fu{vx5}IbsHW8#rD%w}s6BGwnA!IDy74;IB#nC6O|0cNXMA^9Lu~aUs+YH7VXH`p zZMeFDU1#@cN^`~d)vgcUUn@P-SAMGxm(Jd3#rR7M4U=Kck&USkHDe5DVT$R5AdeN( zzLtkFS<^kfptSn#pts{;d^G8_M!u^t7&4>?P*3=tzeJHP-}#fLL2F2uF+3@^A|+`8~|aefy_FXWld zuWeTJMUV`=>-(*@`&OOg^7m_a&j_Kz;E%4#n6myw!>p=gF*?hLB| zoOkH<6cF5-WX+jKtb2rXgc5T8GDyq{*raF!*K0R-a_^gDKmN14q%E=)4CfS${)jto zAjK>G)BPQGXz+Rbal!JCJxAK#OWg1Wv@|z#K%Pf0*Pc?5DCL~|Dta-8UFzjkT(I8A ztAdvdY=31DH(zZx;S05Zul5}*&a)Ww%(%d&1QuCr^J6|dbN^KTJB3rD=m0?~tn}MS zZBUt#=345kWag1}_7V=}= z?|6GDzK%k_i@BO4yA`|4_FbcIlQRb$HHWfAymu$tHf;Z6k0F zGhktIlqT4XYe2b~Aaj92l~m7|R$nBweVAt??QI#$l{;K=)^~}V!I{Uty<@Z3_0X5SEcLV49~Wm+*TSO%7i;n`|VbBo#1EAQJ0Z7 z6nwHjlFe@rH;!8v+43}!ReJf6b3XkEa+%0CMLPFm9s!LP+B< z8UcxuHJF%{+UL_5!oeI1=}(b)#05e6YdBfysl@^r)70<@|qLmDY zWCtOXV?#gqc|P`_qg*uJeY6QUhb=FkIGx36ArzSg+>j>1_8Aud^czNz6lRPiNWaz( z*+``I;rUG6s7ic0IiKy`G)cG#HJ&h?kwOQRqoZ!W= z*KcXQP&;y5$OMFv0YdNn3U{t`0thPea7;;;E`9*Gitn_X=grGr#BA>qZ0D#M{`63d zX3}T)oIa0&EU!14_HB=tGK_zylXV|I@=R)DfGi{rQ%QPh4UAN#wrF|XK6aHyg&|oA zGfePT2akLu_64)(1+!ufF*zqF3U0gY&2oW`fp22lL8pAD=L!4LgP%>dZLnFDE0gTV zfaUq^YHGUGd)Uj|*KtppRW@3j6hv|MO4#U>i;6Wvv4@HKCbvM$9{U<1aIBzL!sgHU zV4(g-_RI23UqAH9nbwwSus%!T%(^Z6CD?ibMKZNpY~Ai;r{iN9YLw@|1yXV>9QWf#ubpU>sWf<6sCE$T)xmhD{qj#~~6V%>866^5yF zI9WDVHfF6BD#;>=82n-39V>U%3^N~Ou&2#`%1Jo#n~j_{@-2Mskgy%gNpB-#yM3*V z;>uep05bT}S<<^6R&H6p-1^X*y1_k(bL&q}v6Cph{4W6S>G?56`J^E0+e6t>N+gWfava#_9bKZ9aWzr7T<$(4p~ zgPx=jdFP%o!x}P0@$Jr=`-{eJn@#Uh>R;g&t?i6w!BU49 zaV$LJfG{6krbT5b9Vj-Nnygfot8;JuIXc?qk4ZSgc@pR%NJ*HqYr>Wh0wF38Vl6ke zTG!keI@3e-l;>m<5E+Rrezfp51IMUH%anJgHcX7eAJr+sNBo@*YbjqI)|40o`v!QF zZG}nEVAvIBNf-w(hXFM@)YvZw_0N?qZ%vhmg4 z9>30m%;b}CRcY{6)KHi~rW!l^Iu%an9uYS(;9A z?l!phth?i)kuOQ6r@-M2$?v&8g)qUET-Lu+-sK<|-}<-0-R-`=*g~mo0b)=HOo`}zRVsY;o_f{ zPXfZnqq7DWh6CwzX@GN7|MAqZSt_J~ePc#B!*$6aX@t{B0${UX-!z;&?4FH}ZReV0 zkQ?=F)hp*N%iMb8dpSEZ5?_wn_q{+Ld@Bg14v?#EL94WLnBFzDnwb4k-buZsGm!X~$~jj#-zUpSUmjm2VByJjzgYJM{fq)|(ucr;EHR()Gl({_@Tv zq@=kp40m6l4jt}Jd7@=A)C(>~3NF^t!A>qhej|@@g=NiduNab{5Zo{%M7>BwWT^Td z`ttSBwB~Gq8@l)ec8X+Pyrrt3vE~?>2zb>0ce|;(bAD#msjxI8YEP` zjLXifwQa7@ejgIHe^|Ds`Z@Kj5gAQ15Wj9H%R)$YWV)=0%++s4-ut4TgI-dX8y=H9 z0}7n`K!Tdw`Q`E-@0sv!qVDtxVddF{`?AfnILRRl6BOGHVyxs~f}+7Vc}*B}&|6MF zb}c1_*H730ee)hH7G$})spB8M#piJ{%k93_$NQaz-hX!|XEs0eH+ezv_l4JYJeGo$ z+$qPcSraY{(e$Rrx}tgxU#&SjVJRuUEG`J7T>irn7=i8f-3&R8cuYdgG_@P*rf+Ab{YF`h4@QxW25$kEpiiuEbX;~+20|QNYqohenIPr z;p-plx_^N6D>YfJES6M6`s1Z35%y!bWY0N2cfB>Md^A4v{N1k0h{67|uH+CWaoYnfq*^wgP7ZSZU%pM^SkSyosPU$2Qb z95%~P6hCk z0-k9xVVIV5M9=>p&k;UImp3{XArV_2+lzE#jwII>~bpj&(= zSP@c^bZ1{oner|OhpNJQwHAbnA*C!L4J%QGv^Dv8k#`Kx5di|A^qh2OhmQ7_aItvU zpYgRb5P*k(232fy&h|y;p1KYL=30hAvvlG~7p4~=BL?{jU6_9c6&v!(zwI92kJWsO z+`f7R*a^WQ`yM&z)Nft$nXvGE9Oew5+*g3Wsin_hL2~rSZU5;z?5!HUbT^Xi3jpDTk! zkIt9Amm2SwbReTu>H>u8>##L*sXLtd$)#-E)p}U=|7HOSBUPBiLGhc^)|&ez7I|T? z+#7^LPc}DB5+zlzPK^h$T{vl8lIRU3PiE^wqX@be7H)XvEW6Z^y~h{ei{KdbMxG2DBp3~V8hToAUn4vqn5 zs#FGx5qjH&=bk<@d%;gio^cznfzM|IZFW+&MdcSe_9mUQ<}YAjePvHn#Z#az4zo(< zO+bNyuXju-v8Y@YO|44l|Ab#0k^~ryA=oQ=_2V*@b}k{{y^xf)Go*aymy57`Dnrtl zP=&TW?z%MwH_6+lGyb>H|clk$90C^%IY!Wf`u$!wPe!y zaPf-!vD8cG(W0VU49%bZ>d8KS`Ai(@2NeP(Qprxu(Kgbx7r(azPnwm6W*nr_oo22g z*cXR@_DM?U!PD+ic5*#^qFA4h500cfGQnnPMf`%UB!dW&w`=xdj_!xe`@En(alB4+ zY1m@_O!Hp|wDJ`ZQgjHD*KNwqG&RLGz2?$6{fSV`?c0*#&}Cu-sbzsrC73wH^Yn!f zijdW2y;5)VVX7QpRmnpYOD|_B`k;4N2apW*B?xL|OX5Z(DYP_7&ww!dyPukP@u$x# zP(BY6ptrEw{tlPz3x5x>{k7+Xr1U~kL7jj}A9gzNOmhhZ$^}#*uq$DJR)@JTgM&kq z?bZ82h$`>ut@ayy&TO0x>h!xE5+?|sYzC;56`Z`?xq(FF&&@A7`qYwhT_Ogei4 zBoh;mC~81PQbJFRRs)*5#a*LkcFbO!Uqv<^kJMFFr1uJulcIIMHL#}((p{(ux)~{+ zJR)!QHLk&%Vwf@Aj;w9wi3lp_^x5CBk68yjY!%G)eb`GKWv$ z%UI41_S=v_YL?cE7C1AVuzc z(^P|2I;hP_im@S`iI_cJ{&@duW^FdY4zpz~rzhN_*NoLaz^=)5uN(`jZi`;1f#8;r zJfz`nh$FKaM~dpP5m#B7hCcoa$OR&DOG(HJilDs?!x7XS>kU@@_%W9QQCndzIOh+0 zmB=oz3r;78e5MTMr87Xwd6eYRn7c4JuhKv|pi`D{AwW9qs)EDf+lhj@ChfV{f%ih4 z8@Ahj$6jfquxC@TE%)lR=zUFIA>}0vrx>`f{tZ4^ug-RYZP4Nl$DfVRfgLb{IBw8Z zm>%S`>hi6B1Z7{E5J_*e?f>^5cB>Q3RSGP!&psNGAj@KeH%B(d8wtI`cF?h}VIE9! zAAZ^B`_(dTS@;=dOvJ!G)h!Ro9%btE{@s@^w;xC;*B`o_2UlQ2lor-e0g$pETr616 zPD;xJ-=;}7E#q4Xm56R`8J`*#I;z>|t6sK*fR3FLNw=(T-*!c~TPUwg>TW5&yd%=; zYP-GjY;NGXI<+-0%f=&@PtoR2!Uo9op@*PM=J-QfNHBvc^0lZVY~hPR^_uRb2r;5rUN%>Zw!fE;E^Jd@jEIep&6TrFYOw=)y{17el)nD zBSlQ|^`3<53^i3B?#}D4c!_f27F!5I;l-O*E2GRwn^;xDN)mdZU80YQ(5LEw+S z8LNwz`#zqwlU;$@XXTK+OaH`pS*UMx9Q$7NN&g*6ngMo%GPWjq;5>YqJA!H&gSeee z6$)Fv2<|}HY;A39{53AnvFxVZqo_4OfmqfbcEJrNZ(fii-Gkx~XT~!~ zE0t=h(=SDQx?;TI(Rm7xc2Gl*;;g3SUj|ANeKk-$c)Vqryi(_5|3hSkMAXQ_`qyV? zO?`G~K{NP{Rg1F16nym&XmpAfc`~1c9xC#R(;rkFFK8&6+O?LXLX>R&f2 zb1OO`A@vR1oM>lV_WaCwVO683id!X^Dbasc_k89o!-1THdZt3@VbwY#ymF35F_t!B zi;yy=o)4O?7lHKUBtC6*W^OPno>&CpTz%=VL#F<7D^3<=kc{STfsp0YZv}S2-WLNK zO=b$ZB$*%s_QS_Buq|ICb(!qd^ThpX&~lCLfDIQ0)igj(S}rTw*InD|825_iQVng7 zB}vH}ZFNxeI}$rmm=BSE%MT|@0uDV+RE*wJd?Q>guo_nnJWbiAjX$q4)xdt*ZMSar zhi)5#CbIcOxT>QOT`|ll5wNp-jy5q)b<)*+@wkt?^S1u!dipX}WiEPROex~d+i*iW z*t|sZth?6kpOf>V2LYg5*At2+$~j%unCd0PM#KV}l{k7HSTQEfgtp=*|H?YBy3#C& zH}Y7XRqVs`@u6(bzC@@2fRb>X4<@9d1WLb#P7kvCpf6YNPU)ZeORe#Tl?(25yCt#< zBv)xVJna==3zR)fHrL&6MaOg6RH3!z->_Z$I5iM`3&{8qkix zPon(Sr%I)IT|J(Z6lp>_#J1H_{AW!&#@59`x#uE-Y<~Q2K%~;yC z-ffxJM!wC*VX9v2)!(maI)x&4hXr5TIihUrp#7fv9C3K80vZt95q zMeGFK+|LWzFWzVhqxLTK9ZuLWY7{!verf2qeJ^0Ba|7w>F1fLFTJgJehoW>|2rS{GxdlSr zrAaBX-MN1e0de3Lt{~6>HCt3@FSTPlxnqY&kHws}oYIqUl8kD{wrIa!7w9|;fReG( zpOyY|n{!GFNU$VmU+QVjd%OF$^UhT{TZTU`J8UIp08Z;>Bq;i+yFl=j)B7V%3JnoK zI0f7L6xBipC|SzDIXGMdcq=r)Z1`*Hy)vALKeMlk-HS?MNRxd}=QL8^d>QAhVwr2P zo|JVVY2iZ16gHybZ-`}v0SM~DVuk5B%!PQD;X_pV12P(174`PICD9Zr^=VJ)_pf7~ zU!^2RfZ&J3S4*FI+ys)GP@voqsdu$+RzyIn9?&ogMaL91Xke(O(aJyXJ!}z~OD1hp z@*v(xF-Pb*KeNd5D3(YCp~`!xfYlY|o2zoX;w9gCdtR(vZ%RA^n!Hvbw^CwKpEnG! ze5Ab^G9~=Q7r97W&8;~5DEl6d*s3<%>=@vegn+a};G|FSwSJIWx7 z1fC|Qs8QdESyp05Cu>BgZgYV8Y)oEoFfk`9VORC+c4RM8*Tu7|rTKAubDnvYrbCO< z$I^?4nTbF?#&jWFl=Ibe?vgl@0kqN@aZSK}^;YG8P*-2lDSOA5B{l{)wr`WPDWgR! zPW;{xhe>)j-357zvd%YR$=NSA2Y#9xL5rEWW*8E=BRcJ>?7ouy5+K?xCj}Sd3Q(UY zTf{r9Z^NbHrMHS--r|N^h!>=9ZS(|X#RZBBBuUbI`AS%n!&oY%5>LO?gxyjz$(6J? zjK^BMmvW{%b}6_A`E(XG^OeX>yUOHk8Fz@MJ(bc)^wq`%g*O`%@#yTV^e~RLz#Twn zM6Oja5fUzAR%FlYIhRioe0fJX9yikAlEzN_H<#WQlj!iA(EhHU9kJbhE}0TD@wvf( zxsxe%*Q`d0l`Z!*NFAl5ow^&&5m*EexqI3WF4123;9E556s*Z&7(OkAOB&~T@qqcn zEREU!v@MA4SMvswqjUxf@$Q{|DWxruEKok7Yl~y{j>uRz?b4rkqu_)yK~~hEDE;Np z=rI()#q^~icH7s+|L}eM#gP@KFpCv~)QPsXjlW(N{X^Cyt&UBOYDZ*(*16r#+}Wf` zXe!4tpEwGr(j z7pGekWoxN8Rw;oKvI?t%0Qr=h$|g%LHy%=QO#p_FyI1G&RI}|YB2ZxMmgbDR%x5aL zSF?%ZFZ2$m*%=7%+7b?O%>1_w=g(a|-`}ye#i1fznjWHMoX^5ZS^!!`4^3pnHKcU?33D_c z8$d<3#UFSFH;k!@{LGMTwi`x8MDkT~NnJc4%f8Mr%t_}?OVRpJ6DwLCX~;Pqe$gfc z3GfMQsw1#nB)u4QVDTW-e|hiH@I=YGlj~q4p>T4+j>e43Qnd+NAtSEwJN&JcR^eyX zfyU;Brf7E7(PvoM9*VfR^mPhI-ro3A{os1_&0gEwhJA?x1270D9d^n&WCgN#X?+qi z5W*K&OL)O_N*N;S!ePdcMMOPF8#I@S6@RTizG4@USmfyi(*$)Rh~Q%1a*#eEU(Z-) zAc)xgx{NJu+xT;*w1VzP)Rf(0DkvP2Zv>QxZdyz^z};r%LQNFyOx9MKs`Q%_JzkFX zEKxEgL4ClS^l*ny9s-gpV0ceGt)8i+KK+*fd(j_E{E1om-~LrQN;{wKDh?_C2?TMk z2j^*f669T$i{C%2Rynv~bVdV#t@z7RJQs>a&w}lGYL5L!_rgjFU*rXiF|XD8eW1Sq zT^pZ$gaWiMPKz-Z6ftno<@Wx&ir=kOi%__ zht1_sRLPS7X|A_NW(U-`9L%VC5CCr|`U=s!k!OSnGVwFaplXvnS_unCY`80!N_MPG zhQ@d>ui8y}qDKZ9dIO4ZRh@+wf<-#m7g04ukLi9l+JOsimU{I@El=4`{BZEF3ZTEI z4%qVEj5;WVq3~#5N^J6xMOngc?I_g|g5jp{N!SDfVGHSMXaP?U0SQdoK%?TU>elLz z6?Tb+Ql)S_U82)FbY_bFr!WTnqzSm z6QziIkk7W~++0zM#-XId6s%kMp`P0Bn0x zXGf%eo)h8lEB@eQ)2ouql+$9RS6Qt&Z{ho8XLJoY!9;+s{t}mR6r>?|Yc8?IFUn=o z-YD?~1Rye~WQy!zC6Pa9=MH~Y8U!)r>%*D@LCw+^E1DO;*&U2OOMu_AVf$8kL)w_F z&Mcz!7&;~{?F*Mr^o#zq-%+?2-9A$z?RxnIKp@ z?h#wT{T(mkA(-E;P9i(f9ge&p-%f*)u9<0u4u(4c#)nimQ`Pv5p6)|U z6T2%fafy;_F)+V4dZCzz_-(sg-QO?%auqmpY}i;(4YLA$D6`HzcgwrdrcL6(mvS9O z$n~zdE{b}y>m=sSU!1>cdHa3MrJT4vZ{M_y4$}d?lQ1}C(4#+EtTfs0HEd`cH(j?& za9dv4&42`@25S5k1SgxgPe4BC!`SgXfISLzI9-S-=oJ<;#gl`?_lU~Z(uY-^}4 z)jlh|0RZ+p)+^ekgJPq%%AoNdnfE~kpsXAB5*`8Ojk{il_wvo+<4$Frlmhj;sCUm2 zny4ZYa=Uez<)4==*FH;M{T%!^olpME{sp#_vxV&0aNvja@40FEOU7 z$~d>Y)>?9QlfP%*W7#e&W1V{#*AO^`RUEzgQYI7yeV+|TFbbI}e({9#+`qaI;ktam zzS)a(y_;tO4Mev{q&*`I@p7cx#m6plo>KewOB`i5EM+kgZ(mRITWg#}7M~^j;FUmT zT52idp00a&E(YSezJWmo3ZRbZa;`j(0+&>(dkWt&lL*9xIComLB_!Wz3xf=p2tUXk z-1b+p_x0}*|G2=nA?dq8_zlyw6eCPuCn>pFY3M(MR0RD`Gsrn7I}GVEPm7Jc zvx~ZI+fTtu+d}TU^P8S_Rr*$#kfr4USW?htfKC*)c(q{qPA`=>l^{b2Q*u6XF*TtK zf?beKi9yka5v2Mb5p{eG+J{V|pXmq=hAn z4(TB+O6fZ6(JqpSm3&bdV`vT=zD_)^z0en7-|>a2%B|X9|L)2VEP>c*|A}~? z40=*I#r&$4EtKdY5Kj+2f-`Iep0{`eO>Jm{B&>8x2s)>9&_SmjNT@dRCA-k$M#)j1 zbxZsj*xLdqWQw%NRW&gq^UO;N>*Ob0!7tB3ad!?n-f~hjpwkV_gFzeDhxoCLDbhHRxYnk3pOO^*-+G~@=l2wbng%7x83ULYxFo4}y(*#)<6dZg$RD?GExn2YXF#V3m3V_U1&e7_j5_9a zrEocwBBm6ai%2s7ib{&MF1v%eMhK8I{aEb6xZqI8eiLB7V)V}lmh=ya)37X?`s%}+V>JFYm zO4Bg3wmpm%T}o?!7L2AYSbV$vHxaLkr^a|BE!dHurOpH;-@|6PhCg6!+(56j3p1_M zKD)o?7uGnoiQ8YVeI6g=m7AXDg0v$F;#<`1BG3di8m_r53@`9}7wEeYVkYopQipX(cg=e~ieQZ#?O^1=waeu}fz?}@S6jdK3m#!Ep9J~sdyn6CUxWFymT%~jf4#`S z3G=*|!4W)5sbktU1qd(uN6Kz*e|vQ&M)FtgrMALg3qpgvrbsOnlS59y|7T8KV&w0^c2*Ob1v z9G|NGf%!X*Ys#0%ACogkS9Y2F2vt|isPij>Vb|p#0dZOWDj6+?h>W*bsvKLikR_DC!p&Zzw>_e=i%t8K3r}mv+<42r zNIt~{y&?LUE4EQ{4)cb4H)@7unpDr^SeoV_21uZX(mNewKLu#bas>@fy_lS}ZiOeX zu=nSMeGUQKger*FN0THaD}??^gLmg1OxVMlCBIno zDSdXS_=m2Zbn(-tpN}dt@fE?hh*nJf#5L8I#O^5LeSF>+uI9Zm=YNYIs~sF?Hws@* z7fIL%Xh%4ZnAQF1a=h@_vG@4)q>H1X9hwV7Nj5WmSmlqrz50MquAdOQ*NWR>7#czo zBsmCP@nsw?->+|0W^R}}b37T!ZjiA~0h!hpqJ;~yM(e}8X~QOT7-7Cqg_D+8pm*rW zI8b^EJpZEX0$5FIT1fsd@j-AqmzwZ;V)UouzB5W>;^H>}E9I`4JjY(3x~)^l)sVPs zF&rbkCrAP9@rIORqQyw-Q$@15i*8dCk43X?37eb?V1aL#mrVr@^_UpqsaL@V;>grw zkpw?j<@I;^WG6OoP`v`-53Q|UwOyUL+`#?q=^JC!{N-zu!QNNb=~0rq*<<)axYsJH z00)v&&UJR0@Z@}3C(HdctL=x^-YI-Ks66Bt+6`POkkveqz2)fSDgs&RSg#~9zDTPf z*OI*TGQg#P&%^9Y@TvD=E$TcbCchU~`4C343Wm>-sX`>VnHJ&bWL0$hcIJlY7rf-@ zvxEcr@lF;*kR5}__-XvkT6~M3M(-#+WO+^%)O!_U#G{Lj_5W14RT!^F$hn>z;aMC)x;6JnqwE&w3d?xq^92mzw9W)cW!yA2jGGCv

1np#V%;7Q6~;Fg zxT)hjZEy;&1;tj&bi`=Hk6oduPLkPC0ndZ)og_O$d8`mJ;uU1ad_=I&Keja=ufAU% zLn88gLZ3zcL}HJ8kmZQN`g@!2)pO5Ki#3N%(+g*YMZhinn3%oNFI2+?zEim5Uys0( zKfBuk1*Gk)apuCwq4n!K6VFH+xWCkIqrw_=+)ezZn8wD-uGxOyxR@l6p)Ys(l|GA$H(>BLODi-f+y0l z9xlDQtH1z!;Bq%5bHk;k$S-*(wV_-?7NR{)jKbXcam(TxJh1AevC@&tV(nOw$$Dx) z@)KorkJzvM;?RZ|>pHSk;#@&}l+e_c(n$E6$%g>}EV@)8E@O1#tuyvux@B@HR~RHX&Brm-|k3(Zxk^A^Zs)l3we$!znfng8Z)Q)RDavnp%*1UA%vmGErOgGRFy{ z1$J-Kl&kQQg<8)sU(@8w?V7UaE^Cv2qXhTv`ic_@-wV5MP6V1R0k=Hm^YK8$+=+uX zYf;ZF{$Tq=N(g)a>yx%!5T*8o>R^b`+@-GVVf~xp)3*|QZ0ny9RZ&Qv`jDODAQa|- zf1yd2YTcc>t}cCl0Nz%?hp@=sNg_2d56H%wcxJzrX{3qhRWfv&;{C?%kV&JClVqSk zGZv>@dkV-WA*F9b=uoik)w8sEi(i!vl?7|=sq=Y3!2!py!VvoBW9LNA#MM}mWD-kI za(vx#q4F-)^Xb~&iA~~cFhF{R<%KP~juI{yW@oz8h z3m+fc5Z>k?JYSUFqacB%7K-em<`MO2>QN?^506|RB~D_hp{)-Ka+q!BxI`O&o!&>m zg(LIK*%R!gz+x~L1##I;S`RZ}S`f<3*|s$>=nqk`^AHGSJU@azM|`J)VTq^X_^?|Z zk=9$(X@KV6guG^4yCuGERt&5wmA1HB6&Uv5YE#H-VGt9jyI6j(X254==En-^>*2L* z6JL)_aM~i?z{MW~MfH&8Vkx3j6Kzmq(O-Jfs^4m&`$05J`mr6p$9(g3`$%3(U~5h} zD#kCV#ChDQHz0~Z2F$NQGiM1ZTu-pr`Sg^(BgTtlX5NDi(vW2h@p6VV{1TNWc=Pc6 zkG{Ad8!>K``QLs_KF|yUSROsc`M-2nA{;Sda)ZJXvWCV2OMAr zi)ku=O27ag&FmAD_+1|5fI+ng;+bITJ4?7{0w5>8rWHAh&`<53r;Lh3#EG3Nelh;8 zABJNA8$5doMGxjP2(bnPe}V1Cgr`>*p($T*az`?5KgV1n%BijBDy6S;8k9uLwaats zCAVzq@^AS6HkI_LeioYD#WFE8%S;C(n5J<+U44)oh@@;bKNEacTN=vvH?7L>axG!> zu7PAVkYUN9+e{dTiybvg+`C==j6Js{GKcA$vS?s7H?PW6<=GF)w}I#lh@&d833*7J z@P@ql$NHyzpOU~QTi||5(S85$DI1`xY2$-cM-jp{miEypU7Q zRgWvK$Y1q;)1LcbvEPQ3S2t+U_@06-yxlme7-d4c~v`Z4Ui? z=4gtCfhbj!X&2rdFl2u3bS_vQMpCAAK<#Tv9rWuv3%s%{DdniJ2A0GM<$*Y!akKZs z{!n<)cr7I&=Zc5S^Ycv1IEiEdf{yT_3REK2WYIR$?DU=Hh5TvE*H2?3*0O<4Pp|6K z1BD?DIA$2Ke!o!4{4mwDlLJcWnNzm@j1$@%(IcZNxqCn@TQFK{J2_~p2Oxb!K1xa| zEk_i{K@hMZSC3RQd8{aKtZUDFPWCTdi@v55af zPrSe4PRBs6-`UfRSbSPYZ|8m7*1co_7BaL_R47Wf`1HmpfSckw}F5 z>f^b7!Of9nfNPI?NAtP@!tdn;P5-(c6h3+HE7>h#VIoue@&TUP*=(h}LK=5^qhdJa znOTcDDlagYAIR#CRmGx$w3N6!=h6B=YwtuG2jocot5nVNcg|0{2h8Fude?`{{&cAEO`*Z;XyW;0K91L+<_0}oWHhr?4iGV#`{B5MbE-_#S)&R<4*7)#jN+u9-^Na z`6MaDFz(3~D|>QH{Bg>Uz%;tgOpwy;CX1wo6nhHOkkrc2>Iw~p&Rc^ovD?FG6d8}e zH}6i;-Qu<9=noKaZv`C5aPf`2CJC}i43KNgG2Oig9BX#|bHl?P&j}!{^{SJC7hYh4 zkM1!CwNtE9F@Yl>JvDp8UGt-2?_Oq^1?SP6f^JS>IyF~ksF;aC-h=Rl+Vz_COVi1> zMrE$272m~7M+W?|x)6%m7mj*iF?lR)d90aaR54KXW=(QR`|Msk&w>A9VpL;?BKPF3 z_uZT~l?q3kbZ*ixaq$YwT>+Ohu^R!dGdGh9MvIEiQ2eRnk7xU>fg-8gI(;^qd zd^H`nztG^5ZL8rjq+(&?!X>i4zztx2^>XKDT%2K)s;DhBrVrZhgJy7-L;ATBf=Lffc=)9|${}$No0#<Nlm|x(8>^JKpuX_D=bxt@5S+8|O<{GKl@N3({fwqh#`M;=0Gh zpH-RSA7`i~zL>7~Av^Jd%G#)d^;Wydk~2wc~g;_etX%|4q5uUsu;px7R(y zb_1B^fr<VYUc^PI1z|?So)Vy(JIjpZF6Zik~ zib=UXyKV?A>-M_$|DWay))mdbMGg!MGo}DnMIVZ1P}BV;5f}OYd1yk-^vCXR_SS8_ zb@{h-{NwDWr-4=cCP@WOsJZ9Cb~89!{&(dUC(r~ zWitsP1)Q7)nlAk3(GsA!48otU-(0l2BoN4yTOY3@<|hqg0f7S(aDNemsRFSZ$nk)m z!8ZYM1|P&RAT2^*F9TT(D!}>)0zAP^0I?@Q0s+VxH4q#&qhSaRhS6jL4hDwNoB|Gq u(ZXT0aDdjdz`, Dask-cuDF supports creation +of DataFrames from a variety of storage formats. For on-disk data that +are not supported directly in Dask-cuDF, we recommend using Dask's +data reading facilities, followed by calling +:func:`.from_dask_dataframe` to obtain a Dask-cuDF object. + +.. automodule:: dask_cudf + :members: + from_cudf, + from_dask_dataframe, + read_csv, + read_json, + read_orc, + to_orc, + read_text, + read_parquet + +Grouping +======== + +As discussed in the :doc:`Dask documentation for groupby +`, ``groupby``, ``join``, and ``merge``, and +similar operations that require matching up rows of a DataFrame become +significantly more challenging in a parallel setting than they are in +serial. Dask-cuDF has the same challenges, however for certain groupby +operations, we can take advantage of functionality in cuDF that allows +us to compute multiple aggregations at once. There are therefore two +interfaces to grouping in Dask-cuDF, the general +:meth:`DataFrame.groupby` which returns a +:class:`.CudfDataFrameGroupBy` object, and a specialized +:func:`.groupby_agg`. Generally speaking, you should not need to call +:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it +if possible. + +.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy + :members: + :inherited-members: + :show-inheritance: + +.. autofunction:: dask_cudf.groupby_agg + + +Dask Collections +================ + +The core distributed objects provided by Dask-cuDF are the +:class:`.DataFrame` and :class:`.Series`. These inherit respectively +from :class:`dask.dataframe.DataFrame` and +:class:`dask.dataframe.Series`, and so the API is essentially +identical. The full API is provided below. + +.. autoclass:: dask_cudf.DataFrame + :members: + :inherited-members: + :show-inheritance: + +.. autoclass:: dask_cudf.Series + :members: + :inherited-members: + :show-inheritance: diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index 750ff2f57bc..acf2ecc8bfe 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -23,12 +23,10 @@ extensions = [ "sphinx.ext.intersphinx", "sphinx.ext.autodoc", - "sphinx.ext.autosummary", "sphinx_copybutton", "numpydoc", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", - "PandasCompat", "myst_nb", ] @@ -46,7 +44,6 @@ html_logo = "_static/RAPIDS-logo-purple.png" htmlhelp_basename = "dask-cudfdoc" -html_theme = "alabaster" html_static_path = ["_static"] pygments_style = "sphinx" @@ -66,6 +63,7 @@ "numpy": ("https://numpy.org/doc/stable", None), "pyarrow": ("https://arrow.apache.org/docs/", None), "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), + "dask": ("https://docs.dask.org/en/stable/", None), } numpydoc_show_inherited_class_members = True diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 5ccbc7f0d5f..b38034a9453 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -1,4 +1,4 @@ -.. dask-cudf documentation master file, created by +.. dask-cudf documentation coordinating file, created by sphinx-quickstart on Mon Feb 6 18:48:11 2023. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. @@ -6,10 +6,26 @@ Welcome to dask-cudf's documentation! ===================================== +Dask-cuDF is an extension library for the `Dask `__ +parallel computing framework that provides a `cuDF +`__-backed distributed +dataframe with the same API as `Dask dataframes +`__. + +If you are familiar with Dask and `pandas `__ or +`cuDF `__, then Dask-cuDF +should feel familiar to you. If not, we recommend starting with `10 +minutes to Dask +`__ followed +by `10 minutes to cuDF and Dask-cuDF +`__. + + .. toctree:: :maxdepth: 2 :caption: Contents: + api Indices and tables From 20641450b49cd25c18f8aa7978f7f538e0d81f95 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 7 Feb 2023 18:13:04 +0000 Subject: [PATCH 3/9] Improve docstrings for public API --- python/dask_cudf/dask_cudf/core.py | 40 +++++++++++++- python/dask_cudf/dask_cudf/groupby.py | 67 ++++++++++++++++++------ python/dask_cudf/dask_cudf/io/csv.py | 27 ++++++---- python/dask_cudf/dask_cudf/io/json.py | 28 +++++----- python/dask_cudf/dask_cudf/io/orc.py | 61 ++++++++++++--------- python/dask_cudf/dask_cudf/io/parquet.py | 12 +++-- 6 files changed, 163 insertions(+), 72 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 60bbe5d9571..d2858876fcd 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,6 +1,7 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import math +import textwrap import warnings import numpy as np @@ -68,6 +69,18 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): + """ + A distributed Dask DataFrame where the backing dataframe is a + :class:`cuDF DataFrame `. + + Typically you would not construct this object directly, but rather + use one of Dask-cuDF's IO routines. + + Most operations on :doc:`Dask DataFrames ` are + supported, with many of the same caveats. + + """ + _partition_type = cudf.DataFrame @_dask_cudf_nvtx_annotate @@ -671,12 +684,35 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): from_cudf.__doc__ = ( - "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__ + textwrap.dedent( + """ + Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. + + This function is a thin wrapper around + :func:`dask.dataframe.from_pandas`, accepting the same + arguments (described below) excepting that it operates on cuDF + rather than pandas objects.\n + """ + ) + + textwrap.dedent(dd.from_pandas.__doc__) ) @_dask_cudf_nvtx_annotate def from_dask_dataframe(df): + """ + Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF + one. + + Parameters + ---------- + df : dask.dataframe.DataFrame + The Dask dataframe to convert + + Returns + ------- + dask_cudf.DataFrame : A new Dask collection backed by cuDF objects + """ return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index f91738bdab0..f4bbcaf4dd1 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from functools import wraps from typing import Set @@ -433,22 +433,55 @@ def groupby_agg( ): """Optimized groupby aggregation for Dask-CuDF. - This aggregation algorithm only supports the following options: - - - "count" - - "mean" - - "std" - - "var" - - "sum" - - "min" - - "max" - - "collect" - - "first" - - "last" - - This "optimized" approach is more performant than the algorithm - in `dask.dataframe`, because it allows the cudf backend to - perform multiple aggregations at once. + Parameters + ---------- + ddf : DataFrame + DataFrame object to perform grouping on. + gb_cols : str or list[str] + Column names to group by. + aggs_in : str, list, or dict + Aggregations to perform. + split_every : int (optional) + How to group intermediate aggregates. + dropna : bool + Drop grouping key values corresponding to NA values. + as_index : bool + Currently ignored. + sort : bool + Sort the group keys, better performance is obtained when + not sorting. + shuffle : str (optional) + Control how shuffling of the DataFrame is performed. + sep : str + Internal usage. + + + Notes + ----- + This "optimized" approach is more performant than the algorithm in + implemented in :meth:`DataFrame.apply` because it allows the cuDF + backend to perform multiple aggregations at once. + + This aggregation algorithm only supports the following options + + * "collect" + * "count" + * "first" + * "last" + * "max" + * "mean" + * "min" + * "std" + * "sum" + * "var" + + + See Also + -------- + DataFrame.groupby : generic groupby of a DataFrame + dask.dataframe.apply_concat_apply : for more description of the + split_every argument. + """ # Assert that aggregations are supported aggs = _redirect_aggs(aggs_in) diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index b4d080fd182..fd27083bbf4 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -16,9 +16,10 @@ def read_csv(path, blocksize="default", **kwargs): """ - Read CSV files into a dask_cudf.DataFrame + Read CSV files into a :class:`.DataFrame`. - This API parallelizes the ``cudf.read_csv`` function in the following ways: + This API parallelizes the :func:`cudf:cudf.read_csv` function in + the following ways: It supports loading many files at once using globstrings: @@ -34,23 +35,26 @@ def read_csv(path, blocksize="default", **kwargs): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports - many of the same keyword arguments with the same performance guarantees. - See the docstring for ``cudf.read_csv()`` for more information on available + Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and + supports many of the same keyword arguments with the same + performance guarantees. See the docstring for + :func:`cudf:cudf.read_csv` for more information on available keyword arguments. Parameters ---------- path : str, path object, or file-like object - Either a path to a file (a str, pathlib.Path, or - py._path.local.LocalPath), URL (including http, ftp, and S3 locations), - or any object with a read() method (such as builtin open() file - handler function or StringIO). + Either a path to a file (a str, :py:class:`pathlib.Path`, or + py._path.local.LocalPath), URL (including http, ftp, and S3 + locations), or any object with a read() method (such as + builtin :py:func:`open` file handler function or + :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" - The target task partition size. If `None`, a single block + The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to ``cudf.read_csv``. + Passthrough key-word arguments that are sent to + :func:`cudf:cudf.read_csv`. Examples -------- @@ -61,6 +65,7 @@ def read_csv(path, blocksize="default", **kwargs): 0 1 hi 1 2 hello 2 3 ai + """ # Handle `chunksize` deprecation diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index bb3d0f3c601..2a6ad603414 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -10,30 +10,33 @@ def read_json(url_path, engine="auto", **kwargs): - """Create a dask_cudf DataFrame collection from JSON data + """Read JSON data into a :class:`.DataFrame`. - This function wraps ``dask.dataframe.read_json``, and passes + This function wraps :func:`dask.dataframe.read_json`, and passes ``engine=partial(cudf.read_json, engine="auto")`` by default. Parameters ---------- - url_path: str, list of str + url_path : str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. engine : str or Callable, default "auto" - If str, this value will be used as the ``engine`` argument when - ``cudf.read_json`` is used to create each partition. If Callable, - this value will be used as the underlying function used to create - each partition from JSON data. The default value is "auto", so - that ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to ``dask.dataframe.read_json`` by default. + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~typing.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + **kwargs : - Key-word arguments to pass through to ``dask.dataframe.read_json``. + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. Returns ------- - dask_cudf.DataFrame + :class:`.DataFrame` Examples -------- @@ -53,7 +56,8 @@ def read_json(url_path, engine="auto", **kwargs): See Also -------- - dask.dataframe.io.json.read_json + dask.dataframe.read_json + """ # TODO: Add optimized code path to leverage the diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index e731057ed90..49fea0d7602 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from io import BufferedWriter, IOBase @@ -25,37 +25,45 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read cudf dataframe from ORC file(s). + """Read ORC files into a :class:`.DataFrame`. Note that this function is mostly borrowed from upstream Dask. Parameters ---------- - path: str or list(str) + path : str or list[str] Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. - columns: None or list(str) + columns : None or list[str] Columns to load. If None, loads all. filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out row groups - using statistics stored for each row group as Parquet metadata. Row - groups that do not match the given filter predicate are not read. The - predicate is expressed in disjunctive normal form (DNF) like - `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical - combinations of single column predicates. The innermost tuples each - describe a single column predicate. The list of inner predicates is - interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the outermost list combines - these filters as a disjunction (OR). Predicates may also be passed - as a list of tuples. This form is interpreted as a single conjunction. - To express OR in predicates, one must use the (preferred) notation of - list of lists of tuples. - storage_options: None or dict + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict Further parameters to pass to the bytes backend. + See Also + -------- + dask.dataframe.read_orc + Returns ------- - cudf.DataFrame + dask_cudf.DataFrame + """ storage_options = storage_options or {} @@ -133,22 +141,25 @@ def to_orc( compute=True, **kwargs, ): - """Write a dask_cudf dataframe to ORC file(s) (one file per partition). + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). Parameters ---------- - df : dask_cudf.DataFrame - path: string or pathlib.Path + df : DataFrame + path : str or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. write_index : boolean, optional Whether or not to write the index. Defaults to True. - storage_options: None or dict + storage_options : None or dict Further parameters to pass to the bytes backend. compression : string or dict, optional compute : bool, optional - If True (default) then the result is computed immediately. If False - then a ``dask.delayed`` object is returned for future computation. + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + """ from dask import compute as dask_compute, delayed diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 1e3ff63ce76..55058190b16 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -411,13 +411,14 @@ def set_object_dtypes_from_pa_schema(df, schema): def read_parquet(path, columns=None, **kwargs): - """Read parquet files into a Dask DataFrame + """ + Read parquet files into a :class:`.DataFrame`. - Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine`` - to coordinate the execution of ``cudf.read_parquet``, and to - ultimately create a ``dask_cudf.DataFrame`` collection. + Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` + to coordinate the execution of :func:`cudf.read_parquet`, and to + ultimately create a :class:`.DataFrame` collection. - See the ``dask.dataframe.read_parquet`` documentation for + See the :func:`dask.dataframe.read_parquet` documentation for all available options. Examples @@ -442,6 +443,7 @@ def read_parquet(path, columns=None, **kwargs): See Also -------- cudf.read_parquet + dask.dataframe.read_parquet """ if isinstance(columns, str): columns = [columns] From 64595114faa96cd12da61c7db77f6dd6f29a127b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 7 Feb 2023 18:16:50 +0000 Subject: [PATCH 4/9] Link to Dask-CUDA on landing page --- docs/dask_cudf/source/index.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index b38034a9453..30f535d6f93 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -20,6 +20,10 @@ minutes to Dask by `10 minutes to cuDF and Dask-cuDF `__. +When running on multi-GPU systems, `Dask-CUDA +`__ is recommended to +simplify the setup of the cluster, taking advantage of all features of +the GPU and networking hardware. .. toctree:: :maxdepth: 2 From 18c00e2e8da55f8ca5517c4e1f38d343c50ed04d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 7 Feb 2023 18:22:23 +0000 Subject: [PATCH 5/9] Pandas intersphinx --- docs/dask_cudf/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index acf2ecc8bfe..2ad79c32854 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -64,6 +64,7 @@ "pyarrow": ("https://arrow.apache.org/docs/", None), "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), "dask": ("https://docs.dask.org/en/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), } numpydoc_show_inherited_class_members = True From cdf6e19e269be76841efff51f06171575da1743d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Feb 2023 12:27:19 +0000 Subject: [PATCH 6/9] More prose --- docs/dask_cudf/source/api.rst | 12 +++++- docs/dask_cudf/source/index.rst | 74 ++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst index 9fa83ec846b..893f5dd7434 100644 --- a/docs/dask_cudf/source/api.rst +++ b/docs/dask_cudf/source/api.rst @@ -26,6 +26,14 @@ data reading facilities, followed by calling read_text, read_parquet +.. warning:: + + FIXME: where should the following live? + + .. autofunction:: dask_cudf.concat + + .. autofunction:: dask_cudf.from_delayed + Grouping ======== @@ -51,8 +59,8 @@ if possible. .. autofunction:: dask_cudf.groupby_agg -Dask Collections -================ +DataFrames and Series +===================== The core distributed objects provided by Dask-cuDF are the :class:`.DataFrame` and :class:`.Series`. These inherit respectively diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 30f535d6f93..66aedcbbd48 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -25,9 +25,81 @@ When running on multi-GPU systems, `Dask-CUDA simplify the setup of the cluster, taking advantage of all features of the GPU and networking hardware. +Using Dask-cuDF +--------------- + +When installed, Dask-cuDF registers itself as a dataframe backend for +Dask. This means that in many cases, using cuDF-backed dataframes requires +only small changes to an existing workflow. The minimal change is to +select cuDF as the dataframe backend in :doc:`Dask's +configuration `. To do so, we must set the option +``dataframe.backend`` to ``cudf``. From Python, this can be achieved +like so:: + + import dask + + dask.config.set({"dataframe.backend": "cudf"}) + +Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the +environment before running your code. + +Dataframe creation from on-disk formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your workflow creates Dask dataframes from on-disk formats +(for example using :func:`dask.dataframe.read_parquet`), then setting +the backend may well be enough to migrate your workflow. + +For example, consider reading a dataframe from parquet:: + + import dask.dataframe as dd + + # By default, we obtain a pandas-backed dataframe + df = dd.read_parquet("data.parquet", ...) + + +To obtain a cuDF-backed dataframe, we must set the +``dataframe.backend`` configuration option:: + + import dask + import dask.dataframe as dd + + dask.config.set({"dataframe.backend": "cudf"}) + # This gives us a cuDF-backed dataframe + df = dd.read_parquet("data.parquet", ...) + +This code will use cuDF's GPU-accelerated :func:`parquet reader +` to read partitions of the data. + +Dataframe creation from in-memory formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you already have a dataframe in memory and want to convert it to a +cuDF-backend one, there are two options depending on whether the +dataframe is already a Dask one or not. If you have a Dask dataframe, +then :func:`dask_cudf.from_dask_dataframe` will convert for you; if +you have a pandas dataframe then you can either call +:func:`dask.dataframe.from_pandas` followed by +:func:`~dask_cudf.from_dask_dataframe` or first convert the dataframe +with :func:`cudf.from_pandas` and then parallelise this with +:func:`dask_cudf.from_cudf`. + +API Reference +------------- + +Generally speaking, Dask-cuDF tries to offer exactly the same API as +Dask itself. There are, however, some minor differences mostly because +cuDF does not :doc:`perfectly mirror ` +the pandas API, or because cuDF provides additional configuration +flags (these mostly occur in data reading and writing interfaces). + +As a result, straightforward workflows can be migrated without too +much trouble, but more complex ones that utilise more features may +need a bit of tweaking. The API documentation describes details of the +differences and all functionality that Dask-cuDF supports. + .. toctree:: :maxdepth: 2 - :caption: Contents: api From bb0e896337eb1f1fe05ed83b857d1e6f39d04b9e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Feb 2023 12:31:44 +0000 Subject: [PATCH 7/9] Add cudf.from_pandas to API docs list --- docs/cudf/source/api_docs/general_functions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index 40e1b766dc9..d7034423b26 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -25,6 +25,7 @@ Top-level conversions cudf.to_numeric cudf.from_dlpack + cudf.from_pandas Top-level dealing with datetimelike ----------------------------------- From 3dcca4b7702be90e253b9b52ed445983c52b3a86 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Feb 2023 16:11:07 +0000 Subject: [PATCH 8/9] Minor config changes --- docs/dask_cudf/source/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index 2ad79c32854..1341e7fd9e7 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -33,6 +33,8 @@ templates_path = ["_templates"] exclude_patterns = [] +copybutton_prompt_text = ">>> " + # Enable automatic generation of systematic, namespaced labels for sections myst_heading_anchors = 2 @@ -43,6 +45,7 @@ html_theme = "pydata_sphinx_theme" html_logo = "_static/RAPIDS-logo-purple.png" htmlhelp_basename = "dask-cudfdoc" +html_use_modindex = True html_static_path = ["_static"] From 3334addaeb16336e5d2ad153d5ddb2060d700272 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 20 Mar 2023 17:53:45 +0000 Subject: [PATCH 9/9] Mention dask.dataframe.to_backend --- docs/dask_cudf/source/index.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 66aedcbbd48..0442ab0929a 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -77,11 +77,11 @@ Dataframe creation from in-memory formats If you already have a dataframe in memory and want to convert it to a cuDF-backend one, there are two options depending on whether the dataframe is already a Dask one or not. If you have a Dask dataframe, -then :func:`dask_cudf.from_dask_dataframe` will convert for you; if -you have a pandas dataframe then you can either call -:func:`dask.dataframe.from_pandas` followed by -:func:`~dask_cudf.from_dask_dataframe` or first convert the dataframe -with :func:`cudf.from_pandas` and then parallelise this with +then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"`` +as the backend; if you have a pandas dataframe then you can either +call :func:`dask.dataframe.from_pandas` followed by +:func:`~dask.dataframe.to_backend` or first convert the dataframe with +:func:`cudf.from_pandas` and then parallelise this with :func:`dask_cudf.from_cudf`. API Reference