From 21ed2519e20f95bad680bbd8d8fc6d379e320cee Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 14:42:48 -0700 Subject: [PATCH] Use list of columns for methods in `Groupby.pyx` (#10419) Part of #10153 This PR changes the APIs in `groupby.pyx` to accept a list of columns as input, not a Frame. This change affects both keys and values. The `Groupby` object now only stores a list of columns in the `keys` attribute and other APIs (`groups`, `aggregate`, `shift`, `replace_nulls`) now only accept a list of columns as its value columns. The `aggregation` communication protocol has changed from a dictionary mapping column names to list of agg names to a list of list of agg names. See changes in `_normalize_aggs` for detail. This PR also tries to simplify post-processing of `result` frame in `agg` method now that we have a finer control in pure python. I gave an attempt to rewrite `aggregate_internal` and `scan_internal` but ended up in futile because the unified aggregation object is a cdef type and precludes separating the aggregation filtering step outside of it's current place. Besides, I tried unifying aggregation and scan with cython fused type but didn't make it due to limitation of using fused type with c++ templated type in cython. Overall, the performance of `agg` call is on par with main branch. With -3%-13% performance diff depending on agg types.
Raw Benchmark ``` ========================================================================== 36 passed in 33.48s ========================================================================== (rapids) rapids@compose:~/scratch/cudf_benchmarks$ ./compare.sh bench_groupby.py --------------------------------------------------------------- benchmark 'False-False-agg1-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg1-100] (afte) 2.5090 (1.0) 2.8418 (1.0) 2.5280 (1.0) 0.0290 (2.40) 2.5229 (1.0) 0.0103 (1.05) 15;19 273 groupby_agg[False-False-agg1-100] (befo) 2.7681 (1.10) 2.8441 (1.00) 2.7877 (1.10) 0.0121 (1.0) 2.7849 (1.10) 0.0098 (1.0) 60;26 252 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-False-agg1-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg1-10000] (afte) 2.7803 (1.0) 3.4156 (1.05) 2.8131 (1.0) 0.0548 (1.57) 2.8007 (1.0) 0.0253 (1.0) 10;12 252 groupby_agg[False-False-agg1-10000] (befo) 3.0402 (1.09) 3.2407 (1.0) 3.1571 (1.12) 0.0348 (1.0) 3.1535 (1.13) 0.0509 (2.01) 39;6 236 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'False-False-agg1-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg1-1000000] (afte) 13.2601 (1.0) 14.0128 (1.01) 13.4242 (1.0) 0.1056 (1.28) 13.4004 (1.0) 0.0286 (1.0) 5;8 68 groupby_agg[False-False-agg1-1000000] (befo) 13.5150 (1.02) 13.9165 (1.0) 13.6015 (1.01) 0.0826 (1.0) 13.5944 (1.01) 0.0696 (2.43) 8;5 66 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-False-agg2-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg2-100] (afte) 2.5342 (1.0) 2.8621 (1.0) 2.5591 (1.0) 0.0431 (3.18) 2.5509 (1.0) 0.0106 (1.01) 13;18 273 groupby_agg[False-False-agg2-100] (befo) 2.8797 (1.14) 2.9507 (1.03) 2.8997 (1.13) 0.0136 (1.0) 2.8965 (1.14) 0.0105 (1.0) 52;28 227 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-False-agg2-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg2-10000] (afte) 2.7922 (1.0) 3.2884 (1.0) 2.8205 (1.0) 0.0473 (1.40) 2.8118 (1.0) 0.0096 (1.0) 10;18 251 groupby_agg[False-False-agg2-10000] (befo) 3.1491 (1.13) 3.4791 (1.06) 3.1752 (1.13) 0.0338 (1.0) 3.1687 (1.13) 0.0108 (1.12) 6;17 172 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'False-False-agg2-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-agg2-1000000] (afte) 13.4699 (1.0) 14.6287 (1.0) 13.6020 (1.0) 0.1359 (1.0) 13.5769 (1.0) 0.0270 (1.0) 3;8 69 groupby_agg[False-False-agg2-1000000] (befo) 13.6079 (1.01) 29.8318 (2.04) 14.0777 (1.03) 1.9806 (14.57) 13.7795 (1.01) 0.0567 (2.10) 2;6 68 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-False-sum-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ groupby_agg[False-False-sum-100] (afte) 2.1667 (1.0) 2.2855 (1.0) 2.1831 (1.0) 0.0146 (1.49) 2.1802 (1.0) 0.0111 (1.14) 25;16 301 groupby_agg[False-False-sum-100] (befo) 2.4142 (1.11) 2.4782 (1.08) 2.4319 (1.11) 0.0098 (1.0) 2.4309 (1.11) 0.0097 (1.0) 65;15 278 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ --------------------------------------------------------------- benchmark 'False-False-sum-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-sum-10000] (afte) 2.4293 (1.0) 2.6593 (1.0) 2.4493 (1.0) 0.0206 (1.66) 2.4455 (1.0) 0.0115 (1.10) 17;19 278 groupby_agg[False-False-sum-10000] (befo) 2.6646 (1.10) 2.7706 (1.04) 2.6832 (1.10) 0.0124 (1.0) 2.6811 (1.10) 0.0105 (1.0) 49;14 257 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------- benchmark 'False-False-sum-1000000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-False-sum-1000000] (afte) 9.3678 (1.0) 21.0480 (2.07) 9.6817 (1.0) 1.2252 (16.49) 9.5286 (1.0) 0.0342 (1.28) 1;9 89 groupby_agg[False-False-sum-1000000] (befo) 9.6830 (1.03) 10.1832 (1.0) 9.7434 (1.01) 0.0743 (1.0) 9.7238 (1.02) 0.0266 (1.0) 6;6 86 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-True-agg1-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ groupby_agg[False-True-agg1-100] (afte) 2.4392 (1.0) 2.7474 (1.06) 2.4598 (1.0) 0.0287 (2.07) 2.4545 (1.0) 0.0103 (1.0) 10;17 278 groupby_agg[False-True-agg1-100] (befo) 2.5183 (1.03) 2.6017 (1.0) 2.5354 (1.03) 0.0139 (1.0) 2.5332 (1.03) 0.0134 (1.30) 51;18 268 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ --------------------------------------------------------------- benchmark 'False-True-agg1-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-agg1-10000] (afte) 2.7196 (1.0) 3.2290 (1.06) 2.7446 (1.0) 0.0462 (2.17) 2.7359 (1.0) 0.0106 (1.00) 11;17 257 groupby_agg[False-True-agg1-10000] (befo) 2.7807 (1.02) 3.0590 (1.0) 2.8039 (1.02) 0.0213 (1.0) 2.8004 (1.02) 0.0106 (1.0) 16;18 251 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'False-True-agg1-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-agg1-1000000] (afte) 13.2259 (1.01) 13.7344 (1.0) 13.3449 (1.00) 0.0797 (1.0) 13.3288 (1.00) 0.0322 (1.41) 5;8 69 groupby_agg[False-True-agg1-1000000] (befo) 13.0875 (1.0) 14.1552 (1.03) 13.3135 (1.0) 0.1325 (1.66) 13.2901 (1.0) 0.0229 (1.0) 4;7 68 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-True-agg2-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ groupby_agg[False-True-agg2-100] (afte) 2.4580 (1.0) 2.5791 (1.0) 2.4792 (1.0) 0.0174 (1.92) 2.4756 (1.0) 0.0121 (1.37) 21;14 277 groupby_agg[False-True-agg2-100] (befo) 2.6094 (1.06) 2.6686 (1.03) 2.6260 (1.06) 0.0091 (1.0) 2.6255 (1.06) 0.0088 (1.0) 66;21 264 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ --------------------------------------------------------------- benchmark 'False-True-agg2-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-agg2-10000] (afte) 2.7218 (1.0) 2.8843 (1.0) 2.7415 (1.0) 0.0180 (1.0) 2.7383 (1.0) 0.0116 (1.12) 21;16 257 groupby_agg[False-True-agg2-10000] (befo) 2.8771 (1.06) 3.1227 (1.08) 2.8956 (1.06) 0.0185 (1.03) 2.8922 (1.06) 0.0104 (1.0) 16;16 244 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'False-True-agg2-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-agg2-1000000] (afte) 13.4555 (1.01) 13.7924 (1.0) 13.5244 (1.00) 0.0601 (1.0) 13.5099 (1.00) 0.0362 (1.0) 7;6 70 groupby_agg[False-True-agg2-1000000] (befo) 13.3841 (1.0) 13.9437 (1.01) 13.4948 (1.0) 0.0773 (1.29) 13.4768 (1.0) 0.0443 (1.22) 5;5 68 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-True-sum-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-sum-100] (afte) 2.1270 (1.0) 2.2397 (1.0) 2.1435 (1.0) 0.0158 (1.01) 2.1407 (1.0) 0.0105 (1.0) 27;22 302 groupby_agg[False-True-sum-100] (befo) 2.1881 (1.03) 2.3309 (1.04) 2.2048 (1.03) 0.0156 (1.0) 2.2014 (1.03) 0.0111 (1.06) 35;30 297 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-True-sum-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-sum-10000] (afte) 2.4018 (1.0) 2.6107 (1.0) 2.4183 (1.0) 0.0198 (1.16) 2.4149 (1.0) 0.0108 (1.12) 14;14 277 groupby_agg[False-True-sum-10000] (befo) 2.4406 (1.02) 2.6840 (1.03) 2.4606 (1.02) 0.0170 (1.0) 2.4585 (1.02) 0.0097 (1.0) 15;14 274 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'False-True-sum-1000000': 2 tests ---------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[False-True-sum-1000000] (afte) 9.4459 (1.01) 10.0397 (1.0) 9.4983 (1.0) 0.0706 (1.0) 9.4846 (1.0) 0.0216 (1.0) 4;6 89 groupby_agg[False-True-sum-1000000] (befo) 9.3064 (1.0) 10.2732 (1.02) 9.5150 (1.00) 0.1107 (1.57) 9.4933 (1.00) 0.0239 (1.10) 6;10 88 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------- benchmark 'True-False-agg1-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-agg1-100] (afte) 4.3327 (1.0) 4.4800 (1.0) 4.3504 (1.0) 0.0202 (1.0) 4.3457 (1.0) 0.0103 (1.0) 10;16 181 groupby_agg[True-False-agg1-100] (befo) 4.6486 (1.07) 12.4651 (2.78) 4.8006 (1.10) 0.7100 (35.18) 4.6664 (1.07) 0.0191 (1.86) 10;19 170 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-False-agg1-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-agg1-10000] (afte) 4.9246 (1.0) 5.1165 (1.0) 4.9491 (1.0) 0.0269 (1.0) 4.9407 (1.0) 0.0133 (1.06) 16;19 164 groupby_agg[True-False-agg1-10000] (befo) 5.2464 (1.07) 5.6002 (1.09) 5.2700 (1.06) 0.0370 (1.38) 5.2623 (1.07) 0.0126 (1.0) 10;17 154 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-False-agg1-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-agg1-1000000] (afte) 36.5089 (1.00) 37.2874 (1.0) 36.8305 (1.0) 0.2321 (1.0) 36.7404 (1.0) 0.2208 (1.0) 7;5 28 groupby_agg[True-False-agg1-1000000] (befo) 36.3558 (1.0) 47.0329 (1.26) 37.7670 (1.03) 2.7313 (11.77) 36.8183 (1.00) 0.8527 (3.86) 2;3 26 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-False-agg2-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ groupby_agg[True-False-agg2-100] (afte) 4.6287 (1.0) 5.2921 (1.02) 4.6918 (1.0) 0.1017 (4.64) 4.6526 (1.0) 0.0496 (3.27) 21;23 167 groupby_agg[True-False-agg2-100] (befo) 4.9776 (1.08) 5.1737 (1.0) 5.0060 (1.07) 0.0219 (1.0) 4.9995 (1.07) 0.0152 (1.0) 18;10 161 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ --------------------------------------------------------------- benchmark 'True-False-agg2-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-agg2-10000] (afte) 5.2022 (1.0) 6.7622 (1.16) 5.2405 (1.0) 0.1267 (2.98) 5.2219 (1.0) 0.0157 (1.0) 2;16 155 groupby_agg[True-False-agg2-10000] (befo) 5.5802 (1.07) 5.8531 (1.0) 5.6166 (1.07) 0.0424 (1.0) 5.6041 (1.07) 0.0206 (1.31) 11;14 147 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-False-agg2-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-agg2-1000000] (afte) 37.9639 (1.0) 38.7598 (1.0) 38.2381 (1.0) 0.1221 (1.0) 38.2346 (1.00) 0.0583 (1.0) 2;2 27 groupby_agg[True-False-agg2-1000000] (befo) 38.0569 (1.00) 41.5735 (1.07) 38.7983 (1.01) 1.1968 (9.80) 38.1696 (1.0) 0.6344 (10.88) 5;5 26 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-False-sum-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-sum-100] (afte) 3.6893 (1.0) 4.2792 (1.03) 3.7130 (1.0) 0.0580 (4.15) 3.7022 (1.0) 0.0079 (1.0) 10;16 206 groupby_agg[True-False-sum-100] (befo) 4.0016 (1.08) 4.1370 (1.0) 4.0218 (1.08) 0.0140 (1.0) 4.0180 (1.09) 0.0097 (1.23) 27;17 188 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-False-sum-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-sum-10000] (afte) 4.2660 (1.0) 4.6651 (1.0) 4.2913 (1.0) 0.0493 (2.97) 4.2799 (1.0) 0.0097 (1.0) 10;21 185 groupby_agg[True-False-sum-10000] (befo) 4.5702 (1.07) 4.7321 (1.01) 4.5904 (1.07) 0.0166 (1.0) 4.5858 (1.07) 0.0134 (1.37) 24;8 172 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-False-sum-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-False-sum-1000000] (afte) 30.5871 (1.00) 30.9527 (1.0) 30.6797 (1.00) 0.0628 (1.0) 30.6720 (1.00) 0.0421 (1.0) 4;3 32 groupby_agg[True-False-sum-1000000] (befo) 30.5386 (1.0) 31.8930 (1.03) 30.6654 (1.0) 0.2383 (3.80) 30.6013 (1.0) 0.0573 (1.36) 1;4 31 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-True-agg1-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg1-100] (afte) 4.2812 (1.0) 4.5815 (1.0) 4.3304 (1.0) 0.0495 (1.43) 4.3134 (1.0) 0.0647 (4.80) 22;4 173 groupby_agg[True-True-agg1-100] (befo) 4.4126 (1.03) 4.7356 (1.03) 4.4357 (1.02) 0.0348 (1.0) 4.4253 (1.03) 0.0135 (1.0) 14;18 158 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-True-agg1-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg1-10000] (afte) 4.8505 (1.0) 5.3411 (1.0) 4.8882 (1.0) 0.0596 (1.49) 4.8693 (1.0) 0.0240 (1.41) 12;15 166 groupby_agg[True-True-agg1-10000] (befo) 4.9857 (1.03) 5.3869 (1.01) 5.0191 (1.03) 0.0399 (1.0) 5.0089 (1.03) 0.0170 (1.0) 9;15 160 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-True-agg1-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg1-1000000] (afte) 36.5387 (1.01) 55.8017 (1.52) 37.3622 (1.03) 3.6965 (48.22) 36.5756 (1.00) 0.0882 (2.75) 1;3 27 groupby_agg[True-True-agg1-1000000] (befo) 36.3456 (1.0) 36.7584 (1.0) 36.4209 (1.0) 0.0767 (1.0) 36.4014 (1.0) 0.0320 (1.0) 1;4 27 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-True-agg2-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg2-100] (afte) 4.5713 (1.0) 5.1548 (1.06) 4.6064 (1.0) 0.0621 (4.49) 4.5886 (1.0) 0.0203 (1.51) 13;22 170 groupby_agg[True-True-agg2-100] (befo) 4.7628 (1.04) 4.8752 (1.0) 4.7832 (1.04) 0.0138 (1.0) 4.7795 (1.04) 0.0134 (1.0) 29;9 167 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-True-agg2-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg2-10000] (afte) 5.1343 (1.0) 5.4159 (1.0) 5.1769 (1.0) 0.0517 (1.36) 5.1590 (1.0) 0.0179 (1.21) 16;22 157 groupby_agg[True-True-agg2-10000] (befo) 5.3567 (1.04) 5.6432 (1.04) 5.3858 (1.04) 0.0379 (1.0) 5.3785 (1.04) 0.0147 (1.0) 7;12 152 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-True-agg2-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-agg2-1000000] (afte) 38.0357 (1.00) 38.2935 (1.00) 38.1159 (1.00) 0.0597 (1.0) 38.1014 (1.00) 0.0846 (1.0) 6;1 27 groupby_agg[True-True-agg2-1000000] (befo) 37.9134 (1.0) 38.2851 (1.0) 38.0201 (1.0) 0.0929 (1.55) 37.9944 (1.0) 0.1066 (1.26) 7;1 26 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- --------------------------------------------------------------- benchmark 'True-True-sum-100': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-sum-100] (afte) 3.7452 (1.0) 4.0287 (1.0) 3.8009 (1.0) 0.0408 (1.0) 3.7968 (1.0) 0.0503 (1.0) 29;3 131 groupby_agg[True-True-sum-100] (befo) 3.8752 (1.03) 4.4384 (1.10) 3.9316 (1.03) 0.0608 (1.49) 3.9265 (1.03) 0.0504 (1.00) 4;3 148 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------- benchmark 'True-True-sum-10000': 2 tests --------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- groupby_agg[True-True-sum-10000] (afte) 4.4442 (1.0) 11.3511 (2.35) 4.5582 (1.0) 0.5829 (24.78) 4.4741 (1.0) 0.0323 (2.85) 3;19 171 groupby_agg[True-True-sum-10000] (befo) 4.5676 (1.03) 4.8264 (1.0) 4.5913 (1.01) 0.0235 (1.0) 4.5871 (1.03) 0.0114 (1.0) 15;16 168 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------- benchmark 'True-True-sum-1000000': 2 tests ----------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers Rounds ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ groupby_agg[True-True-sum-1000000] (afte) 30.5326 (1.00) 33.6395 (1.02) 31.2355 (1.0) 0.9563 (1.20) 30.6933 (1.0) 0.9663 (1.0) 5;3 30 groupby_agg[True-True-sum-1000000] (befo) 30.4080 (1.0) 33.0341 (1.0) 31.2527 (1.00) 0.7946 (1.0) 30.9808 (1.01) 1.2781 (1.32) 11;0 30 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ ```
[Benchmark code](https://github.com/isVoid/cudf_benchmarks/blob/9d9644eaa5301df7894c2fe4b1ba317396240518/bench_groupby.py#L23-L42) Authors: - Michael Wang (https://github.com/isVoid) - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10419 --- python/cudf/cudf/_lib/groupby.pyx | 222 +++++++++++------------ python/cudf/cudf/_typing.py | 10 +- python/cudf/cudf/core/frame.py | 5 +- python/cudf/cudf/core/groupby/groupby.py | 211 +++++++++++---------- 4 files changed, 221 insertions(+), 227 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 49a924c9104..48f566b846d 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -26,7 +26,12 @@ import cudf from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_unique_ptr, + table_view_from_columns, + table_view_from_table, +) from cudf._lib.scalar import as_device_scalar @@ -46,7 +51,6 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.utilities.host_span cimport host_span -from cudf._lib.utils cimport data_from_unique_ptr # The sets below define the possible aggregations that can be performed on # different dtypes. These strings must be elements of the AggregationKind enum. @@ -62,11 +66,39 @@ _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar + +cdef _agg_result_from_columns( + vector[libcudf_groupby.aggregation_result]& c_result_columns, + set column_included, + int n_input_columns +): + """Construct the list of result columns from libcudf result. The result + contains the same number of lists as the number of input columns. Result + for an input column that has no applicable aggregations is an empty list. + """ + cdef: + int i + int j + int result_index = 0 + vector[unique_ptr[column]]* c_result + result_columns = [] + for i in range(n_input_columns): + if i in column_included: + c_result = &c_result_columns[result_index].results + result_columns.append([ + Column.from_unique_ptr(move(c_result[0][j])) + for j in range(c_result[0].size()) + ]) + result_index += 1 + else: + result_columns.append([]) + return result_columns + cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj cdef dict __dict__ - def __cinit__(self, keys, bool dropna=True, *args, **kwargs): + def __cinit__(self, list keys, bool dropna=True, *args, **kwargs): cdef libcudf_types.null_policy c_null_handling if dropna: @@ -74,7 +106,7 @@ cdef class GroupBy: else: c_null_handling = libcudf_types.null_policy.INCLUDE - cdef table_view keys_view = table_view_from_table(keys) + cdef table_view keys_view = table_view_from_columns(keys) with nogil: self.c_obj.reset( @@ -84,46 +116,42 @@ cdef class GroupBy: ) ) - def __init__(self, keys, bool dropna=True): + def __init__(self, list keys, bool dropna=True): self.keys = keys self.dropna = dropna - def groups(self, values): - - cdef table_view values_view = table_view_from_table(values) + def groups(self, list values): + cdef table_view values_view = table_view_from_columns(values) with nogil: c_groups = move(self.c_obj.get()[0].get_groups(values_view)) - c_grouped_keys = move(c_groups.keys) - c_grouped_values = move(c_groups.values) - c_group_offsets = c_groups.offsets - - grouped_keys = cudf.core.index._index_from_data( - *data_from_unique_ptr( - move(c_grouped_keys), - column_names=range(c_grouped_keys.get()[0].num_columns()) - ) - ) - grouped_values = data_from_unique_ptr( - move(c_grouped_values), - index_names=values._index_names, - column_names=values._column_names - ) - return grouped_keys, grouped_values, c_group_offsets + grouped_key_cols = columns_from_unique_ptr(move(c_groups.keys)) + grouped_value_cols = columns_from_unique_ptr(move(c_groups.values)) + return grouped_key_cols, grouped_value_cols, c_groups.offsets def aggregate_internal(self, values, aggregations): - from cudf.core.column_accessor import ColumnAccessor + """`values` is a list of columns and `aggregations` is a list of list + of aggregations. `aggregations[i]` is a list of aggregations for + `values[i]`. Returns a tuple containing 1) list of list of aggregation + results, 2) a list of grouped keys, and 3) a list of list of + aggregations performed. + """ cdef vector[libcudf_groupby.aggregation_request] c_agg_requests cdef libcudf_groupby.aggregation_request c_agg_request cdef Column col cdef GroupbyAggregation agg_obj - allow_empty = all(len(v) == 0 for v in aggregations.values()) + cdef pair[ + unique_ptr[table], + vector[libcudf_groupby.aggregation_result] + ] c_result - included_aggregations = defaultdict(list) - for i, (col_name, aggs) in enumerate(aggregations.items()): - col = values._data[col_name] + allow_empty = all(len(v) == 0 for v in aggregations) + + included_aggregations = [] + column_included = set() + for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype valid_aggregations = ( @@ -135,36 +163,27 @@ cdef class GroupBy: else _DECIMAL_AGGS if is_decimal_dtype(dtype) else "ALL" ) - if (valid_aggregations is _DECIMAL_AGGS - and rmm._cuda.gpu.runtimeGetVersion() < 11000): - raise RuntimeError( - "Decimal aggregations are only supported on CUDA >= 11 " - "due to an nvcc compiler bug." - ) + included_aggregations_i = [] c_agg_request = move(libcudf_groupby.aggregation_request()) for agg in aggs: agg_obj = make_groupby_aggregation(agg) if (valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations): - included_aggregations[col_name].append(agg) + included_aggregations_i.append(agg) c_agg_request.aggregations.push_back( move(agg_obj.c_obj) ) + included_aggregations.append(included_aggregations_i) if not c_agg_request.aggregations.empty(): c_agg_request.values = col.view() c_agg_requests.push_back( move(c_agg_request) ) - + column_included.add(i) if c_agg_requests.empty() and not allow_empty: raise DataError("All requested aggregations are unsupported.") - cdef pair[ - unique_ptr[table], - vector[libcudf_groupby.aggregation_result] - ] c_result - with nogil: c_result = move( self.c_obj.get()[0].aggregate( @@ -172,37 +191,38 @@ cdef class GroupBy: ) ) - grouped_keys, _ = data_from_unique_ptr( - move(c_result.first), - column_names=self.keys._column_names + grouped_keys = columns_from_unique_ptr( + move(c_result.first) ) - result_data = ColumnAccessor(multiindex=True) - # Note: This loop relies on the included_aggregations dict being - # insertion ordered to map results to requested aggregations by index. - for i, col_name in enumerate(included_aggregations): - for j, agg_name in enumerate(included_aggregations[col_name]): - if callable(agg_name): - agg_name = agg_name.__name__ - result_data[(col_name, agg_name)] = ( - Column.from_unique_ptr(move(c_result.second[i].results[j])) - ) + result_columns = _agg_result_from_columns( + c_result.second, column_included, len(values) + ) - return result_data, cudf.core.index._index_from_data( - grouped_keys) + return result_columns, grouped_keys, included_aggregations def scan_internal(self, values, aggregations): - from cudf.core.column_accessor import ColumnAccessor + """`values` is a list of columns and `aggregations` is a list of list + of aggregations. `aggregations[i]` is a list of aggregations for + `values[i]`. Returns a tuple containing 1) list of list of aggregation + results, 2) a list of grouped keys, and 3) a list of list of + aggregations performed. + """ cdef vector[libcudf_groupby.scan_request] c_agg_requests cdef libcudf_groupby.scan_request c_agg_request cdef Column col cdef GroupbyScanAggregation agg_obj - allow_empty = all(len(v) == 0 for v in aggregations.values()) + cdef pair[ + unique_ptr[table], + vector[libcudf_groupby.aggregation_result] + ] c_result + + allow_empty = all(len(v) == 0 for v in aggregations) - included_aggregations = defaultdict(list) - for i, (col_name, aggs) in enumerate(aggregations.items()): - col = values._data[col_name] + included_aggregations = [] + column_included = set() + for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype valid_aggregations = ( @@ -214,36 +234,27 @@ cdef class GroupBy: else _DECIMAL_AGGS if is_decimal_dtype(dtype) else "ALL" ) - if (valid_aggregations is _DECIMAL_AGGS - and rmm._cuda.gpu.runtimeGetVersion() < 11000): - raise RuntimeError( - "Decimal aggregations are only supported on CUDA >= 11 " - "due to an nvcc compiler bug." - ) + included_aggregations_i = [] c_agg_request = move(libcudf_groupby.scan_request()) for agg in aggs: agg_obj = make_groupby_scan_aggregation(agg) if (valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations): - included_aggregations[col_name].append(agg) + included_aggregations_i.append(agg) c_agg_request.aggregations.push_back( move(agg_obj.c_obj) ) + included_aggregations.append(included_aggregations_i) if not c_agg_request.aggregations.empty(): c_agg_request.values = col.view() c_agg_requests.push_back( move(c_agg_request) ) - + column_included.add(i) if c_agg_requests.empty() and not allow_empty: raise DataError("All requested aggregations are unsupported.") - cdef pair[ - unique_ptr[table], - vector[libcudf_groupby.aggregation_result] - ] c_result - with nogil: c_result = move( self.c_obj.get()[0].scan( @@ -251,24 +262,15 @@ cdef class GroupBy: ) ) - grouped_keys, _ = data_from_unique_ptr( - move(c_result.first), - column_names=self.keys._column_names + grouped_keys = columns_from_unique_ptr( + move(c_result.first) ) - result_data = ColumnAccessor(multiindex=True) - # Note: This loop relies on the included_aggregations dict being - # insertion ordered to map results to requested aggregations by index. - for i, col_name in enumerate(included_aggregations): - for j, agg_name in enumerate(included_aggregations[col_name]): - if callable(agg_name): - agg_name = agg_name.__name__ - result_data[(col_name, agg_name)] = ( - Column.from_unique_ptr(move(c_result.second[i].results[j])) - ) + result_columns = _agg_result_from_columns( + c_result.second, column_included, len(values) + ) - return result_data, cudf.core.index._index_from_data( - grouped_keys) + return result_columns, grouped_keys, included_aggregations def aggregate(self, values, aggregations): """ @@ -292,8 +294,8 @@ cdef class GroupBy: return self.aggregate_internal(values, aggregations) - def shift(self, values, int periods, list fill_values): - cdef table_view view = table_view_from_table(values) + def shift(self, list values, int periods, list fill_values): + cdef table_view view = table_view_from_columns(values) cdef size_type num_col = view.num_columns() cdef vector[size_type] offsets = vector[size_type](num_col, periods) @@ -301,7 +303,7 @@ cdef class GroupBy: cdef DeviceScalar d_slr d_slrs = [] c_fill_values.reserve(num_col) - for val, col in zip(fill_values, values._columns): + for val, col in zip(fill_values, values): d_slr = as_device_scalar(val, dtype=col.dtype) d_slrs.append(d_slr) c_fill_values.push_back( @@ -315,21 +317,13 @@ cdef class GroupBy: self.c_obj.get()[0].shift(view, offsets, c_fill_values) ) - grouped_keys = cudf.core.index._index_from_data( - *data_from_unique_ptr( - move(c_result.first), - column_names=self.keys._column_names - ) - ) - - shifted, _ = data_from_unique_ptr( - move(c_result.second), column_names=values._column_names - ) + grouped_keys = columns_from_unique_ptr(move(c_result.first)) + shifted = columns_from_unique_ptr(move(c_result.second)) return shifted, grouped_keys - def replace_nulls(self, values, object method): - cdef table_view val_view = table_view_from_table(values) + def replace_nulls(self, list values, object method): + cdef table_view val_view = table_view_from_columns(values) cdef pair[unique_ptr[table], unique_ptr[table]] c_result cdef replace_policy policy = ( replace_policy.PRECEDING @@ -344,15 +338,13 @@ cdef class GroupBy: self.c_obj.get()[0].replace_nulls(val_view, policies) ) - return data_from_unique_ptr( - move(c_result.second), column_names=values._column_names - )[0] + return columns_from_unique_ptr(move(c_result.second)) _GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"} -def _is_all_scan_aggregate(aggs): +def _is_all_scan_aggregate(all_aggs): """ Returns true if all are scan aggregations. Raises @@ -365,16 +357,12 @@ def _is_all_scan_aggregate(aggs): return agg.__name__ if callable(agg) else agg all_scan = all( - all( - get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name] - ) - for col_name in aggs + get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs + for agg_name in aggs ) any_scan = any( - any( - get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name] - ) - for col_name in aggs + get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs + for agg_name in aggs ) if not all_scan and any_scan: diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 793a5d1d9e8..ca2024929f3 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,6 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. -from typing import TYPE_CHECKING, Any, TypeVar, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union import numpy as np from pandas import Period, Timedelta, Timestamp @@ -32,3 +32,9 @@ SeriesOrSingleColumnIndex = Union[ "cudf.Series", "cudf.core.index.GenericIndex" ] + +# Groupby aggregation +AggType = Union[str, Callable] +MultiColumnAggType = Union[ + AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a847c0b5d3b..2802009b848 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -229,14 +229,17 @@ def _from_columns( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: abc.Iterable[str], + column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, ): """Construct a `Frame` from a list of columns with metadata from self. + If `column_names` is None, use column names from self. If `index_names` is set, the first `len(index_names)` columns are used to construct the index of the frame. """ + if column_names is None: + column_names = self._column_names frame = self.__class__._from_columns( columns, column_names, index_names ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a1a4596ba45..0c274911f3d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -5,16 +5,18 @@ import pickle import warnings from functools import cached_property +from typing import Any, Iterable, List, Tuple, Union import numpy as np import cudf from cudf._lib import groupby as libgroupby from cudf._lib.reshape import interleave_columns -from cudf._typing import DataFrameOrSeries +from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import arange, as_column +from cudf.core.column.column import ColumnBase, arange, as_column +from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate @@ -37,6 +39,8 @@ def _quantile_75(x): class GroupBy(Serializable, Reducible, Scannable): + obj: "cudf.core.indexed_frame.IndexedFrame" + _VALID_REDUCTIONS = { "sum", "prod", @@ -107,6 +111,7 @@ def __init__( self._dropna = dropna if isinstance(by, _Grouping): + by._obj = self.obj self.grouping = by else: self.grouping = _Grouping(obj, by, level) @@ -204,7 +209,9 @@ def cumcount(self): @cached_property def _groupby(self): - return libgroupby.GroupBy(self.grouping.keys, dropna=self._dropna) + return libgroupby.GroupBy( + [*self.grouping.keys._columns], dropna=self._dropna + ) @_cudf_nvtx_annotate def agg(self, func): @@ -274,55 +281,48 @@ def agg(self, func): 1 1.5 1.75 2.0 2.0 2 3.0 3.00 1.0 1.0 """ - normalized_aggs = self._normalize_aggs(func) + column_names, columns, normalized_aggs = self._normalize_aggs(func) # Note: When there are no key columns, the below produces # a Float64Index, while Pandas returns an Int64Index # (GH: 6945) - result = cudf.DataFrame._from_data( - *self._groupby.aggregate(self.obj, normalized_aggs) + ( + result_columns, + grouped_key_cols, + included_aggregations, + ) = self._groupby.aggregate(columns, normalized_aggs) + + result_index = self.grouping.keys._from_columns_like_self( + grouped_key_cols, ) + multilevel = _is_multi_agg(func) + data = {} + for col_name, aggs, cols in zip( + column_names, included_aggregations, result_columns + ): + for agg, col in zip(aggs, cols): + if multilevel: + agg_name = agg.__name__ if callable(agg) else agg + key = (col_name, agg_name) + else: + key = col_name + data[key] = col + data = ColumnAccessor(data, multiindex=multilevel) + if not multilevel: + data = data.rename_levels({np.nan: None}, level=0) + result = cudf.DataFrame._from_data(data, index=result_index) + if self._sort: result = result.sort_index() - if not _is_multi_agg(func): - if result._data.nlevels <= 1: # 0 or 1 levels - # make sure it's a flat index: - result._data.multiindex = False - - if result._data.nlevels > 1: - result._data.droplevel(-1) - - # if, after dropping the last level, the only - # remaining key is `NaN`, we need to convert to `None` - # for Pandas compat: - if result._data.names == (np.nan,): - result._data = result._data.rename_levels( - {np.nan: None}, level=0 - ) + if not self._as_index: + result = result.reset_index() if libgroupby._is_all_scan_aggregate(normalized_aggs): # Scan aggregations return rows in original index order return self._mimic_pandas_order(result) - # set index names to be group key names - if len(result): - result.index.names = self.grouping.names - - # copy categorical information from keys to the result index: - result.index._copy_type_metadata(self.grouping.keys) - result._index = cudf.Index(result._index) - - if not self._as_index: - for col_name in reversed(self.grouping._named_columns): - result._insert( - 0, - col_name, - result.index.get_level_values(col_name)._values, - ) - result.index = cudf.core.index.RangeIndex(len(result)) - return result def _reduce( @@ -417,43 +417,50 @@ def deserialize(cls, header, frames): return cls(obj, grouping, **kwargs) def _grouped(self): - grouped_keys, grouped_values, offsets = self._groupby.groups(self.obj) - grouped_values = self.obj.__class__._from_data(*grouped_values) - grouped_values._copy_type_metadata(self.obj) + grouped_key_cols, grouped_value_cols, offsets = self._groupby.groups( + [*self.obj._index._columns, *self.obj._columns] + ) + grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols) + grouped_values = self.obj._from_columns_like_self( + grouped_value_cols, + column_names=self.obj._column_names, + index_names=self.obj._index_names, + ) group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) - def _normalize_aggs(self, aggs): + def _normalize_aggs( + self, aggs: MultiColumnAggType + ) -> Tuple[Iterable[Any], Tuple[ColumnBase, ...], List[List[AggType]]]: """ - Normalize aggs to a dict mapping column names - to a list of aggregations. + Normalize aggs to a list of list of aggregations, where `out[i]` + is a list of aggregations for column `self.obj[i]`. We support three + different form of `aggs` input here: + - A single agg, such as "sum". This agg is applied to all value + columns. + - A list of aggs, such as ["sum", "mean"]. All aggs are applied to all + value columns. + - A mapping of column name to aggs, such as + {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified + column. + Each agg can be string or lambda functions. """ - if not isinstance(aggs, collections.abc.Mapping): - # Make col_name->aggs mapping from aggs. - # Do not include named key columns - - # Can't do set arithmetic here as sets are - # not ordered - if isinstance(self, SeriesGroupBy): - columns = [self.obj.name] - else: - columns = [ - col_name - for col_name in self.obj._data - if col_name not in self.grouping._named_columns - ] - out = dict.fromkeys(columns, aggs) - else: - out = aggs.copy() - # Convert all values to list-like: - for col, agg in out.items(): - if not is_list_like(agg): - out[col] = [agg] - else: - out[col] = list(agg) - - return out + aggs_per_column: Iterable[Union[AggType, Iterable[AggType]]] + if isinstance(aggs, dict): + column_names, aggs_per_column = aggs.keys(), aggs.values() + columns = tuple(self.obj._data[col] for col in column_names) + else: + values = self.grouping.values + column_names = values._column_names + columns = values._columns + aggs_per_column = (aggs,) * len(columns) + + normalized_aggs = [ + list(agg) if is_list_like(agg) else [agg] + for agg in aggs_per_column + ] + return column_names, columns, normalized_aggs def pipe(self, func, *args, **kwargs): """ @@ -1201,29 +1208,20 @@ def diff(self, periods=1, axis=0): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - # grouped values - value_columns = self.grouping.values - _, (data, index), _ = self._groupby.groups( - cudf.core.frame.Frame(value_columns._data) - ) - grouped = self.obj.__class__._from_data(data, index) - grouped = self._mimic_pandas_order(grouped)._copy_type_metadata( - value_columns + values = self.obj.__class__._from_data( + self.grouping.values._data, self.obj.index ) - - result = grouped - self.shift(periods=periods) - return result._copy_type_metadata(value_columns) + return values - self.shift(periods=periods) def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: """Internal implementation for `ffill` and `bfill`""" - value_columns = self.grouping.values - result = self.obj.__class__._from_data( - self._groupby.replace_nulls( - cudf.core.frame.Frame(value_columns._data), method - ) + values = self.grouping.values + result = self.obj._from_columns( + self._groupby.replace_nulls([*values._columns], method), + values._column_names, ) result = self._mimic_pandas_order(result) - return result._copy_type_metadata(value_columns) + return result._copy_type_metadata(values) def pad(self, limit=None): """Forward fill NA values. @@ -1334,17 +1332,12 @@ def fillna( ) return getattr(self, method, limit)() - value_columns = self.grouping.values - _, (data, index), _ = self._groupby.groups( - cudf.core.frame.Frame(value_columns._data) + values = self.obj.__class__._from_data( + self.grouping.values._data, self.obj.index ) - - grouped = self.obj.__class__._from_data(data, index) - result = grouped.fillna( + return values.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) - result = self._mimic_pandas_order(result) - return result._copy_type_metadata(value_columns) def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -1385,22 +1378,21 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - value_columns = self.grouping.values + values = self.grouping.values if is_list_like(fill_value): - if not len(fill_value) == len(value_columns._data): + if len(fill_value) != len(values._data): raise ValueError( "Mismatched number of columns and values to fill." ) else: - fill_value = [fill_value] * len(value_columns._data) + fill_value = [fill_value] * len(values._data) - result = self.obj.__class__._from_data( - *self._groupby.shift( - cudf.core.frame.Frame(value_columns._data), periods, fill_value - ) + result = self.obj.__class__._from_columns( + self._groupby.shift([*values._columns], periods, fill_value)[0], + values._column_names, ) result = self._mimic_pandas_order(result) - return result._copy_type_metadata(value_columns) + return result._copy_type_metadata(values) def _mimic_pandas_order( self, result: DataFrameOrSeries @@ -1408,11 +1400,12 @@ def _mimic_pandas_order( """Given a groupby result from libcudf, reconstruct the row orders matching that of pandas. This also adds appropriate indices. """ - sorted_order_column = arange(0, result._data.nrows) - _, (order, _), _ = self._groupby.groups( - cudf.core.frame.Frame({"sorted_order_column": sorted_order_column}) + # TODO: copy metadata after this method is a common pattern, should + # merge in this method. + _, order_cols, _ = self._groupby.groups( + [arange(0, result._data.nrows)] ) - gather_map = order["sorted_order_column"].argsort() + gather_map = order_cols[0].argsort() result = result.take(gather_map) result.index = self.obj.index return result @@ -1502,6 +1495,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): Captive 210.0 """ + obj: "cudf.core.dataframe.DataFrame" + _PROTECTED_KEYS = frozenset(("obj",)) def __getitem__(self, key): @@ -1570,6 +1565,8 @@ class SeriesGroupBy(GroupBy): Name: Max Speed, dtype: float64 """ + obj: "cudf.core.series.Series" + def agg(self, func): result = super().agg(func) @@ -1667,7 +1664,7 @@ def keys(self): ) @property - def values(self): + def values(self) -> cudf.core.frame.Frame: """Return value columns as a frame. Note that in aggregation, value columns can be arbitrarily