From 0930f712e6651594e0e642114866961c9aedea25 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 9 Dec 2021 13:30:19 -0500 Subject: [PATCH] [HOTFIX] Fix indexing into a single row of a MultiIndex (#9870) * fix print outputs * Fix indexing into a single row of a MultiIndex * Don't compare dtypes with string * Better document * Only downcast dataframes * Document * Fix multiindex duplicate names. * Add test. * Fix cmake formatting. * Change FORMAT_FILE_URL Co-authored-by: Taurean Dyer <46935140+taureandyernv@users.noreply.github.com> Co-authored-by: Ashwin Srinath Co-authored-by: Vyas Ramasubramani --- ci/checks/style.sh | 2 +- cpp/cmake/thirdparty/get_dlpack.cmake | 3 +- cpp/cmake/thirdparty/get_jitify.cmake | 3 +- cpp/cmake/thirdparty/get_libcudacxx.cmake | 5 +- .../source/user_guide/10min-cudf-cupy.ipynb | 169 ++++++++++++------ python/cudf/cudf/core/multiindex.py | 28 ++- python/cudf/cudf/tests/test_multiindex.py | 25 +++ 7 files changed, 167 insertions(+), 68 deletions(-) diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 67e926a0768..13f7f0e6267 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -14,7 +14,7 @@ LANG=C.UTF-8 . /opt/conda/etc/profile.d/conda.sh conda activate rapids -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/main/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/cpp/cmake/thirdparty/get_dlpack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake index aeffd64f371..252d50c7af8 100644 --- a/cpp/cmake/thirdparty/get_dlpack.cmake +++ b/cpp/cmake/thirdparty/get_dlpack.cmake @@ -21,7 +21,8 @@ function(find_and_configure_dlpack VERSION) dlpack ${VERSION} GIT_REPOSITORY https://github.com/dmlc/dlpack.git GIT_TAG v${VERSION} - GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE OPTIONS "BUILD_MOCK OFF" ) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index 7c4526107a3..51bd41ea079 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -20,7 +20,8 @@ function(find_and_configure_jitify) jitify 2.0.0 GIT_REPOSITORY https://github.com/rapidsai/jitify.git GIT_TAG cudf_0.19 - GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR "${jitify_SOURCE_DIR}" diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake index 290c4f61e41..0917adcd764 100644 --- a/cpp/cmake/thirdparty/get_libcudacxx.cmake +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -17,8 +17,9 @@ function(find_and_configure_libcudacxx) include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) rapids_cpm_libcudacxx( - BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch - --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch --reject-file=- -p1 -N < + ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true ) set(LIBCUDACXX_INCLUDE_DIR diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb index 0985291f3c2..169eec07914 100644 --- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb +++ b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb @@ -45,9 +45,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "44.1 µs ± 689 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "209 µs ± 2.77 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", - "208 µs ± 3.14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" + "158 µs ± 306 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", + "419 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py:3044: FutureWarning: The as_gpu_matrix method will be removed in a future cuDF release. Consider using `to_cupy` instead.\n", + " FutureWarning,\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "339 µs ± 282 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], @@ -117,9 +131,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "22.1 µs ± 518 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "58.3 µs ± 647 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", - "80.2 µs ± 647 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" + "45.4 µs ± 63.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", + "127 µs ± 351 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", + "135 µs ± 5.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], @@ -256,7 +270,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "13.1 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "15.5 ms ± 7.55 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -510,7 +524,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4.9 ms ± 26.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "7.26 ms ± 3.32 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -530,7 +544,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "5.1 ms ± 23.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "4.87 ms ± 2.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -1139,135 +1153,135 @@ " 0\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 4.704433\n", " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.00000\n", " 0.0\n", - " 16.822959\n", " 0.0\n", " 0.000000\n", " 0.0\n", + " -1.162275\n", + " 0.000000\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " \n", " \n", " 1\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.000000\n", " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.00000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", - " 0.000000\n", + " 11.460403\n", " 0.0\n", + " 0.000000\n", + " 0.000000\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " \n", " \n", " 2\n", " 0.0\n", " 0.0\n", - " 6.618972\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.000000\n", " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.000000\n", - " 2.25678\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", + " 0.000000\n", + " 0.407392\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " \n", " \n", " 3\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.000000\n", " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.000000\n", - " 0.00000\n", " 0.0\n", - " 0.000000\n", " 0.0\n", - " 2.715802\n", + " 0.000000\n", " 0.0\n", + " 0.000000\n", + " 0.000000\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " \n", " \n", " 4\n", " 0.0\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.000000\n", " 0.0\n", + " 8.299425\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 4.296568\n", - " 0.00000\n", + " 2.096401\n", " 0.0\n", - " 0.000000\n", " 0.0\n", " 0.000000\n", " 0.0\n", + " 0.000000\n", + " 0.000000\n", " 0.0\n", " 0.0\n", - " 4.865495\n", " \n", " \n", "\n", "" ], "text/plain": [ - " a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 \\\n", - "0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.00000 \n", - "1 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.00000 \n", - "2 0.0 0.0 6.618972 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 2.25678 \n", - "3 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.00000 \n", - "4 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.296568 0.00000 \n", + " a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 \\\n", + "0 0.0 0.0 0.0 0.0 0.0 4.704433 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 8.299425 0.0 0.0 0.0 2.096401 \n", "\n", - " a12 a13 a14 a15 a16 a17 a18 a19 \n", - "0 0.0 16.822959 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", - "1 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", - "2 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 0.000000 \n", - "3 0.0 0.000000 0.0 2.715802 0.0 0.0 0.0 0.000000 \n", - "4 0.0 0.000000 0.0 0.000000 0.0 0.0 0.0 4.865495 " + " a12 a13 a14 a15 a16 a17 a18 a19 \n", + "0 0.0 0.0 0.000000 0.0 -1.162275 0.000000 0.0 0.0 \n", + "1 0.0 0.0 11.460403 0.0 0.000000 0.000000 0.0 0.0 \n", + "2 0.0 0.0 0.000000 0.0 0.000000 0.407392 0.0 0.0 \n", + "3 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 \n", + "4 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 " ] }, "execution_count": 20, @@ -1285,19 +1299,66 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " (41, 0)\t8.237732918475851\n", + " (49, 0)\t-4.161219849238402\n", + " (70, 0)\t-1.646588718395583\n", + " (80, 0)\t11.607048248828713\n", + " (81, 0)\t11.387095517746493\n", + " (105, 0)\t4.059008225609349\n", + " (107, 0)\t9.299030876304984\n", + " (108, 0)\t10.652087054434446\n", + " (127, 0)\t2.442578989241219\n", + " (133, 0)\t-0.7674141633646347\n", + " (135, 0)\t-6.091151515788713\n", + " (145, 0)\t2.968949150266586\n", + " (148, 0)\t5.649147779687932\n", + " (158, 0)\t7.7809955768930745\n", + " (166, 0)\t5.801884262747882\n", + " (175, 0)\t7.3205065025042\n", + " (181, 0)\t13.704683370645277\n", + " (204, 0)\t15.915619596241733\n", + " (207, 0)\t-0.2205888963107494\n", + " (209, 0)\t3.565578265020142\n", + " (215, 0)\t4.1493767841754154\n", + " (231, 0)\t3.4286524053271803\n", + " (233, 0)\t6.021200022977307\n", + " (241, 0)\t4.247163658236771\n", + " (249, 0)\t1.8502158424149273\n", + " :\t:\n", + " (9729, 19)\t7.226429647432215\n", + " (9762, 19)\t-0.6042314722021014\n", + " (9764, 19)\t-1.4827372788735615\n", + " (9769, 19)\t4.140245505599609\n", + " (9776, 19)\t-0.3441145182655059\n", + " (9781, 19)\t-0.235562982602191\n", + " (9782, 19)\t2.1458765970993223\n", + " (9791, 19)\t7.219427633840467\n", + " (9803, 19)\t6.6874487362355115\n", + " (9807, 19)\t5.1769501512294465\n", + " (9823, 19)\t-1.1040045399744103\n", + " (9828, 19)\t3.074156937033751\n", + " (9849, 19)\t0.4663962936122451\n", + " (9851, 19)\t10.302861735090476\n", + " (9862, 19)\t1.9377857550195872\n", + " (9893, 19)\t8.991541850619656\n", + " (9896, 19)\t-0.9003118390325282\n", + " (9919, 19)\t2.4984693551284587\n", + " (9934, 19)\t1.6161057487404191\n", + " (9944, 19)\t6.063387997554039\n", + " (9945, 19)\t11.038782286791717\n", + " (9954, 19)\t13.750186699958661\n", + " (9979, 19)\t0.9225731640357893\n", + " (9995, 19)\t-1.775155437069923\n", + " (9998, 19)\t12.265785237649636\n" + ] } ], "source": [ "sparse_data = cudf_to_cupy_sparse_matrix(df)\n", - "sparse_data" + "print(sparse_data)" ] }, { @@ -1326,7 +1387,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.7.12" } }, "nbformat": 4, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8c4f87d5f67..f21a8dcbae4 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -658,11 +658,11 @@ def _compute_levels_and_codes(self): def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" lookup = cudf.DataFrame() - for name, row in zip(index.names, row_tuple): + for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[name] = cudf.Series(row) - frame = index.to_frame(index=False) + lookup[i] = cudf.Series(row) + frame = cudf.DataFrame(dict(enumerate(index._data.columns))) data_table = cudf.concat( [ frame, @@ -729,16 +729,26 @@ def _index_and_downcast(self, result, index, index_key): for k in range(size, len(index._data)): out_index.insert( out_index._num_columns, - k if index.names is None else index.names[k], + k, cudf.Series._from_data({None: index._data.columns[k]}), ) - if len(result) == 1 and size == 0 and not slice_access: - # If the final result is one row and it was not mapped into - # directly, return a Series with a tuple as name. + # determine if we should downcast from a DataFrame to a Series + need_downcast = ( + isinstance(result, cudf.DataFrame) + and len(result) == 1 # only downcast if we have a single row + and not slice_access # never downcast if we sliced + and ( + size == 0 # index_key was an integer + # we indexed into a single row directly, using its label: + or len(index_key) == self.nlevels + ) + ) + if need_downcast: result = result.T - result = result[result._data.names[0]] - elif len(result) == 0 and not slice_access: + return result[result._data.names[0]] + + if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index d409a099806..d2df14e19bc 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -830,6 +830,19 @@ def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): assert_eq(presult, gresult, check_index_type=False, check_dtype=False) +def test_multiindex_iloc_scalar(): + arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] + tuples = list(zip(*arrays)) + idx = cudf.MultiIndex.from_tuples(tuples) + gdf = cudf.DataFrame( + {"first": cp.random.rand(4), "second": cp.random.rand(4)} + ) + gdf.index = idx + + pdf = gdf.to_pandas() + assert_eq(pdf.iloc[3], gdf.iloc[3]) + + @pytest.mark.parametrize( "iloc_rows", [ @@ -1742,3 +1755,15 @@ def test_multiIndex_type_methods(pidx, func): assert_eq(False, actual) else: assert_eq(expected, actual) + + +def test_multiindex_index_single_row(): + arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] + tuples = list(zip(*arrays)) + idx = cudf.MultiIndex.from_tuples(tuples) + gdf = cudf.DataFrame( + {"first": cp.random.rand(4), "second": cp.random.rand(4)} + ) + gdf.index = idx + pdf = gdf.to_pandas() + assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)])