diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e05b152191..99708a27aa0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,7 +48,9 @@ - PR #5658 Add `filter_tokens` nvtext API - PR #5666 Add `filter_characters_of_type` strings API - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build +- PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function - PR #5572 Add `cudf::encode` API. +- PR #5568 Add support for `Series.keys()` and `DataFrame.keys()` ## Improvements @@ -136,7 +138,12 @@ - PR #5702 Add inherited methods to python docs and other docs fixes - PR #5733 Add support for `size` property in `DataFrame`/ `Series` / `Index`/ `MultiIndex` - PR #5743 Reduce number of test cases in concatenate benchmark +- PR #5748 Disable `tolist` API in `Series` & `Index` and add `tolist` dispatch in `dask-cudf` +- PR #5756 Switch JNI code to use the RMM owning wrapper +- PR #5725 Integrate Gbenchmarks into CI - PR #5752 Add cuDF internals documentation (ColumnAccessor) +- PR #5759 Fix documentation describing JIT cache default location +- PR #5775 Update dask_cudf.read_parquet to align with upstream improvements ## Bug Fixes @@ -202,10 +209,13 @@ - PR #5692 Fix compilation issue with gcc 7.4.0 and CUDA 10.1 - PR #5693 Add fix missing from PR 5656 to update local docker image to py3.7 - PR #5703 Small fix for dataframe constructor with cuda array interface objects that don't have `descr` field +- PR #5727 Fix `Index.__repr__` to allow representation of null values - PR #5719 Fix Frame._concat() with categorical columns - PR #5736 Disable unsigned type in ORC writer benchmarks - PR #5745 Update JNI cast for inability to cast timestamp and integer types - PR #5750 Add RMM_ROOT/include to the spdlog search path in JNI build +- PR #5763 Update Java slf4j version to match Spark 3.0 +- PR #5766 Fix issue related to `iloc` and slicing a `DataFrame` - PR #5319 Disallow SUM and specialize MEAN of timestamp types diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh new file mode 100644 index 00000000000..40167ce259e --- /dev/null +++ b/ci/benchmark/build.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. +######################################### +# cuDF GPU build and test script for CI # +######################################### +set -e +NUMARGS=$# +ARGS=$* + +# Logger function for build status output +function logger() { + echo -e "\n>>>> $@\n" +} + +# Arg parsing function +function hasArg { + (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +# Set path and build parallel level +export PATH=/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=4 +export CUDA_REL=${CUDA_VERSION%.*} +export HOME=$WORKSPACE + +# Parse git describe +cd $WORKSPACE +export GIT_DESCRIBE_TAG=`git describe --tags` +export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` + +# Set Benchmark Vars +export ASVRESULTS_DIR=${WORKSPACE}/ci/artifacts/asv/results +export GBENCH_BENCHMARKS_DIR=${WORKSPACE}/cpp/build/gbenchmarks/ + +# Ensure ASV results directory exists +mkdir -p ${ASVRESULTS_DIR} + +# Set `LIBCUDF_KERNEL_CACHE_PATH` environment variable to $HOME/.jitify-cache because +# it's local to the container's virtual file system, and not shared with other CI jobs +# like `/tmp` is. +export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" + +function remove_libcudf_kernel_cache_dir { + EXITCODE=$? + logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH" + rm -rf "$LIBCUDF_KERNEL_CACHE_PATH" || logger "could not rm -rf $LIBCUDF_KERNEL_CACHE_PATH" + exit $EXITCODE +} + +trap remove_libcudf_kernel_cache_dir EXIT + +mkdir -p "$LIBCUDF_KERNEL_CACHE_PATH" || logger "could not mkdir -p $LIBCUDF_KERNEL_CACHE_PATH" + +################################################################################ +# SETUP - Check environment +################################################################################ + +logger "Check environment..." +env + +logger "Check GPU usage..." +nvidia-smi + +logger "Activate conda env..." +source activate rapids + +# Enter dependencies to be shown in ASV tooltips. +CUDF_DEPS=(librmm) +LIBCUDF_DEPS=(librmm) + +conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \ + "rapids-build-env=$MINOR_VERSION.*" \ + "rapids-notebook-env=$MINOR_VERSION.*" \ + rapids-pytest-benchmark + +# https://docs.rapids.ai/maintainers/depmgmt/ +# conda remove -f rapids-build-env rapids-notebook-env +# conda install "your-pkg=1.0.0" + +# Install the master version of dask, distributed, and streamz +logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps" +pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps +logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps" +pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps +logger "pip install git+https://github.com/python-streamz/streamz.git --upgrade --no-deps" +pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps + +logger "Check versions..." +python --version +$CC --version +$CXX --version +conda list + +################################################################################ +# BUILD - Build libcudf, cuDF and dask_cudf from source +################################################################################ + +logger "Build libcudf..." +if [[ ${BUILD_MODE} == "pull-request" ]]; then + $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests --ptds +else + $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests -l --ptds +fi + +################################################################################ +# BENCHMARK - Run and parse libcudf and cuDF benchmarks +################################################################################ + +logger "Running benchmarks..." + +#Download GBench results Parser +curl -L https://raw.githubusercontent.com/rapidsai/benchmark/main/parser/GBenchToASV.py --output GBenchToASV.py + +### +# Generate Metadata for dependencies +### + +# Concatenate dependency arrays, convert to JSON array, +# and remove duplicates. +X=("${CUDF_DEPS[@]}" "${LIBCUDF_DEPS[@]}") +DEPS=$(printf '%s\n' "${X[@]}" | jq -R . | jq -s 'unique') + +# Build object with k/v pairs of "dependency:version" +DEP_VER_DICT=$(jq -n '{}') +for DEP in $(echo "${DEPS}" | jq -r '.[]'); do + VER=$(conda list | grep "^${DEP}" | awk '{print $2"-"$3}') + DEP_VER_DICT=$(echo "${DEP_VER_DICT}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }') +done + +# Pass in an array of dependencies to get a dict of "dependency:version" +function getReqs() { + local DEPS_ARR=("$@") + local REQS="{}" + for DEP in "${DEPS_ARR[@]}"; do + VER=$(echo "${DEP_VER_DICT}" | jq -r --arg DEP "${DEP}" '.[$DEP]') + REQS=$(echo "${REQS}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }') + done + + echo "${REQS}" +} + +### +# Run LIBCUDF Benchmarks +### + +REQS=$(getReqs "${LIBCUDF_DEPS[@]}") + +mkdir -p ${WORKSPACE}/tmp/benchmark +touch ${WORKSPACE}/tmp/benchmark/benchmarks.txt +ls ${GBENCH_BENCHMARKS_DIR} > ${WORKSPACE}/tmp/benchmark/benchmarks.txt + +#Disable error aborting while tests run, failed tests will not generate data +logger "Running libcudf GBenchmarks..." +cd ${GBENCH_BENCHMARKS_DIR} +set +e +while read BENCH; +do + nvidia-smi + ./${BENCH} --benchmark_out=${BENCH}.json --benchmark_out_format=json + EXITCODE=$? + if [[ ${EXITCODE} != 0 ]]; then + rm ./${BENCH}.json + JOBEXITCODE=1 + fi +done < ${WORKSPACE}/tmp/benchmark/benchmarks.txt +set -e + +rm ${WORKSPACE}/tmp/benchmark/benchmarks.txt +cd ${WORKSPACE} +mv ${GBENCH_BENCHMARKS_DIR}/*.json ${WORKSPACE}/tmp/benchmark/ +python GBenchToASV.py -d ${WORKSPACE}/tmp/benchmark/ -t ${ASVRESULTS_DIR} -n libcudf -b branch-${MINOR_VERSION} -r "${REQS}" + +### +# Run Python Benchmarks +### + +#REQS=$(getReqs "${CUDF_DEPS[@]}") + +#BENCHMARK_META=$(jq -n \ +# --arg NODE "${NODE_NAME}" \ +# --arg BRANCH "branch-${MINOR_VERSION}" \ +# --argjson REQS "${REQS}" ' +# { +# "machineName": $NODE, +# "commitBranch": $BRANCH, +# "requirements": $REQS +# } +#') + +#echo "Benchmark meta:" +#echo "${BENCHMARK_META}" | jq "." diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index cd1f0b3f3b9..35aebab1765 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -28,9 +28,7 @@ requirements: - cudf {{ version }} - dask >=2.15.0 - distributed >=2.15.0 -test: - imports: - - dask_cudf + about: home: http://rapids.ai/ diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh new file mode 100644 index 00000000000..69efd2e6f58 --- /dev/null +++ b/conda/recipes/dask-cudf/run_test.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. + +set -e + +# Logger function for build status output +function logger() { + echo -e "\n>>>> $@\n" +} + +# Install the master version of dask and distributed +logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps" +pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps + +logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps" +pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps + +logger "python -c 'import dask_cudf'" +python -c "import dask_cudf" diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 329a0073985..a8fbe8851f8 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -51,5 +51,54 @@ std::unique_ptr normalize_spaces( cudf::strings_column_view const& strings, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Normalizes strings characters for tokenizing. + * + * This uses the normalizer that is built into the nvtext::subword_tokenize function + * which includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process here adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes the accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + * + * @code{.pseudo} + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,true) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * s2 = normalize_characters(s,false) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * This function requires 8x the number of bytes in the input strings + * column as working memory. + * + * @param strings The input strings to normalize. + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param mr Memory resource to allocate any returned objects. + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& strings, + bool do_lower_case, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of group } // namespace nvtext diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 59a1c25fd94..8f7bdb10cf3 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -24,6 +24,11 @@ namespace nvtext { +/** + * @addtogroup nvtext_tokenize + * @{ + */ + /** * @brief The vocabulary data for use with the subword_tokenize function. */ @@ -171,4 +176,5 @@ tokenizer_result subword_tokenize( uint32_t max_rows_tensor, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/jit/cache.h b/cpp/src/jit/cache.h index 666a8dbbbe9..e2af78cd817 100644 --- a/cpp/src/jit/cache.h +++ b/cpp/src/jit/cache.h @@ -40,7 +40,9 @@ using named_prog = std::pair>; * This function returns a path to the cache directory, creating it if it * doesn't exist. * - * The default cache directory `$TEMPDIR/cudf_$CUDF_VERSION`. + * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides + * are used and if $HOME is not defined, returns an empty path and file + * caching is not used. **/ boost::filesystem::path getCacheDir(); diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 385cf3247d5..ef6f55606da 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,9 +26,12 @@ #include #include +#include #include #include +#include +#include namespace nvtext { namespace detail { @@ -72,6 +76,76 @@ struct normalize_spaces_fn { } }; +// code-point to multi-byte range limits +constexpr uint32_t UTF8_1BYTE = 0x0080; +constexpr uint32_t UTF8_2BYTE = 0x0800; +constexpr uint32_t UTF8_3BYTE = 0x010000; + +/** + * @brief Convert code-point arrays into UTF-8 bytes for each string. + */ +struct codepoint_to_utf8_fn { + cudf::column_device_view const d_strings; // input strings + uint32_t const* cp_data; // full code-point array + int32_t const* d_cp_offsets{}; // offsets to each string's code-point array + int32_t const* d_offsets{}; // offsets for the output strings + char* d_chars{}; // buffer for the output strings column + + /** + * @brief Return the number of bytes for the output string given its code-point array. + * + * @param str_cps code-points for the string + * @param count number of code-points in `str_cps` + * @return Number of bytes required for the output + */ + __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count) + { + return thrust::transform_reduce( + thrust::seq, + str_cps, + str_cps + count, + [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); }, + 0, + thrust::plus()); + } + + __device__ cudf::size_type operator()(cudf::size_type idx) + { + if (d_strings.is_null(idx)) return 0; + auto const d_str = d_strings.element(idx); + auto const offset = d_cp_offsets[idx]; + auto const count = d_cp_offsets[idx + 1] - offset; // number of code-points + auto str_cps = cp_data + offset; // code-points for this string + if (!d_chars) return compute_output_size(str_cps, count); + // convert each code-point to 1-4 UTF-8 encoded bytes + char* out_ptr = d_chars + d_offsets[idx]; + for (uint32_t jdx = 0; jdx < count; ++jdx) { + uint32_t code_point = *str_cps++; + if (code_point < UTF8_1BYTE) // ASCII range + *out_ptr++ = static_cast(code_point); + else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 + // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy + *out_ptr++ = static_cast((((code_point << 2) & 0x001F00) | 0x00C000) >> 8); + *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); + } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 + // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy + *out_ptr++ = static_cast((((code_point << 4) & 0x0F0000) | 0x00E00000) >> 16); + *out_ptr++ = static_cast((((code_point << 2) & 0x003F00) | 0x008000) >> 8); + *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); + } else { // create four-byte UTF-8 + // maximum code-point value is 0x00110000 + // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz + *out_ptr++ = + static_cast((((code_point << 6) & 0x07000000) | unsigned{0xF0000000}) >> 24); + *out_ptr++ = static_cast((((code_point << 4) & 0x003F0000) | 0x00800000) >> 16); + *out_ptr++ = static_cast((((code_point << 2) & 0x003F00) | 0x008000) >> 8); + *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); + } + } + return 0; + } +}; + } // namespace // details API @@ -119,6 +193,69 @@ std::unique_ptr normalize_spaces( mr); } +/** + * @copydoc nvtext::normalize_characters + */ +std::unique_ptr normalize_characters(cudf::strings_column_view const& strings, + bool do_lower_case, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = strings.size(); + if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + + // create the normalizer and call it + data_normalizer normalizer(strings_count, strings.chars_size(), stream, do_lower_case); + auto result = [&strings, &normalizer, stream] { + auto const offsets = strings.offsets(); + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; + return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); + }(); + + CUDF_EXPECTS(result.first.length <= std::numeric_limits::max(), + "output too large for strings column"); + + // convert the result into a strings column + // - the cp_chars are the new 4-byte code-point values for all the characters in the output + // - the cp_offsets identify which code-points go with which strings + uint32_t const* cp_chars = result.first.gpu_ptr; + int32_t const* cp_offsets = reinterpret_cast(result.second.gpu_ptr); + auto strings_column = cudf::column_device_view::create(strings.parent(), stream); + + // build the output offsets column: compute the output size of each string + auto offsets_transformer_itr = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets}); + auto offsets_column = cudf::strings::detail::make_offsets_child_column( + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + auto d_offsets = offsets_column->view().data(); + + // create the output chars column + cudf::size_type output_bytes = + cudf::detail::get_value(offsets_column->view(), strings_count, stream); + auto chars_column = cudf::strings::detail::create_chars_child_column( + strings_count, strings.null_count(), output_bytes, mr, stream); + auto d_chars = chars_column->mutable_view().data(); + + // build the chars output data: convert the 4-byte code-point values into UTF-8 chars + thrust::for_each_n( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + strings_count, + codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars}); + chars_column->set_null_count(0); // reset null count for child column + + return cudf::make_strings_column(strings_count, + std::move(offsets_column), + std::move(chars_column), + strings.null_count(), + copy_bitmask(strings.parent(), stream, mr), + stream, + mr); +} + } // namespace detail // external APIs @@ -130,4 +267,15 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& return detail::normalize_spaces(strings, mr); } +/** + * @copydoc nvtext::normalize_characters + */ +std::unique_ptr normalize_characters(cudf::strings_column_view const& strings, + bool do_lower_case, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(strings, do_lower_case, 0, mr); +} + } // namespace nvtext diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 7aa0116186d..2dc95013918 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -29,7 +29,7 @@ struct TextNormalizeTest : public cudf::test::BaseFixture { }; -TEST_F(TextNormalizeTest, Normalize) +TEST_F(TextNormalizeTest, NormalizeSpaces) { std::vector h_strings{"the\t fox jumped over the dog", "the dog\f chased the cat\r", @@ -67,7 +67,59 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); cudf::strings_column_view strings_view(strings->view()); - auto const results = nvtext::normalize_spaces(strings_view); + auto results = nvtext::normalize_spaces(strings_view); + EXPECT_EQ(results->size(), 0); + results = nvtext::normalize_characters(strings_view, true); EXPECT_EQ(results->size(), 0); - EXPECT_EQ(results->has_nulls(), false); + results = nvtext::normalize_characters(strings_view, false); + EXPECT_EQ(results->size(), 0); +} + +TEST_F(TextNormalizeTest, NormalizeCharacters) +{ + // These include punctuation, accents, whitespace, and CJK characters + std::vector h_strings{"abc£def", + nullptr, + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); + cudf::strings_column_view strings_view(strings); + { + auto results = nvtext::normalize_characters(strings_view, true); + cudf::test::strings_column_wrapper expected({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + validity); + cudf::test::expect_columns_equal(*results, expected); + } + { + auto results = nvtext::normalize_characters(cudf::strings_column_view(strings), false); + cudf::test::strings_column_wrapper expected({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + validity); + cudf::test::expect_columns_equal(*results, expected); + } } diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst index a3d40e874c1..65e2cc120de 100644 --- a/docs/cudf/source/api.rst +++ b/docs/cudf/source/api.rst @@ -17,7 +17,7 @@ Series .. autoclass:: Series :members: :inherited-members: - :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict + :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list Strings ------- @@ -39,7 +39,7 @@ Index .. autoclass:: Index :members: :inherited-members: - :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list RangeIndex ---------- @@ -47,7 +47,7 @@ RangeIndex .. autoclass:: RangeIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list GenericIndex ------------ @@ -55,7 +55,7 @@ GenericIndex .. autoclass:: GenericIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list MultiIndex ---------- @@ -63,7 +63,7 @@ MultiIndex .. autoclass:: MultiIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Int8Index --------- @@ -71,7 +71,7 @@ Int8Index .. autoclass:: Int8Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Int16Index ---------- @@ -79,7 +79,7 @@ Int16Index .. autoclass:: Int16Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Int32Index ---------- @@ -87,7 +87,7 @@ Int32Index .. autoclass:: Int32Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Int64Index ---------- @@ -95,7 +95,7 @@ Int64Index .. autoclass:: Int64Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list UInt8Index ---------- @@ -103,7 +103,7 @@ UInt8Index .. autoclass:: UInt8Index :inherited-members: :members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list UInt16Index ----------- @@ -111,7 +111,7 @@ UInt16Index .. autoclass:: UInt16Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list UInt32Index ----------- @@ -119,7 +119,7 @@ UInt32Index .. autoclass:: UInt32Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list UInt64Index ----------- @@ -127,7 +127,7 @@ UInt64Index .. autoclass:: UInt64Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Float32Index ------------ @@ -135,7 +135,7 @@ Float32Index .. autoclass:: Float32Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Float64Index ------------ @@ -143,7 +143,7 @@ Float64Index .. autoclass:: Float64Index :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list CategoricalIndex ---------------- @@ -151,7 +151,7 @@ CategoricalIndex .. autoclass:: CategoricalIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list StringIndex ----------- @@ -159,7 +159,7 @@ StringIndex .. autoclass:: StringIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list DatetimeIndex ------------- @@ -167,7 +167,7 @@ DatetimeIndex .. autoclass:: DatetimeIndex :members: :inherited-members: - :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize + :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list Categories ---------- diff --git a/java/pom.xml b/java/pom.xml index b33406807b1..c1da507f684 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -105,7 +105,7 @@ org.slf4j slf4j-api - 1.7.9 + ${slf4j.version} compile @@ -123,7 +123,7 @@ org.slf4j slf4j-simple - 1.7.9 + ${slf4j.version} test @@ -148,6 +148,7 @@ OFF OFF ${project.build.directory}/cmake-build + 1.7.30 diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index d1c426fe14d..a317287735d 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -343,7 +344,7 @@ void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArra // Need to keep both separate so we can shut them down appropriately std::unique_ptr> Logging_memory_resource{}; -std::unique_ptr Initialized_resource{}; +std::shared_ptr Initialized_resource{}; } // anonymous namespace extern "C" { @@ -363,29 +364,23 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j bool use_managed_mem = allocation_mode & 2; if (use_pool_alloc) { if (use_managed_mem) { - using managed_mr = rmm::mr::managed_memory_resource; - using managed_pool = rmm::mr::pool_memory_resource; - auto tmp = new managed_pool(new managed_mr(), pool_size, pool_size); - Initialized_resource.reset(tmp); - auto wrapped = make_tracking_adaptor(tmp, RMM_ALLOC_SIZE_ALIGNMENT); + Initialized_resource = rmm::mr::make_owning_wrapper( + std::make_shared(), pool_size, pool_size); + auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); Tracking_memory_resource.reset(wrapped); } else { - using cuda_mr = rmm::mr::cuda_memory_resource; - using cuda_pool = rmm::mr::pool_memory_resource; - auto tmp = new cuda_pool(new cuda_mr(), pool_size, pool_size); - Initialized_resource.reset(tmp); - auto wrapped = make_tracking_adaptor(tmp, RMM_ALLOC_SIZE_ALIGNMENT); + Initialized_resource = rmm::mr::make_owning_wrapper( + std::make_shared(), pool_size, pool_size); + auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); Tracking_memory_resource.reset(wrapped); } } else if (use_managed_mem) { - auto tmp = new rmm::mr::managed_memory_resource(); - Initialized_resource.reset(tmp); - auto wrapped = make_tracking_adaptor(tmp, RMM_ALLOC_SIZE_ALIGNMENT); + Initialized_resource = std::make_shared(); + auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); Tracking_memory_resource.reset(wrapped); } else { - auto tmp = new rmm::mr::cuda_memory_resource(); - Initialized_resource.reset(tmp); - auto wrapped = make_tracking_adaptor(tmp, RMM_ALLOC_SIZE_ALIGNMENT); + Initialized_resource = std::make_shared(); + auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); Tracking_memory_resource.reset(wrapped); } auto resource = Tracking_memory_resource.get(); @@ -439,7 +434,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_shutdownInternal(JNIEnv *env, jcl // we just reset the base adaptor so the others will not be called any more // and then clean them up in really any order. There should be no interaction with // RMM during this time anyways. - Initialized_resource.reset(new rmm::mr::cuda_memory_resource()); + Initialized_resource = std::make_shared(); rmm::mr::set_default_resource(Initialized_resource.get()); Logging_memory_resource.reset(nullptr); Tracking_memory_resource.reset(nullptr); diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 61c8573815b..d953f517e4a 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -8,7 +8,7 @@ import rmm -from cudf import core, datasets +from cudf import core, datasets, testing from cudf._version import get_versions from cudf.core import ( CategoricalIndex, @@ -62,10 +62,9 @@ read_orc, read_parquet, ) +from cudf.utils.dtypes import _NA_REP from cudf.utils.utils import set_allocator -from cudf import testing - cuda.set_memory_manager(rmm.RMMNumbaManager) cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd index a6a892edb0d..7d8ec891692 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column @@ -10,3 +11,8 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] normalize_spaces( const column_view & strings ) except + + + cdef unique_ptr[column] normalize_characters( + const column_view & strings, + bool do_lower_case + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx index 67a94a32b5b..56c5bac9589 100644 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx @@ -1,11 +1,13 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from cudf._lib.move cimport move from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.nvtext.normalize cimport ( + normalize_characters as cpp_normalize_characters, normalize_spaces as cpp_normalize_spaces ) from cudf._lib.column cimport Column @@ -19,3 +21,13 @@ def normalize_spaces(Column strings): c_result = move(cpp_normalize_spaces(c_strings)) return Column.from_unique_ptr(move(c_result)) + + +def normalize_characters(Column strings, bool do_lower=True): + cdef column_view c_strings = strings.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_normalize_characters(c_strings, do_lower)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index edbadf439ad..5b27047b336 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1088,7 +1088,7 @@ def fillna(self, fill_value): codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, - mask=None, + mask=result.base_mask, ordered=self.dtype.ordered, ) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2c79dd8a1b4..43585174834 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -210,13 +210,6 @@ def fillna(self, fill_value): fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) - result = column.build_column( - result.base_data, - result.dtype, - mask=None, - offset=result.offset, - size=result.size, - ) return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 1611ba438fb..d36f24cef81 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -308,13 +308,6 @@ def fillna(self, fill_value): else: fill_value = fill_value.astype(self.dtype) result = libcudf.replace.replace_nulls(self, fill_value) - result = column.build_column( - result.base_data, - result.dtype, - mask=None, - offset=result.offset, - size=result.size, - ) return result diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 944b64feeb7..a82963ef37e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -18,7 +18,10 @@ from cudf._lib.nvtext.ngrams_tokenize import ( ngrams_tokenize as cpp_ngrams_tokenize, ) -from cudf._lib.nvtext.normalize import normalize_spaces as cpp_normalize_spaces +from cudf._lib.nvtext.normalize import ( + normalize_characters as cpp_normalize_characters, + normalize_spaces as cpp_normalize_spaces, +) from cudf._lib.nvtext.replace import ( filter_tokens as cpp_filter_tokens, replace_tokens as cpp_replace_tokens, @@ -3616,6 +3619,55 @@ def normalize_spaces(self, **kwargs): cpp_normalize_spaces(self._column), **kwargs ) + def normalize_characters(self, do_lower=True, **kwargs): + """ + Normalizes strings characters for tokenizing. + + This uses the normalizer that is built into the + subword_tokenize function which includes: + + - adding padding around punctuation (unicode category starts with + "P") as well as certain ASCII symbols like "^" and "$" + - adding padding around the CJK Unicode block characters + - changing whitespace (e.g. ``\\t``, ``\\n``, ``\\r``) to space + - removing control characters (unicode categories "Cc" and "Cf") + + If `do_lower_case = true`, lower-casing also removes the accents. + The accents cannot be removed from upper-case characters without + lower-casing and lower-casing cannot be performed without also + removing accents. However, if the accented character is already + lower-case, then only the accent is removed. + + Parameters + ---------- + do_lower : bool, Default is True + If set to True, characters will be lower-cased and accents + will be removed. If False, accented and upper-case characters + are not transformed. + + Returns + ------- + Series or Index of object. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series(["héllo, \\tworld","ĂĆCĖÑTED","$99"]) + >>> ser.str.normalize_characters() + 0 hello , world + 1 accented + 2 $ 99 + dtype: object + >>> ser.str.normalize_characters(do_lower=False) + 0 héllo , world + 1 ĂĆCĖÑTED + 2 $ 99 + dtype: object + """ + return self._return_or_inplace( + cpp_normalize_characters(self._column, do_lower), **kwargs + ) + def tokenize(self, delimiter=" ", **kwargs): """ Each string is split into tokens using the provided delimiter(s). diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ac12ddb6ee6..1892022f38c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -17,7 +17,9 @@ import pandas as pd import pyarrow as pa from numba import cuda +from pandas._config import get_option from pandas.api.types import is_dict_like +from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing import cudf @@ -39,7 +41,6 @@ from cudf.utils.dtypes import ( cudf_dtype_from_pydata_dtype, is_categorical_dtype, - is_datetime_dtype, is_list_like, is_scalar, is_string_dtype, @@ -1119,14 +1120,30 @@ def _repr_pandas025_formatting(self, ncols, nrows, dtype=None): def _clean_renderable_dataframe(self, output): """ - the below is permissible: null in a datetime to_pandas() becomes - NaT, which is then replaced with null in this processing step. - It is not possible to have a mix of nulls and NaTs in datetime - columns because we do not support NaT - pyarrow as_column - preprocessing converts NaT input values from numpy or pandas into - null. + This method takes in partial/preprocessed dataframe + and returns correct representation of it with correct + dimensions (rows x columns) """ - output = output.to_pandas().__repr__().replace(" NaT", "null") + + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + max_colwidth = get_option("display.max_colwidth") + show_dimensions = get_option("display.show_dimensions") + if get_option("display.expand_frame_repr"): + width, _ = console.get_console_size() + else: + width = None + + output = output.to_pandas().to_string( + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + line_width=width, + max_colwidth=max_colwidth, + show_dimensions=show_dimensions, + ) + lines = output.split("\n") if lines[-1].startswith("["): @@ -1136,6 +1153,23 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) + def _clean_nulls_from_dataframe(self, df): + """ + This function converts all ``null`` values to ```` for + representation as a string in `__repr__`. + + Since we utilize Pandas `__repr__` at all places in our code + for formatting purposes, we convert columns to `str` dtype for + filling with `` values. + """ + for col in df._data: + if self._data[col].has_nulls: + df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) + else: + df[col] = df._data[col] + + return df + def _get_renderable_dataframe(self): """ takes rows and columns from pandas settings or estimation from size. @@ -1143,7 +1177,8 @@ def _get_renderable_dataframe(self): multiindex as well producing an efficient representative string for printing with the dataframe. """ - nrows = np.max([pd.options.display.max_rows, 1]) + max_rows = pd.options.display.max_rows + nrows = np.max([len(self) if max_rows is None else max_rows, 1]) if pd.options.display.max_rows == 0: nrows = len(self) ncols = ( @@ -1154,6 +1189,24 @@ def _get_renderable_dataframe(self): if len(self) <= nrows and len(self._data.names) <= ncols: output = self.copy(deep=False) + elif self.empty and len(self.index) > 0: + max_seq_items = pd.options.display.max_seq_items + # Incase of Empty DataFrame with index, Pandas prints + # first `pd.options.display.max_seq_items` index values + # followed by ... To obtain ... at the end of index list, + # adding 1 extra value. + # If `pd.options.display.max_seq_items` is None, + # entire sequence/Index is to be printed. + # Note : Pandas truncates the dimensions at the end of + # the resulting dataframe when `display.show_dimensions` + # is set to truncate. Hence to display the dimentions we + # need to extract maximum of `max_seq_items` and `nrows` + # and have 1 extra value for ... to show up in the output + # string. + if max_seq_items is not None: + output = self.head(max(max_seq_items, nrows) + 1) + else: + output = self.copy(deep=False) else: left_cols = len(self._data.names) right_cols = 0 @@ -1193,15 +1246,8 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - for col in output._data: - if ( - self._data[col].has_nulls - and not self._data[col].dtype == "O" - and not is_datetime_dtype(self._data[col].dtype) - ): - output[col] = output._data[col].astype("str").fillna("null") - else: - output[col] = output._data[col] + output = self._clean_nulls_from_dataframe(output) + output._index = output._index._clean_nulls_from_index() return output @@ -2758,7 +2804,7 @@ def copy(self, deep=True): def __copy__(self): return self.copy(deep=True) - def __deepcopy__(self, memo={}): + def __deepcopy__(self, memo=None): """ Parameters ---------- @@ -4374,7 +4420,7 @@ def _verbose_repr(): column_head, space ) if show_counts: - counts = self.count().tolist() + counts = self.count().to_pandas().tolist() if len(cols) != len(counts): raise AssertionError( f"Columns must equal " @@ -4481,57 +4527,6 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) - def fillna(self, value, method=None, axis=None, inplace=False, limit=None): - """Fill null values with ``value``. - - Parameters - ---------- - value : scalar, Series-like or dict - Value to use to fill nulls. If Series-like, null values - are filled with values in corresponding indices. - A dict can be used to provide different values to fill nulls - in different columns. - - Returns - ------- - result : DataFrame - Copy with nulls filled. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({'a': [1, 2, None], 'b': [3, None, 5]}) - >>> gdf.fillna(4).to_pandas() - a b - 0 1 3 - 1 2 4 - 2 4 5 - >>> gdf.fillna({'a': 3, 'b': 4}).to_pandas() - a b - 0 1 3 - 1 2 4 - 2 3 5 - """ - if inplace: - outdf = {} # this dict will just hold Nones - else: - outdf = self.copy() - - if not is_dict_like(value): - value = dict.fromkeys(self.columns, value) - - for k in value: - outdf[k] = self[k].fillna( - value[k], - method=method, - axis=axis, - inplace=inplace, - limit=limit, - ) - - if not inplace: - return outdf - def describe(self, percentiles=None, include=None, exclude=None): """Compute summary statistics of a DataFrame's columns. For numeric data, the output includes the minimum, maximum, mean, median, @@ -6527,11 +6522,42 @@ def corr(self): def to_dict(self, orient="dict", into=dict): raise TypeError( - "Implicit conversion to a host memory via to_dict() is not " - "allowed, To explicitly construct a dictionary object, " - "consider using .to_pandas().to_dict()" + "cuDF does not support conversion to host memory " + "via `to_dict()` method. Consider using " + "`.to_pandas().to_dict()` to construct a Python dictionary." ) + def keys(self): + """ + Get the columns. + This is index for Series, columns for DataFrame. + + Returns + ------- + Index + Columns of DataFrame. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'one' : [1, 2, 3], 'five' : ['a', 'b', 'c']}) + >>> df + one five + 0 1 a + 1 2 b + 2 3 c + >>> df.keys() + Index(['one', 'five'], dtype='object') + >>> df = cudf.DataFrame(columns=[0, 1, 2, 3]) + >>> df + Empty DataFrame + Columns: [0, 1, 2, 3] + Index: [] + >>> df.keys() + Int64Index([0, 1, 2, 3], dtype='int64') + """ + return self.columns + def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c6e1bcd622b..f40527c52e2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,17 +1,19 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +import copy import functools import warnings -from collections import OrderedDict +from collections import OrderedDict, abc as abc import cupy import numpy as np import pandas as pd -from pandas.api.types import is_dtype_equal +from pandas.api.types import is_dict_like, is_dtype_equal import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate from cudf.core.column import as_column, build_categorical_column +from cudf.utils import utils from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, @@ -545,6 +547,10 @@ def _slice(self, arg): # it from materializing unnecessarily keep_index = True if self.index is not None and isinstance(self.index, RangeIndex): + if self._num_columns == 0: + result = self._empty_like(keep_index) + result._index = self.index[start:stop] + return result keep_index = False if start < 0: @@ -552,9 +558,7 @@ def _slice(self, arg): if stop < 0: stop = stop + num_rows - if (start > stop and (stride is None or stride == 1)) or ( - len(self._data) == 0 and keep_index is False - ): + if start > stop and (stride is None or stride == 1): return self._empty_like(keep_index) else: start = len(self) if start > num_rows else start @@ -1233,6 +1237,101 @@ def dropna( return self._mimic_inplace(result, inplace=inplace) + def fillna(self, value, method=None, axis=None, inplace=False, limit=None): + """Fill null values with ``value``. + + Parameters + ---------- + value : scalar, Series-like or dict + Value to use to fill nulls. If Series-like, null values + are filled with values in corresponding indices. + A dict can be used to provide different values to fill nulls + in different columns. + + Returns + ------- + result : DataFrame + Copy with nulls filled. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, None], 'b': [3, None, 5]}) + >>> df + a b + 0 1 3 + 1 2 null + 2 null 5 + >>> df.fillna(4) + a b + 0 1 3 + 1 2 4 + 2 4 5 + >>> df.fillna({'a': 3, 'b': 4}) + a b + 0 1 3 + 1 2 4 + 2 3 5 + + ``fillna`` on a Series object: + + >>> ser = cudf.Series(['a', 'b', None, 'c']) + >>> ser + 0 a + 1 b + 2 None + 3 c + dtype: object + >>> ser.fillna('z') + 0 a + 1 b + 2 z + 3 c + dtype: object + + ``fillna`` can also supports inplace operation: + + >>> ser.fillna('z', inplace=True) + >>> ser + 0 a + 1 b + 2 z + 3 c + dtype: object + >>> df.fillna({'a': 3, 'b': 4}, inplace=True) + >>> df + a b + 0 1 3 + 1 2 4 + 2 3 5 + """ + if method is not None: + raise NotImplementedError("The method keyword is not supported") + if limit is not None: + raise NotImplementedError("The limit keyword is not supported") + if axis: + raise NotImplementedError("The axis keyword is not supported") + + if isinstance(value, cudf.Series): + value = value.reindex(self._data.names) + elif isinstance(value, cudf.DataFrame): + if not self.index.equals(value.index): + value = value.reindex(self.index) + else: + value = value + elif not isinstance(value, abc.Mapping): + value = {name: copy.deepcopy(value) for name in self._data.names} + + copy_data = self._data.copy(deep=True) + + for name, col in copy_data.items(): + if name in value and value[name] is not None: + copy_data[name] = copy_data[name].fillna(value[name],) + + result = self._from_table(Frame(copy_data, self._index)) + + return self._mimic_inplace(result, inplace=inplace) + def _drop_na_rows(self, how="any", subset=None, thresh=None): """ Drops null rows from `self`. @@ -2772,13 +2871,13 @@ def _is_sorted(self, ascending=None, null_position=None): def _get_replacement_values(to_replace, replacement, col_name, column): - from pandas.api.types import is_dict_like - - from cudf.utils import utils all_nan = False - - if is_dict_like(to_replace) and replacement is None: + if ( + is_dict_like(to_replace) + and not isinstance(to_replace, cudf.Series) + and replacement is None + ): replacement = list(to_replace.values()) to_replace = list(to_replace.keys()) elif not is_scalar(to_replace): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0914908a8cc..843d6fd3eaa 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -347,6 +347,56 @@ def dropna(self, how="any"): """ return super().dropna(how=how) + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + if self._values.has_nulls: + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) + else: + return self + + def fillna(self, value, downcast=None): + """ + Fill null values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill nulls. This value cannot be a + list-likes. + + downcast : dict, default is None + This Parameter is currently NON-FUNCTIONAL. + + Returns + ------- + filled : Index + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, None, 4]) + >>> index + Int64Index([1, 2, null, 4], dtype='int64') + >>> index.fillna(3) + Int64Index([1, 2, 3, 4], dtype='int64') + """ + if downcast is not None: + raise NotImplementedError( + "`downcast` parameter is not yet supported" + ) + + return super().fillna(value=value) + def take(self, indices): """Gather only the specific subset of indices @@ -407,7 +457,7 @@ def to_pandas(self): Examples -------- >>> import cudf - >>> idx = cudf.core.index.as_index([-3, 10, 15, 20]) + >>> idx = cudf.Index([-3, 10, 15, 20]) >>> idx Int64Index([-3, 10, 15, 20], dtype='int64') >>> idx.to_pandas() @@ -426,7 +476,7 @@ def to_arrow(self): Examples -------- >>> import cudf - >>> idx = cudf.core.index.as_index([-3, 10, 15, 20]) + >>> idx = cudf.Index([-3, 10, 15, 20]) >>> idx.to_arrow() [ @@ -439,16 +489,12 @@ def to_arrow(self): return self._values.to_arrow() def tolist(self): - """ - Return a list type from index data. - Returns - ------- - list - """ - # TODO: Raise error as part - # of https://github.com/rapidsai/cudf/issues/5689 - return self.to_arrow().to_pylist() + raise TypeError( + "cuDF does not support conversion to host memory " + "via `tolist()` method. Consider using " + "`.to_arrow().to_pylist()` to construct a Python list." + ) to_list = tolist @@ -485,7 +531,7 @@ def min(self): Examples -------- >>> import cudf - >>> idx = cudf.core.index.as_index([3, 2, 1]) + >>> idx = cudf.Index([3, 2, 1]) >>> idx.min() 1 """ @@ -510,7 +556,7 @@ def max(self): Examples -------- >>> import cudf - >>> idx = cudf.core.index.as_index([3, 2, 1]) + >>> idx = cudf.Index([3, 2, 1]) >>> idx.max() 3 """ @@ -528,7 +574,7 @@ def sum(self): Examples -------- >>> import cudf - >>> idx = cudf.core.index.as_index([3, 2, 1]) + >>> idx = cudf.Index([3, 2, 1]) >>> idx.sum() 6 """ @@ -1387,7 +1433,7 @@ def equals(self, other): elif isinstance(other, cudf.core.index.RangeIndex): return self._start == other._start and self._stop == other._stop else: - return (self == other)._values.all() + return super().equals(other) def serialize(self): header = {} @@ -1630,27 +1676,46 @@ def __repr__(self): preprocess = concat([top, bottom]) else: preprocess = self - if preprocess._values.nullable: - output = ( - self.__class__(preprocess._values.astype("O").fillna("null")) - .to_pandas() - .__repr__() - ) + + # TODO: Change below usages accordingly to + # utilize `Index.to_string` once it is implemented + # related issue : https://github.com/pandas-dev/pandas/issues/35389 + if isinstance(preprocess, CategoricalIndex): + output = preprocess.to_pandas().__repr__() + output = output.replace("nan", cudf._NA_REP) + elif preprocess._values.nullable: + output = self._clean_nulls_from_index().to_pandas().__repr__() + + if not isinstance(self, StringIndex): + # We should remove all the single quotes + # from the output due to the type-cast to + # object dtype happening above. + # Note : The replacing of single quotes has + # to happen only incase of non-StringIndex types, + # as we want to preserve single quotes incase + # of StringIndex and it is valid to have them. + output = output.replace("'", "") else: output = preprocess.to_pandas().__repr__() + # Fix and correct the class name of the output + # string by finding first occurrence of "(" in the output + index_class_split_index = output.find("(") + output = self.__class__.__name__ + output[index_class_split_index:] + lines = output.split("\n") - if len(lines) > 1: - tmp_meta = lines[-1] - prior_to_dtype = lines[-1].split("dtype")[0] - lines = lines[:-1] - lines.append(prior_to_dtype + "dtype='%s'" % self.dtype) - if self.name is not None: - lines[-1] = lines[-1] + ", name='%s'" % self.name - if "length" in tmp_meta: - lines[-1] = lines[-1] + ", length=%d)" % len(self) - else: - lines[-1] = lines[-1] + ")" + + tmp_meta = lines[-1] + dtype_index = lines[-1].rfind(" dtype=") + prior_to_dtype = lines[-1][:dtype_index] + lines = lines[:-1] + lines.append(prior_to_dtype + " dtype='%s'" % self.dtype) + if self.name is not None: + lines[-1] = lines[-1] + ", name='%s'" % self.name + if "length" in tmp_meta: + lines[-1] = lines[-1] + ", length=%d)" % len(self) + else: + lines[-1] = lines[-1] + ")" return "\n".join(lines) @@ -2130,6 +2195,16 @@ def str(self): def _constructor_expanddim(self): return cudf.MultiIndex + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + """ + if self._values.has_nulls: + return self.fillna(cudf._NA_REP) + else: + return self + def as_index(arbitrary, **kwargs): """Create an Index from an arbitrary object diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 00f6d1bd64f..ce3c6806d54 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -73,9 +73,9 @@ def __getitem__(self, arg): if isinstance(arg, tuple): arg = list(arg) data = self._sr._column[arg] - index = self._sr.index.take(arg) if is_scalar(data) or data is None: return data + index = self._sr.index.take(arg) return self._sr._copy_construct(data=data, index=index) def __setitem__(self, key, value): @@ -432,11 +432,7 @@ def _getitem_tuple_arg(self, arg): return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): - from cudf.core.index import RangeIndex - - slice_len = len(self._df) - start, stop, step = arg[0].indices(slice_len) - df._index = RangeIndex(start, stop) + df._index = as_index(self._df.index[arg[0]]) return df @annotate("ILOC_SETITEM", color="blue", domain="cudf_python") diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c9364f3d2be..1ebdd5101af 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -941,9 +941,75 @@ def is_monotonic_decreasing(self): def argsort(self, ascending=True, **kwargs): return self._source_data.argsort(ascending=ascending, **kwargs) + def fillna(self, value): + """ + Fill null values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill nulls. This value cannot be a + list-likes. + + Returns + ------- + filled : MultiIndex + + Examples + -------- + >>> import cudf + >>> index = cudf.MultiIndex( + ... levels=[["a", "b", "c", None], ["1", None, "5"]], + ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + ... names=["x", "y"], + ... ) + >>> index + MultiIndex(levels=[0 a + 1 b + 2 c + 3 None + dtype: object, 0 1 + 1 None + 2 5 + dtype: object], + codes= x y + 0 0 0 + 1 0 2 + 2 1 1 + 3 2 1 + 4 3 0) + >>> index.fillna('hello') + MultiIndex(levels=[0 a + 1 b + 2 c + 3 hello + dtype: object, 0 1 + 1 5 + 2 hello + dtype: object], + codes= x y + 0 0 0 + 1 0 1 + 2 1 2 + 3 2 2 + 4 3 0) + """ + + return super().fillna(value=value) + def unique(self): return MultiIndex.from_frame(self._source_data.drop_duplicates()) + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in MultiIndex object + to `` as a preprocessing step to `__repr__` methods. + """ + index_df = self._source_data + return MultiIndex.from_frame( + index_df._clean_nulls_from_dataframe(index_df), names=self.names + ) + def memory_usage(self, deep=False): n = 0 for col in self._source_data._columns: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 742cfbd7053..d322a3335be 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. import pickle import warnings +from collections import abc as abc from numbers import Number from shutil import get_terminal_size @@ -37,7 +38,6 @@ from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, - is_datetime_dtype, is_list_dtype, is_list_like, is_mixed_with_object_dtype, @@ -512,7 +512,9 @@ def copy(self, deep=True): def __copy__(self, deep=True): return self.copy(deep) - def __deepcopy__(self): + def __deepcopy__(self, memo=None): + if memo is None: + memo = {} return self.copy() def append(self, to_append, ignore_index=False, verify_integrity=False): @@ -810,9 +812,9 @@ def __iter__(self): def to_dict(self, into=dict): raise TypeError( - "Implicit conversion to a host memory via to_dict() is not " - "allowed, To explicitly construct a dictionary object, " - "consider using .to_pandas().to_dict()" + "cuDF does not support conversion to host memory " + "via `to_dict()` method. Consider using " + "`.to_pandas().to_dict()` to construct a Python dictionary." ) def __setitem__(self, key, value): @@ -848,16 +850,12 @@ def values_to_string(self, nrows=None): return out def tolist(self): - """ - Return a list type from series data. - Returns - ------- - list - """ - # TODO: Raise error as part - # of https://github.com/rapidsai/cudf/issues/5689 - return self.to_arrow().to_pylist() + raise TypeError( + "cuDF does not support conversion to host memory " + "via `tolist()` method. Consider using " + "`.to_arrow().to_pylist()` to construct a Python list." + ) to_list = tolist @@ -975,17 +973,21 @@ def __repr__(self): preprocess = cudf.concat([top, bottom]) else: preprocess = self + + preprocess.index = preprocess.index._clean_nulls_from_index() + if ( preprocess.nullable - and not preprocess.dtype == "O" and not isinstance( preprocess._column, cudf.core.column.CategoricalColumn ) - and not is_datetime_dtype(preprocess.dtype) and not is_list_dtype(preprocess.dtype) ): output = ( - preprocess.astype("O").fillna("null").to_pandas().__repr__() + preprocess.astype("O") + .fillna(cudf._NA_REP) + .to_pandas() + .__repr__() ) elif isinstance( preprocess._column, cudf.core.column.CategoricalColumn @@ -1002,10 +1004,8 @@ def __repr__(self): min_rows=min_rows, max_rows=max_rows, length=show_dimensions, - na_rep="null", + na_rep=cudf._NA_REP, ) - elif is_datetime_dtype(preprocess.dtype): - output = preprocess.to_pandas().fillna("null").__repr__() else: output = preprocess.to_pandas().__repr__() @@ -1813,34 +1813,25 @@ def fill(self, fill_value, begin=0, end=-1, inplace=False): return self._fill([fill_value], begin, end, inplace) def fillna(self, value, method=None, axis=None, inplace=False, limit=None): - """Fill null values with ``value`` without changing the series' type. - - Parameters - ---------- - value : scalar or Series-like - Value to use to fill nulls. If `value`'s dtype differs from the - series, the fill value will be cast to the column's dtype before - applying the fill. If Series-like, null values are filled with the - values in corresponding indices of the given Series. + if isinstance(value, pd.Series): + value = Series.from_pandas(value) - Returns - ------- - result : Series - Copy with nulls filled. - """ - if method is not None: - raise NotImplementedError("The method keyword is not supported") - if limit is not None: - raise NotImplementedError("The limit keyword is not supported") - if axis: - raise NotImplementedError("The axis keyword is not supported") + if not (is_scalar(value) or isinstance(value, (abc.Mapping, Series))): + raise TypeError( + f'"value" parameter must be a scalar, dict ' + f"or Series, but you passed a " + f'"{type(value).__name__}"' + ) - data = self._column.fillna(value) + if isinstance(value, (abc.Mapping, Series)): + value = Series(value) + if not self.index.equals(value.index): + value = value.reindex(self.index) + value = value._column - if inplace: - self._column._mimic_inplace(data, inplace=True) - else: - return self._copy_construct(data=data) + return super().fillna( + value=value, method=method, axis=axis, inplace=inplace, limit=limit + ) def to_array(self, fillna=None): """Get a dense numpy array for the data. @@ -4211,6 +4202,49 @@ def merge( return result + def keys(self): + """ + Return alias for index. + + Returns + ------- + Index + Index of the Series. + + Examples + -------- + >>> import cudf + >>> sr = cudf.Series([10, 11, 12, 13, 14, 15]) + >>> sr + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + 5 15 + dtype: int64 + + >>> sr.keys() + RangeIndex(start=0, stop=6) + >>> sr = cudf.Series(['a', 'b', 'c']) + >>> sr + 0 a + 1 b + 2 c + dtype: object + >>> sr.keys() + RangeIndex(start=0, stop=3) + >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c']) + >>> sr + a 1 + b 2 + c 3 + dtype: int64 + >>> sr.keys() + StringIndex(['a' 'b' 'c'], dtype='object') + """ + return self.index + truediv_int_dtype_corrections = { "int8": "float32", diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 1552594178b..e5f4a05156f 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -62,8 +62,8 @@ def test_categorical_integer(): string = str(sr) expect_str = """ 0 a -1 null -2 null +1 +2 3 c 4 a dtype: category @@ -424,7 +424,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1 assert "d" in pd_sr_1.cat.categories.to_list() - assert "d" in cd_sr_1.cat.categories.to_list() + assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() assert_eq(pd_sr_1, cd_sr_1) @@ -445,7 +445,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace): cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1 assert "a" not in pd_sr_1.cat.categories.to_list() - assert "a" not in cd_sr_1.cat.categories.to_list() + assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() assert_eq(pd_sr_1, cd_sr_1) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index dd4a8075742..86fcec63fa0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -524,10 +524,10 @@ def test_dataframe_to_string(): expect = """ a b c 0 1 11 0 -1 2 12 null +1 2 12 2 3 13 2 3 4 14 3 -4 5 15 null +4 5 15 5 6 16 5 """ # values should match despite whitespace difference @@ -4171,59 +4171,6 @@ def test_constructor_properties(): df._constructor_expanddim -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5], - [1, 2, None, 4, 5], - [1.0, 2.0, 3.0, 4.0, 5.0], - [1.0, 2.0, None, 4.0, 5.0], - ["a", "b", "c", "d", "e"], - ["a", "b", None, "d", "e"], - [None, None, None, None, None], - np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), - np.array(["1991-11-20", None], dtype=np.datetime64), - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 - ), - np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), - ], -) -def test_tolist(data): - psr = pd.Series(data) - gsr = Series.from_pandas(psr) - - got = gsr.tolist() - expected = [x if not pd.isnull(x) else None for x in psr.tolist()] - - np.testing.assert_array_equal(got, expected) - - -def test_tolist_mixed_nulls(): - num_data = pa.array([1.0, None, np.float64("nan")]) - num_data_expect = [1.0, None, np.float64("nan")] - - time_data = pa.array( - [1, None, -9223372036854775808], type=pa.timestamp("ns") - ) - time_data_expect = [ - pd.Timestamp("1970-01-01T00:00:00.000000001"), - None, - pd.NaT, - ] - - df = DataFrame() - df["num_data"] = num_data - df["time_data"] = time_data - - num_data_got = df["num_data"].tolist() - time_data_got = df["time_data"].tolist() - - np.testing.assert_equal(num_data_got, num_data_expect) - for got, exp in zip(time_data_got, time_data_expect): # deal with NaT - assert (got == exp) or (pd.isnull(got) and pd.isnull(exp)) - - @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @pytest.mark.parametrize("as_dtype", ALL_TYPES) def test_df_astype_numeric_to_all(dtype, as_dtype): @@ -6139,9 +6086,9 @@ def test_dataframe_to_dict_error(): with pytest.raises( TypeError, match=re.escape( - r"Implicit conversion to a host memory via to_dict() is not " - r"allowed, To explicitly construct a dictionary object, " - r"consider using .to_pandas().to_dict()" + r"cuDF does not support conversion to host memory " + r"via `to_dict()` method. Consider using " + r"`.to_pandas().to_dict()` to construct a Python dictionary." ), ): df.to_dict() @@ -6149,14 +6096,89 @@ def test_dataframe_to_dict_error(): with pytest.raises( TypeError, match=re.escape( - r"Implicit conversion to a host memory via to_dict() is not " - r"allowed, To explicitly construct a dictionary object, " - r"consider using .to_pandas().to_dict()" + r"cuDF does not support conversion to host memory " + r"via `to_dict()` method. Consider using " + r"`.to_pandas().to_dict()` to construct a Python dictionary." ), ): df["a"].to_dict() +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]}), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + } + ), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + }, + index=[10, 20, 30, 40, 50, 60], + ), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + }, + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame(index=["a", "b", "c", "d", "e", "f"]), + pd.DataFrame(columns=["a", "b", "c", "d", "e", "f"]), + pd.DataFrame(index=[10, 11, 12]), + pd.DataFrame(columns=[10, 11, 12]), + pd.DataFrame(), + pd.DataFrame({"one": [], "two": []}), + pd.DataFrame({2: [], 1: []}), + pd.DataFrame( + { + 0: [1, 2, 3, 4, 5, 10], + 1: ["abc", "def", "ghi", "xyz", "pqr", "abc"], + 100: ["a", "b", "b", "x", "z", "a"], + }, + index=[10, 20, 30, 40, 50, 60], + ), + ], +) +def test_dataframe_keys(df): + gdf = gd.from_pandas(df) + + assert_eq(df.keys(), gdf.keys()) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series([1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]), + pd.Series(["abc", "def", "ghi", "xyz", "pqr", "abc"]), + pd.Series( + [1, 2, 3, 4, 5, 10], + index=["abc", "def", "ghi", "xyz", "pqr", "abc"], + ), + pd.Series( + ["abc", "def", "ghi", "xyz", "pqr", "abc"], + index=[1, 2, 3, 4, 5, 10], + ), + pd.Series(index=["a", "b", "c", "d", "e", "f"]), + pd.Series(index=[10, 11, 12]), + pd.Series(), + pd.Series([]), + ], +) +def test_series_keys(ps): + gds = gd.from_pandas(ps) + + if len(ps) == 0 and not isinstance(ps.index, pd.RangeIndex): + assert_eq(ps.keys().astype("float64"), gds.keys()) + else: + assert_eq(ps.keys(), gds.keys()) + + @pytest.mark.parametrize( "df", [ diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 5d7a3b1f000..ef211454edb 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1304,15 +1304,22 @@ def test_index_drop_duplicates(data, dtype): assert_eq(pdi.drop_duplicates(), gdi.drop_duplicates()) -@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) +@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) @pytest.mark.parametrize( "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] ) def test_index_tolist(data, dtype): - pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) - assert_eq(pdi.tolist(), gdi.tolist()) + with pytest.raises( + TypeError, + match=re.escape( + r"cuDF does not support conversion to host memory " + r"via `tolist()` method. Consider using " + r"`.to_arrow().to_pylist()` to construct a Python list." + ), + ): + gdi.tolist() @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) @@ -1342,3 +1349,31 @@ def test_index_values_host(data, dtype): pdi = pd.Index(data, dtype=dtype) np.testing.assert_array_equal(gdi.values_host, pdi.values) + + +@pytest.mark.parametrize( + "data,fill_value", + [ + ([1, 2, 3, 1, None, None], 1), + ([None, None, 3.2, 1, None, None], 10.0), + ([None, "a", "3.2", "z", None, None], "helloworld"), + (pd.Series(["a", "b", None], dtype="category"), "b"), + (pd.Series([None, None, 1.0], dtype="category"), 1.0), + ( + np.array([1, 2, 3, None], dtype="datetime64[s]"), + np.datetime64("2005-02-25"), + ), + ( + np.array( + [None, None, 122, 3242234, None, 6237846], + dtype="datetime64[ms]", + ), + np.datetime64("2005-02-25"), + ), + ], +) +def test_index_fillna(data, fill_value): + pdi = pd.Index(data) + gdi = cudf.Index(data) + + assert_eq(pdi.fillna(fill_value), gdi.fillna(fill_value)) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 727817d5f2f..21dbfd70f81 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1134,3 +1134,97 @@ def test_loc_datetime_index(sli, is_dataframe): got = gd_data.loc[sli] assert_eq(expect, got) + + +@pytest.mark.parametrize( + "gdf", + [ + cudf.DataFrame({"a": range(1000000)}), + cudf.DataFrame({"a": range(1000000), "b": range(1000000)}), + cudf.DataFrame({"a": range(20), "b": range(20)}), + cudf.DataFrame( + { + "a": range(20), + "b": range(20), + "c": ["abc", "def", "xyz", "def", "pqr"] * 4, + } + ), + cudf.DataFrame(index=[1, 2, 3]), + cudf.DataFrame(index=range(1000000)), + cudf.DataFrame(columns=["a", "b", "c", "d"]), + cudf.DataFrame(columns=["a"], index=range(1000000)), + cudf.DataFrame( + columns=["a", "col2", "...col n"], index=range(1000000) + ), + cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")), + cudf.DataFrame( + columns=["a", "b", "c", "d"], + index=cudf.Series(range(1000000)).astype("str"), + ), + ], +) +@pytest.mark.parametrize( + "slice", + [ + slice(250000, 500000), + slice(250000, 250001), + slice(500000), + slice(1, 10), + slice(10, 20), + slice(15, 24000), + slice(6), + ], +) +def test_dataframe_sliced(gdf, slice): + pdf = gdf.to_pandas() + + actual = gdf[slice] + expected = pdf[slice] + + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "gdf", + [ + cudf.DataFrame({"a": range(10000)}), + cudf.DataFrame( + { + "a": range(10000), + "b": range(10000), + "c": range(10000), + "d": range(10000), + "e": range(10000), + "f": range(10000), + } + ), + cudf.DataFrame({"a": range(20), "b": range(20)}), + cudf.DataFrame( + { + "a": range(20), + "b": range(20), + "c": ["abc", "def", "xyz", "def", "pqr"] * 4, + } + ), + cudf.DataFrame(index=[1, 2, 3]), + cudf.DataFrame(index=range(10000)), + cudf.DataFrame(columns=["a", "b", "c", "d"]), + cudf.DataFrame(columns=["a"], index=range(10000)), + cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), + cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), + cudf.DataFrame( + columns=["a", "b", "c", "d"], + index=cudf.Series(range(10000)).astype("str"), + ), + ], +) +@pytest.mark.parametrize( + "slice", [slice(6), slice(1), slice(7), slice(1, 3)], +) +def test_dataframe_iloc_index(gdf, slice): + pdf = gdf.to_pandas() + + actual = gdf.iloc[:, slice] + expected = pdf.iloc[:, slice] + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index d544bd9f872..5d7f9265c0e 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. """ Test related to MultiIndex @@ -881,6 +881,56 @@ def test_multiindex_to_arrow(): midx.to_arrow() +@pytest.mark.parametrize( + "pdi, fill_value, expected", + [ + ( + pd.MultiIndex( + levels=[[1, 3, 4, None], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + 5, + pd.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex( + levels=[[1, 3, 4, None], [1, None, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + 100, + pd.MultiIndex( + levels=[[1, 3, 4, 100], [1, 100, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex( + levels=[["a", "b", "c", None], ["1", None, "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + "100", + pd.MultiIndex( + levels=[["a", "b", "c", "100"], ["1", "100", "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ], +) +def test_multiIndex_fillna(pdi, fill_value, expected): + gdi = cudf.from_pandas(pdi) + + assert_eq(expected, gdi.fillna(fill_value)) + + @pytest.mark.parametrize( "pdi", [ diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 3ecf601010e..dc8331965fe 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,7 +1,9 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest +import cudf from cudf.core import DataFrame, Series from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq @@ -159,145 +161,290 @@ def test_replace_strings(): assert_eq(pdf.replace("a", "e"), gdf.replace("a", "e")) +@pytest.mark.parametrize( + "psr", + [pd.Series([0, 1, None, 2, None]), pd.Series([0, 1, np.nan, 2, np.nan])], +) @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) @pytest.mark.parametrize("fill_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("fill_type", ["scalar", "series"]) -@pytest.mark.parametrize("null_value", [None, np.nan]) +@pytest.mark.parametrize("fill_value", [10, pd.Series([10, 20, 30, 40, 50])]) @pytest.mark.parametrize("inplace", [True, False]) def test_series_fillna_numerical( - data_dtype, fill_dtype, fill_type, null_value, inplace + psr, data_dtype, fill_dtype, fill_value, inplace ): # TODO: These tests should use Pandas' nullable int type # when we support a recent enough version of Pandas # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + if np.dtype(data_dtype).kind not in ("i"): + psr = psr.astype(data_dtype) - if fill_type == "scalar": - fill_value = np.random.randint(0, 5) - expect = np.array([0, 1, fill_value, 2, fill_value], dtype=data_dtype) - elif fill_type == "series": - data = np.random.randint(0, 5, (5,)) - fill_value = pd.Series(data, dtype=data_dtype) - expect = np.array( - [0, 1, fill_value[2], 2, fill_value[4]], dtype=data_dtype - ) + gsr = cudf.from_pandas(psr) - sr = Series([0, 1, null_value, 2, null_value], dtype=data_dtype) - result = sr.fillna(fill_value, inplace=inplace) + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value - if inplace: - result = sr + expected = psr.fillna(fill_value, inplace=inplace) + actual = gsr.fillna(fill_value_cudf, inplace=inplace) - got = result.to_array() + if inplace: + expected = psr + actual = gsr - np.testing.assert_equal(expect, got) + assert_eq(expected, actual) -@pytest.mark.parametrize("fill_type", ["scalar", "series"]) -@pytest.mark.parametrize("null_value", [None, np.nan]) +@pytest.mark.parametrize( + "psr", + [ + pd.Series(["a", "b", "a", None, "c", None], dtype="category"), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["q", "r", "z", "a", "b", "c"], + ), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["x", "t", "p", "q", "r", "z"], + ), + pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), + pd.Series( + [None, None, None, None, None, None, "a", "b", "c"], + dtype="category", + ), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + "c", + pd.Series(["c", "c", "c", "c", "c", "a"], dtype="category"), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["x", "t", "p", "q", "r", "z"], + ), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["q", "r", "z", "a", "b", "c"], + ), + pd.Series(["a", "b", "a", None, "c", None], dtype="category"), + pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), + ], +) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_categorical(fill_type, null_value, inplace): - data = pd.Series( - ["a", "b", "a", null_value, "c", null_value], dtype="category" - ) - sr = Series.from_pandas(data) - - if fill_type == "scalar": - fill_value = "c" - expect = pd.Series(["a", "b", "a", "c", "c", "c"], dtype="category") - elif fill_type == "series": - fill_value = pd.Series( - ["c", "c", "c", "c", "c", "a"], dtype="category" - ) - expect = pd.Series(["a", "b", "a", "c", "c", "a"], dtype="category") +def test_fillna_categorical(psr, fill_value, inplace): + + gsr = Series.from_pandas(psr) + + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value - got = sr.fillna(fill_value, inplace=inplace) + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: - got = sr + expected = psr + got = gsr - assert_eq(expect, got) + assert_eq(expected, got) -@pytest.mark.parametrize("fill_type", ["scalar", "series"]) +@pytest.mark.parametrize( + "psr", + [ + pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")), + pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + ), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + ), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("2010-01-02"), + pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")) + + pd.Timedelta("1d"), + pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + ), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + ), + ], +) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_datetime(fill_type, inplace): - psr = pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")) - - if fill_type == "scalar": - fill_value = pd.Timestamp("2010-01-02") - elif fill_type == "series": - fill_value = psr + pd.Timedelta("1d") +def test_fillna_datetime(psr, fill_value, inplace): + gsr = cudf.from_pandas(psr) - psr[[5, 9]] = None - sr = Series.from_pandas(psr) + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value - expect = psr.fillna(fill_value) - got = sr.fillna(fill_value, inplace=inplace) + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: - got = sr + got = gsr + expected = psr - assert_eq(expect, got) + assert_eq(expected, got) -@pytest.mark.parametrize("fill_type", ["scalar", "series", "dict"]) +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), + pd.DataFrame( + {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] + ), + ], +) +@pytest.mark.parametrize( + "value", + [ + 10, + pd.Series([10, 20, 30]), + pd.Series([3, 4, 5]), + pd.Series([10, 20, 30], index=["z", "a", "p"]), + {"a": 5, "b": pd.Series([3, 4, 5])}, + {"a": 5001}, + {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])}, + {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])}, + {"c": 100}, + ], +) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_dataframe(fill_type, inplace): - pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}) +def test_fillna_dataframe(df, value, inplace): + pdf = df gdf = DataFrame.from_pandas(pdf) - if fill_type == "scalar": - fill_value_pd = 5 - fill_value_cudf = fill_value_pd - elif fill_type == "series": - fill_value_pd = pd.Series([3, 4, 5]) - fill_value_cudf = Series.from_pandas(fill_value_pd) + fill_value_pd = value + if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)): + fill_value_cudf = cudf.from_pandas(fill_value_pd) + elif isinstance(fill_value_pd, dict): + fill_value_cudf = {} + for key in fill_value_pd: + temp_val = fill_value_pd[key] + if isinstance(temp_val, cudf.Series): + temp_val = cudf.from_pandas(temp_val) + fill_value_cudf[key] = temp_val else: - fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])} - fill_value_cudf = { - "a": fill_value_pd["a"], - "b": Series.from_pandas(fill_value_pd["b"]), - } - - # https://github.com/pandas-dev/pandas/issues/27197 - # pandas df.fill_value with series is not working - - if isinstance(fill_value_pd, pd.Series): - expect = pd.DataFrame() - for col in pdf.columns: - expect[col] = pdf[col].fillna(fill_value_pd) - else: - expect = pdf.fillna(fill_value_pd) + fill_value_cudf = value + expect = pdf.fillna(fill_value_pd, inplace=inplace) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf + expect = pdf assert_eq(expect, got) -@pytest.mark.parametrize("fill_type", ["scalar", "series"]) +@pytest.mark.parametrize( + "psr", + [ + pd.Series(["a", "b", "c", "d"]), + pd.Series([None] * 4, dtype="object"), + pd.Series(["z", None, "z", None]), + pd.Series(["x", "y", None, None, None]), + pd.Series([None, None, None, "i", "P"]), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + "a", + pd.Series(["a", "b", "c", "d"]), + pd.Series(["z", None, "z", None]), + pd.Series([None] * 4, dtype="object"), + pd.Series(["x", "y", None, None, None]), + pd.Series([None, None, None, "i", "P"]), + ], +) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_string(fill_type, inplace): - psr = pd.Series(["z", None, "z", None]) +def test_fillna_string(psr, fill_value, inplace): + gsr = cudf.from_pandas(psr) - if fill_type == "scalar": - fill_value_pd = "a" - fill_value_cudf = fill_value_pd - elif fill_type == "series": - fill_value_pd = pd.Series(["a", "b", "c", "d"]) - fill_value_cudf = Series.from_pandas(fill_value_pd) - - sr = Series.from_pandas(psr) + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value - expect = psr.fillna(fill_value_pd) - got = sr.fillna(fill_value_cudf, inplace=inplace) + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: - got = sr + expected = psr + got = gsr - assert_eq(expect, got) + assert_eq(expected, got) @pytest.mark.parametrize("data_dtype", INTEGER_TYPES) @@ -720,3 +867,44 @@ def test_multiindex_clip(lower, upper, inplace): assert_eq(df, index.to_frame(index=False)) else: assert_eq(expected, got.to_frame(index=False)) + + +@pytest.mark.parametrize( + "data", [[1, 2.0, 3, 4, None, 1, None, 10, None], ["a", "b", "c"]] +) +@pytest.mark.parametrize( + "index", + [ + None, + [1, 2, 3], + ["a", "b", "z"], + ["a", "b", "c", "d", "e", "f", "g", "l", "m"], + ], +) +@pytest.mark.parametrize("value", [[1, 2, 3, 4, None, 1, None, 10, None]]) +def test_series_fillna(data, index, value): + psr = pd.Series( + data, + index=index if index is not None and len(index) == len(data) else None, + ) + gsr = Series( + data, + index=index if index is not None and len(index) == len(data) else None, + ) + + expect = psr.fillna(pd.Series(value)) + got = gsr.fillna(Series(value)) + assert_eq(expect, got) + + +def test_series_fillna_error(): + psr = pd.Series([1, 2, None, 3, None]) + gsr = cudf.from_pandas(psr) + + try: + psr.fillna(pd.DataFrame({"a": [1, 2, 3]})) + except Exception as e: + with pytest.raises(type(e), match=str(e)): + gsr.fillna(cudf.DataFrame({"a": [1, 2, 3]})) + else: + raise AssertionError("Expected psr.fillna to fail") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 66d4deda12d..c9dacab793a 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest @@ -21,15 +21,16 @@ def test_null_series(nrows, dtype): ps = sr.to_pandas() pd.options.display.max_rows = int(nrows) psrepr = ps.__repr__() - psrepr = psrepr.replace("NaN", "null") - psrepr = psrepr.replace("NaT", "null") + psrepr = psrepr.replace("NaN", "") + psrepr = psrepr.replace("NaT", "") + psrepr = psrepr.replace("None", "") if ( dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long") ): psrepr = psrepr.replace( - str(sr._column.default_na_value()) + "\n", "null\n" + str(sr._column.default_na_value()) + "\n", "\n" ) print(psrepr) @@ -59,8 +60,9 @@ def test_null_dataframe(ncols): pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) pdfrepr = pdf.__repr__() - pdfrepr = pdfrepr.replace("NaN", "null") - pdfrepr = pdfrepr.replace("NaT", "null") + pdfrepr = pdfrepr.replace("NaN", "") + pdfrepr = pdfrepr.replace("NaT", "") + pdfrepr = pdfrepr.replace("None", "") print(pdf) print(gdf) assert pdfrepr.split() == gdf.__repr__().split() @@ -236,3 +238,325 @@ def test_generic_index(length, dtype): gsr = cudf.Series.from_pandas(psr) assert psr.index.__repr__() == gsr.index.__repr__() + + +@pytest.mark.parametrize( + "gdf", + [ + cudf.DataFrame({"a": range(10000)}), + cudf.DataFrame({"a": range(10000), "b": range(10000)}), + cudf.DataFrame({"a": range(20), "b": range(20)}), + cudf.DataFrame( + { + "a": range(20), + "b": range(20), + "c": ["abc", "def", "xyz", "def", "pqr"] * 4, + } + ), + cudf.DataFrame(index=[1, 2, 3]), + cudf.DataFrame(index=range(10000)), + cudf.DataFrame(columns=["a", "b", "c", "d"]), + cudf.DataFrame(columns=["a"], index=range(10000)), + cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), + cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), + cudf.DataFrame( + columns=["a", "b", "c", "d"], + index=cudf.Series(range(10000)).astype("str"), + ), + ], +) +@pytest.mark.parametrize( + "slice", + [ + slice(2500, 5000), + slice(2500, 2501), + slice(5000), + slice(1, 10), + slice(10, 20), + slice(15, 2400), + ], +) +@pytest.mark.parametrize("max_seq_items", [1, 10, 60, 10000, None]) +@pytest.mark.parametrize("max_rows", [1, 10, 60, 10000, None]) +def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): + pd.options.display.max_seq_items = max_seq_items + pd.options.display.max_rows = max_rows + pdf = gdf.to_pandas() + + sliced_gdf = gdf[slice] + sliced_pdf = pdf[slice] + + expected_repr = sliced_pdf.__repr__().replace("None", "") + actual_repr = sliced_gdf.__repr__() + + assert expected_repr == actual_repr + pd.reset_option("display.max_rows") + pd.reset_option("display.max_seq_items") + + +@pytest.mark.parametrize( + "index,expected_repr", + [ + ( + cudf.Index([1, 2, 3, None]), + "Int64Index([1, 2, 3, ], dtype='int64')", + ), + ( + cudf.Index([None, 2.2, 3.324342, None]), + "Float64Index([, 2.2, 3.324342, ], dtype='float64')", + ), + ( + cudf.Index([None, None, None], name="hello"), + "Float64Index([, , ], dtype='float64', name='hello')", + ), + ( + cudf.Index([None], name="hello"), + "Float64Index([], dtype='float64', name='hello')", + ), + ( + cudf.Index([None], dtype="int8", name="hello"), + "Int8Index([], dtype='int8', name='hello')", + ), + ( + cudf.Index([None] * 50, dtype="object"), + "StringIndex([None None None None None None None None " + "None None None None None None\n None None None None None None " + "None None None None None None None None\n None None None None " + "None None None None None None None None None None\n None None " + "None None None None None None], dtype='object')", + ), + ( + cudf.Index([None] * 20, dtype="uint32"), + "UInt32Index([, , , , , , , , " + ",\n , , , , , , , , " + ",\n , ],\n dtype='uint32')", + ), + ( + cudf.Index( + [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" + ), + "Int16Index([, 111, 22, 33, , 23, 34, 2343, ], " + "dtype='int16')", + ), + ( + cudf.Index([1, 2, 3, None], dtype="category"), + "CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], " + "ordered=False, dtype='category')", + ), + ( + cudf.Index([None, None], dtype="category"), + "CategoricalIndex([, ], categories=[], ordered=False, " + "dtype='category')", + ), + ( + cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), + "DatetimeIndex([1970-01-01 00:00:00.000000010, " + "1970-01-01 00:00:00.000000020," + "\n 1970-01-01 00:00:00.000000030, ],\n " + "dtype='datetime64[ns]')", + ), + ( + cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), + "DatetimeIndex([1970-01-01 00:00:10, " + "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" + " ],\n dtype='datetime64[s]')", + ), + ( + cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), + "DatetimeIndex([1970-01-01 00:00:00.000010, " + "1970-01-01 00:00:00.000020,\n " + "1970-01-01 00:00:00.000030, ],\n " + "dtype='datetime64[us]')", + ), + ( + cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), + "DatetimeIndex([1970-01-01 00:00:00.010, " + "1970-01-01 00:00:00.020,\n " + "1970-01-01 00:00:00.030, ],\n " + "dtype='datetime64[ms]')", + ), + ( + cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), + "DatetimeIndex([, , , , , , , , " + ",\n ],\n dtype='datetime64[ms]')", + ), + ], +) +def test_generic_index_null(index, expected_repr): + + actual_repr = index.__repr__() + + assert expected_repr == actual_repr + + +@pytest.mark.parametrize( + "df,pandas_special_case", + [ + (pd.DataFrame({"a": [1, 2, 3]}, index=[10, 20, None]), False), + ( + pd.DataFrame( + { + "a": [1, None, 3], + "string_col": ["hello", "world", "rapids"], + }, + index=[None, "a", "b"], + ), + True, + ), + (pd.DataFrame([], index=[None, "a", "b"]), False), + (pd.DataFrame({"aa": [None, None]}, index=[None, None]), False), + (pd.DataFrame({"aa": [1, 2, 3]}, index=[None, None, None]), False), + ( + pd.DataFrame( + {"aa": [None, 2, 3]}, + index=np.array([1, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [None, 2, 3]}, + index=np.array([100, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [None, None, None]}, + index=np.array([None, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [1, None, 3]}, + index=np.array([10, 15, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} + ).set_index(["a", "v"]), + False, + ), + ( + pd.DataFrame( + { + "a": [1, 2, None], + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"]), + False, + ), + ( + pd.DataFrame( + { + "a": np.array([1, None, None], dtype="datetime64[ns]"), + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"]), + False, + ), + ], +) +def test_dataframe_null_index_repr(df, pandas_special_case): + pdf = df + gdf = cudf.from_pandas(pdf) + + expected_repr = ( + pdf.__repr__() + .replace("NaN", "") + .replace("NaT", "") + .replace("None", "") + ) + actual_repr = gdf.__repr__() + + if pandas_special_case: + # Pandas inconsistently print StringIndex null values + # as `None` at some places and `NaN` at few other places + # Whereas cudf is consistent with strings `null` values + # to be printed as `None` everywhere. + actual_repr = gdf.__repr__().replace("None", "") + + assert expected_repr.split() == actual_repr.split() + + +@pytest.mark.parametrize( + "sr,pandas_special_case", + [ + (pd.Series([1, 2, 3], index=[10, 20, None]), False), + (pd.Series([1, None, 3], name="a", index=[None, "a", "b"]), True), + (pd.Series(None, index=[None, "a", "b"], dtype="float"), True), + (pd.Series([None, None], name="aa", index=[None, None]), False), + (pd.Series([1, 2, 3], index=[None, None, None]), False), + ( + pd.Series( + [None, 2, 3], + index=np.array([1, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.Series( + [None, None, None], + index=np.array([None, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.Series( + [1, None, 3], + index=np.array([10, 15, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} + ).set_index(["a", "v"])["p"], + False, + ), + ( + pd.DataFrame( + { + "a": [1, 2, None], + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"])["p"], + False, + ), + ( + pd.DataFrame( + { + "a": np.array([1, None, None], dtype="datetime64[ns]"), + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"])["p"], + False, + ), + ], +) +def test_series_null_index_repr(sr, pandas_special_case): + psr = sr + gsr = cudf.from_pandas(psr) + + expected_repr = ( + psr.__repr__() + .replace("NaN", "") + .replace("NaT", "") + .replace("None", "") + ) + actual_repr = gsr.__repr__() + + if pandas_special_case: + # Pandas inconsistently print StringIndex null values + # as `None` at some places and `NaN` at few other places + # Whereas cudf is consistent with strings `null` values + # to be printed as `None` everywhere. + actual_repr = gsr.__repr__().replace("None", "") + assert expected_repr.split() == actual_repr.split() diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8795304155d..28eaf4c3d92 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -319,6 +319,36 @@ def test_series_column_iter_error(): iter(gs._column) +@pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, None, 4.0, 5.0], + ["a", "b", "c", "d", "e"], + ["a", "b", None, "d", "e"], + [None, None, None, None, None], + np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), + np.array(["1991-11-20", None], dtype=np.datetime64), + np.array( + ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 + ), + np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), + ], +) +def test_series_tolist(data): + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + + with pytest.raises( + TypeError, + match=re.escape( + r"cuDF does not support conversion to host memory " + r"via `tolist()` method. Consider using " + r"`.to_arrow().to_pylist()` to construct a Python list." + ), + ): + gsr.tolist() + + @pytest.mark.parametrize( "data", [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index b9d0f427934..ad6f78ea3bf 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -153,8 +153,8 @@ def test_string_repr(ps_gs, item): expect = str(expect_out) got = str(got_out) - # if isinstance(expect_out, pd.Series): - # expect = expect.replace("object", "str") + if got_out is not None and len(got_out) > 1: + expect = expect.replace("None", "") assert expect == got diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 479ad1d1660..82c779dfeba 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -96,6 +96,40 @@ def test_normalize_spaces(): assert_eq(expected, actual) +def test_normalize_characters(): + strings = cudf.Series( + ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] + ) + expected = cudf.Series( + [ + " 乾 乿 ", + "accentuate", + "ascend , descend", + "", + None, + "stock ^ $ 1", + ] + ) + + actual = strings.str.normalize_characters() + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + " 乾 乿 ", + "ĂĆCĖÑTÜATE", + "âscénd , Descend", + "", + None, + "Stock ^ $ 1", + ] + ) + actual = strings.str.normalize_characters(do_lower=False) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + @pytest.mark.parametrize( "n, separator, expected_values", [ diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a0cf8c89129..723c8b4a37e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -11,6 +11,7 @@ import cudf +_NA_REP = "" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 03373053e8b..a98d47c4a77 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,3 +1,4 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. import cupy as cp import numpy as np import pandas as pd @@ -5,7 +6,7 @@ from dask.dataframe.categorical import categorical_dtype_dispatch from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty -from dask.dataframe.methods import concat_dispatch +from dask.dataframe.methods import concat_dispatch, tolist_dispatch from dask.dataframe.utils import ( UNKNOWN_CATEGORIES, _nonempty_scalar, @@ -218,6 +219,11 @@ def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) +@tolist_dispatch.register((cudf.Series, cudf.Index)) +def tolist_cudf(obj): + return obj.to_arrow().to_pylist() + + try: from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch diff --git a/python/dask_cudf/dask_cudf/io/opt_parquet.py b/python/dask_cudf/dask_cudf/io/opt_parquet.py deleted file mode 100644 index ca590554130..00000000000 --- a/python/dask_cudf/dask_cudf/io/opt_parquet.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. -import warnings - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import parquet as pq - -from dask.base import tokenize -from dask.dataframe.io.parquet.core import set_index_columns -from dask.dataframe.io.parquet.utils import ( - _normalize_index_columns, - _parse_pandas_metadata, -) -from dask.dataframe.io.utils import _get_pyarrow_dtypes, _meta_from_dtypes - -from dask_cudf import DataFrame - -import cudf -from cudf.core.column import as_column - -try: - import ujson as json -except ImportError: - import json - - -def _get_dataset_and_parts(data_path, fs, row_groups_per_part): - parts = [] - dataset = pq.ParquetDataset(data_path, filesystem=fs) - if dataset.metadata: - fpath_last = None - rgi = 0 - rg_list = [] - for rg in range(dataset.metadata.num_row_groups): - - fpath = dataset.metadata.row_group(rg).column(0).file_path - - if fpath_last and fpath_last != fpath: - rgi = 0 - full_path = fs.sep.join([data_path, fpath_last]) - parts.append(tuple([full_path, rg_list])) - rg_list = [] - elif len(rg_list) >= row_groups_per_part: - full_path = fs.sep.join([data_path, fpath_last]) - parts.append(tuple([full_path, rg_list])) - rg_list = [] - - if fpath is None: - raise ValueError("_metadata file is missing file_path string.") - - fpath_last = fpath - rg_list.append(rgi) - rgi += 1 - if rg_list: - full_path = fs.sep.join([data_path, fpath_last]) - parts.append(tuple([full_path, rg_list])) - else: - warnings.warn( - "Must have metadata file to split by row group." - "Using full file for each partition." - ) - for piece in dataset.pieces: - parts.append(tuple([piece.path, None])) - - return dataset, parts - - -def _read_metadata(fs, path, row_groups_per_part, index=None): - dataset, parts = _get_dataset_and_parts(path, fs, row_groups_per_part) - if not dataset.metadata: - raise ValueError("_metadata file is missing.") - - schema = dataset.metadata.schema.to_arrow_schema() - columns = None - has_pandas_metadata = ( - schema.metadata is not None and b"pandas" in schema.metadata - ) - categories = None - if has_pandas_metadata: - pandas_metadata = json.loads(schema.metadata[b"pandas"].decode("utf8")) - ( - index_names, - column_names, - storage_name_mapping, - column_index_names, - ) = _parse_pandas_metadata(pandas_metadata) - categories = [] - for col in pandas_metadata["columns"]: - if (col["pandas_type"] == "categorical") and ( - col["name"] not in categories - ): - categories.append(col["name"]) - else: - index_names = [] - column_names = schema.names - storage_name_mapping = {k: k for k in column_names} - column_index_names = [None] - - if index is None and index_names: - index = index_names - - column_names, index_names = _normalize_index_columns( - columns, column_names, index, index_names - ) - all_columns = index_names + column_names - - dtypes = _get_pyarrow_dtypes(schema, categories) - dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} - - index_cols = index or () - meta = _meta_from_dtypes( - all_columns, dtypes, index_cols, column_index_names - ) - - return meta, parts - - -def _read_partition(part, index, columns, strings_to_cats): - # Read dataset part - path, row_groups = part - if columns is not None: - columns = [c for c in columns] - if isinstance(index, list): - columns += index - - df = cudf.io.read_parquet( - path, - row_groups=row_groups, - columns=columns, - strings_to_cats=strings_to_cats, - ) - - if index and (index[0] in df.columns): - df = df.set_index(index[0]) - return df - - -def parquet_reader( - path, - columns=None, - row_groups_per_part=None, - index=None, - storage_options=None, - **kwargs, -): - - name = "opt-read-parquet-" + tokenize( - path, columns, index, storage_options, row_groups_per_part - ) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - if len(paths) > 1 or not fs.isdir(paths[0]): - raise ValueError( - "Must pass in a directory path to use `row_groups_per_part`." - ) - - auto_index_allowed = False - if index is None: - # User is allowing auto-detected index - auto_index_allowed = True - if index and isinstance(index, str): - index = [index] - - dd_meta, parts = _read_metadata(fs, path, row_groups_per_part, index=index) - strings_to_cats = kwargs.get("strings_to_categorical", False) - meta = cudf.DataFrame(index=dd_meta.index) - for col in dd_meta.columns: - if dd_meta[col].dtype == "O": - meta[col] = as_column( - dd_meta[col], dtype="int32" if strings_to_cats else "object" - ) - else: - meta[col] = as_column(dd_meta[col]) - - if meta.index.name is not None: - index = meta.index.name - - # Account for index and columns arguments. - # Modify `meta` dataframe accordingly - index_in_columns = False - meta, index, columns = set_index_columns( - meta, index, columns, index_in_columns, auto_index_allowed - ) - - dsk = {} - for p, part in enumerate(parts): - read_key = (name, p) - dsk[read_key] = ( - _read_partition, - part, - index, - columns, - strings_to_cats, - ) - - # Set the index that was previously treated as a column - if index_in_columns: - meta = meta.set_index(index) - - divisions = [None] * (len(parts) + 1) - return DataFrame(dsk, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 0215384b51b..7349b837745 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -69,8 +69,7 @@ def read_partition( if index and (index[0] in df.columns): df = df.set_index(index[0]) - - if len(partition_keys) > 0: + if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): @@ -150,9 +149,7 @@ def write_metadata(parts, fmd, fs, path, append=False, **kwargs): def read_parquet( path, columns=None, - chunksize=None, - split_row_groups=True, - gather_statistics=None, + split_row_groups=None, row_groups_per_part=None, **kwargs, ): @@ -177,41 +174,17 @@ class to support full functionality. columns = [columns] if row_groups_per_part: - from .opt_parquet import parquet_reader - warnings.warn( - "Using optimized read_parquet engine. This option does not " - "support partitioned datsets or filtering, and will not " - "result in known divisions. Do not use `row_groups_per_part` " - "if full support is needed." - ) - if kwargs.get("filters", None): - raise ValueError( - "Cannot use `filters` with `row_groups_per_part=True`." - ) - return parquet_reader( - path, - columns=columns, - row_groups_per_part=row_groups_per_part, - **kwargs, + "row_groups_per_part is deprecated. " + "Pass an integer value to split_row_groups instead." ) + if split_row_groups is None: + split_row_groups = row_groups_per_part - if chunksize and gather_statistics is False: - warnings.warn( - "Setting chunksize parameter with gather_statistics=False. " - "Use gather_statistics=True to enable row-group aggregation." - ) - if chunksize and split_row_groups is False: - warnings.warn( - "Setting chunksize parameter with split_row_groups=False. " - "Use split_row_groups=True to enable row-group aggregation." - ) return dd.read_parquet( path, columns=columns, - chunksize=chunksize, split_row_groups=split_row_groups, - gather_statistics=gather_statistics, engine=CudfEngine, **kwargs, ) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index afa458b0450..16454019929 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -6,6 +6,7 @@ import tlz as toolz from dask.base import tokenize +from dask.dataframe import methods from dask.dataframe.core import DataFrame, Index, Series from dask.dataframe.shuffle import rearrange_by_column from dask.highlevelgraph import HighLevelGraph @@ -241,5 +242,5 @@ def sort_values( df4 = df3.map_partitions(M.sort_values, by) if not isinstance(divisions, gd.DataFrame) and set_divisions: # Can't have multi-column divisions elsewhere in dask (yet) - df4.divisions = divisions.tolist() + df4.divisions = methods.tolist(divisions) return df4