Merge branch 'branch-0.15' into bug-disallow-sum-timestamp

rapidsai · Jul 28, 2020 · 53f427a · 53f427a
2 parents 2026363 + 448f38a
commit 53f427a
Show file tree

Hide file tree

Showing 40 changed files with 2,014 additions and 639 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,7 +48,9 @@
 - PR #5658 Add `filter_tokens` nvtext API
 - PR #5666 Add `filter_characters_of_type` strings API
 - PR #5673 Always build and test with per-thread default stream enabled in the GPU CI build
+- PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function
 - PR #5572 Add `cudf::encode` API.
+- PR #5568 Add support for `Series.keys()` and `DataFrame.keys()`
 
 ## Improvements
 
@@ -136,7 +138,12 @@
 - PR #5702 Add inherited methods to python docs and other docs fixes
 - PR #5733 Add support for `size` property in `DataFrame`/ `Series` / `Index`/ `MultiIndex`
 - PR #5743 Reduce number of test cases in concatenate benchmark
+- PR #5748 Disable `tolist` API in `Series` & `Index` and add `tolist` dispatch in `dask-cudf`
+- PR #5756 Switch JNI code to use the RMM owning wrapper
+- PR #5725 Integrate Gbenchmarks into CI
 - PR #5752 Add cuDF internals documentation (ColumnAccessor)
+- PR #5759 Fix documentation describing JIT cache default location
+- PR #5775 Update dask_cudf.read_parquet to align with upstream improvements
 
 ## Bug Fixes
 
@@ -202,10 +209,13 @@
 - PR #5692 Fix compilation issue with gcc 7.4.0 and CUDA 10.1
 - PR #5693 Add fix missing from PR 5656 to update local docker image to py3.7
 - PR #5703 Small fix for dataframe constructor with cuda array interface objects that don't have `descr` field
+- PR #5727 Fix `Index.__repr__` to allow representation of null values
 - PR #5719 Fix Frame._concat() with categorical columns
 - PR #5736 Disable unsigned type in ORC writer benchmarks
 - PR #5745 Update JNI cast for inability to cast timestamp and integer types
 - PR #5750 Add RMM_ROOT/include to the spdlog search path in JNI build
+- PR #5763 Update Java slf4j version to match Spark 3.0
+- PR #5766 Fix issue related to `iloc` and slicing a `DataFrame`
 - PR #5319 Disallow SUM and specialize MEAN of timestamp types
 
 

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#########################################
+# cuDF GPU build and test script for CI #
+#########################################
+set -e
+NUMARGS=$#
+ARGS=$*
+
+# Logger function for build status output
+function logger() {
+  echo -e "\n>>>> $@\n"
+}
+
+# Arg parsing function
+function hasArg {
+    (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
+}
+
+# Set path and build parallel level
+export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
+export PARALLEL_LEVEL=4
+export CUDA_REL=${CUDA_VERSION%.*}
+export HOME=$WORKSPACE
+
+# Parse git describe
+cd $WORKSPACE
+export GIT_DESCRIBE_TAG=`git describe --tags`
+export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
+
+# Set Benchmark Vars
+export ASVRESULTS_DIR=${WORKSPACE}/ci/artifacts/asv/results
+export GBENCH_BENCHMARKS_DIR=${WORKSPACE}/cpp/build/gbenchmarks/
+
+# Ensure ASV results directory exists
+mkdir -p ${ASVRESULTS_DIR}
+
+# Set `LIBCUDF_KERNEL_CACHE_PATH` environment variable to $HOME/.jitify-cache because
+# it's local to the container's virtual file system, and not shared with other CI jobs
+# like `/tmp` is.
+export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
+
+function remove_libcudf_kernel_cache_dir {
+    EXITCODE=$?
+    logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
+    rm -rf "$LIBCUDF_KERNEL_CACHE_PATH" || logger "could not rm -rf $LIBCUDF_KERNEL_CACHE_PATH"
+    exit $EXITCODE
+}
+
+trap remove_libcudf_kernel_cache_dir EXIT
+
+mkdir -p "$LIBCUDF_KERNEL_CACHE_PATH" || logger "could not mkdir -p $LIBCUDF_KERNEL_CACHE_PATH"
+
+################################################################################
+# SETUP - Check environment
+################################################################################
+
+logger "Check environment..."
+env
+
+logger "Check GPU usage..."
+nvidia-smi
+
+logger "Activate conda env..."
+source activate rapids
+
+# Enter dependencies to be shown in ASV tooltips.
+CUDF_DEPS=(librmm)
+LIBCUDF_DEPS=(librmm)
+
+conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
+              "rapids-build-env=$MINOR_VERSION.*" \
+              "rapids-notebook-env=$MINOR_VERSION.*" \
+              rapids-pytest-benchmark
+
+# https://docs.rapids.ai/maintainers/depmgmt/
+# conda remove -f rapids-build-env rapids-notebook-env
+# conda install "your-pkg=1.0.0"
+
+# Install the master version of dask, distributed, and streamz
+logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+logger "pip install git+https://github.com/python-streamz/streamz.git --upgrade --no-deps"
+pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps
+
+logger "Check versions..."
+python --version
+$CC --version
+$CXX --version
+conda list
+
+################################################################################
+# BUILD - Build libcudf, cuDF and dask_cudf from source
+################################################################################
+
+logger "Build libcudf..."
+if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests --ptds
+else
+    $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests -l --ptds
+fi
+
+################################################################################
+# BENCHMARK - Run and parse libcudf and cuDF benchmarks
+################################################################################
+
+logger "Running benchmarks..."
+
+#Download GBench results Parser
+curl -L https://raw.githubusercontent.com/rapidsai/benchmark/main/parser/GBenchToASV.py --output GBenchToASV.py
+
+###
+# Generate Metadata for dependencies
+###
+
+# Concatenate dependency arrays, convert to JSON array,
+# and remove duplicates.
+X=("${CUDF_DEPS[@]}" "${LIBCUDF_DEPS[@]}")
+DEPS=$(printf '%s\n' "${X[@]}" | jq -R . | jq -s 'unique')
+
+# Build object with k/v pairs of "dependency:version"
+DEP_VER_DICT=$(jq -n '{}')
+for DEP in $(echo "${DEPS}" | jq -r '.[]'); do
+  VER=$(conda list | grep "^${DEP}" | awk '{print $2"-"$3}')
+  DEP_VER_DICT=$(echo "${DEP_VER_DICT}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }')
+done
+
+# Pass in an array of dependencies to get a dict of "dependency:version"
+function getReqs() {
+  local DEPS_ARR=("$@")
+  local REQS="{}"
+  for DEP in "${DEPS_ARR[@]}"; do
+    VER=$(echo "${DEP_VER_DICT}" | jq -r --arg DEP "${DEP}" '.[$DEP]')
+    REQS=$(echo "${REQS}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }')
+  done
+
+  echo "${REQS}"
+}
+
+###
+# Run LIBCUDF Benchmarks
+###
+
+REQS=$(getReqs "${LIBCUDF_DEPS[@]}")
+
+mkdir -p ${WORKSPACE}/tmp/benchmark
+touch ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+ls ${GBENCH_BENCHMARKS_DIR} > ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+
+#Disable error aborting while tests run, failed tests will not generate data
+logger "Running libcudf GBenchmarks..."
+cd ${GBENCH_BENCHMARKS_DIR}
+set +e
+while read BENCH;
+do
+    nvidia-smi
+    ./${BENCH} --benchmark_out=${BENCH}.json --benchmark_out_format=json
+    EXITCODE=$?
+    if [[ ${EXITCODE} != 0 ]]; then
+        rm ./${BENCH}.json
+	JOBEXITCODE=1
+    fi
+done < ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+set -e
+
+rm ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+cd ${WORKSPACE}
+mv ${GBENCH_BENCHMARKS_DIR}/*.json ${WORKSPACE}/tmp/benchmark/
+python GBenchToASV.py -d  ${WORKSPACE}/tmp/benchmark/ -t ${ASVRESULTS_DIR} -n libcudf -b branch-${MINOR_VERSION} -r "${REQS}" 
+
+###
+# Run Python Benchmarks
+###
+
+#REQS=$(getReqs "${CUDF_DEPS[@]}")
+
+#BENCHMARK_META=$(jq -n \
+#  --arg NODE "${NODE_NAME}" \
+#  --arg BRANCH "branch-${MINOR_VERSION}" \
+#  --argjson REQS "${REQS}" '
+#  {
+#    "machineName": $NODE,
+#    "commitBranch": $BRANCH,
+#    "requirements": $REQS
+#  }
+#')
+
+#echo "Benchmark meta:"
+#echo "${BENCHMARK_META}" | jq "."
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
@@ -28,9 +28,7 @@ requirements:
     - cudf {{ version }}
     - dask >=2.15.0
     - distributed >=2.15.0
-test:
-  imports:
-    - dask_cudf
+
 
 about:
   home: http://rapids.ai/

diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+set -e
+
+# Logger function for build status output
+function logger() {
+  echo -e "\n>>>> $@\n"
+}
+
+# Install the master version of dask and distributed
+logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
+
+logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+
+logger "python -c 'import dask_cudf'"
+python -c "import dask_cudf"
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
@@ -51,5 +51,54 @@ std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& strings,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/**
+ * @brief Normalizes strings characters for tokenizing.
+ *
+ * This uses the normalizer that is built into the nvtext::subword_tokenize function
+ * which includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process here adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes the accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ *
+ * @code{.pseudo}
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,true)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ * s2 = normalize_characters(s,false)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * This function requires 8x the number of bytes in the input strings
+ * column as working memory.
+ *
+ * @param strings The input strings to normalize.
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param mr Memory resource to allocate any returned objects.
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& strings,
+  bool do_lower_case,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
@@ -24,6 +24,11 @@
 
 namespace nvtext {
 
+/**
+ * @addtogroup nvtext_tokenize
+ * @{
+ */
+
 /**
  * @brief The vocabulary data for use with the subword_tokenize function.
  */
@@ -171,4 +176,5 @@ tokenizer_result subword_tokenize(
   uint32_t max_rows_tensor,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/jit/cache.h b/cpp/src/jit/cache.h
@@ -40,7 +40,9 @@ using named_prog = std::pair<std::string, std::shared_ptr<Tv>>;
  * This function returns a path to the cache directory, creating it if it
  * doesn't exist.
  *
- * The default cache directory `$TEMPDIR/cudf_$CUDF_VERSION`.
+ * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides
+ * are used and if $HOME is not defined, returns an empty path and file
+ * caching is not used.
  **/
 boost::filesystem::path getCacheDir();