Merge pull request #4714 from rapidsai/branch-0.13

rapidsai · Mar 30, 2020 · 19f5174 · 19f5174
2 parents 8d7bf34 + 6158033
commit 19f5174
Show file tree

Hide file tree

Showing 770 changed files with 64,736 additions and 19,703 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
 python/cudf/cudf/_version.py export-subst
+CHANGELOG.md merge=union
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -3,6 +3,7 @@ cpp/               @rapidsai/cudf-cpp-codeowners
 
 #python code owners
 python/            @rapidsai/cudf-python-codeowners
+notebooks/         @rapidsai/cudf-python-codeowners
 python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 
 #cmake code owners

diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
@@ -0,0 +1,35 @@
+name: Auto Assign New Issues to Triage Project
+
+on:
+  issues:
+    types: [opened]
+
+env:
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+jobs:
+  assign_one_project:
+    runs-on: ubuntu-latest
+    name: Assign to New Issues to Triage Project
+    steps:
+    - name: Process bug issues
+      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
+      if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1
+        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
+    - name: Process feature issues
+      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
+      if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9
+        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
+    - name: Process other issues
+      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
+      if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10
+        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
diff --git a/.gitignore b/.gitignore
@@ -14,12 +14,15 @@ DartConfiguration.tcl
 .DS_Store
 *.manifest
 *.spec
+.nfs*
 
 ## Python build directories & artifacts
+dask-worker-space/
 dist/
 cudf.egg-info/
 python/build
 python/*/build
+python/cudf/cudf-coverage.xml
 python/cudf/*/_lib/**/*.cpp
 python/cudf/*/_lib/**/*.h
 python/cudf/*/_lib/.nfs*
@@ -28,6 +31,7 @@ python/cudf/*/_libxx/**/*.h
 python/cudf/*/_libxx/.nfs*
 python/cudf/*.ipynb
 python/cudf/.ipynb_checkpoints
+python/nvstrings/nvstrings-coverage.xml
 python/*/record.txt
 .Python
 env/
@@ -55,8 +59,11 @@ htmlcov/
 .cache
 nosetests.xml
 coverage.xml
+junit-cudf.xml
+junit-nvstrings.xml
 *.cover
 .hypothesis/
+test-results
 
 ## Patching
 *.diff
@@ -142,3 +149,9 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+## VSCode IDE
+.vscode
+
+# Dask
+dask-worker-space/
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -159,11 +159,12 @@ git submodule update --init --remote --recursive
 # create the conda environment (assuming in base `cudf` directory)
 conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda10.0.yml
 # activate the environment
-source activate cudf_dev
+conda activate cudf_dev
 ```
-- If you're using CUDA 9.2, you will need to create the environment with `conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda9.2.yml` instead.
+- If using CUDA 9.2, create the environment with `conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda9.2.yml` instead.
+- For other CUDA versions, check the corresponding cudf_dev_cuda*.yml file in conda/environments
 
-- Build and install `libcudf`. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
+- Build and install `libcudf` after its dependencies. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
 ```bash
 $ cd $CUDF_HOME/cpp                                                       # navigate to C/C++ CUDA source root directory
 $ mkdir build                                                             # make a build directory
@@ -173,15 +174,18 @@ $ cd build                                                                # ente
 # -DCMAKE_INSTALL_PREFIX set to the install path for your libraries or $CONDA_PREFIX if you're using Anaconda, i.e. -DCMAKE_INSTALL_PREFIX=/install/path or -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
 # -DCMAKE_CXX11_ABI set to ON or OFF depending on the ABI version you want, defaults to ON. When turned ON, ABI compability for C++11 is used. When OFF, pre-C++11 ABI compability is used.
 $ cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON      # configure cmake ...
-
 $ make -j                                                                 # compile the libraries librmm.so, libcudf.so ... '-j' will start a parallel job using the number of physical cores available on your system
 $ make install                                                            # install the libraries librmm.so, libcudf.so to the CMAKE_INSTALL_PREFIX
 ```
 
 - As a convenience, a `build.sh` script is provided in `$CUDF_HOME`. To execute the same build commands above, run the script as shown below.  Note that the libraries will be installed to the location set in `$INSTALL_PREFIX` if set (i.e. `export INSTALL_PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
 ```bash
 $ cd $CUDF_HOME
-$ ./build.sh libcudf                   # compile the cuDF libraries and install them to $INSTALL_PREFIX if set, otherwise $CONDA_PREFIX
+$ ./build.sh                                                              # To build both C++ and Python cuDF versions with their dependencies
+```
+- To build only the C++ component with the script
+```bash
+$ ./build.sh libnvstrings libcudf                                         # Build only the cuDF C++ components and install them to $INSTALL_PREFIX if set, otherwise $CONDA_PREFIX
 ```
 
 - To run tests (Optional):

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # <div align="left"><img src="img/rapids_logo.png" width="90px"/>&nbsp;cuDF - GPU DataFrames</div>
 
-[![Build Status](https://gpuci.gpuopenanalytics.com/buildStatus/icon?job=gpuCI%2Fcudf%2Fbranches%2Fcudf-gpu-branch-0.12)](https://gpuci.gpuopenanalytics.com/job/gpuCI/job/cudf/job/branches/job/cudf-gpu-branch-0.12/)
+[![Build Status](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/branches/job/cudf-branch-pipeline/badge/icon)](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/branches/job/cudf-branch-pipeline/)
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/master/README.md) ensure you are on the `master` branch.
 

diff --git a/build.sh b/build.sh
@@ -18,21 +18,25 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libnvstrings nvstrings libcudf cudf dask_cudf benchmarks -v -g -n --allgpuarch -h"
-HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [-v] [-g] [-n] [-h]
-   clean        - remove all existing build artifacts and configuration (start
-                  over)
-   libnvstrings - build the nvstrings C++ code only
-   nvstrings    - build the nvstrings Python package
-   libcudf      - build the cudf C++ code only
-   cudf         - build the cudf Python package
-   dask_cudf    - build the dask_cudf Python package
-   benchmarks   - build benchmarks
-   -v           - verbose build mode
-   -g           - build for debug
-   -n           - no install step
-   --allgpuarch - build for all supported GPU architectures
-   -h           - print this text
+VALIDARGS="clean libnvstrings nvstrings libcudf cudf dask_cudf benchmarks tests -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn -h"
+HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [-v] [-g] [-n] [-h] [-l]
+   clean            - remove all existing build artifacts and configuration (start
+                      over)
+   libnvstrings     - build the nvstrings C++ code only
+   nvstrings        - build the nvstrings Python package
+   libcudf          - build the cudf C++ code only
+   cudf             - build the cudf Python package
+   dask_cudf        - build the dask_cudf Python package
+   benchmarks       - build benchmarks
+   tests            - build tests
+   -v               - verbose build mode
+   -g               - build for debug
+   -n               - no install step
+   -l               - build legacy tests
+   --allgpuarch     - build for all supported GPU architectures
+   --disable_nvtx   - disable inserting NVTX profiling ranges
+   --show_depr_warn - show cmake deprecation warnings
+   -h               - print this text
 
    default action (no args) is to build and install 'libnvstrings' then
    'nvstrings' then 'libcudf' then 'cudf' then 'dask_cudf' targets
@@ -49,6 +53,10 @@ BUILD_TYPE=Release
 INSTALL_TARGET=install
 BENCHMARKS=OFF
 BUILD_ALL_GPU_ARCH=0
+BUILD_NVTX=ON
+BUILD_TESTS=OFF
+BUILD_LEGACY_TESTS=OFF
+BUILD_DISABLE_DEPRECATION_WARNING=ON
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -88,12 +96,26 @@ if hasArg -g; then
 fi
 if hasArg -n; then
     INSTALL_TARGET=""
+    LIBCUDF_BUILD_DIR=${LIB_BUILD_DIR}
+    LIBNVSTRINGS_BUILD_DIR=${LIB_BUILD_DIR}
+fi
+if hasArg -l; then
+    BUILD_LEGACY_TESTS=ON
 fi
 if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 if hasArg benchmarks; then
-    BENCHMARKS=ON
+    BENCHMARKS="ON"
+fi
+if hasArg tests; then
+    BUILD_TESTS=ON
+fi
+if hasArg --disable_nvtx; then
+    BUILD_NVTX="OFF"
+fi
+if hasArg --show_depr_warn; then
+    BUILD_DISABLE_DEPRECATION_WARNING=OFF
 fi
 
 # If clean given, run it prior to any other steps
@@ -128,7 +150,10 @@ if buildAll || hasArg libnvstrings || hasArg libcudf; then
     cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CXX11_ABI=ON \
           ${GPU_ARCH} \
+          -DUSE_NVTX=${BUILD_NVTX} \
           -DBUILD_BENCHMARKS=${BENCHMARKS} \
+          -DBUILD_LEGACY_TESTS=${BUILD_LEGACY_TESTS} \
+          -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ..
 fi
 
@@ -140,6 +165,10 @@ if buildAll || hasArg libnvstrings; then
     else
         make -j${PARALLEL_LEVEL} nvstrings VERBOSE=${VERBOSE}
     fi
+
+    if [[ ${BUILD_TESTS} == "ON" ]]; then
+        make -j${PARALLEL_LEVEL} build_tests_nvstrings VERBOSE=${VERBOSE}
+    fi
 fi
 
 # Build and install the nvstrings Python package
@@ -150,7 +179,7 @@ if buildAll || hasArg nvstrings; then
         python setup.py build_ext
         python setup.py install --single-version-externally-managed --record=record.txt
     else
-        python setup.py build_ext --library-dir=${LIBNVSTRINGS_BUILD_DIR}
+        python setup.py build_ext --build-lib=${PWD} --library-dir=${LIBNVSTRINGS_BUILD_DIR}
     fi
 fi
 
@@ -163,17 +192,21 @@ if buildAll || hasArg libcudf; then
     else
         make -j${PARALLEL_LEVEL} cudf VERBOSE=${VERBOSE}
     fi
+
+    if [[ ${BUILD_TESTS} == "ON" ]]; then
+        make -j${PARALLEL_LEVEL} build_tests_cudf VERBOSE=${VERBOSE}
+    fi
 fi
 
 # Build and install the cudf Python package
 if buildAll || hasArg cudf; then
 
     cd ${REPODIR}/python/cudf
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py build_ext --inplace
+        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace
         python setup.py install --single-version-externally-managed --record=record.txt
     else
-        python setup.py build_ext --inplace --library-dir=${LIBCUDF_BUILD_DIR}
+        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace --library-dir=${LIBCUDF_BUILD_DIR}
     fi
 fi
 

diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 #Upload cudf once per PYTHON
-if [[ "$CUDA" == "9.2" ]]; then
+if [[ "$CUDA" == "10.0" ]]; then
     export UPLOAD_CUDF=1
 else
     export UPLOAD_CUDF=0

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -59,10 +59,10 @@ logger "Activate conda env..."
 source activate gdf
 conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
               "dask>=2.1.0" "distributed>=2.1.0" "numpy>=1.16" "double-conversion" \
-              "rapidjson" "flatbuffers" "boost-cpp" "fsspec>=0.3.3" "dlpack" \
+              "rapidjson" "flatbuffers" "boost-cpp" "fsspec>=0.3.3,<0.7.0a0" "dlpack" \
               "feather-format" "cupy>=6.6.0,<8.0.0a0,!=7.1.0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \
               "fastavro>=0.22.0" "pandas>=0.25,<0.26" "hypothesis" "s3fs" "gcsfs" \
-              "boto3" "moto" "httpretty" "streamz"
+              "boto3" "moto" "httpretty" "streamz" "ipython=7.3*" "jupyterlab"
 
 # Install the master version of dask, distributed, and streamz
 logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps"
@@ -83,7 +83,11 @@ conda list
 ################################################################################
 
 logger "Build libcudf..."
-$WORKSPACE/build.sh clean libnvstrings nvstrings libcudf cudf dask_cudf benchmarks
+if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    $WORKSPACE/build.sh clean libnvstrings nvstrings libcudf cudf dask_cudf benchmarks tests
+else
+    $WORKSPACE/build.sh clean libnvstrings nvstrings libcudf cudf dask_cudf benchmarks tests -l
+fi
 
 ################################################################################
 # TEST - Run GoogleTest and py.tests for libnvstrings, nvstrings, libcudf, and
@@ -96,20 +100,22 @@ else
     logger "Check GPU usage..."
     nvidia-smi
 
-    logger "GoogleTest for libnvstrings..."
+    logger "GoogleTests..."
     cd $WORKSPACE/cpp/build
-    GTEST_OUTPUT="xml:${WORKSPACE}/test-results/" make -j${PARALLEL_LEVEL} test_nvstrings
 
-    logger "GoogleTest for libcudf..."
-    cd $WORKSPACE/cpp/build
-    GTEST_OUTPUT="xml:${WORKSPACE}/test-results/" make -j${PARALLEL_LEVEL} test_cudf
+    for gt in ${WORKSPACE}/cpp/build/gtests/* ; do
+        test_name=$(basename ${gt})
+        echo "Running GoogleTest $test_name"
+        ${gt} --gtest_output=xml:${WORKSPACE}/test-results/
+    done
+
 
     # set environment variable for numpy 1.16
     # will be enabled for later versions by default
     np_ver=$(python -c "import numpy; print('.'.join(numpy.__version__.split('.')[:-1]))")
     if [ "$np_ver" == "1.16" ];then
-      logger "export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1"
-      export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+        logger "export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1"
+        export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
     fi
 
     cd $WORKSPACE/python/nvstrings
@@ -128,4 +134,6 @@ else
     logger "Python py.test for cuStreamz..."
     py.test --cache-clear --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
 
+    ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
+    python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
 fi
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+NOTEBOOKS_DIR=${WORKSPACE}/notebooks
+NBTEST=${WORKSPACE}/ci/utils/nbtest.sh
+LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
+
+cd ${NOTEBOOKS_DIR}
+TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
+
+# Add notebooks that should be skipped here
+# (space-separated list of filenames without paths)
+
+SKIPNBS=""
+
+## Check env
+env
+
+EXITCODE=0
+
+# Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
+# if any run fails
+
+cd ${NOTEBOOKS_DIR}
+for nb in $(find . -name "*.ipynb"); do
+    nbBasename=$(basename ${nb})
+    # Skip all NBs that use dask (in the code or even in their name)
+    if ((echo ${nb}|grep -qi dask) || \
+        (grep -q dask ${nb})); then
+        echo "--------------------------------------------------------------------------------"
+        echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
+        echo "--------------------------------------------------------------------------------"
+    elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then
+        echo "--------------------------------------------------------------------------------"
+        echo "SKIPPING: ${nb} (listed in skip list)"
+        echo "--------------------------------------------------------------------------------"
+    else
+        nvidia-smi
+        ${NBTEST} ${nbBasename}
+        EXITCODE=$((EXITCODE | $?))
+        rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
+    fi
+done
+
+
+nvidia-smi
+
+exit ${EXITCODE}
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		python/cudf/cudf/_version.py export-subst
		CHANGELOG.md merge=union