Merge branch '017-fea-pref-order' of github.com:dantegd/cuml into 017…

…-fea-pref-order
rapidsai · Nov 19, 2020 · 17d0ff6 · 17d0ff6
2 parents 5fb7ef1 + 1b02eac
commit 17d0ff6
Show file tree

Hide file tree

Showing 125 changed files with 13,399 additions and 1,153 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,8 @@
 # cuML 0.17.0 (Date TBD)
 
 ## New Features
+- PR #2659: Add initial max inner product sparse knn
+- PR #2836: Refactor UMAP to accept sparse inputs
 
 ## Improvements
 - PR #3077: Improve runtime for test_kmeans
@@ -27,11 +29,17 @@
 - PR #3112: Speed test_array
 - PR #3111: Adding Cython to Code Coverage
 - PR #3129:  Update notebooks README
+- PR #3002: Update flake8 Config To With Per File Settings
+- PR #3135: Add QuasiNewton tests
 - PR #3040: Improved Array Conversion with CumlArrayDescriptor and Decorators
 - PR #3134: Improving the Deprecation Message Formatting in Documentation
 - PR #3113: Add tags and prefered memory order tags to estimators
+- PR #3137: Reorganize Pytest Config and Add Quick Run Option
+- PR #3144: Adding Ability to Set Arbitrary Cmake Flags in ./build.sh
+- PR #3155: Eliminate unnecessary warnings from random projection test
 
 ## Bug Fixes
+- PR #3069: Prevent conversion of DataFrames to Series in preprocessing
 - PR #3065: Refactoring prims metrics function names from camelcase to underscore format
 - PR #3033: Splitting ml metrics to individual files
 - PR #3072: Fusing metrics and score directories in src_prims
@@ -40,6 +48,7 @@
 - PR #3011: Fix unused initialize_embeddings parameter in Barnes-Hut t-SNE
 - PR #3008: Check number of columns in check_array validator
 - PR #3012: Increasing learning rate for SGD log loss and invscaling pytests
+- PR #2950: Fix includes in UMAP
 - PR #3021: Fix a hang in cuML RF experimental backend
 - PR #3039: Update RF and decision tree parameter initializations in benchmark codes
 - PR #3060: Speed up test suite `test_fil`
@@ -52,6 +61,8 @@
 - PR #3117: Fix two crashes in experimental RF backend
 - PR #3119: Fix memset args for benchmark
 - PR #3130: Return Python string from `dump_as_json()` of RF
+- PR #3136: Fix stochastic gradient descent example
+- PR #3156: Force local conda artifact install
 
 # cuML 0.16.0 (Date TBD)
 
@@ -110,6 +121,7 @@
 - PR #2928: Updating Estimators Derived from Base for Consistency
 - PR #2942: Adding `cuml.experimental` to the Docs
 - PR #3010: Improve gpuCI Scripts
+- PR #3141: Move DistanceType enum to RAFT
 
 ## Bug Fixes
 - PR #2973: Allow data imputation for nan values
@@ -155,7 +167,6 @@
 - PR #2984: Fix GPU test scripts gcov error
 - PR #2990: Reduce MNMG kneighbors regressor test threshold
 - PR #2997: Changing ARIMA `get/set_params` to `get/set_fit_params`
-
 # cuML 0.15.0 (Date TBD)
 
 ## New Features

diff --git a/build.sh b/build.sh
@@ -48,6 +48,13 @@ HELP="$0 [<target> ...] [<flag> ...]
    -h               - print this text
 
  default action (no args) is to build and install 'libcuml', 'cuml', and 'prims' targets only for the detected GPU arch
+
+ The following environment variables are also accepted to allow further customization:
+   PARALLEL_LEVEL         - Number of parallel threads to use in compilation.
+   CUML_EXTRA_CMAKE_ARGS  - Extra arguments to pass directly to cmake. Values listed in environment
+                            variable will override existing arguments. Example:
+                            CUML_EXTRA_CMAKE_ARGS=\"-DBUILD_CUML_C_LIBRARY=OFF\" ./build.sh
+   CUML_EXTRA_PYTHON_ARGS - Extra argument to pass directly to python setup.py
 "
 LIBCUML_BUILD_DIR=${LIBCUML_BUILD_DIR:=${REPODIR}/cpp/build}
 CUML_BUILD_DIR=${REPODIR}/python/build
@@ -60,7 +67,7 @@ BUILD_TYPE=Release
 INSTALL_TARGET=install
 BUILD_ALL_GPU_ARCH=0
 SINGLEGPU_CPP_FLAG=""
-BUILD_PYTHON_ARGS=${BUILD_PYTHON_ARGS:=""}
+CUML_EXTRA_PYTHON_ARGS=${CUML_EXTRA_PYTHON_ARGS:=""}
 NVTX=OFF
 CLEAN=0
 BUILD_DISABLE_DEPRECATION_WARNING=ON
@@ -74,6 +81,12 @@ BUILD_STATIC_FAISS=OFF
 INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX}}}
 PARALLEL_LEVEL=${PARALLEL_LEVEL:=""}
 
+# Allow setting arbitrary cmake args via the $CUML_ADDL_CMAKE_ARGS variable. Any
+# values listed here will override existing arguments. For example:
+# CUML_EXTRA_CMAKE_ARGS="-DBUILD_CUML_C_LIBRARY=OFF" ./build.sh
+# Will disable building the C library even though it is hard coded to ON
+CUML_EXTRA_CMAKE_ARGS=${CUML_EXTRA_CMAKE_ARGS:=""}
+
 function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
@@ -117,7 +130,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 if hasArg --singlegpu; then
-    BUILD_PYTHON_ARGS="${BUILD_PYTHON_ARGS} --singlegpu"
+    CUML_EXTRA_PYTHON_ARGS="${CUML_EXTRA_PYTHON_ARGS} --singlegpu"
     SINGLEGPU_CPP_FLAG=ON
 fi
 if hasArg cpp-mgtests; then
@@ -136,7 +149,7 @@ if hasArg --show_depr_warn; then
     BUILD_DISABLE_DEPRECATION_WARNING=OFF
 fi
 if hasArg --codecov; then
-    BUILD_PYTHON_ARGS="${BUILD_PYTHON_ARGS} --linetrace=1 --profile"
+    CUML_EXTRA_PYTHON_ARGS="${CUML_EXTRA_PYTHON_ARGS} --linetrace=1 --profile"
 fi
 if hasArg clean; then
     CLEAN=1
@@ -189,6 +202,7 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri
           -DNCCL_PATH=${INSTALL_PREFIX} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} \
+          ${CUML_EXTRA_CMAKE_ARGS} \
           ..
 fi
 
@@ -229,9 +243,9 @@ fi
 if completeBuild || hasArg cuml || hasArg pydocs; then
     cd ${REPODIR}/python
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${BUILD_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR} install --single-version-externally-managed --record=record.txt
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${CUML_EXTRA_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR} install --single-version-externally-managed --record=record.txt
     else
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${BUILD_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR}
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${CUML_EXTRA_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR}
     fi
 
     if hasArg pydocs; then

diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
@@ -26,7 +26,6 @@
     re.compile(r"CMakeLists[.]txt$"),
     re.compile(r"CMakeLists_standalone[.]txt$"),
     re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
 

diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -16,7 +16,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 conda install "ucx-py=${MINOR_VERSION}"
 
 # Run flake8 and get results/return code
-FLAKE=`flake8 --exclude=cpp,thirdparty,__init__.py,versioneer.py && flake8 --config=python/.flake8.cython`
+FLAKE=`flake8 --config=python/setup.cfg`
 RETVAL=$?
 
 # Output results if failure otherwise show pass

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -186,8 +186,11 @@ else
     patchelf --replace-needed `patchelf --print-needed ./test/ml | grep faiss` libfaiss.so ./test/ml
     GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml
 
-    gpuci_logger "Installing libcuml"
-    conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ libcuml
+    CONDA_FILE=`find $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ -name "libcuml*.tar.bz2"`
+    CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
+    CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
+    gpuci_logger "Installing $CONDA_FILE"
+    conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ "$CONDA_FILE"
 
     gpuci_logger "Building cuml"
     "$WORKSPACE/build.sh" -v cuml --codecov

diff --git a/ci/local/README.md b/ci/local/README.md
@@ -32,7 +32,7 @@ Style Check:
 $ bash ci/local/build.sh -r ~/rapids/cuml -s
 $ source activate rapids    # Activate gpuCI conda environment
 $ cd rapids
-$ flake8 python
+$ flake8 --config=python/setup.cfg
 ```
 
 ## Information

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -401,6 +401,7 @@ if(BUILD_CUML_CPP_LIBRARY)
     src/holtwinters/holtwinters.cu
     src/kmeans/kmeans.cu
     src/knn/knn.cu
+    src/knn/knn_sparse.cu
     src/metrics/accuracy_score.cu
     src/metrics/adjusted_rand_index.cu
     src/metrics/completeness_score.cu

diff --git a/cpp/bench/prims/distance_common.cuh b/cpp/bench/prims/distance_common.cuh
@@ -26,7 +26,7 @@ struct Params {
   int m, n, k;
 };  // struct Params
 
-template <typename T, ML::Distance::DistanceType DType>
+template <typename T, raft::distance::DistanceType DType>
 struct Distance : public Fixture {
   Distance(const std::string& name, const Params& p)
     : Fixture(name, std::shared_ptr<deviceAllocator>(

diff --git a/cpp/bench/prims/distance_cosine.cu b/cpp/bench/prims/distance_cosine.cu
@@ -21,7 +21,7 @@ namespace Bench {
 namespace Distance {
 
 DIST_BENCH_REGISTER(DistanceCosine,
-                    ML::Distance::DistanceType::EucExpandedCosine);
+                    raft::distance::DistanceType::EucExpandedCosine);
 
 }  // namespace Distance
 }  // namespace Bench

diff --git a/cpp/bench/prims/distance_exp_l2.cu b/cpp/bench/prims/distance_exp_l2.cu
@@ -20,9 +20,9 @@ namespace MLCommon {
 namespace Bench {
 namespace Distance {
 
-DIST_BENCH_REGISTER(DistanceL2Sq, ML::Distance::DistanceType::EucExpandedL2);
+DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::EucExpandedL2);
 DIST_BENCH_REGISTER(DistanceL2Sqrt,
-                    ML::Distance::DistanceType::EucExpandedL2Sqrt);
+                    raft::distance::DistanceType::EucExpandedL2Sqrt);
 
 }  // namespace Distance
 }  // namespace Bench

diff --git a/cpp/bench/prims/distance_l1.cu b/cpp/bench/prims/distance_l1.cu
@@ -20,7 +20,7 @@ namespace MLCommon {
 namespace Bench {
 namespace Distance {
 
-DIST_BENCH_REGISTER(DistanceL1, ML::Distance::DistanceType::EucUnexpandedL1);
+DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::EucUnexpandedL1);
 
 }  // namespace Distance
 }  // namespace Bench

diff --git a/cpp/bench/prims/distance_unexp_l2.cu b/cpp/bench/prims/distance_unexp_l2.cu
@@ -21,9 +21,9 @@ namespace Bench {
 namespace Distance {
 
 DIST_BENCH_REGISTER(DistanceUnexpL2Sq,
-                    ML::Distance::DistanceType::EucUnexpandedL2);
+                    raft::distance::DistanceType::EucUnexpandedL2);
 DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt,
-                    ML::Distance::DistanceType::EucUnexpandedL2Sqrt);
+                    raft::distance::DistanceType::EucUnexpandedL2Sqrt);
 
 }  // namespace Distance
 }  // namespace Bench

diff --git a/cpp/bench/sg/umap.cu b/cpp/bench/sg/umap.cu
@@ -111,8 +111,8 @@ class UmapSupervised : public UmapBase {
 
  protected:
   void coreBenchmarkMethod() {
-    fit(*this->handle, this->data.X, yFloat, this->params.nrows,
-        this->params.ncols, nullptr, nullptr, &uParams, embeddings);
+    UMAP::fit(*this->handle, this->data.X, yFloat, this->params.nrows,
+              this->params.ncols, nullptr, nullptr, &uParams, embeddings);
   }
 };
 ML_BENCH_REGISTER(Params, UmapSupervised, "blobs", getInputs());
@@ -124,8 +124,8 @@ class UmapUnsupervised : public UmapBase {
 
  protected:
   void coreBenchmarkMethod() {
-    fit(*this->handle, this->data.X, this->params.nrows, this->params.ncols,
-        nullptr, nullptr, &uParams, embeddings);
+    UMAP::fit(*this->handle, this->data.X, nullptr, this->params.nrows,
+              this->params.ncols, nullptr, nullptr, &uParams, embeddings);
   }
 };
 ML_BENCH_REGISTER(Params, UmapUnsupervised, "blobs", getInputs());
@@ -136,17 +136,17 @@ class UmapTransform : public UmapBase {
 
  protected:
   void coreBenchmarkMethod() {
-    transform(*this->handle, this->data.X, this->params.nrows,
-              this->params.ncols, nullptr, nullptr, this->data.X,
-              this->params.nrows, embeddings, this->params.nrows, &uParams,
-              transformed);
+    UMAP::transform(*this->handle, this->data.X, this->params.nrows,
+                    this->params.ncols, nullptr, nullptr, this->data.X,
+                    this->params.nrows, embeddings, this->params.nrows,
+                    &uParams, transformed);
   }
   void allocateBuffers(const ::benchmark::State& state) {
     UmapBase::allocateBuffers(state);
     auto& handle = *this->handle;
     alloc(transformed, this->params.nrows * uParams.n_components);
-    fit(handle, this->data.X, yFloat, this->params.nrows, this->params.ncols,
-        nullptr, nullptr, &uParams, embeddings);
+    UMAP::fit(handle, this->data.X, yFloat, this->params.nrows,
+              this->params.ncols, nullptr, nullptr, &uParams, embeddings);
   }
   void deallocateBuffers(const ::benchmark::State& state) {
     dealloc(transformed, this->params.nrows * uParams.n_components);

diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
@@ -39,7 +39,7 @@ else(DEFINED ENV{RAFT_PATH})
 
   ExternalProject_Add(raft
     GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           9b3afe67895fbea397fb2c72375157aadfc132d8
+    GIT_TAG           eebd0e306624b419168b2cd5cd7aa44ebaec51f1
     PREFIX            ${RAFT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""

diff --git a/cpp/examples/dbscan/gen_dataset.py b/cpp/examples/dbscan/gen_dataset.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 import argparse
-from sklearn.datasets.samples_generator import make_blobs
+from sklearn.datasets import make_blobs
 
 parser = argparse.ArgumentParser('gen_dataset.py ')
 

diff --git a/cpp/include/cuml/cluster/kmeans.hpp b/cpp/include/cuml/cluster/kmeans.hpp
@@ -53,7 +53,7 @@ struct KMeansParams {
   int seed = 0;
 
   // Metric to use for distance computation. Any metric from
-  // ML::Distance::DistanceType can be used
+  // raft::distance::DistanceType can be used
   int metric = 0;
 
   // Number of instance k-means algorithm will be run with different seeds.
@@ -184,7 +184,7 @@ void predict(const raft::handle_t &handle, const KMeansParams &params,
  * sample in 'X' (it should be same as the dimension for each cluster centers in
  * 'centroids').
  * @param[in]     metric        Metric to use for distance computation. Any
- * metric from ML::Distance::DistanceType can be used
+ * metric from raft::distance::DistanceType can be used
  * @param[out]    X_new         X transformed in the new space..
  */
 void transform(const raft::handle_t &handle, const KMeansParams &params,

diff --git a/cpp/src/internals/internals.h → cpp/include/cuml/common/callback.hpp b/cpp/src/internals/internals.h → cpp/include/cuml/common/callback.hpp
@@ -46,4 +46,4 @@ class GraphBasedDimRedCallback : public Callback {
 };
 
 }  // namespace Internals
-}  // namespace ML
+}  // namespace ML
diff --git a/cpp/include/cuml/distance/distance_type.h b/cpp/include/cuml/distance/distance_type.h
@@ -17,6 +17,8 @@ enum DistanceType : unsigned short {
   EucUnexpandedL2 = 4,
   /** same as above, but inside the epilogue, perform square root operation */
   EucUnexpandedL2Sqrt = 5,
+  /** simple inner product */
+  InnerProduct = 6
 };
 
 };  // end namespace Distance