Merge pull request rapidsai#15 from rapidsai/branch-0.10

Merge b10
jakirkham · Aug 16, 2019 · 989fa57 · 989fa57
2 parents 57801fd + 959fc41
commit 989fa57
Show file tree

Hide file tree

Showing 16 changed files with 86 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+# cuML 0.10.0 (Date TBD)
+
+## New Features
+
+## Improvements
+
+## Bug Fixes
+
+
 # cuML 0.9.0 (Date TBD)
 
 ## New Features
@@ -29,6 +38,7 @@
 - PR #930: Dask RF
 - PR #882: TSNE - T-Distributed Stochastic Neighbourhood Embedding
 - PR #926: Wrapper for FIL
+- PR #960: Enable using libcumlprims for MG algorithms/prims
 
 ## Improvements
 - PR #822: build: build.sh update to club all make targets together
@@ -52,6 +62,7 @@
 - PR #976: Update api.rst for new 0.9 classes
 - PR #973: Use cudaDeviceGetAttribute instead of relying on cudaDeviceProp object being passed
 - PR #978: Update README for 0.9
+- PR #1009: Fix references to notebooks-contrib
 
 ## Bug Fixes
 
@@ -73,6 +84,7 @@
 - PR #988: Switch to exact tsne
 - PR #991: Cleanup python code in cuml.dask.cluster
 - PR #996: ucx_initialized being properly set in CommsContext
+- PR #1007: Throws a well defined error when mutigpu is not enabled
 
 # cuML 0.8.0 (27 June 2019)
 

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ programming. In most cases, cuML's Python API matches the API from
 
 For large datasets, these GPU-based implementations can complete 10-50x faster
 than their CPU equivalents. For details on performance, see the [cuML Benchmarks
-Notebook](https://github.com/rapidsai/notebooks-extended/blob/master/intermediate_notebooks/benchmarks/cuml_benchmarks.ipynb).
+Notebook](https://github.com/rapidsai/notebooks-contrib/blob/master/intermediate_notebooks/benchmarks/cuml_benchmarks.ipynb).
 
 As an example, the following Python snippet loads input and computes DBSCAN clusters, all on GPU:
 ```python
@@ -62,8 +62,8 @@ For additional examples, browse our complete [API
 documentation](https://docs.rapids.ai/api/cuml/stable/), or check out our
 introductory [walkthrough
 notebooks](https://github.com/rapidsai/notebooks/tree/master/cuml). Finally, you
-can find complete end-to-end examples in the [notebooks-extended
-repo](https://github.com/rapidsai/notebooks-extended).
+can find complete end-to-end examples in the [notebooks-contrib
+repo](https://github.com/rapidsai/notebooks-contrib).
 
 
 ### Supported Algorithms
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-extended).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | (Experimental) |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -46,6 +46,7 @@ conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c rapidsai/label/x
       cudf=${MINOR_VERSION} \
       rmm=${MINOR_VERSION} \
       nvstrings=${MINOR_VERSION} \
+      libcumlprims=${MINOR_VERSION} \
       lapack \
       cmake==3.14.3 \
       umap-learn \
@@ -73,7 +74,7 @@ conda list
 ################################################################################
 
 logger "Build libcuml..."
-$WORKSPACE/build.sh clean libcuml cuml prims -v
+$WORKSPACE/build.sh clean libcuml cuml prims --multigpu -v
 
 ################################################################################
 # TEST - Run GoogleTest and py.tests for libcuml and cuML
@@ -93,7 +94,7 @@ GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml
 
 logger "Python pytest for cuml..."
 cd $WORKSPACE/python
-pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v -m "not mg"
+pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v
 
 ################################################################################
 # TEST - Run GoogleTest for ml-prims

diff --git a/conda/recipes/cuml-cuda10/build.sh b/conda/recipes/cuml-cuda10/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh cuml #--multigpu
+./build.sh cuml --multigpu
diff --git a/conda/recipes/cuml-cuda10/meta.yaml b/conda/recipes/cuml-cuda10/meta.yaml
@@ -30,13 +30,14 @@ requirements:
     - cmake>=3.14
     - cudf 0.9*
     - libcuml={{ version }}
-    - libcumlMG
+    - libcumlprims=0.9
     - cudatoolkit {{ cuda_version }}.*
   run:
     - python x.x
     - cudf 0.9*
     - libcuml={{ version }}
-    - libcumlMG
+    - libcumlprims=0.9
+    - nccl 2.4.*
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
 about:

diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
@@ -36,6 +36,7 @@ requirements:
     - python x.x
     - cudf {{ minor_version }}
     - libcuml={{ version }}
+    - nccl 2.4.*
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
 about:

diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml
@@ -1,7 +1,7 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 
 # Usage:
-#   conda build . -c defaults -c conda-forge -c numba -c rapidsai -c pytorch
+#   conda build . -c defaults -c conda-forge -c nvidia -c rapidsai -c pytorch
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
@@ -28,12 +28,15 @@ build:
 requirements:
   build:
     - cmake>=3.14
+    - libclang
+  host:
+    - nccl 2.4.*
     - cudf {{ minor_version }}
     - cudatoolkit {{ cuda_version }}.*
     - lapack
-    - libclang
   run:
     - cudf {{ minor_version }}
+    - nccl 2.4.*
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
 about:

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -19,7 +19,7 @@ set (CMAKE_FIND_NO_INSTALL_PREFIX TRUE FORCE)
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 cmake_policy(SET CMP0079 NEW)
 
-project(CUML VERSION 0.9.0 LANGUAGES CXX CUDA)
+project(CUML VERSION 0.10.0 LANGUAGES CXX CUDA)
 
 
 ###################################################################################################

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -59,9 +59,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.9'
+version = '0.10'
 # The full version, including alpha/beta/rc tags.
-release = '0.9.0a'
+release = '0.10.0a'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/python/cuml/dask/cluster/kmeans.py b/python/cuml/dask/cluster/kmeans.py
@@ -105,7 +105,13 @@ def func_fit(sessionId, n_clusters, max_iter, tol, verbose, random_state,
         :param r: Stops memoizatiion caching
         :return: The fit model
         """
-        from cuml.cluster.kmeans_mg import KMeansMG as cumlKMeans
+        try:
+            from cuml.cluster.kmeans_mg import KMeansMG as cumlKMeans
+        except ImportError:
+            raise Exception("cuML has not been built with multiGPU support "
+                            "enabled. Build with the --multigpu flag to"
+                            " enable multiGPU support.")
+
         handle = worker_state(sessionId)["handle"]
 
         df = concat(dfs)

diff --git a/python/cuml/dask/linear_model/linear_regression.py b/python/cuml/dask/linear_model/linear_regression.py
@@ -366,8 +366,12 @@ def _fit_on_worker(data, params):
         from cuml.linear_model.linear_regression_mg import LinearRegressionMG as cuOLS  # NOQA
         ols = cuOLS()
         intercept = ols._fit_mg(alloc_info, params)
+    except ImportError:
+        raise Exception("cuML has not been built with multiGPU support "
+                        "enabled. Build with the --multigpu flag to enable"
+                        " multiGPU support.")
     except Exception as e:
-        print("FAILURE in FIT: " + str(e))
+        print("Failure in Fit(): " + str(e))
 
     [t.close() for t in open_ipcs]
     # [t.join() for t in open_ipcs]
@@ -413,7 +417,10 @@ def _predict_on_worker(data, intercept, params):
         from cuml.linear_model.linear_regression_mg import LinearRegressionMG as cuOLS  # NOQA
         ols = cuOLS()
         ols._predict_mg(alloc_info, intercept, params)
-
+    except ImportError:
+        raise Exception("cuML has not been built with multiGPU support "
+                        "enabled. Build with the --multigpu flag to enable"
+                        " multiGPU support.")
     except Exception as e:
         print("Failure in predict(): " + str(e))
 

diff --git a/python/cuml/decomposition/tsvd_mg.pyx b/python/cuml/decomposition/tsvd_mg.pyx
@@ -31,7 +31,7 @@ from libc.stdint cimport uintptr_t
 from cuml.decomposition.utils cimport *
 from cuml.utils import zeros
 
-cdef extern from "tsvd/tsvd_spmg.h" namespace "ML":
+cdef extern from "cumlprims/spmg/tsvd_spmg.hpp" namespace "ML":
 
     cdef void tsvdFitSPMG(float *h_input,
                           float *h_components,

diff --git a/python/cuml/linear_model/linear_regression_mg.pyx b/python/cuml/linear_model/linear_regression_mg.pyx
@@ -32,7 +32,7 @@ from libc.stdlib cimport calloc, malloc, free
 from cuml.utils import zeros
 
 
-cdef extern from "glm/glm_spmg.h" namespace "ML::GLM":
+cdef extern from "cumlprims/spmg/glm_spmg.hpp" namespace "ML::GLM":
 
     cdef void olsFitSPMG(float *h_input,
                          int n_rows,
@@ -74,7 +74,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM":
                              int *gpu_ids,
                              int n_gpus)
 
-    cdef void spmgOlsFit(float **input,
+    cdef void olsFitSPMG(float **input,
                          int *input_cols,
                          int n_rows,
                          int n_cols,
@@ -87,7 +87,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM":
                          bool normalize,
                          int n_gpus)
 
-    cdef void spmgOlsFit(double **input,
+    cdef void olsFitSPMG(double **input,
                          int *input_cols,
                          int n_rows,
                          int n_cols,
@@ -100,7 +100,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM":
                          bool normalize,
                          int n_gpus)
 
-    cdef void spmgOlsPredict(float **input,
+    cdef void olsPredictSPMG(float **input,
                              int *input_cols,
                              int n_rows,
                              int n_cols,
@@ -111,7 +111,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM":
                              int *pred_cols,
                              int n_gpus)
 
-    cdef void spmgOlsPredict(double **input,
+    cdef void olsPredictSPMG(double **input,
                              int *input_cols,
                              int n_rows,
                              int n_cols,
@@ -132,7 +132,7 @@ class LinearRegressionMG:
 
     .. code-block:: python
 
-        from cuml import LinearRegression
+        from cuml import LinearRegressionMG
         import numpy as np
 
 
@@ -161,7 +161,7 @@ class LinearRegressionMG:
                       61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0],
                      dtype=np.float32)
 
-        lr = LinearRegression()
+        lr = LinearRegressionMG()
 
         res = lr.fit(X, y, gpu_ids=[0,1])
 
@@ -322,7 +322,7 @@ class LinearRegressionMG:
         cdef uintptr_t X_ptr, y_ptr, gpu_ids_ptr, coef_ptr
 
         self.gdf_datatype = X.dtype
-        self.coef_ = zeros(X.shape[1], dtype=X.dtype)
+        self.coef_ = np.zeros(X.shape[1], dtype=X.dtype)
 
         X_ptr = X.ctypes.data
         y_ptr = y.ctypes.data
@@ -476,7 +476,7 @@ class LinearRegressionMG:
 
                 idx = idx + 1
 
-            spmgOlsFit(<float**> input32,
+            olsFitSPMG(<float**> input32,
                        <int*> input_cols,
                        <int> n_rows,
                        <int> n_cols,
@@ -514,7 +514,7 @@ class LinearRegressionMG:
 
                 idx = idx + 1
 
-            spmgOlsFit(<double**> input64,
+            olsFitSPMG(<double**> input64,
                        <int*> input_cols,
                        <int> n_rows,
                        <int> n_cols,
@@ -586,7 +586,7 @@ class LinearRegressionMG:
 
                 idx = idx + 1
 
-            spmgOlsPredict(<float**>input32,
+            olsPredictSPMG(<float**>input32,
                            <int*>input_cols,
                            <int> n_rows,
                            <int> n_cols,
@@ -620,7 +620,7 @@ class LinearRegressionMG:
 
                 idx = idx + 1
 
-            spmgOlsPredict(<double**>input64,
+            olsPredictSPMG(<double**>input64,
                            <int*>input_cols,
                            <int> n_rows,
                            <int> n_cols,

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
@@ -78,15 +78,20 @@ class TSNE(Base):
     dataset you give it, and is used in many areas including cancer research,
     music analysis and neural network weight visualizations.
 
-    cuML's TSNE implementation handles any # of n_components although
-    specifying n_components = 2 will use the Barnes Hut algorithm which scales
-    much better for large data since it is a O(NlogN) algorithm.
+    The current cuML TSNE implementation is a first experimental release. It
+    defaults to use the 'exact' fitting algorithm, which is signficantly slower
+    then the Barnes-Hut algorithm as data sizes grow. A preview implementation
+    of Barnes-Hut (derived from CannyLabs' BH open source CUDA code) is also
+    available for problems with n_components = 2, though this implementation
+    currently has outstanding issues that can lead to crashes in rare
+    scenarios. Future releases of TSNE will fix these issues (tracked as cuML
+    Issue #1002) and switch Barnes-Hut to be the default.
 
     Parameters
     ----------
     n_components : int (default 2)
-        The output dimensionality size. Can be any number, but with
-        n_components = 2 TSNE can run faster.
+        The output dimensionality size. Currently only size=2 is tested, but
+        the 'exact' algorithm will support greater dimensionality in future.
     perplexity : float (default 30.0)
         Larger datasets require a larger value. Consider choosing different
         perplexity values from 5 to 50 and see the output differences.

diff --git a/python/cuml/test/dask/test_linear_regression.py b/python/cuml/test/dask/test_linear_regression.py
@@ -42,6 +42,7 @@ def load_data(nrows, ncols, cached='data/mortgage.npy.gz'):
     else:
         print('use random data')
         X = np.random.rand(nrows, ncols)
+        y = np.random.rand(nrows, 1)
 
     df_X = pd.DataFrame({'fea%d' % i: X[:, i] for i in range(X.shape[1])})
     df_y = pd.DataFrame({'fea%d' % i: y[:, i] for i in range(y.shape[1])})