rapidsai · cjnolet · Jul 2, 2020 · Jun 16, 2020 · Jun 16, 2020 · Jun 16, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - PR #2340: Import ARIMA in the root init file and fix the `test_fit_function` test
 - PR #2408: Install meta packages for dependencies
 - PR #2417: Move doc customization scripts to Jenkins
+- PR #2427: Moving MNMG decomposition to cuml
 - PR #2433: Add libcumlprims_mg to CMake
 - PR #2420: Add and set convert_dtype default to True in estimator fit methods
 - PR #2411: Refactor Mixin classes and use in classifier/regressor estimators

@@ -116,7 +116,7 @@ if hasArg --singlegpu; then
     SINGLEGPU_PYTHON_FLAG="--singlegpu"
     SINGLEGPU_CPP_FLAG=ON
 fi
-if hasArg mgtests; then
+if hasArg cpp-mgtests; then
     BUILD_CPP_MG_TESTS=ON
 fi
 if hasArg --nvtx; then
@@ -149,7 +149,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg prims-bench || hasArg cppdocs; then
+if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg prims-bench || hasArg cppdocs || hasArg cpp-mgtests; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         GPU_ARCH=""
         echo "Building for the architecture of the GPU in the system..."
@@ -185,7 +185,7 @@ MAKE_TARGETS=
 if hasArg libcuml; then
     MAKE_TARGETS="${MAKE_TARGETS}cuml++ cuml ml"
 fi
-if hasArg mgtests; then
+if hasArg cpp-mgtests; then
     MAKE_TARGETS="${MAKE_TARGETS} ml_mg"
 fi
 if hasArg prims; then
@@ -199,7 +199,7 @@ if hasArg prims-bench; then
 fi
 
 # If `./build.sh cuml` is called, don't build C/C++ components
-if completeBuild || hasArg libcuml || hasArg prims || hasArg bench; then
+if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg cpp-mgtests; then
     # If there are no targets specified when calling build.sh, it will
     # just call `make -j`. This avoids a lot of extra printing
     cd ${LIBCUML_BUILD_DIR}

@@ -50,7 +50,7 @@ option(BUILD_CUML_CPP_LIBRARY "Build libcuml++ shared library" ON)
 
 option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON)
 
-option(BUILD_CUML_MG_TESTS "Build cuML multigpu algorithm tests" ON)
+option(BUILD_CUML_MG_TESTS "Build cuML multigpu algorithm tests" OFF)
 
 option(BUILD_PRIMS_TESTS "Build ml-prim tests" ON)
 
@@ -156,10 +156,10 @@ if(SINGLEGPU)
   set(WITH_UCX OFF)
 endif(SINGLEGPU)
 
-if(NOT BUILD_CUML_MPI_COMMS AND NOT SINGLEGPU)
-  message(STATUS "Detected BUILD_CUML_MPI_COMMS set to OFF. Disabling BUILD_CUML_MG_TESTS")
-  set(BUILD_CUML_MG_TESTS OFF)
-endif(NOT BUILD_CUML_MPI_COMMS AND NOT SINGLEGPU)
+if(BUILD_CUML_MG_TESTS AND NOT SINGLEGPU)
+  message(STATUS "Detected BUILD_CUML_MG_TESTS set to ON. Enabling BUILD_CUML_MPI_COMMS")
+  set(BUILD_CUML_MPI_COMMS ON)
+endif(BUILD_CUML_MG_TESTS AND NOT SINGLEGPU)
 
 ##############################################################################
 # - Requirements -------------------------------------------------------------
@@ -295,7 +295,6 @@ include(cmake/Dependencies.cmake)
 
 set(CUML_INCLUDE_DIRECTORIES
   ${CUML_INCLUDE_DIR}
-  ${CUMLPRIMS_MG_INCLUDE_DIRS}
   ${CMAKE_CURRENT_SOURCE_DIR}/src
   ${CMAKE_CURRENT_SOURCE_DIR}/src_prims
   ${CMAKE_CURRENT_SOURCE_DIR}/test/prims
@@ -304,16 +303,33 @@ set(CUML_INCLUDE_DIRECTORIES
   ${CUTLASS_DIR}/src/cutlass
   ${CUB_DIR}/src/cub
   ${SPDLOG_DIR}/src/spdlog/include
-  ${RAFT_DIR}/cpp/include)
+  ${RAFT_DIR}/cpp/include
+  )
 
-set(CUML_LINK_LIBRARIES
+set(CUML_PUBLIC_LINK_LIBRARIES
   ${CUDA_cublas_LIBRARY}
   ${CUDA_curand_LIBRARY}
   ${CUDA_cusolver_LIBRARY}
   ${CUDA_CUDART_LIBRARY}
   ${CUDA_cusparse_LIBRARY}
   ${CUDA_nvgraph_LIBRARY}
-  ${CUMLPRIMS_MG_LIBRARIES})
+  )
+
+set(CUML_PRIVATE_LINK_LIBRARIES
+  ${Protobuf_LIBRARIES}
+  faisslib
+  treelite::treelite
+  treelite::treelite_runtime
+  )
+
+if(ENABLE_CUMLPRIMS_MG)
+  list(APPEND CUML_INCLUDE_DIRECTORIES
+       ${CUMLPRIMS_MG_INCLUDE_DIRS})
+
+  list(APPEND CUML_PRIVATE_LINK_LIBRARIES
+       CUMLPRIMS_MG::CUMLPRIMS_MG)
+
+endif(ENABLE_CUMLPRIMS_MG)
 
 ##############################################################################
 # - build libcuml++ shared library -------------------------------------------
@@ -359,18 +375,21 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   # mnmg components
 
-  # if(NOT SINGLEGPU)
-  #   target_sources(${CUML_CPP_TARGET}
-  #     PRIVATE src/kmeans/kmeans_mg.cu
-  #   )
-  # endif(NOT SINGLEGPU)
+  if(NOT SINGLEGPU)
+    target_sources(${CUML_CPP_TARGET}
+      PRIVATE
+        src/pca/pca_mg.cu
+        src/pca/sign_flip_mg.cu
+        src/tsvd/tsvd_mg.cu
+    )
+  endif(NOT SINGLEGPU)
 
   if(OPENMP_FOUND)
-    set(CUML_LINK_LIBRARIES ${CUML_LINK_LIBRARIES} ${OpenMP_CXX_LIB_NAMES} Threads::Threads)
+    set(CUML_PUBLIC_LINK_LIBRARIES ${CUML_PUBLIC_LINK_LIBRARIES} ${OpenMP_CXX_LIB_NAMES} Threads::Threads)
   endif(OPENMP_FOUND)
 
   if(NVTX)
-    set(CUML_LINK_LIBRARIES ${CUML_LINK_LIBRARIES} nvToolsExt)
+    set(CUML_PUBLIC_LINK_LIBRARIES ${CUML_PUBLIC_LINK_LIBRARIES} nvToolsExt)
     link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
   endif(NVTX)
 
@@ -379,12 +398,10 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   target_link_libraries(${CUML_CPP_TARGET}
     PUBLIC
-      ${CUML_LINK_LIBRARIES}
+      ${CUML_PUBLIC_LINK_LIBRARIES}
     PRIVATE
-      ${Protobuf_LIBRARIES}
-      faisslib
-      treelite::treelite
-      treelite::treelite_runtime)
+      ${CUML_PRIVATE_LINK_LIBRARIES}
+  )
   # If we export the libdmlc symbols, they can lead to weird crashes with other
   # libraries that use libdmlc. This just hides the symbols internally.
   target_link_options(${CUML_CPP_TARGET} PRIVATE "-Wl,--exclude-libs,libdmlc.a")

@@ -34,11 +34,11 @@ Current cmake offers the following configuration options:
 | --- | --- | --- | --- |
 | BUILD_CUML_CPP_LIBRARY | [ON, OFF]  | ON  | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` |
 | BUILD_CUML_TESTS | [ON, OFF]  | ON  |  Enable/disable building cuML algorithm test executable `ml_test`.  |
-| BUILD_CUML_MG_TESTS | [ON, OFF]  | ON  |  Enable/disable building cuML algorithm test executable `ml_mg_test`. Requires MPI installed and turning BUILD_CUML_MPI_COMMS to ON. |
+| BUILD_CUML_MG_TESTS | [ON, OFF]  | ON  |  Enable/disable building cuML algorithm test executable `ml_mg_test`. Requires MPI to be installed. When enabled, BUILD_CUML_MPI_COMMS will be automatically set to ON. |
 | BUILD_PRIMS_TESTS | [ON, OFF]  | ON  | Enable/disable building cuML algorithm test executable `prims_test`.  |
 | BUILD_CUML_STD_COMMS | [ON, OFF] | ON | Enable/disable building cuML NCCL+UCX communicator for running multi-node multi-GPU algorithms. Note that UCX support can also be enabled/disabled (see below). The standard communicator and MPI communicator are not mutually exclusive and can both be installed at the same time. |
 | WITH_UCX | [ON, OFF] | OFF | Enable/disable UCX support in the standard cuML communicator. Algorithms requiring point-to-point messaging will not work when this is disabled. This flag is ignored if BUILD_CUML_STD_COMMS is set to OFF. |
-| BUILD_CUML_MPI_COMMS | [ON, OFF] | OFF | Enable/disable building cuML MPI+NCCL communicator for running multi-node multi-GPU C++ tests. MPI communicator and STD communicator are not mutually exclusive and can both be installed at the same time. If OFF, it overrides BUILD_CUML_MG_TESTS to be OFF as well. |
+| BUILD_CUML_MPI_COMMS | [ON, OFF] | OFF | Enable/disable building cuML MPI+NCCL communicator for running multi-node multi-GPU C++ tests. MPI communicator and STD communicator may both be installed at the same time. If OFF, it overrides BUILD_CUML_MG_TESTS to be OFF as well. |
 | SINGLEGPU | [ON, OFF] | OFF | Disable all mnmg components. Disables building of all multi-GPU algorithms and all comms library components. Removes libcumlprims, UCX-py and NCCL dependencies. Overrides values of  BUILD_CUML_MG_TESTS, BUILD_CUML_STD_COMMS, WITH_UCX and BUILD_CUML_MPI_COMMS. |
 | BUILD_CUML_EXAMPLES | [ON, OFF]  | ON  | Enable/disable building cuML C++ API usage examples.  |
 | BUILD_CUML_BENCH | [ON, OFF] | ON | Enable/disable building oc cuML C++ benchark.  |

@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <opg/matrix/data.hpp>
+#include <opg/matrix/part_descriptor.hpp>
+#include "pca.hpp"
+
+#include <common/cumlHandle.hpp>
+
+namespace ML {
+
+enum class mg_solver { COV_EIG_DQ, COV_EIG_JACOBI, QR };
+
+typedef paramsTSVDTemplate<mg_solver> paramsTSVDMG;
+
+typedef paramsPCATemplate<mg_solver> paramsPCAMG;
+
+namespace PCA {
+namespace opg {
+
+/**
+ * @brief performs MNMG fit operation for the pca
+ * @input param handle: the internal cuml handle object
+ * @input param rank_sizes: includes all the partition size information for the rank
+ * @input param n_parts: number of partitions
+ * @input param input: input data
+ * @input param components: principal components of the input data
+ * @output param explained_var: explained var
+ * @output param explained_var_ratio: the explained var ratio
+ * @output param singular_vals: singular values of the data
+ * @output param mu: mean of every column in input
+ * @output param noise_vars: variance of the noise
+ * @input param prms: data structure that includes all the parameters from input size to algorithm
+ * @input param verbose
+ */
+void fit(cumlHandle &handle,
+         std::vector<MLCommon::Matrix::Data<float> *> &input_data,
+         MLCommon::Matrix::PartDescriptor &input_desc, float *components,
+         float *explained_var, float *explained_var_ratio, float *singular_vals,
+         float *mu, float *noise_vars, paramsPCAMG prms, bool verbose = false);
+
+void fit(cumlHandle &handle,
+         std::vector<MLCommon::Matrix::Data<double> *> &input_data,
+         MLCommon::Matrix::PartDescriptor &input_desc, double *components,
+         double *explained_var, double *explained_var_ratio,
+         double *singular_vals, double *mu, double *noise_vars,
+         paramsPCAMG prms, bool verbose = false);
+
+/**
+ * @brief performs MNMG fit and transform operation for the pca
+ * @input param handle: the internal cuml handle object
+ * @input param rank_sizes: includes all the partition size information for the rank
+ * @input param n_parts: number of partitions
+ * @input param input: input data
+ * @output param trans_input: transformed input data
+ * @output param components: principal components of the input data
+ * @output param explained_var: explained var
+ * @output param explained_var_ratio: the explained var ratio
+ * @output param singular_vals: singular values of the data
+ * @output param mu: mean of every column in input
+ * @output param noise_vars: variance of the noise
+ * @input param prms: data structure that includes all the parameters from input size to algorithm
+ * @input param verbose
+ */
+void fit_transform(cumlHandle &handle,
+                   MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts,
+                   MLCommon::Matrix::floatData_t **input,
+                   MLCommon::Matrix::floatData_t **trans_input,
+                   float *components, float *explained_var,
+                   float *explained_var_ratio, float *singular_vals, float *mu,
+                   float *noise_vars, paramsPCAMG prms, bool verbose);
+
+void fit_transform(cumlHandle &handle,
+                   MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts,
+                   MLCommon::Matrix::doubleData_t **input,
+                   MLCommon::Matrix::doubleData_t **trans_input,
+                   double *components, double *explained_var,
+                   double *explained_var_ratio, double *singular_vals,
+                   double *mu, double *noise_vars, paramsPCAMG prms,
+                   bool verbose);
+
+/**
+ * @brief performs MNMG transform operation for the pca
+ * @input param handle: the internal cuml handle object
+ * @input param rank_sizes: includes all the partition size information for the rank
+ * @input param n_parts: number of partitions
+ * @input param input: input data
+ * @input param components: principal components of the input data
+ * @output param trans_input: transformed input data
+ * @input param singular_vals: singular values of the data
+ * @input param mu: mean of every column in input
+ * @input param prms: data structure that includes all the parameters from input size to algorithm
+ * @input param verbose
+ */
+void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes,
+               size_t n_parts, MLCommon::Matrix::Data<float> **input,
+               float *components, MLCommon::Matrix::Data<float> **trans_input,
+               float *singular_vals, float *mu, paramsPCAMG prms, bool verbose);
+
+void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes,
+               size_t n_parts, MLCommon::Matrix::Data<double> **input,
+               double *components, MLCommon::Matrix::Data<double> **trans_input,
+               double *singular_vals, double *mu, paramsPCAMG prms,
+               bool verbose);
+
+/**
+ * @brief performs MNMG inverse transform operation for the pca
+ * @input param handle: the internal cuml handle object
+ * @input param rank_sizes: includes all the partition size information for the rank
+ * @input param n_parts: number of partitions
+ * @input param trans_input: transformed input data
+ * @input param components: principal components of the input data
+ * @output param input: input data
+ * @input param singular_vals: singular values of the data
+ * @input param mu: mean of every column in input
+ * @input param prms: data structure that includes all the parameters from input size to algorithm
+ * @input param verbose
+ */
+void inverse_transform(cumlHandle &handle,
+                       MLCommon::Matrix::RankSizePair **rank_sizes,
+                       size_t n_parts,
+                       MLCommon::Matrix::Data<float> **trans_input,
+                       float *components, MLCommon::Matrix::Data<float> **input,
+                       float *singular_vals, float *mu, paramsPCAMG prms,
+                       bool verbose);
+
+void inverse_transform(
+  cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes,
+  size_t n_parts, MLCommon::Matrix::Data<double> **trans_input,
+  double *components, MLCommon::Matrix::Data<double> **input,
+  double *singular_vals, double *mu, paramsPCAMG prms, bool verbose);
+
+};  // end namespace opg
+};  // end namespace PCA
+};  // end namespace ML
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/cumlHandle.hpp>
+#include <opg/matrix/data.hpp>
+#include <opg/matrix/part_descriptor.hpp>
+
+namespace ML {
+namespace PCA {
+namespace opg {
+
+/**
+ * @brief sign flip for PCA and tSVD. This is used to stabilize the sign of column major eigen vectors
+ * @input param handle: the internal cuml handle object
+ * @input/output param input param input: input matrix that will be used to determine the sign.
+ * @input param input_desc: MNMG description of the input
+ * @input/output param  components: components matrix.
+ * @input param n_components: number of columns of components matrix
+ * @input param streams: cuda streams
+ * @input param n_streams: number of streams
+ * @{
+ */
+void sign_flip(cumlHandle &handle,
+               std::vector<MLCommon::Matrix::Data<float> *> &input_data,
+               MLCommon::Matrix::PartDescriptor &input_desc, float *components,
+               int n_components, cudaStream_t *streams, int n_stream);
+
+void sign_flip(cumlHandle &handle,
+               std::vector<MLCommon::Matrix::Data<double> *> &input_data,
+               MLCommon::Matrix::PartDescriptor &input_desc, double *components,
+               int n_components, cudaStream_t *streams, int n_stream);
+
+};  // end namespace opg
+};  // end namespace PCA
+};  // end namespace ML