rapidsai · rapids-bot · Feb 9, 2022 · Dec 13, 2021 · Dec 20, 2021 · Dec 20, 2021
@@ -18,11 +18,13 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/gemv.h>
 #include <cuml/solvers/params.hpp>
 #include <functions/linearReg.cuh>
 #include <functions/penalty.cuh>
 #include <functions/softThres.cuh>
 #include <glm/preprocess.cuh>
+// #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/eltwise.cuh>
@@ -39,6 +41,36 @@ namespace Solver {
 
 using namespace MLCommon;
 
+namespace {
+
+/** Epoch and iteration -related state. */
+template <typename math_t>
+struct ConvState {
+  math_t coef;
+  math_t coefMax;
+  math_t diffMax;
+};
+
+template <typename math_t>
+__global__ void __launch_bounds__(1, 1) updateCoefKernel(math_t* coefLoc,
+                                                         const math_t* squaredLoc,
+                                                         ConvState<math_t>* convStateLoc,
+                                                         const math_t l1_alpha)
+{
+  auto coef    = *coefLoc;
+  auto r       = coef > l1_alpha ? coef - l1_alpha : (coef < -l1_alpha ? coef + l1_alpha : 0);
+  auto squared = *squaredLoc;
+  r            = squared > math_t(1e-5) ? r / squared : math_t(0);
+  auto diff    = raft::myAbs(convStateLoc->coef - r);
+  if (convStateLoc->diffMax < diff) convStateLoc->diffMax = diff;
+  auto absv = raft::myAbs(r);
+  if (convStateLoc->coefMax < absv) convStateLoc->coefMax = absv;
+  convStateLoc->coef = -r;
+  *coefLoc           = r;
+}
+
+}  // namespace
+
 /**
  * Fits a linear, lasso, and elastic-net regression model using Coordinate Descent solver
  * @param handle
@@ -95,22 +127,20 @@ void cdFit(const raft::handle_t& handle,
            math_t tol,
            cudaStream_t stream)
 {
+  // RAFT_USING_NVTX_RANGE("ML::Solver::cdFit-%d-%d", n_rows, n_cols);
   ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one");
   ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two");
   ASSERT(loss == ML::loss_funct::SQRD_LOSS,
          "Parameter loss: Only SQRT_LOSS function is supported for now");
 
   cublasHandle_t cublas_handle = handle.get_cublas_handle();
 
-  rmm::device_uvector<math_t> pred(n_rows, stream);
   rmm::device_uvector<math_t> residual(n_rows, stream);
   rmm::device_uvector<math_t> squared(n_cols, stream);
   rmm::device_uvector<math_t> mu_input(0, stream);
   rmm::device_uvector<math_t> mu_labels(0, stream);
   rmm::device_uvector<math_t> norm2_input(0, stream);
 
-  std::vector<math_t> h_coef(n_cols, math_t(0));
-
   if (fit_intercept) {
     mu_input.resize(n_cols, stream);
     mu_labels.resize(1, stream);
@@ -135,7 +165,7 @@ void cdFit(const raft::handle_t& handle,
   initShuffle(ri, g);
 
   math_t l2_alpha = (1 - l1_ratio) * alpha * n_rows;
-  alpha           = l1_ratio * alpha * n_rows;
+  math_t l1_alpha = l1_ratio * alpha * n_rows;
 
   if (normalize) {
     math_t scalar = math_t(1.0) + l2_alpha;
@@ -148,59 +178,67 @@ void cdFit(const raft::handle_t& handle,
 
   raft::copy(residual.data(), labels, n_rows, stream);
 
+  ConvState<math_t> h_convState;
+  rmm::device_uvector<ConvState<math_t>> convStateBuf(1, stream);
+  auto convStateLoc = convStateBuf.data();
+
+  CUBLAS_CHECK(
+    raft::linalg::cublassetpointermode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE, stream));
+  rmm::device_scalar<math_t> cublas_alpha(1.0, stream);
+  rmm::device_scalar<math_t> cublas_beta(0.0, stream);
+
   for (int i = 0; i < epochs; i++) {
+    // RAFT_USING_NVTX_RANGE(stream, "ML::Solver::cdFit::epoch-%d", i);
     if (i > 0 && shuffle) { Solver::shuffle(ri, g); }
 
-    math_t coef_max   = 0.0;
-    math_t d_coef_max = 0.0;
-    math_t coef_prev  = 0.0;
+    CUDA_CHECK(cudaMemsetAsync(convStateLoc, 0, sizeof(ConvState<math_t>), stream));
 
     for (int j = 0; j < n_cols; j++) {
+      // RAFT_USING_NVTX_RANGE(stream, "ML::Solver::cdFit::col-%d", j);
       int ci                = ri[j];
       math_t* coef_loc      = coef + ci;
       math_t* squared_loc   = squared.data() + ci;
       math_t* input_col_loc = input + (ci * n_rows);
 
-      raft::linalg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream);
-      raft::linalg::add(residual.data(), residual.data(), pred.data(), n_rows, stream);
-      raft::linalg::gemm(handle,
-                         input_col_loc,
-                         n_rows,
-                         1,
-                         residual.data(),
-                         coef_loc,
-                         1,
-                         1,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         stream);
-
-      if (l1_ratio > math_t(0.0)) Functions::softThres(coef_loc, coef_loc, alpha, 1, stream);
-
-      raft::linalg::eltwiseDivideCheckZero(coef_loc, coef_loc, squared_loc, 1, stream);
-
-      coef_prev = h_coef[ci];
-      raft::update_host(&(h_coef[ci]), coef_loc, 1, stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-
-      math_t diff = abs(coef_prev - h_coef[ci]);
-
-      if (diff > d_coef_max) d_coef_max = diff;
-
-      if (abs(h_coef[ci]) > coef_max) coef_max = abs(h_coef[ci]);
-
-      raft::linalg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream);
-      raft::linalg::subtract(residual.data(), residual.data(), pred.data(), n_rows, stream);
+      raft::copy(&(convStateLoc->coef), coef_loc, 1, stream);
+      CUBLAS_CHECK(raft::linalg::cublasaxpy(
+        cublas_handle, n_rows, coef_loc, input_col_loc, 1, residual.data(), 1, stream));
+
+      raft::linalg::cublasgemv(cublas_handle,
+                               CUBLAS_OP_N,
+                               1,
+                               n_rows,
+                               cublas_alpha.data(),
+                               input_col_loc,
+                               1,
+                               residual.data(),
+                               1,
+                               cublas_beta.data(),
+                               coef_loc,
+                               1,
+                               stream);
+
+      updateCoefKernel<math_t><<<dim3(1, 1, 1), dim3(1, 1, 1), 0, stream>>>(
+        coef_loc, squared_loc, convStateLoc, l1_alpha);
+      CUDA_CHECK(cudaGetLastError());
+
+      CUBLAS_CHECK(raft::linalg::cublasaxpy(cublas_handle,
+                                            n_rows,
+                                            &(convStateLoc->coef),
+                                            input_col_loc,
+                                            1,
+                                            residual.data(),
+                                            1,
+                                            stream));
     }
+    raft::update_host(&h_convState, convStateLoc, 1, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    bool flag_continue = true;
-    if (coef_max == math_t(0)) { flag_continue = false; }
-
-    if ((d_coef_max / coef_max) < tol) { flag_continue = false; }
-
-    if (!flag_continue) { break; }
+    if (h_convState.coefMax < tol || (h_convState.diffMax / h_convState.coefMax) < tol) break;
   }
 
+  CUBLAS_CHECK(raft::linalg::cublassetpointermode(cublas_handle, CUBLAS_POINTER_MODE_HOST, stream));
+
   if (fit_intercept) {
     GLM::postProcessData(handle,
                          input,