[FEA] Support no regularization in MNMG LogisticRegression (#5558)

Also adopted the code structure of the SG class to prepare for future PRs. This PR depends on and has included [PR 5567](#5567) Authors: - Jinfeng Li (https://github.com/lijinf2) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #5558
rapidsai · Sep 21, 2023 · f10a320 · f10a320
1 parent 7813ef8
commit f10a320
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 50 deletions.
diff --git a/cpp/src/glm/qn/glm_base_mg.cuh → cpp/src/glm/qn/mg/glm_base_mg.cuh b/cpp/src/glm/qn/glm_base_mg.cuh → cpp/src/glm/qn/mg/glm_base_mg.cuh
@@ -121,10 +121,12 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData<T, GLMObjective> {
     auto lossFunc        = regularizer_obj->loss;
     auto reg             = regularizer_obj->reg;
     G.fill(0, stream);
-    reg->reg_grad(dev_scalar, G, W, lossFunc->fit_intercept, stream);
-    float reg_host;
-    raft::update_host(&reg_host, dev_scalar, 1, stream);
-    // note: avoid syncing here because there's a sync before reg_host is used.
+    float reg_host = 0;
+    if (reg->l2_penalty != 0) {
+      reg->reg_grad(dev_scalar, G, W, lossFunc->fit_intercept, stream);
+      raft::update_host(&reg_host, dev_scalar, 1, stream);
+      // note: avoid syncing here because there's a sync before reg_host is used.
+    }
 
     // apply linearFwd, getLossAndDz, linearBwd
     ML::GLM::detail::linearFwd(

diff --git a/cpp/src/glm/qn/mg/qn_mg.cuh b/cpp/src/glm/qn/mg/qn_mg.cuh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "glm_base_mg.cuh"
+#include <glm/qn/glm_logistic.cuh>
+#include <glm/qn/glm_regularizer.cuh>
+#include <glm/qn/glm_softmax.cuh>
+#include <glm/qn/glm_svm.cuh>
+#include <glm/qn/qn_solvers.cuh>
+#include <glm/qn/qn_util.cuh>
+
+#include <cuml/linear_model/qn.h>
+#include <rmm/device_uvector.hpp>
+
+namespace ML {
+namespace GLM {
+namespace opg {
+using namespace ML::GLM::detail;
+
+template <typename T, typename LossFunction>
+int qn_fit_mg(const raft::handle_t& handle,
+              const qn_params& pams,
+              LossFunction& loss,
+              const SimpleMat<T>& X,
+              const SimpleVec<T>& y,
+              SimpleDenseMat<T>& Z,
+              T* w0_data,  // initial value and result
+              T* fx,
+              int* num_iters,
+              size_t n_samples,
+              int rank,
+              int n_ranks)
+{
+  cudaStream_t stream = handle.get_stream();
+  LBFGSParam<T> opt_param(pams);
+  SimpleVec<T> w0(w0_data, loss.n_param);
+
+  // Scale the regularization strength with the number of samples.
+  T l1 = 0;
+  T l2 = pams.penalty_l2;
+  if (pams.penalty_normalized) { l2 /= n_samples; }
+
+  ML::GLM::detail::Tikhonov<T> reg(l2);
+  ML::GLM::detail::RegularizedGLM<T, LossFunction, decltype(reg)> regularizer_obj(&loss, &reg);
+
+  auto obj_function = GLMWithDataMG(handle, rank, n_ranks, n_samples, &regularizer_obj, X, y, Z);
+  return ML::GLM::detail::qn_minimize(
+    handle, w0, fx, num_iters, obj_function, l1, opt_param, pams.verbose);
+}
+
+template <typename T>
+inline void qn_fit_x_mg(const raft::handle_t& handle,
+                        const qn_params& pams,
+                        SimpleMat<T>& X,
+                        T* y_data,
+                        int C,
+                        T* w0_data,
+                        T* f,
+                        int* num_iters,
+                        int64_t n_samples,
+                        int rank,
+                        int n_ranks,
+                        T* sample_weight = nullptr,
+                        T svr_eps        = 0)
+{
+  /*
+   NB:
+    N - number of data rows
+    D - number of data columns (features)
+    C - number of output classes
+
+    X in R^[N, D]
+    w in R^[D, C]
+    y in {0, 1}^[N, C] or {cat}^N
+
+    Dimensionality of w0 depends on loss, so we initialize it later.
+   */
+  cudaStream_t stream = handle.get_stream();
+  int N               = X.m;
+  int D               = X.n;
+  int n_targets       = ML::GLM::detail::qn_is_classification(pams.loss) && C == 2 ? 1 : C;
+  rmm::device_uvector<T> tmp(n_targets * N, stream);
+  SimpleDenseMat<T> Z(tmp.data(), n_targets, N);
+  SimpleVec<T> y(y_data, N);
+
+  switch (pams.loss) {
+    case QN_LOSS_LOGISTIC: {
+      ASSERT(C == 2, "qn_mg.cuh: logistic loss invalid C");
+      ML::GLM::detail::LogisticLoss<T> loss(handle, D, pams.fit_intercept);
+      ML::GLM::opg::qn_fit_mg<T, decltype(loss)>(
+        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks);
+    } break;
+    default: {
+      ASSERT(false, "qn_mg.cuh: unknown loss function type (id = %d).", pams.loss);
+    }
+  }
+}
+
+};  // namespace opg
+};  // namespace GLM
+};  // namespace ML
diff --git a/cpp/src/glm/qn_mg.cu b/cpp/src/glm/qn_mg.cu
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include "qn/glm_logistic.cuh"
-#include "qn/glm_regularizer.cuh"
-#include "qn/qn_util.cuh"
+#include "qn/mg/qn_mg.cuh"
 #include "qn/simple_mat/dense.hpp"
+#include <cuda_runtime.h>
 #include <cuml/common/logger.hpp>
 #include <cuml/linear_model/qn.h>
 #include <cuml/linear_model/qn_mg.hpp>
@@ -27,10 +26,6 @@
 #include <raft/util/cudart_utils.hpp>
 using namespace MLCommon;
 
-#include "qn/glm_base_mg.cuh"
-
-#include <cuda_runtime.h>
-
 namespace ML {
 namespace GLM {
 namespace opg {
@@ -62,39 +57,20 @@ void qnFit_impl(const raft::handle_t& handle,
     }
   }
 
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-  auto X_simple       = SimpleDenseMat<T>(X, N, D, X_col_major ? COL_MAJOR : ROW_MAJOR);
-  auto y_simple       = SimpleVec<T>(y, N);
-  SimpleVec<T> coef_simple(w0, D + pams.fit_intercept);
-
-  ML::GLM::detail::LBFGSParam<T> opt_param(pams);
-
-  // prepare regularizer regularizer_obj
-  ML::GLM::detail::LogisticLoss<T> loss_func(handle, D, pams.fit_intercept);
-  T l2 = pams.penalty_l2;
-  if (pams.penalty_normalized) {
-    l2 /= n_samples;  // l2 /= 1/X.m
-  }
-  ML::GLM::detail::Tikhonov<T> reg(l2);
-  ML::GLM::detail::RegularizedGLM<T, ML::GLM::detail::LogisticLoss<T>, decltype(reg)>
-    regularizer_obj(&loss_func, &reg);
-
-  // prepare GLMWithDataMG
-  int n_targets = C == 2 ? 1 : C;
-  rmm::device_uvector<T> tmp(n_targets * N, stream);
-  SimpleDenseMat<T> Z(tmp.data(), n_targets, N);
-  auto obj_function =
-    GLMWithDataMG(handle, rank, n_ranks, n_samples, &regularizer_obj, X_simple, y_simple, Z);
-
-  // prepare temporary variables fx, k, workspace
-  float fx = -1;
-  int k    = -1;
-  rmm::device_uvector<float> tmp_workspace(lbfgs_workspace_size(opt_param, coef_simple.len),
-                                           stream);
-  SimpleVec<float> workspace(tmp_workspace.data(), tmp_workspace.size());
-
-  // call min_lbfgs
-  min_lbfgs(opt_param, obj_function, coef_simple, fx, &k, workspace, stream, 5);
+  auto X_simple = SimpleDenseMat<T>(X, N, D, X_col_major ? COL_MAJOR : ROW_MAJOR);
+
+  ML::GLM::opg::qn_fit_x_mg(handle,
+                            pams,
+                            X_simple,
+                            y,
+                            C,
+                            w0,
+                            f,
+                            num_iters,
+                            n_samples,
+                            rank,
+                            n_ranks);  // ignore sample_weight, svr_eps
+  return;
 }
 
 template <typename T>

diff --git a/python/cuml/linear_model/logistic_regression_mg.pyx b/python/cuml/linear_model/logistic_regression_mg.pyx
@@ -103,7 +103,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
         self.solver_model.coef_ = value
 
     def prepare_for_fit(self, n_classes):
-        self.qnparams = QNParams(
+        self.solver_model.qnparams = QNParams(
             loss=self.loss,
             penalty_l1=self.l1_strength,
             penalty_l2=self.l2_strength,
@@ -176,10 +176,11 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
         # TODO: calculate _num_classes at runtime
         self._num_classes = 2
+        self.loss = "sigmoid" if self._num_classes <= 2 else "softmax"
         self.prepare_for_fit(self._num_classes)
         cdef uintptr_t mat_coef_ptr = self.coef_.ptr
 
-        cdef qn_params qnpams = self.qnparams.params
+        cdef qn_params qnpams = self.solver_model.qnparams.params
 
         if self.dtype == np.float32:
             qnFit(

diff --git a/python/cuml/solvers/qn.pyx b/python/cuml/solvers/qn.pyx
@@ -930,8 +930,15 @@ class QN(Base,
 
         if self.fit_intercept:
             self.intercept_ = self._coef_[-1]
-        else:
+            return
+
+        _num_classes_dim, _ = self.coef_.shape
+        _num_classes = self.get_num_classes(_num_classes_dim)
+
+        if _num_classes == 2:
             self.intercept_ = CumlArray.zeros(shape=1)
+        else:
+            self.intercept_ = CumlArray.zeros(shape=_num_classes)
 
     def get_param_names(self):
         return super().get_param_names() + \

diff --git a/python/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/tests/dask/test_dask_logistic_regression.py
@@ -259,7 +259,14 @@ def assert_params(
 @pytest.mark.parametrize("datatype", [np.float32])
 @pytest.mark.parametrize("delayed", [True, False])
 def test_lbfgs(
-    nrows, ncols, n_parts, fit_intercept, datatype, delayed, client
+    nrows,
+    ncols,
+    n_parts,
+    fit_intercept,
+    datatype,
+    delayed,
+    client,
+    penalty="l2",
 ):
     tolerance = 0.005
 
@@ -280,12 +287,12 @@ def imp():
 
     X_df, y_df = _prep_training_data(client, X, y, n_parts)
 
-    lr = cumlLBFGS_dask(fit_intercept=fit_intercept)
+    lr = cumlLBFGS_dask(fit_intercept=fit_intercept, penalty=penalty)
     lr.fit(X_df, y_df)
     lr_coef = lr.coef_.to_numpy()
     lr_intercept = lr.intercept_.to_numpy()
 
-    sk_model = skLR(fit_intercept=fit_intercept)
+    sk_model = skLR(fit_intercept=fit_intercept, penalty=penalty)
     sk_model.fit(X, y)
     sk_coef = sk_model.coef_
     sk_intercept = sk_model.intercept_
@@ -305,3 +312,27 @@ def imp():
     assert (accuracy_cuml >= accuracy_sk) | (
         np.abs(accuracy_cuml - accuracy_sk) < 1e-3
     )
+
+    return lr
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_noreg(fit_intercept, client):
+    lr = test_lbfgs(
+        nrows=1e5,
+        ncols=20,
+        n_parts=23,
+        fit_intercept=fit_intercept,
+        datatype=np.float32,
+        delayed=True,
+        client=client,
+        penalty="none",
+    )
+
+    qnpams = lr.qnparams.params
+    assert qnpams["penalty_l1"] == 0.0
+    assert qnpams["penalty_l2"] == 0.0
+
+    l1_strength, l2_strength = lr._get_qn_params()
+    assert l1_strength == 0.0
+    assert l2_strength == 0.0
diff --git a/python/cuml/tests/test_linear_model.py b/python/cuml/tests/test_linear_model.py
@@ -561,6 +561,9 @@ def test_logistic_regression(
     )
     assert len(np.unique(cu_preds)) == len(np.unique(y_test))
 
+    if fit_intercept is False:
+        assert np.array_equal(culog.intercept_, sklog.intercept_)
+
 
 @given(
     dtype=floating_dtypes(sizes=(32, 64)),