Fix for Categorical Naive Bayes sparse handling (rapidsai#4277)

Authors: - Micka (https://github.com/lowener) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#4277
vimarsh6739 · Oct 12, 2021 · 9168938 · 9168938
1 parent cfb0a8d
commit 9168938
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 16 deletions.
diff --git a/python/cuml/naive_bayes/naive_bayes.py b/python/cuml/naive_bayes/naive_bayes.py
@@ -720,7 +720,7 @@ def partial_fit(self, X, y, classes=None,
 
         sample_weight : array-like of shape (n_samples)
                         Weights applied to individual samples (1. for
-                        unweighted). Currently sample weight is ignored
+                        unweighted). Currently sample weight is ignored.
 
         Returns
         -------
@@ -804,7 +804,9 @@ def fit(self, X, y, sample_weight=None) -> "_BaseDiscreteNB":
         y : array-like shape (n_samples) Target values.
         sample_weight : array-like of shape (n_samples)
             Weights applied to individial samples (1. for unweighted).
+            Currently sample weight is ignored.
         """
+        self.fit_called_ = False
         return self.partial_fit(X, y, sample_weight)
 
     def _init_counters(self, n_effective_classes, n_features, dtype):
@@ -990,6 +992,22 @@ class MultinomialNB(_BaseDiscreteNB):
         Sets logging level. It must be one of `cuml.common.logger.level_*`.
         See :ref:`verbosity-levels` for more info.
 
+    Attributes
+    ----------
+    class_count_ : ndarray of shape (n_classes)
+        Number of samples encountered for each class during fitting.
+    class_log_prior_ : ndarray of shape (n_classes)
+        Log probability of each class (smoothed).
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier
+    feature_count_ : ndarray of shape (n_classes, n_features)
+        Number of samples encountered for each (class, feature)
+        during fitting.
+    feature_log_prob_ : ndarray of shape (n_classes, n_features)
+        Empirical log probability of features given a class, P(x_i|y).
+    n_features_ : int
+        Number of features of each sample.
+
     Examples
     --------
 
@@ -1127,16 +1145,14 @@ class BernoulliNB(_BaseDiscreteNB):
     Attributes
     ----------
     class_count_ : ndarray of shape (n_classes)
-        Number of samples encountered for each class during fitting. This
-        value is weighted by the sample weight when provided.
+        Number of samples encountered for each class during fitting.
     class_log_prior_ : ndarray of shape (n_classes)
         Log probability of each class (smoothed).
     classes_ : ndarray of shape (n_classes,)
         Class labels known to the classifier
     feature_count_ : ndarray of shape (n_classes, n_features)
         Number of samples encountered for each (class, feature)
-        during fitting. This value is weighted by the sample weight when
-        provided.
+        during fitting.
     feature_log_prob_ : ndarray of shape (n_classes, n_features)
         Empirical log probability of features given a class, P(x_i|y).
     n_features_ : int
@@ -1444,14 +1460,13 @@ def _count_sparse(self, x_coo_rows, x_coo_cols, x_coo_data, x_shape, Y,
         highest_feature = int(x_coo_data.max()) + 1
         feature_diff = highest_feature - self.category_count_.shape[1]
         # In case of a partial fit, pad the array to have the highest feature
-        if feature_diff > 0:
-            if not cp.sparse.issparse(self.category_count_):
-                self.category_count_ = cupyx.scipy.sparse.coo_matrix(
-                    (self.n_features_ * n_classes, highest_feature))
-            else:
-                self.category_count_ = cupyx.scipy.sparse.coo_matrix(
-                    self.category_count_,
-                    shape=(self.n_features_ * n_classes, highest_feature))
+        if not cp.sparse.issparse(self.category_count_):
+            self.category_count_ = cupyx.scipy.sparse.coo_matrix(
+                (self.n_features_ * n_classes, highest_feature))
+        elif feature_diff > 0:
+            self.category_count_ = cupyx.scipy.sparse.coo_matrix(
+                self.category_count_,
+                shape=(self.n_features_ * n_classes, highest_feature))
         highest_feature = self.category_count_.shape[1]
 
         count_features_coo = cp.ElementwiseKernel(

diff --git a/python/cuml/test/test_naive_bayes.py b/python/cuml/test/test_naive_bayes.py
@@ -359,10 +359,10 @@ def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse,
 
     X, y = nlp_20news
     model = GaussianNB()
-    n_rows = 1000
-
+    n_rows = 500
+    n_cols = int(2e5)
     X = sparse_scipy_to_cp(X, x_dtype)
-    X = X.tocsr()[:n_rows]
+    X = X.tocsr()[:n_rows, :n_cols]
 
     if is_sparse:
         y = y.astype(y_dtype)[:n_rows]