Skip to content

Commit

Permalink
Fix for Categorical Naive Bayes sparse handling (rapidsai#4277)
Browse files Browse the repository at this point in the history
Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: rapidsai#4277
  • Loading branch information
lowener authored Oct 12, 2021
1 parent cfb0a8d commit 9168938
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 16 deletions.
41 changes: 28 additions & 13 deletions python/cuml/naive_bayes/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def partial_fit(self, X, y, classes=None,
sample_weight : array-like of shape (n_samples)
Weights applied to individual samples (1. for
unweighted). Currently sample weight is ignored
unweighted). Currently sample weight is ignored.
Returns
-------
Expand Down Expand Up @@ -804,7 +804,9 @@ def fit(self, X, y, sample_weight=None) -> "_BaseDiscreteNB":
y : array-like shape (n_samples) Target values.
sample_weight : array-like of shape (n_samples)
Weights applied to individial samples (1. for unweighted).
Currently sample weight is ignored.
"""
self.fit_called_ = False
return self.partial_fit(X, y, sample_weight)

def _init_counters(self, n_effective_classes, n_features, dtype):
Expand Down Expand Up @@ -990,6 +992,22 @@ class MultinomialNB(_BaseDiscreteNB):
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
Attributes
----------
class_count_ : ndarray of shape (n_classes)
Number of samples encountered for each class during fitting.
class_log_prior_ : ndarray of shape (n_classes)
Log probability of each class (smoothed).
classes_ : ndarray of shape (n_classes,)
Class labels known to the classifier
feature_count_ : ndarray of shape (n_classes, n_features)
Number of samples encountered for each (class, feature)
during fitting.
feature_log_prob_ : ndarray of shape (n_classes, n_features)
Empirical log probability of features given a class, P(x_i|y).
n_features_ : int
Number of features of each sample.
Examples
--------
Expand Down Expand Up @@ -1127,16 +1145,14 @@ class BernoulliNB(_BaseDiscreteNB):
Attributes
----------
class_count_ : ndarray of shape (n_classes)
Number of samples encountered for each class during fitting. This
value is weighted by the sample weight when provided.
Number of samples encountered for each class during fitting.
class_log_prior_ : ndarray of shape (n_classes)
Log probability of each class (smoothed).
classes_ : ndarray of shape (n_classes,)
Class labels known to the classifier
feature_count_ : ndarray of shape (n_classes, n_features)
Number of samples encountered for each (class, feature)
during fitting. This value is weighted by the sample weight when
provided.
during fitting.
feature_log_prob_ : ndarray of shape (n_classes, n_features)
Empirical log probability of features given a class, P(x_i|y).
n_features_ : int
Expand Down Expand Up @@ -1444,14 +1460,13 @@ def _count_sparse(self, x_coo_rows, x_coo_cols, x_coo_data, x_shape, Y,
highest_feature = int(x_coo_data.max()) + 1
feature_diff = highest_feature - self.category_count_.shape[1]
# In case of a partial fit, pad the array to have the highest feature
if feature_diff > 0:
if not cp.sparse.issparse(self.category_count_):
self.category_count_ = cupyx.scipy.sparse.coo_matrix(
(self.n_features_ * n_classes, highest_feature))
else:
self.category_count_ = cupyx.scipy.sparse.coo_matrix(
self.category_count_,
shape=(self.n_features_ * n_classes, highest_feature))
if not cp.sparse.issparse(self.category_count_):
self.category_count_ = cupyx.scipy.sparse.coo_matrix(
(self.n_features_ * n_classes, highest_feature))
elif feature_diff > 0:
self.category_count_ = cupyx.scipy.sparse.coo_matrix(
self.category_count_,
shape=(self.n_features_ * n_classes, highest_feature))
highest_feature = self.category_count_.shape[1]

count_features_coo = cp.ElementwiseKernel(
Expand Down
6 changes: 3 additions & 3 deletions python/cuml/test/test_naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,10 @@ def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse,

X, y = nlp_20news
model = GaussianNB()
n_rows = 1000

n_rows = 500
n_cols = int(2e5)
X = sparse_scipy_to_cp(X, x_dtype)
X = X.tocsr()[:n_rows]
X = X.tocsr()[:n_rows, :n_cols]

if is_sparse:
y = y.astype(y_dtype)[:n_rows]
Expand Down

0 comments on commit 9168938

Please sign in to comment.