Skip to content

Commit

Permalink
Vectorizers to accept Pandas Series as input (rapidsai#4811)
Browse files Browse the repository at this point in the history
Resolves rapidsai#3403

This PR adds support for using `pandas.Series` as an input to `TfidfVectorizer`, `HashingVectorizer` and `CountVectorizer`.

Authors:
  - Shaswat Anand (https://github.com/shaswat-indian)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: rapidsai#4811
  • Loading branch information
shaswat-indian authored Jul 29, 2022
1 parent f1b5f40 commit e6f7bfb
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 15 deletions.
6 changes: 3 additions & 3 deletions python/cuml/feature_extraction/_tfidf_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def fit(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand All @@ -225,7 +225,7 @@ def fit_transform(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand All @@ -246,7 +246,7 @@ def transform(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand Down
22 changes: 10 additions & 12 deletions python/cuml/feature_extraction/_vectorizers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,7 @@
from cuml.common.type_utils import CUPY_SPARSE_DTYPES
from cudf.utils.dtypes import min_signed_type
import cuml.common.logger as logger
import pandas as pd


def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
Expand All @@ -35,7 +36,7 @@ def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
Parameters
----------
doc: cudf.Series[str]
doc: cudf.Series[str] or pd.Series[str]
The string to preprocess
lower: bool
Whether to use str.lower to lowercase all of the text
Expand All @@ -49,6 +50,8 @@ def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
doc: cudf.Series[str]
preprocessed string
"""
if isinstance(doc, pd.Series):
doc = Series(doc)
if lower:
doc = doc.str.lower()
if remove_non_alphanumeric:
Expand Down Expand Up @@ -513,7 +516,7 @@ def fit(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand All @@ -533,7 +536,7 @@ def fit_transform(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand Down Expand Up @@ -591,7 +594,7 @@ def transform(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand Down Expand Up @@ -825,14 +828,9 @@ def fit(self, X, y=None):
Parameters
----------
X : cudf.Series
X : cudf.Series or pd.Series
A Series of string documents
"""
if not (
isinstance(X, cudf.Series)
and isinstance(X._column, cudf.core.column.StringColumn)
):
raise ValueError(f"cudf.Series([str]) expected ,got {type(X)}")
self._warn_for_unused_params()
self._validate_params()
return self
Expand Down Expand Up @@ -896,7 +894,7 @@ def transform(self, raw_documents):
Parameters
----------
raw_documents : cudf.Series
raw_documents : cudf.Series or pd.Series
A Series of string documents
Returns
Expand Down
20 changes: 20 additions & 0 deletions python/cuml/tests/test_text_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from cudf import Series
from numpy.testing import assert_array_equal
import numpy as np
import pandas as pd


def test_count_vectorizer():
Expand Down Expand Up @@ -530,3 +531,22 @@ def test_hashingvectorizer_delimiter():
preprocessor=lambda s: s,
).fit_transform(corpus)
assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())


@pytest.mark.parametrize('vectorizer', ['tfidf', 'hash_vec', 'count_vec'])
def test_vectorizer_with_pandas_series(vectorizer):
corpus = [
"This Is DoC",
"this DoC is the second DoC.",
"And this document is the third one.",
"and Is this the first document?",
]
cuml_vec, sklearn_vec = {
'tfidf': (TfidfVectorizer, SkTfidfVect),
'hash_vec': (HashingVectorizer, SkHashVect),
'count_vec': (CountVectorizer, SkCountVect)
}[vectorizer]
raw_documents = pd.Series(corpus)
res = cuml_vec().fit_transform(raw_documents)
ref = sklearn_vec().fit_transform(raw_documents)
assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())

0 comments on commit e6f7bfb

Please sign in to comment.