Vectorizers to accept Pandas Series as input (rapidsai#4811)

Resolves rapidsai#3403 This PR adds support for using `pandas.Series` as an input to `TfidfVectorizer`, `HashingVectorizer` and `CountVectorizer`. Authors: - Shaswat Anand (https://github.com/shaswat-indian) - Ray Douglass (https://github.com/raydouglass) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#4811
jakirkham · Jul 29, 2022 · e6f7bfb · e6f7bfb
1 parent f1b5f40
commit e6f7bfb
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 15 deletions.
diff --git a/python/cuml/feature_extraction/_tfidf_vectorizer.py b/python/cuml/feature_extraction/_tfidf_vectorizer.py
@@ -206,7 +206,7 @@ def fit(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
            A Series of string documents
 
         Returns
@@ -225,7 +225,7 @@ def fit_transform(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
            A Series of string documents
 
         Returns
@@ -246,7 +246,7 @@ def transform(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
            A Series of string documents
 
         Returns

diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 from cuml.common.type_utils import CUPY_SPARSE_DTYPES
 from cudf.utils.dtypes import min_signed_type
 import cuml.common.logger as logger
+import pandas as pd
 
 
 def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
@@ -35,7 +36,7 @@ def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
 
     Parameters
     ----------
-    doc: cudf.Series[str]
+    doc: cudf.Series[str] or pd.Series[str]
         The string to preprocess
     lower: bool
         Whether to use str.lower to lowercase all of the text
@@ -49,6 +50,8 @@ def _preprocess(doc, lower=False, remove_non_alphanumeric=False, delimiter=" ",
     doc: cudf.Series[str]
         preprocessed string
     """
+    if isinstance(doc, pd.Series):
+        doc = Series(doc)
     if lower:
         doc = doc.str.lower()
     if remove_non_alphanumeric:
@@ -513,7 +516,7 @@ def fit(self, raw_documents):
         Parameters
         ----------
 
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
             A Series of string documents
 
         Returns
@@ -533,7 +536,7 @@ def fit_transform(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
            A Series of string documents
 
         Returns
@@ -591,7 +594,7 @@ def transform(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
            A Series of string documents
 
         Returns
@@ -825,14 +828,9 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : cudf.Series
+        X : cudf.Series or pd.Series
              A Series of string documents
         """
-        if not (
-            isinstance(X, cudf.Series)
-            and isinstance(X._column, cudf.core.column.StringColumn)
-        ):
-            raise ValueError(f"cudf.Series([str]) expected ,got {type(X)}")
         self._warn_for_unused_params()
         self._validate_params()
         return self
@@ -896,7 +894,7 @@ def transform(self, raw_documents):
 
         Parameters
         ----------
-        raw_documents : cudf.Series
+        raw_documents : cudf.Series or pd.Series
             A Series of string documents
 
         Returns

diff --git a/python/cuml/tests/test_text_feature_extraction.py b/python/cuml/tests/test_text_feature_extraction.py
@@ -25,6 +25,7 @@
 from cudf import Series
 from numpy.testing import assert_array_equal
 import numpy as np
+import pandas as pd
 
 
 def test_count_vectorizer():
@@ -530,3 +531,22 @@ def test_hashingvectorizer_delimiter():
         preprocessor=lambda s: s,
     ).fit_transform(corpus)
     assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
+
+
+@pytest.mark.parametrize('vectorizer', ['tfidf', 'hash_vec', 'count_vec'])
+def test_vectorizer_with_pandas_series(vectorizer):
+    corpus = [
+        "This Is DoC",
+        "this DoC is the second DoC.",
+        "And this document is the third one.",
+        "and Is this the first document?",
+    ]
+    cuml_vec, sklearn_vec = {
+        'tfidf': (TfidfVectorizer, SkTfidfVect),
+        'hash_vec': (HashingVectorizer, SkHashVect),
+        'count_vec': (CountVectorizer, SkCountVect)
+    }[vectorizer]
+    raw_documents = pd.Series(corpus)
+    res = cuml_vec().fit_transform(raw_documents)
+    ref = sklearn_vec().fit_transform(raw_documents)
+    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())