From f9e599a6a194774b6e4c89924afe6d7e99928950 Mon Sep 17 00:00:00 2001
From: CMG203 <cmg203@203LM-2024-02.lan>
Date: Sat, 16 Nov 2024 20:05:27 +0100
Subject: [PATCH] feat: Add option to remove default stopwords from word
 summary

---
 requirements-dev.txt                          |  1 +
 requirements-test.txt                         |  3 +-
 .../pandas/describe_categorical_pandas.py     | 42 ++++++++++++++-----
 .../test_describe_categorical_pandas.py       | 37 +++++++++++++---
 4 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index d727c6d48..7a3152283 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3
 sphinx-autodoc-typehints>=1.10.3
 sphinx-multiversion>=0.2.3
 autodoc_pydantic
+nltk
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index f190e6cd1..ad3bd8a36 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -6,4 +6,5 @@ pytest-spark
 nbval
 pyarrow
 twine>=3.1.1
-kaggle
\ No newline at end of file
+kaggle
+nltk
\ No newline at end of file
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
index 31ae57417..237a77e1a 100644
--- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 import pandas as pd
+from nltk.corpus import stopwords
+import nltk
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
@@ -18,6 +20,9 @@
 )
 
 
+nltk.download('stopwords')
+
+
 def get_character_counts_vc(vc: pd.Series) -> pd.Series:
     series = pd.Series(vc.index, index=vc)
     characters = series[series != ""].apply(list)
@@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict:
     return summary
 
 
-def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
+def word_summary_vc(
+    vc: pd.Series,
+    stop_words: List[str] = [],
+    remove_default_stopwords: bool = True,
+    keep_stopwords: List[str] = []
+) -> dict:
     """Count the number of occurrences of each individual word across
     all lines of the data Series, then sort from the word with the most
     occurrences to the word with the least occurrences. If a list of
-    stop words is given, they will be ignored.
+    stop words is given, they will be ignored, along with default
+    English stopwords if remove_default_stopwords is True.
 
     Args:
         vc: Series containing all unique categories as index and their
             frequency as value. Sorted from the most frequent down.
         stop_words: List of stop words to ignore, empty by default.
+        remove_default_stopwords: Boolean flag to decide if default
+            English stopwords should be removed, default is True.
+        keep_stopwords: List of stop words to keep, even if they are
+            part of the default or custom stop words.
 
     Returns:
         A dict containing the results as a Series with unique words as
-        index and the computed frequency as value
+        index and the computed frequency as value.
     """
-    # TODO: configurable lowercase/punctuation etc.
-    # TODO: remove punctuation in words
+    # Convert custom stop words to lowercase
+    stop_words = {word.lower() for word in stop_words}
+
+    # Merge default stop words if enabled
+    if remove_default_stopwords:
+        default_stop_words = set(stopwords.words('english'))
+        stop_words = stop_words.union(default_stop_words)
 
+    # Remove any words specified in keep_stopwords
+    stop_words -= set(word.lower() for word in keep_stopwords)
+
+    # Prepare series for word count
     series = pd.Series(vc.index, index=vc)
     word_lists = series.str.lower().str.split()
     words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
     word_counts = pd.Series(words.index, index=words)
-    # fix for pandas 1.0.5
     word_counts = word_counts[word_counts.index.notnull()]
     word_counts = word_counts.groupby(level=0, sort=False).sum()
     word_counts = word_counts.sort_values(ascending=False)
 
-    # Remove stop words
-    if len(stop_words) > 0:
-        stop_words = [x.lower() for x in stop_words]
-        word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
+    # Exclude stop words
+    word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
 
     return {"word_counts": word_counts} if not word_counts.empty else {}
 
 
+
 def length_summary_vc(vc: pd.Series) -> dict:
     series = pd.Series(vc.index, index=vc)
     length = series.str.len()
diff --git a/tests/unit/test_pandas/test_describe_categorical_pandas.py b/tests/unit/test_pandas/test_describe_categorical_pandas.py
index 4cb7b12f6..51a9593dd 100644
--- a/tests/unit/test_pandas/test_describe_categorical_pandas.py
+++ b/tests/unit/test_pandas/test_describe_categorical_pandas.py
@@ -1,23 +1,48 @@
 import pandas as pd
 import pytest
-
 from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc
 
 value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1])
 
-
+# Test the basic word summary function
 def test_word_summary_vc():
     assert (
-        word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict()
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict()
         == pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict()
     )
 
-
-@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]])
+# Test word summary function with custom stop words
+@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]])
 def test_word_summary_vc_with_stop_words(stop_words):
     assert (
-        word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[
+        word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[
             "word_counts"
         ].to_dict()
         == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
     )
+
+# Test word summary function with default stopwords removed
+def test_word_summary_vc_with_default_stopwords():
+    assert (
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict()
+        == pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict()
+    )
+
+# Test word summary function with both custom and default stop words
+@pytest.mark.parametrize(
+    "stop_words, expected",
+    [
+        (["dog"], {"hungry": 1}),  # Custom stop word "dog", "is" removed as a default stopword
+        (["the", "is"], {"dog": 2, "hungry": 1}),  # Custom stop words "the" and "is"
+    ],
+)
+def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected):
+    result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict()
+    assert result == expected
+
+# Test word summary function with keep_stopwords
+def test_word_summary_vc_with_keep_stopwords():
+    assert (
+        word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict()
+        == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
+    )