From 8e59d6c85509b5256363b8af9a5598f53e625a26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Tue, 20 Aug 2024 16:55:34 -0400
Subject: [PATCH] Upgrade to nltk 3.9.1 to address CVE-2024-39705

The upgrade to nltk to version 3.9.1 is a BREAKING change. This change
downloads `punkt_tab` instead of `punkt` which has a critical security
vulnerability (CVE-2024-39705).
See e.g.:
- https://github.com/advisories/GHSA-cgvx-9447-vcch
- https://github.com/nltk/nltk/issues/3293
- https://github.com/nltk/nltk/issues/3266
---
 requirements.txt                                  | 2 +-
 valentine/algorithms/cupid/linguistic_matching.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4c70343..d2b9e9d 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # algorithms
 numpy==1.26.0
 pandas==2.1.1
-nltk==3.8.1
+nltk==3.9.1
 anytree==2.10.0
 networkx==3.1
 chardet==5.2.0
diff --git a/valentine/algorithms/cupid/linguistic_matching.py b/valentine/algorithms/cupid/linguistic_matching.py
index b353a59..73a851d 100755
--- a/valentine/algorithms/cupid/linguistic_matching.py
+++ b/valentine/algorithms/cupid/linguistic_matching.py
@@ -26,7 +26,7 @@ def normalization(element,
     try:
         tokens = nltk.word_tokenize(element)
     except LookupError:
-        nltk.download('punkt')
+        nltk.download('punkt_tab')
         nltk.download('omw-1.4')
         nltk.download('stopwords')
         nltk.download('wordnet')
@@ -195,7 +195,7 @@ def compute_similarity_wordnet(word1,
     try:
         wn_lemmas = set(wn.all_lemma_names())
     except LookupError:
-        nltk.download('punkt')
+        nltk.download('punkt_tab')
         nltk.download('omw-1.4')
         nltk.download('stopwords')
         nltk.download('wordnet')