From 8e59d6c85509b5256363b8af9a5598f53e625a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Tue, 20 Aug 2024 16:55:34 -0400 Subject: [PATCH] Upgrade to nltk 3.9.1 to address CVE-2024-39705 The upgrade to nltk to version 3.9.1 is a BREAKING change. This change downloads `punkt_tab` instead of `punkt` which has a critical security vulnerability (CVE-2024-39705). See e.g.: - https://github.com/advisories/GHSA-cgvx-9447-vcch - https://github.com/nltk/nltk/issues/3293 - https://github.com/nltk/nltk/issues/3266 --- requirements.txt | 2 +- valentine/algorithms/cupid/linguistic_matching.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4c70343..d2b9e9d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # algorithms numpy==1.26.0 pandas==2.1.1 -nltk==3.8.1 +nltk==3.9.1 anytree==2.10.0 networkx==3.1 chardet==5.2.0 diff --git a/valentine/algorithms/cupid/linguistic_matching.py b/valentine/algorithms/cupid/linguistic_matching.py index b353a59..73a851d 100755 --- a/valentine/algorithms/cupid/linguistic_matching.py +++ b/valentine/algorithms/cupid/linguistic_matching.py @@ -26,7 +26,7 @@ def normalization(element, try: tokens = nltk.word_tokenize(element) except LookupError: - nltk.download('punkt') + nltk.download('punkt_tab') nltk.download('omw-1.4') nltk.download('stopwords') nltk.download('wordnet') @@ -195,7 +195,7 @@ def compute_similarity_wordnet(word1, try: wn_lemmas = set(wn.all_lemma_names()) except LookupError: - nltk.download('punkt') + nltk.download('punkt_tab') nltk.download('omw-1.4') nltk.download('stopwords') nltk.download('wordnet')