From 3aa14be4c6b29bc62bfa03e30c1d584ce4d4f096 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Sat, 10 Sep 2022 16:17:26 -0700 Subject: [PATCH 1/2] Update _tfidf.py I need a sentence-level ngram option since I'm checking on similarities between short texts. Maybe this option is useful for others! --- polyfuzz/models/_tfidf.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index e065f11..aa8b308 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -51,7 +51,8 @@ def __init__(self, min_similarity: float = 0.75, top_n: int = 1, cosine_method: str = "sparse", - model_id: str = None): + model_id: str = None, + use_word_grams: bool = False): super().__init__(model_id) self.type = "TF-IDF" self.n_gram_range = n_gram_range @@ -61,6 +62,7 @@ def __init__(self, self.top_n = top_n self.vectorizer = None self.tf_idf_to = None + self.use_word_grams = use_word_grams def match(self, from_list: List[str], @@ -125,10 +127,16 @@ def _create_ngrams(self, string: str) -> List[str]: string = _clean_string(string) result = [] - for n in range(self.n_gram_range[0], self.n_gram_range[1]+1): - ngrams = zip(*[string[i:] for i in range(n)]) + if self.use_word_grams: + tokens = [token for token in string.split(" ") if token != ""] + ngrams = zip(*[tokens[i:] for i in range(n)]) ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] result.extend(ngrams) + else: + for n in range(self.n_gram_range[0], self.n_gram_range[1]+1): + ngrams = zip(*[string[i:] for i in range(n)]) + ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] + result.extend(ngrams) return result From 68315cdeb5da16ccbb0e380b00b026c17ae94285 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Sat, 10 Sep 2022 16:28:57 -0700 Subject: [PATCH 2/2] Update _tfidf.py Missed a loop --- polyfuzz/models/_tfidf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/polyfuzz/models/_tfidf.py b/polyfuzz/models/_tfidf.py index aa8b308..243b1ba 100644 --- a/polyfuzz/models/_tfidf.py +++ b/polyfuzz/models/_tfidf.py @@ -129,9 +129,10 @@ def _create_ngrams(self, string: str) -> List[str]: result = [] if self.use_word_grams: tokens = [token for token in string.split(" ") if token != ""] - ngrams = zip(*[tokens[i:] for i in range(n)]) - ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] - result.extend(ngrams) + for n in range(self.n_gram_range[0], self.n_gram_range[1]+1): + ngrams = zip(*[tokens[i:] for i in range(n)]) + ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram] + result.extend(ngrams) else: for n in range(self.n_gram_range[0], self.n_gram_range[1]+1): ngrams = zip(*[string[i:] for i in range(n)])