From 061ca6b05891380fccd67675ac825764eb0f90b5 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 15 Oct 2021 14:26:11 +0200 Subject: [PATCH 1/5] shorter code in core.py --- justext/core.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/justext/core.py b/justext/core.py index 7cb6895..fb4316d 100644 --- a/justext/core.py +++ b/justext/core.py @@ -309,12 +309,11 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING Context-sensitive paragraph classification. Assumes that classify_pragraphs has already been called. """ - # copy classes - for paragraph in paragraphs: - paragraph.class_type = paragraph.cf_class # good headings for i, paragraph in enumerate(paragraphs): + # copy classes + paragraph.class_type = paragraph.cf_class if not (paragraph.heading and paragraph.class_type == 'short'): continue j = i + 1 @@ -333,10 +332,9 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING continue prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True) next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True) - neighbours = {prev_neighbour, next_neighbour} - if neighbours == {'good'}: + if prev_neighbour == 'good' and next_neighbour == 'good': new_classes[i] = 'good' - elif neighbours == {'bad'}: + elif prev_neighbour == 'bad' and next_neighbour == 'bad': new_classes[i] = 'bad' # it must be set(['good', 'bad']) elif (prev_neighbour == 'bad' and get_prev_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood') or \ From e03e8b41ef5755b697405da74758ed02a35e4dc2 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 15 Oct 2021 14:33:55 +0200 Subject: [PATCH 2/5] shorter code in paragraph.py --- justext/paragraph.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/justext/paragraph.py b/justext/paragraph.py index f246dc1..0a9ab04 100644 --- a/justext/paragraph.py +++ b/justext/paragraph.py @@ -8,6 +8,9 @@ from .utils import normalize_whitespace +HEADINGS_PATTERN = re.compile(r"\bh\d\b") + + class Paragraph(object): """Object representing one block of text in HTML.""" def __init__(self, path): @@ -19,7 +22,7 @@ def __init__(self, path): @property def is_heading(self): - return bool(re.search(r"\bh\d\b", self.dom_path)) + return bool(HEADINGS_PATTERN.search(self.dom_path)) @property def is_boilerplate(self): @@ -46,20 +49,13 @@ def append_text(self, text): return text def stopwords_count(self, stopwords): - count = 0 - - for word in self.text.split(): - if word.lower() in stopwords: - count += 1 - - return count + return sum([word.lower() in stopwords for word in self.text.split()]) def stopwords_density(self, stopwords): - words_count = self.words_count - if words_count == 0: + if self.words_count == 0: return 0 - return self.stopwords_count(stopwords) / words_count + return self.stopwords_count(stopwords) / self.words_count def links_density(self): text_length = len(self.text) From 0d981aaa273c347efef088d8f83dcee504eb9821 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 15 Oct 2021 14:48:20 +0200 Subject: [PATCH 3/5] unnecessary else after return in utils.py --- justext/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/justext/utils.py b/justext/utils.py index fef77d1..8b4af4c 100644 --- a/justext/utils.py +++ b/justext/utils.py @@ -25,8 +25,7 @@ def _replace_whitespace(match): if "\n" in text or "\r" in text: return "\n" - else: - return " " + return " " def is_blank(string): From 9477a82c0b7ff2486a8b778193b84da5866b9d4f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 18 Oct 2021 13:34:38 +0200 Subject: [PATCH 4/5] generator expression without list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mišo Belica --- justext/paragraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justext/paragraph.py b/justext/paragraph.py index 0a9ab04..b07d613 100644 --- a/justext/paragraph.py +++ b/justext/paragraph.py @@ -49,7 +49,7 @@ def append_text(self, text): return text def stopwords_count(self, stopwords): - return sum([word.lower() in stopwords for word in self.text.split()]) + return sum(word.lower() in stopwords for word in self.text.split()) def stopwords_density(self, stopwords): if self.words_count == 0: From c4ccca2768ce9a56f2831a4b5374549a46bc7283 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 18 Oct 2021 13:39:03 +0200 Subject: [PATCH 5/5] implement suggestion: use one-liner in _replace_whitespace --- justext/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/justext/utils.py b/justext/utils.py index 8b4af4c..42e5074 100644 --- a/justext/utils.py +++ b/justext/utils.py @@ -21,11 +21,9 @@ def normalize_whitespace(text): def _replace_whitespace(match): + """Normalize all spacing characters that aren't a newline to a space.""" text = match.group() - - if "\n" in text or "\r" in text: - return "\n" - return " " + return "\n" if "\n" in text or "\r" in text else " " def is_blank(string):