From 061ca6b05891380fccd67675ac825764eb0f90b5 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Fri, 15 Oct 2021 14:26:11 +0200
Subject: [PATCH 1/5] shorter code in core.py

---
 justext/core.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/justext/core.py b/justext/core.py
index 7cb6895..fb4316d 100644
--- a/justext/core.py
+++ b/justext/core.py
@@ -309,12 +309,11 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING
     Context-sensitive paragraph classification. Assumes that classify_pragraphs
     has already been called.
     """
-    # copy classes
-    for paragraph in paragraphs:
-        paragraph.class_type = paragraph.cf_class
 
     # good headings
     for i, paragraph in enumerate(paragraphs):
+        # copy classes
+        paragraph.class_type = paragraph.cf_class
         if not (paragraph.heading and paragraph.class_type == 'short'):
             continue
         j = i + 1
@@ -333,10 +332,9 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING
             continue
         prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True)
         next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True)
-        neighbours = {prev_neighbour, next_neighbour}
-        if neighbours == {'good'}:
+        if prev_neighbour == 'good' and next_neighbour == 'good':
             new_classes[i] = 'good'
-        elif neighbours == {'bad'}:
+        elif prev_neighbour == 'bad' and next_neighbour == 'bad':
             new_classes[i] = 'bad'
         # it must be set(['good', 'bad'])
         elif (prev_neighbour == 'bad' and get_prev_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood') or \

From e03e8b41ef5755b697405da74758ed02a35e4dc2 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Fri, 15 Oct 2021 14:33:55 +0200
Subject: [PATCH 2/5] shorter code in paragraph.py

---
 justext/paragraph.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/justext/paragraph.py b/justext/paragraph.py
index f246dc1..0a9ab04 100644
--- a/justext/paragraph.py
+++ b/justext/paragraph.py
@@ -8,6 +8,9 @@
 from .utils import normalize_whitespace
 
 
+HEADINGS_PATTERN = re.compile(r"\bh\d\b")
+
+
 class Paragraph(object):
     """Object representing one block of text in HTML."""
     def __init__(self, path):
@@ -19,7 +22,7 @@ def __init__(self, path):
 
     @property
     def is_heading(self):
-        return bool(re.search(r"\bh\d\b", self.dom_path))
+        return bool(HEADINGS_PATTERN.search(self.dom_path))
 
     @property
     def is_boilerplate(self):
@@ -46,20 +49,13 @@ def append_text(self, text):
         return text
 
     def stopwords_count(self, stopwords):
-        count = 0
-
-        for word in self.text.split():
-            if word.lower() in stopwords:
-                count += 1
-
-        return count
+        return sum([word.lower() in stopwords for word in self.text.split()])
 
     def stopwords_density(self, stopwords):
-        words_count = self.words_count
-        if words_count == 0:
+        if self.words_count == 0:
             return 0
 
-        return self.stopwords_count(stopwords) / words_count
+        return self.stopwords_count(stopwords) / self.words_count
 
     def links_density(self):
         text_length = len(self.text)

From 0d981aaa273c347efef088d8f83dcee504eb9821 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Fri, 15 Oct 2021 14:48:20 +0200
Subject: [PATCH 3/5] unnecessary else after return in utils.py

---
 justext/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/justext/utils.py b/justext/utils.py
index fef77d1..8b4af4c 100644
--- a/justext/utils.py
+++ b/justext/utils.py
@@ -25,8 +25,7 @@ def _replace_whitespace(match):
 
     if "\n" in text or "\r" in text:
         return "\n"
-    else:
-        return " "
+    return " "
 
 
 def is_blank(string):

From 9477a82c0b7ff2486a8b778193b84da5866b9d4f Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Mon, 18 Oct 2021 13:34:38 +0200
Subject: [PATCH 4/5] generator expression without list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mišo Belica <miso.belica@gmail.com>
---
 justext/paragraph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/justext/paragraph.py b/justext/paragraph.py
index 0a9ab04..b07d613 100644
--- a/justext/paragraph.py
+++ b/justext/paragraph.py
@@ -49,7 +49,7 @@ def append_text(self, text):
         return text
 
     def stopwords_count(self, stopwords):
-        return sum([word.lower() in stopwords for word in self.text.split()])
+        return sum(word.lower() in stopwords for word in self.text.split())
 
     def stopwords_density(self, stopwords):
         if self.words_count == 0:

From c4ccca2768ce9a56f2831a4b5374549a46bc7283 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Mon, 18 Oct 2021 13:39:03 +0200
Subject: [PATCH 5/5] implement suggestion: use one-liner in
 _replace_whitespace

---
 justext/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/justext/utils.py b/justext/utils.py
index 8b4af4c..42e5074 100644
--- a/justext/utils.py
+++ b/justext/utils.py
@@ -21,11 +21,9 @@ def normalize_whitespace(text):
 
 
 def _replace_whitespace(match):
+    """Normalize all spacing characters that aren't a newline to a space."""
     text = match.group()
-
-    if "\n" in text or "\r" in text:
-        return "\n"
-    return " "
+    return "\n" if "\n" in text or "\r" in text else " "
 
 
 def is_blank(string):