diff --git a/justext/core.py b/justext/core.py index 7cb6895..fb4316d 100644 --- a/justext/core.py +++ b/justext/core.py @@ -309,12 +309,11 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING Context-sensitive paragraph classification. Assumes that classify_pragraphs has already been called. """ - # copy classes - for paragraph in paragraphs: - paragraph.class_type = paragraph.cf_class # good headings for i, paragraph in enumerate(paragraphs): + # copy classes + paragraph.class_type = paragraph.cf_class if not (paragraph.heading and paragraph.class_type == 'short'): continue j = i + 1 @@ -333,10 +332,9 @@ def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING continue prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True) next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True) - neighbours = {prev_neighbour, next_neighbour} - if neighbours == {'good'}: + if prev_neighbour == 'good' and next_neighbour == 'good': new_classes[i] = 'good' - elif neighbours == {'bad'}: + elif prev_neighbour == 'bad' and next_neighbour == 'bad': new_classes[i] = 'bad' # it must be set(['good', 'bad']) elif (prev_neighbour == 'bad' and get_prev_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood') or \ diff --git a/justext/paragraph.py b/justext/paragraph.py index f246dc1..b07d613 100644 --- a/justext/paragraph.py +++ b/justext/paragraph.py @@ -8,6 +8,9 @@ from .utils import normalize_whitespace +HEADINGS_PATTERN = re.compile(r"\bh\d\b") + + class Paragraph(object): """Object representing one block of text in HTML.""" def __init__(self, path): @@ -19,7 +22,7 @@ def __init__(self, path): @property def is_heading(self): - return bool(re.search(r"\bh\d\b", self.dom_path)) + return bool(HEADINGS_PATTERN.search(self.dom_path)) @property def is_boilerplate(self): @@ -46,20 +49,13 @@ def append_text(self, text): return text def stopwords_count(self, stopwords): - count = 0 - - for word in self.text.split(): - if word.lower() in stopwords: - count += 1 - - return count + return sum(word.lower() in stopwords for word in self.text.split()) def stopwords_density(self, stopwords): - words_count = self.words_count - if words_count == 0: + if self.words_count == 0: return 0 - return self.stopwords_count(stopwords) / words_count + return self.stopwords_count(stopwords) / self.words_count def links_density(self): text_length = len(self.text) diff --git a/justext/utils.py b/justext/utils.py index fef77d1..42e5074 100644 --- a/justext/utils.py +++ b/justext/utils.py @@ -21,12 +21,9 @@ def normalize_whitespace(text): def _replace_whitespace(match): + """Normalize all spacing characters that aren't a newline to a space.""" text = match.group() - - if "\n" in text or "\r" in text: - return "\n" - else: - return " " + return "\n" if "\n" in text or "\r" in text else " " def is_blank(string):