Skip to content

Commit

Permalink
Adds Text class for storing different version of a text
Browse files Browse the repository at this point in the history
Allows us not to repeat the tokenization w/m.
  • Loading branch information
LudvigOlsen committed Mar 22, 2020
1 parent 94ec805 commit 63cb161
Showing 1 changed file with 37 additions and 0 deletions.
37 changes: 37 additions & 0 deletions textdescriptives/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

import string
import re
from nltk import tokenize # evt ændr tokenizer

class Text():
def __init__(self, text):
assert isinstance(text, str), "'text' must have type str."
self.text = text
self.text_without_punctuation = Text.remove_punct(self.text)
self.tokens = self.text.split()
self.tokens_without_punctuation = self.text_without_punctuation.split()
self.sentences = tokenize.sent_tokenize(text)
self.num_tokens = len(self.tokens)
self.num_tokens_without_punctuation = len(self.tokens_without_punctuation)
self.num_sentences = len(self.sentences)

@staticmethod
def remove_punct(text):
return text.translate(str.maketrans('', '', string.punctuation))

def __newline_to_period(self, text):
text = re.sub(r"\n", '.', text)
text = re.sub(r"\.\.+", '. ', text)
return text

@staticmethod
def to_text(text):
"""
If not of type Text, convert to Text object and return
Otherwise, return as is.
"""
if not isinstance(text, Text):
if not isinstance(text, str):
raise TypeError(f"'text' must have type str, not {type(text)}")
text = Text(text)
return text

0 comments on commit 63cb161

Please sign in to comment.