Adds Text class for storing different version of a text

Allows us not to repeat the tokenization w/m.
HLasse · Mar 22, 2020 · 63cb161 · 63cb161
1 parent 94ec805
commit 63cb161
Showing 1 changed file with 37 additions and 0 deletions.
diff --git a/textdescriptives/text.py b/textdescriptives/text.py
@@ -0,0 +1,37 @@
+
+import string
+import re
+from nltk import tokenize # evt ændr tokenizer
+
+class Text():
+    def __init__(self, text):
+        assert isinstance(text, str), "'text' must have type str."
+        self.text = text
+        self.text_without_punctuation = Text.remove_punct(self.text)
+        self.tokens = self.text.split()
+        self.tokens_without_punctuation = self.text_without_punctuation.split()
+        self.sentences = tokenize.sent_tokenize(text)
+        self.num_tokens = len(self.tokens)
+        self.num_tokens_without_punctuation = len(self.tokens_without_punctuation)
+        self.num_sentences = len(self.sentences)
+
+    @staticmethod
+    def remove_punct(text):
+        return text.translate(str.maketrans('', '', string.punctuation))
+
+    def __newline_to_period(self, text):
+        text = re.sub(r"\n", '.', text)
+        text = re.sub(r"\.\.+", '. ', text)
+        return text
+
+    @staticmethod
+    def to_text(text):
+        """
+        If not of type Text, convert to Text object and return
+        Otherwise, return as is.
+        """
+        if not isinstance(text, Text):
+            if not isinstance(text, str):
+                raise TypeError(f"'text' must have type str, not {type(text)}")
+            text = Text(text)
+        return text