-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathutils.py
38 lines (26 loc) · 1.02 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Utility functions for calculating various text descriptives"""
from spacy.tokens import Doc, Span, Token
from pyphen import Pyphen
from typing import Union
def filtered_tokens(doc: Union[Doc, Span]):
"""Return words in document or span.
Filters punctuation and words that start with an apostrophe (contractions)"""
filtered_tokens = [
word for word in doc if not word.is_punct and "'" not in word.text
]
return filtered_tokens
def n_sentences(doc: Union[Doc, Span]):
"""Return number of sentences in the document"""
return len(list(doc.sents))
def n_tokens(doc: Union[Doc, Span]):
"""Return number of words in the document."""
return len(doc._._filtered_tokens)
def n_syllables(doc: Doc):
"""
Return number of syllables per token
"""
dic = Pyphen(lang=doc.lang_)
def count_syl(token: Token):
word_hyphenated = dic.inserted(token.lower_)
return max(1, word_hyphenated.count("-") + 1)
return [count_syl(token) for token in doc._._filtered_tokens]