-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathutils.py
81 lines (57 loc) · 1.96 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
Utility functions for calculating various text descriptives
"""
from typing import Any, Callable, Union
from pyphen import Pyphen
from spacy.tokens import Doc, Span, Token
def filtered_tokens(doc: Union[Doc, Span]):
"""Return words in document or span.
Filters punctuation and words that start with an apostrophe (contractions)"""
filtered_tokens = [
word for word in doc if not word.is_punct and "'" not in word.text
]
return filtered_tokens
def n_sentences(doc: Union[Doc, Span]):
"""Return number of sentences in the document"""
return len(list(doc.sents))
def n_tokens(doc: Union[Doc, Span]):
"""Return number of words in the document."""
return len(doc._._filtered_tokens) # pylint: disable=protected-access
def n_syllables(doc: Doc):
"""
Return number of syllables per token
"""
dic = Pyphen(lang=doc.lang_)
def count_syl(token: Token):
word_hyphenated = dic.inserted(token.lower_)
return max(1, word_hyphenated.count("-") + 1)
return [
count_syl(token)
for token in doc._._filtered_tokens # pylint: disable=protected-access
]
def span_getter_to_token_getter(
span_getter: Callable[[Span], Any]
) -> Callable[[Token], Any]:
"""Converts a span getter to a token getter.
Args:
span_getter (Callable[[Span], Any]):
The span getter function.
Returns:
Callable[[Token], Any]: The token getter function.
"""
def token_getter(token):
return span_getter(token.doc[token.i : token.i + 1])
return token_getter
def span_getter_to_doc_getter(
span_getter: Callable[[Span], Any]
) -> Callable[[Doc], Any]:
"""Converts a span getter to a document getter.
Args:
span_getter (Callable[[Span], Any]):
The span getter function.
Returns:
Callable[[Doc], Any]: The document getter function.
"""
def doc_getter(doc):
return span_getter(doc[:])
return doc_getter