-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathdataframe_extract.py
141 lines (120 loc) · 4.65 KB
/
dataframe_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Extract metrics as Pandas DataFrame"""
from spacy.tokens import Doc
from typing import Union, List
import types
import pandas as pd
class DataFrameExtractor:
def __init__(
self,
doc: Doc,
metrics: Union[List[str], str] = "all",
include_text: bool = True,
):
"""Utility class to extract specified metrics to a Pandas DataFrame
Args:
doc (Doc): a spaCy doc
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
Defaults to "all".
include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
"""
if not isinstance(doc, (Doc)):
raise TypeError(f"doc should be a spaCy Doc object, not {type(doc)}.")
valid_metrics = set(
["descriptive_stats", "readability", "dependency_distance", "all"]
)
if isinstance(metrics, str):
metrics = [metrics]
if not isinstance(metrics, list):
raise TypeError(
f"'metrics' should be string or list of strings, not {type(metrics)}"
)
if not set(metrics).issubset(valid_metrics):
raise ValueError(
f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance']"
)
if include_text:
df_list = [pd.DataFrame([doc.text], columns=["text"])]
else:
df_list = []
if "all" in metrics:
if doc.has_extension("counts"):
df_list.append(self.__descriptive_stats(doc))
if doc.has_extension("readability"):
df_list.append(self.__readability(doc))
if doc.has_extension("dependency_distance"):
df_list.append(self.__dependency_distance(doc))
else:
if "descriptive_stats" in metrics:
df_list.append(self.__descriptive_stats(doc))
if "readability" in metrics:
df_list.append(self.__readability(doc))
if "dependency_distance" in metrics:
df_list.append(self.__dependency_distance(doc))
self.df = pd.concat(df_list, axis=1)
def __descriptive_stats(self, doc: Doc) -> pd.DataFrame:
descriptive_stats = {
**doc._.token_length,
**doc._.sentence_length,
**doc._.syllables,
**doc._.counts,
}
return pd.DataFrame.from_records([descriptive_stats])
def __readability(self, doc: Doc) -> pd.DataFrame:
return pd.DataFrame.from_records([doc._.readability])
def __dependency_distance(self, doc: Doc) -> pd.DataFrame:
return pd.DataFrame.from_records([doc._.dependency_distance])
def extract_df(
doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True
) -> pd.DataFrame:
"""Extract calculated metrics from a spaCy Doc object or a generator of Docs from
nlp.pipe to a Pandas DataFrame
Args:
doc (Doc): a spaCy doc or a generator of spaCy Docs
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
Defaults to "all".
include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
Returns:
pd.DataFrame: DataFrame with a row for each doc and column for each metric.
"""
if isinstance(doc, types.GeneratorType):
rows = []
for d in doc:
metric_df = DataFrameExtractor(d, metrics, include_text).df
rows.append(metric_df)
return pd.concat(rows, axis=0, ignore_index=True)
return DataFrameExtractor(doc, metrics, include_text).df
"""Helpers to subset an extracted dataframe"""
readability_cols = [
"flesch_reading_ease",
"flesch_kincaid_grade",
"smog",
"gunning_fog",
"automated_readability_index",
"coleman_liau_index",
"lix",
"rix",
]
dependency_cols = [
"dependency_distance_mean",
"dependency_distance_std",
"prop_adjacent_dependency_relation_mean",
"prop_adjacent_dependency_relation_std",
]
descriptive_stats_cols = [
"token_length_mean",
"token_length_median",
"token_length_std",
"sentence_length_mean",
"sentence_length_median",
"sentence_length_std",
"syllables_per_token_mean",
"syllables_per_token_median",
"syllables_per_token_std",
"n_tokens",
"n_unique_tokens",
"proportion_unique_tokens",
"n_sentences",
"n_characters",
]