-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwysiwyg.py
149 lines (113 loc) · 3.17 KB
/
wysiwyg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import streamlit as st
from lxml import etree
from standoffconverter import Standoff, View
st.set_page_config(layout="wide")
st.write("# Interactive demo of the standoff converter")
st.write('''
This demo shows the steps involved to get from TEI XML to plain text, identify sentences and put the found sentences as `<s>` into the TEI. It illustrates how you can use standard NLP tools on your TEI documents. You can modify the input document to see how things change!
[The code for this demo is available in our Github Repository](https://github.com/standoff-nlp/standoffconverter/blob/master/examples/wysiwyg.py)
''')
st.write("## TEI to plain text")
col1, col2, col3 = st.beta_columns(3)
input_xml = col1.text_area(
'input_xml:',
value="""<TEI>
<teiHeader> </teiHeader>
<text>
<body>
<p>1 2 3 4. 5 6<lb/> 7 9 10.</p>
<p> 11 12 13 14</p>
</body>
</text>
</TEI>
""",
height=400,
max_chars=300,
)
col2.write("1. Standoff representation")
col2.code(
"""# 1. create standoff
tree = etree.fromstring(input_xml)
so = Standoff(tree)
print(so.collapsed_table)"""
)
tree = etree.fromstring(input_xml)
so = Standoff(tree)
col2.write(so.collapsed_table)
col3.write("2. Plain text view")
view = (
View(so)
.insert_tag_text(
"lb",
"\n"
)
.exclude_outside("p")
)
plain = view.get_plain()
col3.code(
"""# 2. create view
view = (
View(so)
.insert_tag_text(
"lb",
"\\n"
)
.exclude_outside("p")
)
plain = view.get_plain()
print(plain)"""
)
col3.text(plain)
st.write("## Apply spacy sentencizer and add `<s>`-tags")
col1, col2, col3 = st.beta_columns(3)
col1.write('3. Apply spacy sentencizer')
col1.code(
"""# 3. annotate with NLP
# to split sentences
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')
sentences = []
for sent in nlp(plain).sents:
col1.write(f"* {sent}")
sentences.append(sent)
""")
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')
col1.write('spacy found the following sentences:')
sentences = []
for sent in nlp(plain).sents:
col1.write(f"* {sent}")
sentences.append(sent)
col2.write('4. Add annotations')
col2.code(
"""# 4. retrieve results from spacy,
# resolve original character positions
# and add annotations to the tree
for isent, sent in enumerate(sentences):
start_ind = view.get_table_pos(sent.start_char)
end_ind = view.get_table_pos(sent.end_char-1)+1
so.add_inline(
begin=start_ind,
end=end_ind,
tag="s",
depth=None,
attrib={'id':f'{isent}'}
)
""")
for isent, sent in enumerate(sentences):
start_ind = view.get_table_pos(sent.start_char)
end_ind = view.get_table_pos(sent.end_char-1)+1
try:
so.add_inline(
begin=start_ind,
end=end_ind,
tag="s",
depth=None,
attrib={'id':f'{isent}'}
)
except (ValueError, IndexError):
raise ValueError(f"Unable to add sentence tag for '{sent}'. It probably violates the tree constraint of XML.")
col3.write("final TEI")
col3.code(etree.tostring(so.tree).decode("utf-8"))