This repository has been archived by the owner on Jul 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 20
/
text_fixer.py
389 lines (320 loc) · 13.7 KB
/
text_fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
"""
Copyright 2024 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from .base import BaseLLMOperation, LLMOPERATORS
from ray.data import Dataset
from pyspark.sql import DataFrame
from pyrecdp.core.model_utils import prepare_model, get_model
from typing import List, Union, Optional
import re
from typing import Dict
CPAT = re.compile("copyright", re.IGNORECASE)
PAT = re.compile("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/")
class SupportedType:
HTML = 'html'
LATEX = 'latex'
CODES = 'codes'
def clean_html(text):
from selectolax.parser import HTMLParser
text = text.replace('<li>', '\n*')
text = text.replace('</li>', '')
text = text.replace('<ol>', '\n*')
text = text.replace('</ol>', '')
parser = HTMLParser(text)
return parser.text()
def clean_latex(text):
non_arg_macros = {}
non_arg_macros.update(_build_non_arg_macros_dict(text))
# TODO: macros that take arguments are not supported yet
arg_macros = {}
cleaned_text = _clean_text_file(
file_content=text,
arg_macros=arg_macros,
non_arg_macros=non_arg_macros
)
return cleaned_text
def clean_codes(text):
return _clean_copyright_comments(text)
def _clean_text_file(
file_content: str, arg_macros: Dict, non_arg_macros: Dict
) -> str:
r""" function takes a tex file as input and returns a cleaned version. The
cleaned version is a concatenation of the tex files with the
following modifications:
- remove all comments (i.e. all lines starting with %)
- remove everything before the first section-like header
- remove everything after the first occurrence of either \appendix or
\bibliography
- inline-expand definitions and macros
@param file_content: the content of the tex file as a string.
@return: cleaned tex file as a string
"""
# find the first occurence of a \section-like header and replace everything
# before it with an empty string. This matches the following pattern:
# \<section-type>[optional-args]{name}
pattern = r"^(.*?)("
pattern += r"\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
pattern += r"\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
pattern += r"\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
pattern += r"\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
pattern += r"\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
pattern += r"\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}"
pattern += r"\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}"
pattern += r")"
# if no section like header is found, then we return an empty string
if not re.search(pattern, file_content, flags=re.DOTALL):
return file_content
# replace everything with the second group of the match (i.e. everything
# after and including the section header)
file_content = re.sub(
pattern=pattern,
repl=r"\2",
string=file_content,
flags=re.DOTALL # make sure that the dot matches also newlines
)
# remove all line comments
file_content = re.sub(
pattern=r"(?m)^%.*\n?",
repl=r"",
string=file_content,
flags=re.MULTILINE
)
# remove all in comments within a line
file_content = re.sub(
# pattern matches a "%" that is not preceded by a backslash (=comment)
pattern=r"[^\\]%.+$",
repl=r"",
string=file_content,
flags=re.MULTILINE
)
# find the first occurence of either \appendix or \bibliography and
# replace everything after it with an empty string
pattern = r"("
pattern += r"\\appendix|"
pattern += r"\\begin\{references\}|"
pattern += r"\\begin\{REFERENCES\}|"
pattern += r"\\begin\{thebibliography\}|"
pattern += r"\\bibliography\{.*\}"
pattern += r").*$"
file_content = re.sub(
pattern=pattern,
repl=r'',
string=file_content,
flags=re.DOTALL # make sure that the dot matches also newlines
)
# inline-expand all non-arg macros
for macro_name, macro_value in non_arg_macros.items():
file_content = re.sub(
# make pattern grouped to make sure that the macro is not part
# of a longer alphanumeric word
pattern=r"(" + macro_name + r")" + r"([^a-zA-Z0-9])",
# replace the macro with its value and add back the character that
# was matched after the macro
repl=macro_value + r"\2",
string=file_content
)
# inline-expand all macros that use args
# TODO: inline-expand macros with args
for macro_name, macro_value in arg_macros.items():
pass
return file_content
def _build_non_arg_macros_dict(file_content: str) -> Dict[str, str]:
r""" function takes the content of a tex file and returns a dictionary
that contains the definitions of all macros that do not use arguments.
The dictionary is of the form {macro_name: macro_value}.
@param file_content: the content of the tex file as a string.
@return: dict
"""
# regex for extracting \newcommand macros without arguments
non_arg_nc_reg = re.compile(
# this regex matches the following:
# \newcommand{\macro_name}{macro_value}
# \newcommand*{\macro_name}{macro_value}
# where macro_name is only allowed to contain letters and numbers;
# macro_value can contain any character.
pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$',
flags=re.MULTILINE
)
# regex for extracting \def macros without arguments
non_arg_def_reg = re.compile(
# this regex matches the following:
# \def\macro_name{macro_value}
# where macro_name is only allowed to contain letters and numbers;
# macro_value can contain any character.
pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$',
flags=re.MULTILINE
)
# Extract all user-defined LaTeX macros from the preamble
macros = {}
for reg in [non_arg_nc_reg, non_arg_def_reg]:
for match in reg.finditer(file_content):
# convert the macro name and value to a raw string that can be
# used in re.sub
macro_name = match \
.group(1).encode("unicode-escape").decode("utf-8")
macro_val = match \
.group(2).encode("unicode-escape").decode("utf-8")
macros[macro_name] = macro_val
return macros
def _clean_copyright_comments(text: str):
r = PAT.search(text)
if r:
# found one, now see if it contains "copyright", if so strip it
span = r.span()
sub = text[span[0]:span[1]]
if CPAT.search(sub):
# cut it
text = text[: span[0]] + text[span[1]:]
return text
lines = text.split('\n')
skip = 0
# Greedy replace any file that begins with comment block, most
# are copyright headers
for k in range(len(lines)):
if (
lines[k].startswith("//") or
lines[k].startswith("#") or
lines[k].startswith("--") or
not lines[k]
):
skip = skip + 1
else:
break
if skip:
# we skipped, consume it
text = "\n".join(lines[skip:])
return text
def get_fixer_by_type(text_type):
if isinstance(text_type, str):
text_type = getattr(SupportedType, text_type.upper())
if SupportedType.HTML == text_type:
return clean_html
if SupportedType.LATEX == text_type:
return clean_latex
if SupportedType.CODES == text_type:
return clean_codes
class TextFix(BaseLLMOperation):
def __init__(self, text_key='text', inplace=True, text_type='html'):
"""
Clean up text of the specified type
:param text_type: Supported text type. Default: html. (html, latex, codes)
"""
settings = {'text_key': text_key, 'inplace': inplace, 'text_type': text_type}
requirements = []
super().__init__(settings, requirements)
self.text_key = text_key
self.inplace = inplace
self.text_type = text_type
self.actual_func = None
self.support_spark = True
self.support_ray = True
def process_rayds(self, ds: Dataset) -> Dataset:
if self.inplace:
new_name = self.text_key
else:
new_name = 'fixed_text'
if self.actual_func is None:
self.actual_func = get_fixer_by_type(self.text_type)
return ds.map(lambda x: self.process_row(x, self.text_key, new_name, self.actual_func))
def process_spark(self, spark, spark_df: DataFrame) -> DataFrame:
import pyspark.sql.functions as F
fix_by_type_udf = F.udf(get_fixer_by_type(self.text_type))
if self.inplace:
new_name = self.text_key
else:
new_name = 'fixed_text'
return spark_df.withColumn(new_name, fix_by_type_udf(F.col(self.text_key)))
LLMOPERATORS.register(TextFix)
class RAGTextFix(BaseLLMOperation):
def __init__(self, text_key='text', chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', language: str = 'en',
str_to_replace: Optional[dict] = {}, remove_extra_whitespace: bool = False, re_sentence: bool = False):
"""
Clean up text for LLM RAG to use.
Step 1: Fix unicode errors in text using ftfy
Step 2: Normalize different kinds of whitespaces to whitespace ' ' (0x20) in text
Different kinds of whitespaces can be found here:
https://en.wikipedia.org/wiki/Whitespace_character
Step 3: Clean specific chars in text.
Step 4: Replace the specified string.
Step 5: Remove extra whitespaces.
Step 6: Re segment sentences in the text to avoid sentence segmentation errors caused by unnecessary line breaks.
:param language: Supported language. Default: en. (en)
:param re_sentence: If re-segment sentences. Default: False.
:param remove_extra_whitespace: If remove extra whitespace. Default: False.
:param chars_to_remove: Chars to remove. Default: '◆●■►▼▲▴∆▻▷❖♡□'
:param str_to_replace: Stings to replace. Default:None.
{'recdp':'RecDP', 'josn':'json'}
"""
settings = {'chars_to_remove': chars_to_remove, 'text_key': text_key, 'language': language,
'str_to_replace': str_to_replace, 'remove_extra_whitespace': remove_extra_whitespace,
're_sentence': re_sentence}
requirements = ["ftfy", "selectolax"]
super().__init__(settings, requirements)
self.support_spark = True
self.support_ray = True
self.text_key = text_key
self.inplace = True
self.chars_to_remove = chars_to_remove
self.language = language
self.re_sentence = re_sentence
self.str_to_replace = str_to_replace
self.remove_extra_whitespace = remove_extra_whitespace
def process_rayds(self, ds: Dataset) -> Dataset:
remover = self.get_compute_func()
new_ds = ds.map(lambda x: self.process_row(x, self.text_key, self.text_key, remover))
return new_ds
def process_spark(self, spark, spark_df: DataFrame) -> DataFrame:
import pyspark.sql.functions as F
custom_udf = F.udf(self.get_compute_func())
return spark_df.withColumn(self.text_key, custom_udf(F.col(self.text_key)))
def get_compute_func(self):
import ftfy
from pyrecdp.primitives.operations.constant import VARIOUS_WHITESPACES
pattern = '[' + ''.join(self.chars_to_remove) + ']'
model_key = prepare_model(lang=self.language, model_type='nltk')
nltk_model = get_model(model_key, lang=self.language, model_type='nltk')
str_to_replace = self.str_to_replace
remove_extra_whitespace = self.remove_extra_whitespace
re_sentence = self.re_sentence
def compute(text):
# fix unicode errors
text = ftfy.fix_text(text)
# normalize different kinds of whitespaces
text = ''.join([
char if char not in VARIOUS_WHITESPACES else ' ' for char in text
])
# clean specific chars in text.
if len(pattern) > 2:
text = re.sub(pattern=pattern, repl=r'',
string=text, flags=re.DOTALL)
# replace some strings.
for key, value in str_to_replace.items():
text = text.replace(str(key), str(value))
# remove extra whitespaces
if remove_extra_whitespace:
text = re.sub(r"\s+", " ", text)
# Re segment sentences
if re_sentence:
paragraph_break_pattern = "\\n\s*\\n"
replace_str = '*^*^*'
text = re.sub(pattern=paragraph_break_pattern, repl=replace_str,
string=text, flags=re.DOTALL)
sentences = nltk_model.tokenize(text)
new_sentences = []
for sentence in sentences:
new_sentences.append(sentence.replace("\n", " "))
new_text = ' '.join(new_sentences).replace(replace_str, "\n\n")
else:
new_text = text
return new_text
return compute
LLMOPERATORS.register(RAGTextFix)