-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck-language.py
executable file
·299 lines (249 loc) · 9.4 KB
/
check-language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/env python3
"""
Checks Markdown files using dockerized LanguageTool toolkit for language errors.
This script is a wrapper for dockerized LTeX language server CLI
(https://github.com/valentjn/ltex-ls), which internally uses LanguageTool server
(https://github.com/languagetool-org/languagetool). When docs are developed in Visual Studio Code,
we use LTeX VSCode addon, which also uses whe LTeX language server, but not the CLI. The CLI can
utilize the `settings.json` file but unfortunately there are some bugs/differences compared to the
addon:
- the CLI does not respect "picky rules" setting,
- the CLI returns non-zero exit code when rules marked as "hint" are detected,
- the CLI always throws an exception on the successful run,
- the CLI throws an exception when the language error spans accross multiple lines and stops
checking the rest of file.
To fix these inconsistences/bugs this script performs some additional operations when wrapping
execution of the LTeX CLI:
- it adds list of known "picky rules" manually using `enabledRules` settings (see `PICKY_RULES`
list documentation in the code),
- it changes hint-rules into disabled rules by default and allows to specify `--show-hint` flag to
check the hint-rules,
- it eats the "successful exception" from the output (see `RE_SUCCESS_EXCEPTION` regexp)
- it shows a reliable message when the multiple-lines exception is thrown (see
`RE_MULTILINE_ERROR_EXCEPTION` regexp).
Compatible with Python 3.5+, needs the Docker installed.
"""
__author__ = "Jakub Liput"
__copyright__ = "Copyright (C) 2024 ACK CYFRONET AGH"
__license__ = "This software is released under the MIT license cited in LICENSE.txt"
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
VERSION_RE = re.compile(r"(\d+)\.(\d+)\.(\d+)")
LANGUAGETOOL_IMG = "docker.onedata.org/languagetool:v1"
SETTINGS_JSON_PATH = ".vscode/settings.json"
TMP_SETTINGS_JSON_PATH = "/tmp/"
CONFIG_DIAGNOSTIC_SEVERITY = "ltex.diagnosticSeverity"
CONFIG_DISABLED_RULES = "ltex.disabledRules"
CONFIG_ENABLED_RULES = "ltex.enabledRules"
CONFIG_EN_US = "en-US"
CONFIG_PICKY_RULES = "ltex.additionalRules.enablePickyRules"
TEXT_COLOR_RED = 31
TEXT_COLOR_GREEN = 32
# Matches the "successful run exception" - see more in module doc.
RE_SUCCESS_EXCEPTION = re.compile(
r"^.*[AP]M org\.eclipse\.lsp4j\.jsonrpc\.json\.ConcurrentMessageProcessor run(.|\n)*more\s*$",
re.MULTILINE,
)
# Matches the "multiline error exception" - see more in module doc.
RE_MULTILINE_ERROR_EXCEPTION = re.compile(
r"^java\.lang\.StringIndexOutOfBoundsException(.|\n)*at org\.bsplines\.lspcli\.LspCliLauncher"
+ r"\.main\(LspCliLauncher\.kt\)\s*$",
re.MULTILINE,
)
##
# The list of "picky"-level rules that are normally enabled in VSCode linter using the
# `"ltex.additionalRules.enablePickyRules": true` setting, but this does not work when using
# standalone LTeX CLI - we must add enable manually "picky" rules.
#
# The list is created by reading the source code of `languagetool`
# (https://github.com/languagetool-org/languagetool). The following files contain rules for English:
# - languagetool-language-modules/en/src/main/resources/org/languagetool/rules/en/en-US/grammar.xml
# - languagetool-language-modules/en/src/main/resources/org/languagetool/rules/en/en-US/style.xml
#
# The picky rules are tagged with `tags="picky"` in XML.
PICKY_RULES = [
"CHILDISH_LANGUAGE",
"CIRCUMSTANCES_SURROUNDING",
"COVID_19",
"DASH_RULE",
"DO_MAKE_PRP_VBG",
"DOES_XX_CAN",
"DT_JJ_NO_NOUN",
"EG_NO_COMMA",
"EITHER_NOR",
"ELLIPSIS",
"EN_QUOTES",
"FOUR_NN",
"HAPPY_CHRISTMAS",
"HONEST_TRUTH",
"HYPHEN_TO_EN",
"HYPOTHESIS_TYPOGRAPHY",
"IE_NO_COMMA",
"IN_A_X_MANNER",
"LITTLE_BIT",
"MISSING_PERIOD_AFTER_ABBREVIATION",
"MULTIPLICATION_SIGN",
"NON_STANDARD_COMMA",
"NON_STANDARD_QUESTION_MARK",
"NUMEROUS_DIFFERENT",
"OCCASION_TRANSITIVE_VERB_VERY_FORMAL",
"PASSIVE_VOICE",
"PASSIVE_VOICE_SIMPLE",
"PLUS_MINUS",
"PPL",
"PREVENT_FROM",
"PROFANITY",
"PROFANITY_TYPOS",
"REASON_WHY",
"REP_PASSIVE_VOICE",
"RUDE_SARCASTIC",
"SENT_START_CONJUNCTION",
"SENT_START_NUM",
"SENTENCE_FRAGMENT",
"SERIAL_COMMA_ON",
"SOME_OF_THE",
"TAG_QUESTIONS_SVA",
"TELL_IT",
"THE_PROOF_IS_IN_THE_PUDDING",
"THREE_NN",
"TOO_TO_EITHER",
"TRY_AND",
"TWITTER_X",
"TWO_HYPHENS",
"TYPEWRITER_APOSTROPHE",
"TYPOGRAPHICAL_APOSTROPHE",
"UNIT_SPACE",
"WHO_WHOM",
"WHOLE_OTHER",
"WILL_ALLOW",
"WORD_CONTAINS_UNDERSCORE",
]
def create_arg_parser():
parser = argparse.ArgumentParser(
description="Checks Markdown files using dockerized LanguageTool toolkit for language "
+ "errors.",
)
parser.add_argument(
"input_path",
nargs="?",
default="docs",
help="A relative path starting from the project root to directory with Markdown files "
+ "(scanned recursively) or a single Markdown file path to check. Paths outside the "
+ "project directory are invalid.",
)
parser.add_argument(
"--show-hints",
default=False,
action="store_true",
help='Enables checking rules that have "hint" severity in original settings.json. '
+ 'By default rules with "hint" severity are not checked.',
)
parser.add_argument(
"--quiet",
default=False,
action="store_true",
help="Disables the output.",
)
return parser
def disable_hint_rules(settings_data):
custom_severity_rules = settings_data[CONFIG_DIAGNOSTIC_SEVERITY]
disabled_rules = settings_data[CONFIG_DISABLED_RULES][CONFIG_EN_US]
hint_rules = list(
filter(
lambda rule_key: rule_key != "default"
and custom_severity_rules[rule_key] == "hint",
custom_severity_rules,
)
)
for rule_key in hint_rules:
disabled_rules.append(rule_key)
del custom_severity_rules[rule_key]
def parse_settings_json():
with open(SETTINGS_JSON_PATH, "r") as settings_reader:
lines = settings_reader.readlines()
# throw out comments from the JSON
result = ''
for line in lines:
result += re.sub(
r"\s*\/\/.*|((.*)((\"|\d|true|false|null),?|\[|\{))?(\s*\/\/.*)?",
r"\1",
line
)
return json.loads(result)
def create_settings_content(show_hints=False):
settings_data = parse_settings_json()
if not show_hints and CONFIG_DIAGNOSTIC_SEVERITY in settings_data:
disable_hint_rules(settings_data)
if CONFIG_PICKY_RULES in settings_data and settings_data[CONFIG_PICKY_RULES]:
add_picky_rules(settings_data)
return json.dumps(settings_data, indent=" ")
def add_picky_rules(settings_data):
if (not CONFIG_ENABLED_RULES in settings_data) or (
not CONFIG_EN_US in settings_data[CONFIG_EN_US]
):
settings_data[CONFIG_ENABLED_RULES] = {CONFIG_EN_US: []}
disabled_rules = settings_data[CONFIG_DISABLED_RULES][CONFIG_EN_US]
non_disabled_picky_rules = filter(
lambda rule: rule not in disabled_rules, PICKY_RULES
)
settings_data[CONFIG_ENABLED_RULES][CONFIG_EN_US].extend(non_disabled_picky_rules)
def color_text(text, color_code):
return "\033[0;{}m{}\033[0m".format(color_code, text)
def exec_with_settings(settings_content, input_path, quiet):
with tempfile.NamedTemporaryFile(mode="a") as tmp_settings_file:
tmp_settings_file.write(settings_content)
tmp_settings_file.flush()
docker_project_root = "/onedata-documentation"
process_args = [
"docker",
"run",
"--rm",
"-t",
"-e",
"TERM=xterm-256color",
"-v",
"{}:{}".format(os.getcwd(), docker_project_root),
"-v",
"{}:/settings.json".format(tmp_settings_file.name),
LANGUAGETOOL_IMG,
"--client-configuration=/settings.json",
os.path.join(docker_project_root, input_path),
]
result = subprocess.run(
process_args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
check=False,
)
output = result.stdout.decode("utf-8")
if result.returncode == 0:
output = RE_SUCCESS_EXCEPTION.sub("", output)
elif RE_MULTILINE_ERROR_EXCEPTION.search(output):
output = RE_MULTILINE_ERROR_EXCEPTION.sub("", output)
multiline_error_text = (
"The latest language error spans accross mulitple lines. Due to "
+ "the bug in LTeX, the check stopped here. Fix the error and run check-language again."
)
output += "\n{}".format(color_text(multiline_error_text, TEXT_COLOR_RED))
output = re.sub(r"\s*$", "", output, re.MULTILINE)
if not quiet:
if output != "":
print(output)
if result.returncode == 0:
print(color_text("No language problems found.", TEXT_COLOR_GREEN))
else:
print(color_text("Some language problems have been found.", TEXT_COLOR_RED))
return result.returncode
def main(args):
settings_content = create_settings_content(show_hints=args.show_hints)
return exec_with_settings(
settings_content=settings_content, input_path=args.input_path, quiet=args.quiet
)
if __name__ == "__main__":
arg_parser = create_arg_parser()
sys.exit(main(arg_parser.parse_args()))