forked from hemanth/functional-programming-jargon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoc.py
338 lines (274 loc) · 9.7 KB
/
toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
"""Generate generate table of contents within Markdown (.md) files."""
# Steps:
#
# 1. Open .md doc for reading; get the doc as a Python str
# 2. Build a generator of (level: int, header: str) for the found headers
# 3. Format & indent the found headers using []() Markdown style
# 4. Replace '<!---toc start-->', '<!---toc end-->', and text in between
# with the str result from (3.)
# 5. Overwrite the input file with the result
from __future__ import print_function
from __future__ import unicode_literals
import pathlib
import argparse
import collections
import os
import re
import sys
import requests
from xtermcolor import colorize
TOC_PAT = re.compile(
r"[ \t]*<\!---toc start-->(.*?)<\!---toc end-->[ \t]*",
flags=re.DOTALL | re.M,
)
# Pattern needs to be careful for URLs containing parentheses,
# nested within the actual Markdown link's parentheses.
# I.e.
# [Text here](https://www.cool.com/this(-is-a)-link.html)
MD_LINK_PAT = re.compile(
r"\[([^\[\]]+)\]" # Brackets containing non-bracket characters
r"\((([^\s)(]|\([^\s)(]*\))*)\)", # Outer parentheses
re.M,
)
# A Markdown "atx-style" header, GitHub-flavored.
# See https://github.github.com/gfm/#atx-heading
HEADER_PAT = re.compile(r"^\s{,3}(#{1,6})\s+(.*)")
# Used in _strip() - see docstring.
# NOTE: GitHub seems to get its own logic wrong on this one.
# It will render:
# - "### foo \###" as "foo ###"
# - "## foo #\##" as "foo ###"
# Go figure. We take the rule for its word.
STRIP_CANDIDATE_PAT = re.compile(r"(?<!\\)[ \t#]+$|^[ \t#]+")
# Bold & italics get dropped from links
ITAL_PAT = re.compile(r"(?<!\\)_[^(?<!\\)_]+(?<!\\)_")
BOLD_PAT = re.compile(r"(?<!\\)\*[^(?<!\\)\*]+(?<!\\)\*")
class MarkdownError(Exception):
"""Markdown formatted incorrectly & unparseable."""
def _strip(x, _sub=STRIP_CANDIDATE_PAT.sub):
"""Strip surrounding spaces, tabs, and hash signs.
Don't strip escaped hash signs.
This is equivalent to str.strip but with the negative lookbehind
assertion found in STRIP_CANDIDATE_PAT.
"""
return _sub("", x)
def _replace_ital_bold(s):
to_repl = "_*"
for pat in (ITAL_PAT, BOLD_PAT):
for match in pat.finditer(s):
found = match.group(0)
s = s.replace(found, found.strip(to_repl))
return s
def as_link(x):
"""Convert Markdown header string into relative URL."""
res = re.sub(
r"[^-\w\s]",
"",
re.sub(r"\s+", "-", _strip(x.lower())),
flags=re.U, # Python 2
)
# Slow route: check and remove bold & italic marks.
# TODO: find a fast route for this.
# We also cannot simply count occurences of _ and *, since
# they may be escaped by \
res = _replace_ital_bold(res)
# One more fix: if the resulting link ends with multiple hyphens,
# make it just one.
if res.endswith("--"):
res = res.strip("-") + "-"
return res
def escape(x):
"""Escape brackets, '['' and ']'."""
return x.replace("[", "\\[").replace("]", "\\]")
def toc(md_string):
"""Takes a Markdown string, returns TOC string (in Markdown).
Formats indented headers in []() style.
"""
toc = []
n_seen = collections.defaultdict(int)
for level, header in headers(md_string):
# If we see the same header multiple times (regardless of level),
# it should have a 1..n suffix on the end. The first occurence
# gets no suffix; others get 1..n.
link = as_link(header)
n = n_seen[link]
if n > 0:
n_seen[link] += 1
link += "-" + str(n)
else:
n_seen[link] += 1
toc.append(
"{spaces}* [{header}](#{link})".format(
spaces=" " * (level - 1),
header=escape(_strip(header)),
link=link,
)
)
return "\n".join(toc)
def headers(md_string):
"""Generator of (level: int, header: str) tuples from Markdown string.
The headers are not yet formatted (cleaned); they are only generated here
via match for what loosely describes an ATX header pattern.
"""
is_comment_block = False
for line in md_string.split("\n"):
if line.startswith("```"):
is_comment_block = not is_comment_block
if is_comment_block:
continue
header = HEADER_PAT.match(line)
if header:
level = len(header.group(1))
header = header.group(2)
yield level, header
def modify_and_write(path, outfile=None):
"""Write a table of contents to the Markdown file at `path`.
Overwrites the file in place.
If no tags or improper tags (<!---toc start--> and <!---toc end-->),
raise MarkdownError before writing back.
If `outfile` is not None, write the results to that file.
Otherwise, overwrite the input `path` inplace.
"""
write_stdout = outfile is sys.stdout
markdown = pathlib.Path(path).read_text(encoding='utf-8')
table_of_contents = toc(markdown)
new_markdown, replacements = TOC_PAT.subn(
"<!---toc start-->\n\n{}\n\n<!---toc end-->".format(table_of_contents),
markdown,
)
# If we couldn't find tags and 0 replacements were made, let user
# know and raise.
if not write_stdout:
if not replacements:
raise MarkdownError(
"Document missing toc start/end tags.\n"
"Add these delimiters to your Markdown file:\n\n"
"\t<!---toc start-->\n"
"\t<!---toc end-->\n\n"
"Then, run:\n\n"
"\t$ mdtoc %s" % path
)
elif replacements > 1:
raise MarkdownError(
"Multiple toc start/end tag pairs detected."
" Your Markdown file should include only one pair of tags"
)
if outfile is None:
# Default
outfile = path
if write_stdout:
outfile.write(table_of_contents + "\n")
else:
pathlib.Path(path).write_text(new_markdown, encoding='utf-8')
print(
colorize("Success: wrote TOC to {path}".format(path=outfile), ansi=22)
)
def get_links(md_string):
"""Find links in a Markdown string.
Yields a 4-tuple: text, URL, linenum, colnum.
"""
def line_col(position):
l, c = 1, 1
for idx, char in enumerate(md_string):
if idx == position:
break
elif char in {"\r", "\n"}:
l, c = l + 1, 1
else:
l, c = l, c + 1
return (l, c)
for m in MD_LINK_PAT.finditer(md_string):
line, col = line_col(m.start(1))
yield m.group(1), m.group(2), line, col
_description = """
Generates table of contents for Markdown files.
The algorithm searches for the text blocks
between the delimiters:
<!---toc start--->
... anything ...
<!---toc end--->
The contents of the block are then replaced
by a table of contents.
"""
def parse_args():
"""Parse command-line arguments."""
from mdtoc import __version__
parser = argparse.ArgumentParser(
description=_description,
epilog="Created by Scott Frazer (https://github.com/scottfrazer).\n",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("--version", action="version", version=__version__)
parser.add_argument(
"--check-links",
action="store_true",
help="find all hyperlinks and ensure that\nthey point to something valid", # noqa
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--outfile",
help="instead of overwriting the input file,\n"
"write to this file instead",
)
group.add_argument(
"--stdout",
action="store_true",
help="don't write or overwrite any file;\n"
"just print the TOC (not full .md file) to stdout",
)
parser.add_argument(
"markdown_file",
help="relative or abs. path of the Markdown\n(.md) file to overwrite",
)
return parser.parse_args()
def cli():
"""Command-line entry point."""
cli = parse_args()
cli.markdown_file = os.path.expanduser(cli.markdown_file)
if cli.stdout:
outfile = sys.stdout
elif cli.outfile:
outfile = cli.outfile
else:
outfile = None
try:
modify_and_write(cli.markdown_file, outfile=outfile)
except OSError:
print(
colorize(
"Failed: " "Not found: {path}".format(path=cli.markdown_file),
ansi=1,
)
)
return 1
except MarkdownError as e:
print(colorize("Failed: " + str(e), ansi=1))
return 1
if cli.check_links:
with open(cli.markdown_file) as fp:
contents = fp.read()
valid_http_fragments = [
"#" + as_link(h) for (l, h) in headers(contents)
]
for text, link, line, col in get_links(contents):
valid = "unrecognized link type"
if link.startswith("#"):
if link not in valid_http_fragments:
valid = colorize("INVALID", ansi=1)
else:
valid = colorize("VALID", ansi=2)
elif link.startswith("http://") or link.startswith("https://"):
r = requests.get(link)
valid = "Response: {}".format(r.status_code)
print(
"Checking {line}:{col} [{text}]({link}) --> {valid} ".format(
text=colorize(text, ansi=3),
link=colorize(link, ansi=4),
line=line,
col=col,
valid=valid,
)
)
if __name__ == "__main__":
txt = pathlib.Path("readme.md").read_text(encoding='utf-8')
print('\n'.join(toc(txt).split('\n')[1:]))