-
Notifications
You must be signed in to change notification settings - Fork 42
/
9-cleanup-markdown.coffee
118 lines (86 loc) · 4.25 KB
/
9-cleanup-markdown.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
sanitizeParagraphs = (string) ->
paragraphs = string.split(/\n\n+/)
string = ''
for paragraph in paragraphs
if paragraph.length > 0
lines = paragraph.split /[ ]*\n[ ]*/
string += lines.join(' ') + '\n\n'
string
text = require('fs').readFileSync('/dev/stdin').toString()
# Strip styles, classes, attributes
text = text.replace /\s*xmlns:mml="http:\/\/www.w3.org\/1998\/Math\/MathML"/gi, ''
text = text.replace /\s*align="left"/gi, ''
text = text.replace /\s*class="MsoTableGrid"/gi, ''
text = text.replace /\s*cellspacing="0"/gi, ''
text = text.replace /\s*cellpadding="0"/gi, ''
text = text.replace /\s*colspan="1"/gi, ''
text = text.replace /\s*rowspan="1"/gi, ''
text = text.replace /(mso-|text-width)[^;"']*(;[ ]*)?/gi, ''
text = text.replace /\s*style="[ ]*"/gi, ''
# Remove crazy unicode invisible characters
text = text.replace /[\u00AD\u200B-\u200D\uFEFF]/g, ''
# Replace \< , \> and \$ by HTML entities
text = text.replace /\\</g, '<'
text = text.replace /\\>/g, '>'
text = text.replace /\\\$/g, '$'
# Make sure spaces in mi elements are preserved
text = text.replace /<mi[^>]*>[ \u00A0\n]+<\/mi>/g, '<mi> </mi>'
# But collapse multiple <mi> </mi> into one
text = text.replace /(<mi[^>]*> <\/mi>[ \n]*)+/g, '<mi> </mi>'
# Remove <a shape="rect">, usually found with comments
text = text.replace /<a shape="rect"[^>]*>(\s*<\/a>)?/gi, ''
# Convert single line breaks in new paragraphs
text = text.replace /\\\n/g, '\n\n'
# Replace ellipsis by "..."
text = text.replace /\u2026/g, '...'
# Replace no breaking character
text = text.replace /\u00A0/g, ''
# Replace no breaking character
text = text.replace /\u00A0/g, ''
# Collapsing newlines before tags (but not if they start a paragraph)
text = text.replace /([^\n])\n</g, '$1 <'
# Collapse dotted lists
text = text.replace /^(\u00b7 (.+\n)+)\n*(?=\u00b7 )/mg, '$1'
# Convert dotted lists to Markdown lists
text = text.replace /^\u00b7 /mg, '- '
# Put list items in separate paragraphs
text = text.replace /^(?=[ ]*(-|\+|\*|\d+\.))/mg, '\n\n'
# Intermediate paragraph sanitation
text = sanitizeParagraphs text
# Replace <mo>-</mo> with unicode line when used on top of an expression
text = text.replace /<mo>-<\/mo>(\s*<\/mover>)/g, '<mo>─</mo>$1'
# Replace <mo>.</mo> <mo>.</mo> by ellipsis
text = text.replace /<mo>\.<\/mo>(\s*<mo>\.<\/mo>)+/g, '<mo>…</mo>'
# Replace <mo>.</mo> <mn> by <mn>.
text = text.replace /<mo>\.<\/mo>\s*<mn>/g, '<mn>.'
# Replace <mo>\^</mo> by <mo>^</mo>
text = text.replace /<mo>\\\^<\/mo>/g, '<mo>^</mo>'
# Remove spaces after angles ∠
text = text.replace /<mi>\u2220<\/mi>\s*<mi> <\/mi>/g, '<mi>∠</mi>'
# Make standard "(a, b)"-like string (without using OfficeMath's "braces") into an mfenced
text = text.replace /<mo>\(<\/mo>([\s\S]*?)<mo>\)<\/mo>/g, (match, content) ->
if content.indexOf('math') >= 0
console.error 'Conversion into <mfenced> crossed <math> borders'
return match
contentWithRows = content.replace /(<mi> <\/mi>\s*)*<mo>,<\/mo>(\s*<mi> <\/mi>)*/g, '</mrow> <mrow>'
'<mfenced> <mrow>' + contentWithRows + '</mrow> </mfenced>'
# Put block math in separate paragraphs
text = text.replace /^\s*<math\s+display="block"\s*>(.*?)<\/math>/mgi, '\n\n<math display="block">$1</math>\n\n'
# Put inline math at the start of a paragraph explicitly in a paragraph tag
text = text.replace /(^|\n\n)([ ]*<math>[\s\S]*?)(?=\n\n)/gi, '\n\n<p>$2</p>'
# Strip bold images
text = text.replace /^\*+\s*(!\[\]\([^)]+\))\s*\*+/mgi, '$1'
text = text.replace /\*+(!\[\]\([^)]+\))\*+/gi, '$1'
# Images on separate paragraphs if they are followed by content
text = text.replace /(!\[\]\([^)]+\))[ ]*(?!\n\n)/g, '\n\n$1\n\n'
text = sanitizeParagraphs text
# Format HTML a bit
text = text.replace /\n?[ ]*<table([^>]*)>[ ]*\n?/gi, '\n<table$1>\n'
text = text.replace /\n?[ ]*<\/table>[ ]*\n?/gi, '\n</table>\n'
text = text.replace /\n?[ ]*<tbody([^>]*)>[ ]*\n?/gi, '\n<tbody$1>\n'
text = text.replace /\n?[ ]*<\/tbody>[ ]*\n?/gi, '\n</tbody>\n'
text = text.replace /\n?[ ]*<tr([^>]*)>[ ]*\n?/gi, '\n<tr$1>\n'
text = text.replace /\n?[ ]*<\/tr>[ ]*\n?/gi, '\n</tr>\n'
text = text.replace /\n?[ ]*<td([^>]*)>[ ]*\n?/gi, '\n<td$1>\n'
text = text.replace /\n?[ ]*<\/td>[ ]*\n?/gi, '\n</td>\n'
console.log text