diff --git a/README.rst b/README.rst
index 55ea7cf..98473c6 100644
--- a/README.rst
+++ b/README.rst
@@ -128,9 +128,9 @@ escape_underscores
Defaults to ``True``.
escape_misc
- If set to ``False``, do not escape miscellaneous punctuation characters
+ If set to ``True``, escape miscellaneous punctuation characters
that sometimes have Markdown significance in text.
- Defaults to ``True``.
+ Defaults to ``False``.
keep_inline_images_in
Images are converted to their alt-text when the images are located inside
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index cd66a39..cc3d153 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -7,7 +7,8 @@
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\t ]+')
-all_whitespace_re = re.compile(r'[\s]+')
+all_whitespace_re = re.compile(r'[\t \r\n]+')
+newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
html_heading_re = re.compile(r'h[1-6]')
@@ -66,6 +67,23 @@ def _todict(obj):
return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
+def should_remove_whitespace_inside(el):
+ """Return to remove whitespace immediately inside a block-level element."""
+ if not el or not el.name:
+ return False
+ if html_heading_re.match(el.name) is not None:
+ return True
+ return el.name in ('p', 'blockquote',
+ 'ol', 'ul', 'li',
+ 'table', 'thead', 'tbody', 'tfoot',
+ 'tr', 'td', 'th')
+
+
+def should_remove_whitespace_outside(el):
+ """Return to remove whitespace immediately outside a block-level element."""
+ return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
+
+
class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
@@ -76,7 +94,7 @@ class DefaultOptions:
default_title = False
escape_asterisks = True
escape_underscores = True
- escape_misc = True
+ escape_misc = False
heading_style = UNDERLINED
keep_inline_images_in = []
newline_style = SPACES
@@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
if not children_only and (isHeading or isCell):
convert_children_as_inline = True
- # Remove whitespace-only textnodes in purely nested nodes
- def is_nested_node(el):
- return el and el.name in ['ol', 'ul', 'li',
- 'table', 'thead', 'tbody', 'tfoot',
- 'tr', 'td', 'th']
-
- if is_nested_node(node):
- for el in node.children:
- # Only extract (remove) whitespace-only text node if any of the
- # conditions is true:
- # - el is the first element in its parent
- # - el is the last element in its parent
- # - el is adjacent to an nested node
- can_extract = (not el.previous_sibling
- or not el.next_sibling
- or is_nested_node(el.previous_sibling)
- or is_nested_node(el.next_sibling))
- if (isinstance(el, NavigableString)
- and six.text_type(el).strip() == ''
- and can_extract):
- el.extract()
+ # Remove whitespace-only textnodes just before, after or
+ # inside block-level elements.
+ should_remove_inside = should_remove_whitespace_inside(node)
+ for el in node.children:
+ # Only extract (remove) whitespace-only text node if any of the
+ # conditions is true:
+ # - el is the first element in its parent (block-level)
+ # - el is the last element in its parent (block-level)
+ # - el is adjacent to a block-level node
+ can_extract = (should_remove_inside and (not el.previous_sibling
+ or not el.next_sibling)
+ or should_remove_whitespace_outside(el.previous_sibling)
+ or should_remove_whitespace_outside(el.next_sibling))
+ if (isinstance(el, NavigableString)
+ and six.text_type(el).strip() == ''
+ and can_extract):
+ el.extract()
# Convert the children first
for el in node.children:
@@ -148,7 +162,13 @@ def is_nested_node(el):
elif isinstance(el, NavigableString):
text += self.process_text(el)
else:
- text += self.process_tag(el, convert_children_as_inline)
+ text_strip = text.rstrip('\n')
+ newlines_left = len(text) - len(text_strip)
+ next_text = self.process_tag(el, convert_children_as_inline)
+ next_text_strip = next_text.lstrip('\n')
+ newlines_right = len(next_text) - len(next_text_strip)
+ newlines = '\n' * max(newlines_left, newlines_right)
+ text = text_strip + newlines + next_text_strip
if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -162,18 +182,26 @@ def process_text(self, el):
# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
- text = whitespace_re.sub(' ', text)
+ if self.options['wrap']:
+ text = all_whitespace_re.sub(' ', text)
+ else:
+ text = newline_whitespace_re.sub('\n', text)
+ text = whitespace_re.sub(' ', text)
# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
text = self.escape(text)
- # remove trailing whitespaces if any of the following condition is true:
- # - current text node is the last node in li
- # - current text node is followed by an embedded list
- if (el.parent.name == 'li'
- and (not el.next_sibling
- or el.next_sibling.name in ['ul', 'ol'])):
+ # remove leading whitespace at the start or just after a
+ # block-level element; remove traliing whitespace at the end
+ # or just before a block-level element.
+ if (should_remove_whitespace_outside(el.previous_sibling)
+ or (should_remove_whitespace_inside(el.parent)
+ and not el.previous_sibling)):
+ text = text.lstrip()
+ if (should_remove_whitespace_outside(el.next_sibling)
+ or (should_remove_whitespace_inside(el.parent)
+ and not el.next_sibling)):
text = text.rstrip()
return text
@@ -208,20 +236,32 @@ def escape(self, text):
if not text:
return ''
if self.options['escape_misc']:
- text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
- text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
+ text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
+ # A sequence of one or more consecutive '-', preceded and
+ # followed by whitespace or start/end of fragment, might
+ # be confused with an underline of a header, or with a
+ # list marker.
+ text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
+ # A sequence of up to six consecutive '#', preceded and
+ # followed by whitespace or start/end of fragment, might
+ # be confused with an ATX heading.
+ text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
+ # '.' or ')' preceded by up to nine digits might be
+ # confused with a list item.
+ text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
+ text)
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
text = text.replace('_', r'\_')
return text
- def indent(self, text, level):
- return line_beginning_re.sub('\t' * level, text) if text else ''
+ def indent(self, text, columns):
+ return line_beginning_re.sub(' ' * columns, text) if text else ''
def underline(self, text, pad_char):
text = (text or '').rstrip()
- return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+ return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
@@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline):
def convert_blockquote(self, el, text, convert_as_inline):
if convert_as_inline:
- return text
+ return ' ' + text.strip() + ' '
return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
@@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
+ text = all_whitespace_re.sub(' ', text)
hashes = '#' * n
if style == ATX_CLOSED:
- return '%s %s %s\n\n' % (hashes, text, hashes)
- return '%s %s\n\n' % (hashes, text)
+ return '\n%s %s %s\n\n' % (hashes, text, hashes)
+ return '\n%s %s\n\n' % (hashes, text)
def convert_hr(self, el, text, convert_as_inline):
return '\n\n---\n\n'
@@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline):
el = el.parent
if nested:
# remove trailing newline if nested
- return '\n' + self.indent(text, 1).rstrip()
- return text + ('\n' if before_paragraph else '')
+ return '\n' + text.rstrip()
+ return '\n\n' + text + ('\n' if before_paragraph else '')
convert_ul = convert_list
convert_ol = convert_list
@@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline):
el = el.parent
bullets = self.options['bullets']
bullet = bullets[depth % len(bullets)]
- return '%s %s\n' % (bullet, (text or '').strip())
+ bullet = bullet + ' '
+ text = (text or '').strip()
+ text = self.indent(text, len(bullet))
+ if text:
+ text = bullet + text[len(bullet):]
+ return '%s\n' % text
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
- return text
+ return ' ' + text.strip() + ' '
if self.options['wrap']:
- text = fill(text,
- width=self.options['wrap_width'],
- break_long_words=False,
- break_on_hyphens=False)
- return '%s\n\n' % text if text else ''
+ # Preserve newlines (and preceding whitespace) resulting
+ # from
tags. Newlines in the input have already been
+ # replaced by spaces.
+ lines = text.split('\n')
+ new_lines = []
+ for line in lines:
+ line = line.lstrip()
+ line_no_trailing = line.rstrip()
+ trailing = line[len(line_no_trailing):]
+ line = fill(line,
+ width=self.options['wrap_width'],
+ break_long_words=False,
+ break_on_hyphens=False)
+ new_lines.append(line + trailing)
+ text = '\n'.join(new_lines)
+ return '\n\n%s\n\n' % text if text else ''
def convert_pre(self, el, text, convert_as_inline):
if not text:
diff --git a/pyproject.toml b/pyproject.toml
index c0d1ce6..51604d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "markdownify"
-version = "0.13.1"
+version = "0.14.0"
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
description = "Convert HTML to markdown."
readme = "README.rst"
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index 14bf3cd..a3a5fda 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -14,7 +14,7 @@ def test_chomp():
def test_nested():
text = md('
This is an example link.
') - assert text == 'This is an [example link](http://example.com/).\n\n' + assert text == '\n\nThis is an [example link](http://example.com/).\n\n' def test_ignore_comments(): diff --git a/tests/test_basic.py b/tests/test_basic.py index bf25ee0..66f8b6c 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -11,3 +11,4 @@ def test_soup(): def test_whitespace(): assert md(' a b \t\t c ') == ' a b c ' + assert md(' a b \n\n c ') == ' a b\nc ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index a35b982..0be1d0c 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE def inline_tests(tag, markup): @@ -66,7 +66,7 @@ def test_blockquote_with_paragraph(): def test_blockquote_nested(): text = md('And she was like') - assert text == '\n> And she was like \n> > Hello\n\n' + assert text == '\n> And she was like\n> > Hello\n\n' def test_br(): @@ -112,36 +112,39 @@ def test_em(): def test_header_with_space(): - assert md('Hello
P
C ', heading_style=ATX_CLOSED) == '# A P C #\n\n' - assert md('P
C ', heading_style=ATX) == '# A P C\n\n' + assert md('P
C ', heading_style=ATX_CLOSED) == '\n# A P C #\n\n' + assert md('P
C ', heading_style=ATX) == '\n# A P C\n\n' def test_hn_nested_simple_tag(): @@ -157,12 +160,12 @@ def test_hn_nested_simple_tag(): ] for tag, markdown in tag_to_markdown: - assert md('Hello
\nWorld
') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' + assert md('Hello
\nWorld
') == '\n\nHello\n\n---\n\nWorld\n\n' def test_i(): @@ -210,12 +213,23 @@ def test_kbd(): def test_p(): - assert md('hello
') == 'hello\n\n' - assert md('123456789 123456789
') == '123456789 123456789\n\n' - assert md('123456789 123456789
', wrap=True, wrap_width=10) == '123456789\n123456789\n\n' - assert md('', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n' - assert md('12345
67890
12345678901
12345
hello
') == '\n\nhello\n\n' + assert md('123456789 123456789
') == '\n\n123456789 123456789\n\n' + assert md('123456789\n\n\n123456789
') == '\n\n123456789\n123456789\n\n' + assert md('123456789\n\n\n123456789
', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' + assert md('123456789 123456789
', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' + assert md('', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' + assert md('12345
67890
12345
67890
12345
67890
12345
67890
12345678901
12345
12345678901
12345
12345678901
12345
12345678901
12345
1234 5678 9012
67890
1234 5678 9012
67890
Second
Third
Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' def test_pre(): @@ -289,3 +303,13 @@ def callback(el): assert md('test\n foo\nbar', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n' assert md('
test\n foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
assert md('test\n foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
+
+
+def test_spaces():
+ assert md('a b
c d
') == '\n\na b\n\nc d\n\n' + assert md('a
') == '\n\n*a*\n\n' + assert md('testagain
') == 'test\n\nagain\n\n' + assert md('testtextafter') == 'test\n> text\n\nafter' + assert md('
foobar') == 'test\n```\n foo \n```\nbar' diff --git a/tests/test_escaping.py b/tests/test_escaping.py index eaef77d..878760a 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -1,3 +1,5 @@ +import warnings +from bs4 import MarkupResemblesLocatorWarning from markdownify import markdownify as md @@ -12,7 +14,7 @@ def test_underscore(): def test_xml_entities(): - assert md('&') == r'\&' + assert md('&', escape_misc=True) == r'\&' def test_named_entities(): @@ -25,23 +27,49 @@ def test_hexadecimal_entities(): def test_single_escaping_entities(): - assert md('&') == r'\&' - - -def text_misc(): - assert md('\\*') == r'\\\*' - assert md('
first para
second para
third para
fourth para
first para
second para
third para
fourth para
foo
bar
') == 'foo\n\n* a\n* b\n\nbar\n\n' + assert md('foo
bar
') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n' + assert md('foo