diff --git a/README.rst b/README.rst index 55ea7cf..98473c6 100644 --- a/README.rst +++ b/README.rst @@ -128,9 +128,9 @@ escape_underscores Defaults to ``True``. escape_misc - If set to ``False``, do not escape miscellaneous punctuation characters + If set to ``True``, escape miscellaneous punctuation characters that sometimes have Markdown significance in text. - Defaults to ``True``. + Defaults to ``False``. keep_inline_images_in Images are converted to their alt-text when the images are located inside diff --git a/markdownify/__init__.py b/markdownify/__init__.py index cd66a39..cc3d153 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -7,7 +7,8 @@ convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') -all_whitespace_re = re.compile(r'[\s]+') +all_whitespace_re = re.compile(r'[\t \r\n]+') +newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') @@ -66,6 +67,23 @@ def _todict(obj): return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) +def should_remove_whitespace_inside(el): + """Return to remove whitespace immediately inside a block-level element.""" + if not el or not el.name: + return False + if html_heading_re.match(el.name) is not None: + return True + return el.name in ('p', 'blockquote', + 'ol', 'ul', 'li', + 'table', 'thead', 'tbody', 'tfoot', + 'tr', 'td', 'th') + + +def should_remove_whitespace_outside(el): + """Return to remove whitespace immediately outside a block-level element.""" + return should_remove_whitespace_inside(el) or (el and el.name == 'pre') + + class MarkdownConverter(object): class DefaultOptions: autolinks = True @@ -76,7 +94,7 @@ class DefaultOptions: default_title = False escape_asterisks = True escape_underscores = True - escape_misc = True + escape_misc = False heading_style = UNDERLINED keep_inline_images_in = [] newline_style = SPACES @@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False): if not children_only and (isHeading or isCell): convert_children_as_inline = True - # Remove whitespace-only textnodes in purely nested nodes - def is_nested_node(el): - return el and el.name in ['ol', 'ul', 'li', - 'table', 'thead', 'tbody', 'tfoot', - 'tr', 'td', 'th'] - - if is_nested_node(node): - for el in node.children: - # Only extract (remove) whitespace-only text node if any of the - # conditions is true: - # - el is the first element in its parent - # - el is the last element in its parent - # - el is adjacent to an nested node - can_extract = (not el.previous_sibling - or not el.next_sibling - or is_nested_node(el.previous_sibling) - or is_nested_node(el.next_sibling)) - if (isinstance(el, NavigableString) - and six.text_type(el).strip() == '' - and can_extract): - el.extract() + # Remove whitespace-only textnodes just before, after or + # inside block-level elements. + should_remove_inside = should_remove_whitespace_inside(node) + for el in node.children: + # Only extract (remove) whitespace-only text node if any of the + # conditions is true: + # - el is the first element in its parent (block-level) + # - el is the last element in its parent (block-level) + # - el is adjacent to a block-level node + can_extract = (should_remove_inside and (not el.previous_sibling + or not el.next_sibling) + or should_remove_whitespace_outside(el.previous_sibling) + or should_remove_whitespace_outside(el.next_sibling)) + if (isinstance(el, NavigableString) + and six.text_type(el).strip() == '' + and can_extract): + el.extract() # Convert the children first for el in node.children: @@ -148,7 +162,13 @@ def is_nested_node(el): elif isinstance(el, NavigableString): text += self.process_text(el) else: - text += self.process_tag(el, convert_children_as_inline) + text_strip = text.rstrip('\n') + newlines_left = len(text) - len(text_strip) + next_text = self.process_tag(el, convert_children_as_inline) + next_text_strip = next_text.lstrip('\n') + newlines_right = len(next_text) - len(next_text_strip) + newlines = '\n' * max(newlines_left, newlines_right) + text = text_strip + newlines + next_text_strip if not children_only: convert_fn = getattr(self, 'convert_%s' % node.name, None) @@ -162,18 +182,26 @@ def process_text(self, el): # normalize whitespace if we're not inside a preformatted element if not el.find_parent('pre'): - text = whitespace_re.sub(' ', text) + if self.options['wrap']: + text = all_whitespace_re.sub(' ', text) + else: + text = newline_whitespace_re.sub('\n', text) + text = whitespace_re.sub(' ', text) # escape special characters if we're not inside a preformatted or code element if not el.find_parent(['pre', 'code', 'kbd', 'samp']): text = self.escape(text) - # remove trailing whitespaces if any of the following condition is true: - # - current text node is the last node in li - # - current text node is followed by an embedded list - if (el.parent.name == 'li' - and (not el.next_sibling - or el.next_sibling.name in ['ul', 'ol'])): + # remove leading whitespace at the start or just after a + # block-level element; remove traliing whitespace at the end + # or just before a block-level element. + if (should_remove_whitespace_outside(el.previous_sibling) + or (should_remove_whitespace_inside(el.parent) + and not el.previous_sibling)): + text = text.lstrip() + if (should_remove_whitespace_outside(el.next_sibling) + or (should_remove_whitespace_inside(el.parent) + and not el.next_sibling)): text = text.rstrip() return text @@ -208,20 +236,32 @@ def escape(self, text): if not text: return '' if self.options['escape_misc']: - text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text) - text = re.sub(r'([0-9])([.)])', r'\1\\\2', text) + text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text) + # A sequence of one or more consecutive '-', preceded and + # followed by whitespace or start/end of fragment, might + # be confused with an underline of a header, or with a + # list marker. + text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text) + # A sequence of up to six consecutive '#', preceded and + # followed by whitespace or start/end of fragment, might + # be confused with an ATX heading. + text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text) + # '.' or ')' preceded by up to nine digits might be + # confused with a list item. + text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2', + text) if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: text = text.replace('_', r'\_') return text - def indent(self, text, level): - return line_beginning_re.sub('\t' * level, text) if text else '' + def indent(self, text, columns): + return line_beginning_re.sub(' ' * columns, text) if text else '' def underline(self, text, pad_char): text = (text or '').rstrip() - return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' + return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' def convert_a(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) @@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline): def convert_blockquote(self, el, text, convert_as_inline): if convert_as_inline: - return text + return ' ' + text.strip() + ' ' return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else '' @@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline): if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) + text = all_whitespace_re.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: - return '%s %s %s\n\n' % (hashes, text, hashes) - return '%s %s\n\n' % (hashes, text) + return '\n%s %s %s\n\n' % (hashes, text, hashes) + return '\n%s %s\n\n' % (hashes, text) def convert_hr(self, el, text, convert_as_inline): return '\n\n---\n\n' @@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline): el = el.parent if nested: # remove trailing newline if nested - return '\n' + self.indent(text, 1).rstrip() - return text + ('\n' if before_paragraph else '') + return '\n' + text.rstrip() + return '\n\n' + text + ('\n' if before_paragraph else '') convert_ul = convert_list convert_ol = convert_list @@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline): el = el.parent bullets = self.options['bullets'] bullet = bullets[depth % len(bullets)] - return '%s %s\n' % (bullet, (text or '').strip()) + bullet = bullet + ' ' + text = (text or '').strip() + text = self.indent(text, len(bullet)) + if text: + text = bullet + text[len(bullet):] + return '%s\n' % text def convert_p(self, el, text, convert_as_inline): if convert_as_inline: - return text + return ' ' + text.strip() + ' ' if self.options['wrap']: - text = fill(text, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) - return '%s\n\n' % text if text else '' + # Preserve newlines (and preceding whitespace) resulting + # from
tags. Newlines in the input have already been + # replaced by spaces. + lines = text.split('\n') + new_lines = [] + for line in lines: + line = line.lstrip() + line_no_trailing = line.rstrip() + trailing = line[len(line_no_trailing):] + line = fill(line, + width=self.options['wrap_width'], + break_long_words=False, + break_on_hyphens=False) + new_lines.append(line + trailing) + text = '\n'.join(new_lines) + return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): if not text: diff --git a/pyproject.toml b/pyproject.toml index c0d1ce6..51604d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "markdownify" -version = "0.13.1" +version = "0.14.0" authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}] description = "Convert HTML to markdown." readme = "README.rst" diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 14bf3cd..a3a5fda 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -14,7 +14,7 @@ def test_chomp(): def test_nested(): text = md('

This is an example link.

') - assert text == 'This is an [example link](http://example.com/).\n\n' + assert text == '\n\nThis is an [example link](http://example.com/).\n\n' def test_ignore_comments(): diff --git a/tests/test_basic.py b/tests/test_basic.py index bf25ee0..66f8b6c 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -11,3 +11,4 @@ def test_soup(): def test_whitespace(): assert md(' a b \t\t c ') == ' a b c ' + assert md(' a b \n\n c ') == ' a b\nc ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index a35b982..0be1d0c 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE def inline_tests(tag, markup): @@ -66,7 +66,7 @@ def test_blockquote_with_paragraph(): def test_blockquote_nested(): text = md('
And she was like
Hello
') - assert text == '\n> And she was like \n> > Hello\n\n' + assert text == '\n> And she was like\n> > Hello\n\n' def test_br(): @@ -112,36 +112,39 @@ def test_em(): def test_header_with_space(): - assert md('

\n\nHello

') == '### Hello\n\n' - assert md('

\n\nHello

') == '#### Hello\n\n' - assert md('
\n\nHello
') == '##### Hello\n\n' - assert md('
\n\nHello\n\n
') == '##### Hello\n\n' - assert md('
\n\nHello \n\n
') == '##### Hello\n\n' + assert md('

\n\nHello

') == '\n### Hello\n\n' + assert md('

Hello\n\n\nWorld

') == '\n### Hello World\n\n' + assert md('

\n\nHello

') == '\n#### Hello\n\n' + assert md('
\n\nHello
') == '\n##### Hello\n\n' + assert md('
\n\nHello\n\n
') == '\n##### Hello\n\n' + assert md('
\n\nHello \n\n
') == '\n##### Hello\n\n' def test_h1(): - assert md('

Hello

') == 'Hello\n=====\n\n' + assert md('

Hello

') == '\n\nHello\n=====\n\n' def test_h2(): - assert md('

Hello

') == 'Hello\n-----\n\n' + assert md('

Hello

') == '\n\nHello\n-----\n\n' def test_hn(): - assert md('

Hello

') == '### Hello\n\n' - assert md('

Hello

') == '#### Hello\n\n' - assert md('
Hello
') == '##### Hello\n\n' - assert md('
Hello
') == '###### Hello\n\n' + assert md('

Hello

') == '\n### Hello\n\n' + assert md('

Hello

') == '\n#### Hello\n\n' + assert md('
Hello
') == '\n##### Hello\n\n' + assert md('
Hello
') == '\n###### Hello\n\n' def test_hn_chained(): - assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n' - assert md('X

First

', heading_style=ATX) == 'X# First\n\n' + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n' + assert md('X

First

', heading_style=ATX) == 'X\n# First\n\n' + assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n# First #\n\n' + assert md('X

First

') == 'X\n\nFirst\n=====\n\n' def test_hn_nested_tag_heading_style(): - assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' - assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' + assert md('

A

P

C

', heading_style=ATX_CLOSED) == '\n# A P C #\n\n' + assert md('

A

P

C

', heading_style=ATX) == '\n# A P C\n\n' def test_hn_nested_simple_tag(): @@ -157,12 +160,12 @@ def test_hn_nested_simple_tag(): ] for tag, markdown in tag_to_markdown: - assert md('

A <' + tag + '>' + tag + ' B

') == '### A ' + markdown + ' B\n\n' + assert md('

A <' + tag + '>' + tag + ' B

') == '\n### A ' + markdown + ' B\n\n' - assert md('

A
B

', heading_style=ATX) == '### A B\n\n' + assert md('

A
B

', heading_style=ATX) == '\n### A B\n\n' # Nested lists not supported - # assert md('

A

', heading_style=ATX) == '### A li1 li2 B\n\n' + # assert md('

A

', heading_style=ATX) == '\n### A li1 li2 B\n\n' def test_hn_nested_img(): @@ -172,18 +175,18 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

A B

') == '### A ' + markdown + ' B\n\n' - assert md('

A B

', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' + assert md('

A B

') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' + assert md('

A B

', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' def test_hn_atx_headings(): - assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' - assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '\n# Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '\n## Hello\n\n' def test_hn_atx_closed_headings(): - assert md('

Hello

', heading_style=ATX_CLOSED) == '# Hello #\n\n' - assert md('

Hello

', heading_style=ATX_CLOSED) == '## Hello ##\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '\n# Hello #\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n' def test_head(): @@ -193,7 +196,7 @@ def test_head(): def test_hr(): assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('Hello
World') == 'Hello\n\n---\n\nWorld' - assert md('

Hello

\n
\n

World

') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' + assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n---\n\nWorld\n\n' def test_i(): @@ -210,12 +213,23 @@ def test_kbd(): def test_p(): - assert md('

hello

') == 'hello\n\n' - assert md('

123456789 123456789

') == '123456789 123456789\n\n' - assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '123456789\n123456789\n\n' - assert md('

Some long link

', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n' + assert md('

hello

') == '\n\nhello\n\n' + assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' + assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' + assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' + assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' def test_pre(): @@ -289,3 +303,13 @@ def callback(el): assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' + + +def test_spaces(): + assert md('

a b

c d

') == '\n\na b\n\nc d\n\n' + assert md('

a

') == '\n\n*a*\n\n' + assert md('test

again

') == 'test\n\nagain\n\n' + assert md('test
text
after') == 'test\n> text\n\nafter' + assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' + assert md(' """) == '\n\n* a\n* b\n* c\n' + assert md('') == '\n\n* first para\n \n second para\n* third para\n \n fourth para\n' def test_inline_ul(): - assert md('

foo

bar

') == 'foo\n\n* a\n* b\n\nbar\n\n' + assert md('

foo

bar

') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n' + assert md('foobaz') == 'foo\n\n* bar\n\nbaz' def test_nested_uls(): @@ -73,12 +77,12 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' + assert md(nested_uls) == '\n\n* 1\n + a\n - I\n - II\n - III\n + b\n + c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' + assert md(nested_uls, bullets='-') == '\n\n- 1\n - a\n - I\n - II\n - III\n - b\n - c\n- 2\n- 3\n' def test_li_text(): - assert md('') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' + assert md('') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' diff --git a/tests/test_tables.py b/tests/test_tables.py index 594e5bf..fc6eee6 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -242,7 +242,7 @@ def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' + assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'