From 446a8eb0733835fa2b2e61b8e311a02f9325cf00 Mon Sep 17 00:00:00 2001 From: Scott Blackburn Date: Fri, 10 Apr 2015 13:25:39 -0700 Subject: [PATCH] Retain escaping of html except within code or pre tags. --- html2text/__init__.py | 14 +++++++++++--- test/html-escaping.html | 3 +++ test/html-escaping.md | 8 ++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 test/html-escaping.html create mode 100644 test/html-escaping.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 2ee872f..48d4454 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,6 +3,7 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re +import cgi try: from textwrap import wrap @@ -160,10 +161,16 @@ def close(self): return outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -351,6 +358,7 @@ def handle_tag(self, tag, attrs, start): if tag in ["code", "tt"] and not self.pre: self.o('`') # TODO: `` `this` `` + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None @@ -416,7 +424,7 @@ def handle_tag(self, tag, attrs, start): else: self.o("[") self.maybe_automatic_link = None - self.empty_link = False + self.empty_link = False # If we have images_to_alt, we discard the image itself, # considering only the alt text. diff --git a/test/html-escaping.html b/test/html-escaping.html new file mode 100644 index 0000000..b6f1da7 --- /dev/null +++ b/test/html-escaping.html @@ -0,0 +1,3 @@ +

Escaped HTML like <div> or & should remain escaped on output

+
...unless that escaped HTML is in a <pre> tag
+...or a <code> tag \ No newline at end of file diff --git a/test/html-escaping.md b/test/html-escaping.md new file mode 100644 index 0000000..19e91ee --- /dev/null +++ b/test/html-escaping.md @@ -0,0 +1,8 @@ +Escaped HTML like <div> or & should remain escaped on output + + + + ...unless that escaped HTML is in a
 tag
+
+`...or a  tag`
+