Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Use <meta> tags to discover the per-page encoding of html previews #4183

Merged
merged 4 commits into from
Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/4183.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
31 changes: 22 additions & 9 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@

logger = logging.getLogger(__name__)

_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Obligatory reference to https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#1732454 here.

(yeah, I don't know how you're supposed to parse the HTML before you know what encoding it is, either)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, when I was coming up with this, I had two solutions:

  1. Decode as latin1 (which will decode any 8 bit stream) or ascii with errors set to ignore
  2. Regex looking for a <meta beginning block and then charset=<foo>, which is likely to be in a http-equiv="Content-Type".

Since I consider this best-effort (correctly configured servers ought to be serving the correct header in the Content-Type), I decided a regex was better than parsing it, then parsing it again. I chose not to check for the http equiv part since it can go before or after the content="charset=utf8" bit and make the regex far more complex.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fully agreed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's only a 2/10 on the HE COMES scale :)

_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)


class PreviewUrlResource(Resource):
isLeaf = True
Expand Down Expand Up @@ -223,15 +226,25 @@ def _do_preview(self, url, user, ts):
with open(media_info['filename'], 'rb') as file:
body = file.read()

# clobber the encoding from the content-type, or default to utf-8
# XXX: this overrides any <meta/> or XML charset headers in the body
# which may pose problems, but so far seems to work okay.
match = re.match(
r'.*; *charset="?(.*?)"?(;|$)',
media_info['media_type'],
re.I
)
encoding = match.group(1) if match else "utf-8"
encoding = None

# Let's try and figure out if it has an encoding set in a meta tag.
# Limit it to the first 1kb, since it ought to be in the meta tags
# at the top.
match = _charset_match.search(body[:1000])

# If we find a match, it should take precedence over the
# Content-Type header, so set it here.
if match:
encoding = match.group(1).decode('ascii')

# If we don't find a match, we'll look at the HTTP Content-Type, and
# if that doesn't exist, we'll fall back to UTF-8.
if not encoding:
match = _content_type_match.match(
media_info['media_type']
)
encoding = match.group(1) if match else "utf-8"

og = decode_and_calc_og(body, media_info['uri'], encoding)

Expand Down
77 changes: 77 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,80 @@ def test_cache_returns_correct_type(self):
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)

def test_non_ascii_preview_httpequiv(self):

request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# We've made one fetch
self.assertEqual(len(self.fetches), 1)

end_content = (
b'<html><head>'
b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
b'<meta property="og:title" content="\xe4\xea\xe0" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)

self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
# This charset=utf-8 should be ignored, because the
# document has a meta tag overriding it.
b"Content-Type": [b'text/html; charset="utf8"'],
},
"https://example.com",
200,
),
)
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")

def test_non_ascii_preview_content_type(self):

request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# We've made one fetch
self.assertEqual(len(self.fetches), 1)

end_content = (
b'<html><head>'
b'<meta property="og:title" content="\xe4\xea\xe0" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)

self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
b"Content-Type": [b'text/html; charset="windows-1251"'],
},
"https://example.com",
200,
),
)
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")