Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Use <meta> tags to discover the per-page encoding of html previews (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkowl authored Nov 15, 2018
1 parent a51288e commit df758e1
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 9 deletions.
1 change: 1 addition & 0 deletions changelog.d/4183.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
31 changes: 22 additions & 9 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@

logger = logging.getLogger(__name__)

_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)


class PreviewUrlResource(Resource):
isLeaf = True
Expand Down Expand Up @@ -223,15 +226,25 @@ def _do_preview(self, url, user, ts):
with open(media_info['filename'], 'rb') as file:
body = file.read()

# clobber the encoding from the content-type, or default to utf-8
# XXX: this overrides any <meta/> or XML charset headers in the body
# which may pose problems, but so far seems to work okay.
match = re.match(
r'.*; *charset="?(.*?)"?(;|$)',
media_info['media_type'],
re.I
)
encoding = match.group(1) if match else "utf-8"
encoding = None

# Let's try and figure out if it has an encoding set in a meta tag.
# Limit it to the first 1kb, since it ought to be in the meta tags
# at the top.
match = _charset_match.search(body[:1000])

# If we find a match, it should take precedence over the
# Content-Type header, so set it here.
if match:
encoding = match.group(1).decode('ascii')

# If we don't find a match, we'll look at the HTTP Content-Type, and
# if that doesn't exist, we'll fall back to UTF-8.
if not encoding:
match = _content_type_match.match(
media_info['media_type']
)
encoding = match.group(1) if match else "utf-8"

og = decode_and_calc_og(body, media_info['uri'], encoding)

Expand Down
77 changes: 77 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,80 @@ def test_cache_returns_correct_type(self):
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)

def test_non_ascii_preview_httpequiv(self):

request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# We've made one fetch
self.assertEqual(len(self.fetches), 1)

end_content = (
b'<html><head>'
b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
b'<meta property="og:title" content="\xe4\xea\xe0" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)

self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
# This charset=utf-8 should be ignored, because the
# document has a meta tag overriding it.
b"Content-Type": [b'text/html; charset="utf8"'],
},
"https://example.com",
200,
),
)
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")

def test_non_ascii_preview_content_type(self):

request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# We've made one fetch
self.assertEqual(len(self.fetches), 1)

end_content = (
b'<html><head>'
b'<meta property="og:title" content="\xe4\xea\xe0" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)

self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
b"Content-Type": [b'text/html; charset="windows-1251"'],
},
"https://example.com",
200,
),
)
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")

0 comments on commit df758e1

Please sign in to comment.