Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Cache failures in url_preview handler #2669

Merged
merged 2 commits into from
Nov 13, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 47 additions & 41 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from synapse.api.errors import (
SynapseError, Codes,
)
from synapse.util.logcontext import preserve_fn, make_deferred_yieldable
from synapse.util.stringutils import random_string
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.http.client import SpiderHttpClient
Expand Down Expand Up @@ -63,16 +64,15 @@ def __init__(self, hs, media_repo):

self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

# simple memory cache mapping urls to OG metadata
self.cache = ExpiringCache(
# memory cache mapping urls to an ObservableDeferred returning
# JSON-encoded OG metadata
self._cache = ExpiringCache(
cache_name="url_previews",
clock=self.clock,
# don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000,
)
self.cache.start()

self.downloads = {}
self._cache.start()

self._cleaner_loop = self.clock.looping_call(
self._expire_url_cache_data, 10 * 1000
Expand All @@ -94,6 +94,7 @@ def _async_render_GET(self, request):
else:
ts = self.clock.time_msec()

# XXX: we could move this into _do_preview if we wanted.
url_tuple = urlparse.urlsplit(url)
for entry in self.url_preview_url_blacklist:
match = True
Expand Down Expand Up @@ -126,47 +127,53 @@ def _async_render_GET(self, request):
Codes.UNKNOWN
)

# first check the memory cache - good to handle all the clients on this
# HS thundering away to preview the same URL at the same time.
og = self.cache.get(url)
if og:
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
return
# the in-memory cache:
# * ensures that only one request is active at a time
# * takes load off the DB for the thundering herds
# * also caches any failures (unlike the DB) so we don't keep
# requesting the same endpoint

observable = self._cache.get(url)

if not observable:
download = preserve_fn(self._do_preview)(
url, requester.user, ts,
)
observable = ObservableDeferred(
download,
consumeErrors=True
)
self._cache[url] = observable
else:
logger.info("Returning cached response")

og = yield make_deferred_yieldable(observable.observe())
respond_with_json_bytes(request, 200, og, send_cors=True)

# then check the URL cache in the DB (which will also provide us with
@defer.inlineCallbacks
def _do_preview(self, url, user, ts):
"""Check the db, and download the URL and build a preview

Args:
url (str):
user (str):
ts (int):

Returns:
Deferred[str]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
cache_result = yield self.store.get_url_cache(url, ts)
if (
cache_result and
cache_result["expires_ts"] > ts and
cache_result["response_code"] / 100 == 2
):
respond_with_json_bytes(
request, 200, cache_result["og"].encode('utf-8'),
send_cors=True
)
defer.returnValue(cache_result["og"])
return

# Ensure only one download for a given URL is active at a time
download = self.downloads.get(url)
if download is None:
download = self._download_url(url, requester.user)
download = ObservableDeferred(
download,
consumeErrors=True
)
self.downloads[url] = download

@download.addBoth
def callback(media_info):
del self.downloads[url]
return media_info
media_info = yield download.observe()

# FIXME: we should probably update our cache now anyway, so that
# even if the OG calculation raises, we don't keep hammering on the
# remote server. For now, leave it uncached to aid debugging OG
# calculation problems
media_info = yield self._download_url(url, user)

logger.debug("got media_info of '%s'" % media_info)

Expand Down Expand Up @@ -212,7 +219,7 @@ def callback(media_info):
# just rely on the caching on the master request to speed things up.
if 'og:image' in og and og['og:image']:
image_info = yield self._download_url(
_rebase_url(og['og:image'], media_info['uri']), requester.user
_rebase_url(og['og:image'], media_info['uri']), user
)

if _is_media(image_info['media_type']):
Expand All @@ -239,21 +246,20 @@ def callback(media_info):

logger.debug("Calculated OG for %s as %s" % (url, og))

# store OG in ephemeral in-memory cache
self.cache[url] = og
jsonog = json.dumps(og)

# store OG in history-aware DB cache
yield self.store.store_url_cache(
url,
media_info["response_code"],
media_info["etag"],
media_info["expires"] + media_info["created_ts"],
json.dumps(og),
jsonog,
media_info["filesystem_id"],
media_info["created_ts"],
)

respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
defer.returnValue(jsonog)

@defer.inlineCallbacks
def _download_url(self, url, user):
Expand Down