Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Avoid storing URL cache files in storage providers #10911

Merged
merged 8 commits into from
Sep 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10911.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid storing URL cache files in storage providers. Server admins may safely delete the `url_cache/` and `url_cache_thumbnails/` directories from any configured storage providers to reclaim space.
7 changes: 7 additions & 0 deletions docs/upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ process, for example:
dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
```

# Upgrading to v1.44.0

## The URL preview cache is no longer mirrored to storage providers
The `url_cache/` and `url_cache_thumbnails/` directories in the media store are
no longer mirrored to storage providers. These two directories can be safely
deleted from any configured storage providers to reclaim space.

# Upgrading to v1.43.0

## The spaces summary APIs can now be handled by workers
Expand Down
11 changes: 6 additions & 5 deletions synapse/rest/media/v1/filepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,23 +195,24 @@ def url_cache_thumbnail_rel(

url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)

def url_cache_thumbnail_directory(self, media_id: str) -> str:
def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we're making this change, is it just for consistency?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to be so we can use it in tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, it's for the tests

# Media id is of the form <DATE><RANDOM_STRING>
# E.g.: 2017-09-28-fsdRDt24DS234dsf

if NEW_FORMAT_ID_RE.match(media_id):
return os.path.join(
self.base_path, "url_cache_thumbnails", media_id[:10], media_id[11:]
)
return os.path.join("url_cache_thumbnails", media_id[:10], media_id[11:])
else:
return os.path.join(
self.base_path,
"url_cache_thumbnails",
media_id[0:2],
media_id[2:4],
media_id[4:],
)

url_cache_thumbnail_directory = _wrap_in_base_path(
url_cache_thumbnail_directory_rel
)

def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
"The dirs to try and remove if we delete the media_id thumbnails"
# Media id is of the form <DATE><RANDOM_STRING>
Expand Down
1 change: 0 additions & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,6 @@ def _start_expire_url_cache_data(self) -> Deferred:

async def _expire_url_cache_data(self) -> None:
"""Clean up expired url cache content, media and thumbnails."""
# TODO: Delete from backup media store

assert self._worker_run_media_background_jobs

Expand Down
10 changes: 10 additions & 0 deletions synapse/rest/media/v1/storage_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ async def store_file(self, path: str, file_info: FileInfo) -> None:
if file_info.server_name and not self.store_remote:
return None

if file_info.url_cache:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious why you chose to do this here instead of where we make the URL previews? My guess is to ensure we don't break this if we add another way of having URL previews?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little bit, but mostly for existing code. The call hierarchy gets pretty messy above store_into_file and I found it difficult to figure out which paths might touch the url cache. And I didn't want to thread an extra boolean through _generate_thumbnails.

The choice between store_into_file and here was mostly arbitrary, though it seems like this is where we do the existing filtering based on FileInfo.

# The URL preview cache is short lived and not worth offloading or
# backing up.
return None

if self.store_synchronous:
# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
Expand All @@ -110,6 +115,11 @@ async def store() -> None:
run_in_background(store)

async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
if file_info.url_cache:
# Files in the URL preview cache definitely aren't stored here,
# so avoid any potentially slow I/O or network access.
return None

# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
return await maybe_awaitable(self.backend.fetch(path, file_info))
Expand Down
130 changes: 130 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from twisted.test.proto_helpers import AccumulatingProtocol

from synapse.config.oembed import OEmbedEndpointConfig
from synapse.util.stringutils import parse_and_validate_mxc_uri

from tests import unittest
from tests.server import FakeTransport
Expand Down Expand Up @@ -721,3 +722,132 @@ def test_oembed_format(self):
"og:description": "Content Preview",
},
)

def _download_image(self):
"""Downloads an image into the URL cache.

Returns:
A (host, media_id) tuple representing the MXC URI of the image.
"""
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]

channel = self.make_request(
"GET",
"preview_url?url=http://cdn.twitter.com/matrixdotorg",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
% (len(SMALL_PNG),)
+ SMALL_PNG
)

self.pump()
self.assertEqual(channel.code, 200)
body = channel.json_body
mxc_uri = body["og:image"]
host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
self.assertIsNone(_port)
return host, media_id

def test_storage_providers_exclude_files(self):
"""Test that files are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
storage_provider_path = os.path.join(self.storage_path, rel_file_path)

# Check storage
self.assertTrue(os.path.isfile(media_store_path))
self.assertFalse(
os.path.isfile(storage_provider_path),
"URL cache file was unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

# Move cached file into the storage provider
os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
os.rename(media_store_path, storage_provider_path)

channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache file was unexpectedly retrieved from a storage provider",
)

def test_storage_providers_exclude_thumbnails(self):
"""Test that thumbnails are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_thumbnail_path = (
self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
)
media_store_thumbnail_path = os.path.join(
self.media_store_path, rel_thumbnail_path
)
storage_provider_thumbnail_path = os.path.join(
self.storage_path, rel_thumbnail_path
)

# Check storage
self.assertTrue(os.path.isdir(media_store_thumbnail_path))
self.assertFalse(
os.path.isdir(storage_provider_thumbnail_path),
"URL cache thumbnails were unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

# Remove the original, otherwise thumbnails will regenerate
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
os.remove(media_store_path)

# Move cached thumbnails into the storage provider
os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)

channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache thumbnail was unexpectedly retrieved from a storage provider",
)