Skip to content

Commit

Permalink
Merge pull request #3670 from rebeccacremona/wacz-download
Browse files Browse the repository at this point in the history
Tools for working with WACZs
  • Loading branch information
rebeccacremona authored Dec 4, 2024
2 parents 59c9064 + 432782b commit c02c37a
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 48 deletions.
32 changes: 24 additions & 8 deletions perma_web/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from perma.models import LinkUser, Folder, CaptureJob, Capture, Link, Organization, LinkBatch
from perma.utils import send_to_scoop

from .utils import get_mime_type, mime_type_lookup, reverse_api_view
from .utils import get_mime_type, mime_type_lookup, get_download_url

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -183,10 +183,24 @@ class LinkSerializer(BaseSerializer):
queue_time = serializers.SerializerMethodField()
capture_time = serializers.SerializerMethodField()
warc_download_url = serializers.SerializerMethodField()
wacz_download_url = serializers.SerializerMethodField()

class Meta:
model = Link
fields = ('guid', 'creation_timestamp', 'url', 'title', 'description', 'warc_size', 'warc_download_url', 'captures', 'queue_time', 'capture_time')
fields = (
'guid',
'creation_timestamp',
'url',
'title',
'description',
'warc_size',
'warc_download_url',
'wacz_size',
'wacz_download_url',
'captures',
'queue_time',
'capture_time'
)

def get_queue_time(self, link):
try:
Expand All @@ -203,9 +217,10 @@ def get_capture_time(self, link):
return None

def get_warc_download_url(self, link):
if link.warc_size:
return reverse_api_view('public_archives_download', kwargs={'guid': link.guid}, request=self.context['request'])
return None
return get_download_url(self.context['request'], link, file_format='warc', public=True)

def get_wacz_download_url(self, link):
return get_download_url(self.context['request'], link, file_format='wacz', public=True)


class AuthenticatedLinkSerializer(LinkSerializer):
Expand All @@ -220,9 +235,10 @@ class Meta(LinkSerializer.Meta):
allowed_update_fields = ['submitted_title', 'submitted_description', 'notes', 'is_private', 'private_reason', 'default_to_screenshot_view']

def get_warc_download_url(self, link):
if link.warc_size:
return reverse_api_view('archives_download', kwargs={'guid': link.guid}, request=self.context['request'])
return None
return get_download_url(self.context['request'], link, file_format='warc', public=False)

def get_wacz_download_url(self, link):
return get_download_url(self.context['request'], link, file_format='wacz', public=False)

def validate_url(self, url):
# Clean up the user submitted url
Expand Down
64 changes: 53 additions & 11 deletions perma_web/api/tests/test_link_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from django.conf import settings
from django.urls import reverse
from django.http import StreamingHttpResponse
from django.test.utils import override_settings

from mock import patch
Expand Down Expand Up @@ -67,9 +66,9 @@ def setUp(self):
self.public_link_download_url_for_private_link = reverse('api:public_archives_download', args=[self.unrelated_private_link.pk])

self.replaced_link_public_download_url = reverse('api:public_archives_download', args=['ABCD-0006'])
self.replaced_link_public_download_redirect_target = reverse('api:public_archives_download', args=['3SLN-JHX9'])
self.replaced_link_public_download_redirect_target = f"{reverse('api:public_archives_download', args=['3SLN-JHX9'])}?file_format=warc"
self.replaced_link_authed_download_url = reverse('api:archives_download', args=['ABCD-0006'])
self.replaced_link_authed_download_redirect_target = reverse('api:archives_download', args=['3SLN-JHX9'])
self.replaced_link_authed_download_redirect_target = f"{reverse('api:archives_download', args=['3SLN-JHX9'])}?file_format=warc"
self.replaced_link_owner = LinkUser.objects.get(id=4)

self.logged_out_fields = [
Expand All @@ -81,6 +80,8 @@ def setUp(self):
'captures',
'warc_size',
'warc_download_url',
'wacz_size',
'wacz_download_url',
'queue_time',
'capture_time',
]
Expand Down Expand Up @@ -163,12 +164,38 @@ def test_get_list_json(self):
def test_get_detail_json(self):
self.successful_get(self.public_link_detail_url, fields=self.logged_out_fields)

@patch('api.views.stream_warc', autospec=True)
def test_public_download(self, stream):
stream.return_value = StreamingHttpResponse(StringIO("warc placeholder"))
@patch('perma.models.Link.get_warc', autospec=True)
def test_public_download_warc(self, get_warc):
get_warc.return_value = StringIO("archive placeholder")
resp = self.api_client.get(self.public_link_download_url)
self.assertHttpOK(resp)
self.assertEqual(stream.call_count, 1)
self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.warc.gz"')
self.assertEqual(resp.get('Content-Type', ''), 'application/gzip')
self.assertEqual(get_warc.call_count, 1)

@patch('perma.models.Link.get_wacz', autospec=True)
def test_public_download_wacz(self, get_wacz):
get_wacz.return_value = StringIO("archive placeholder")
resp = self.api_client.get(f"{self.public_link_download_url}?file_format=wacz")
self.assertHttpOK(resp)
self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.wacz"')
self.assertEqual(resp.get('Content-Type', ''), 'application/wacz')
self.assertEqual(get_wacz.call_count, 1)

def test_public_download_unsupported_format(self):
self.rejected_get(f"{self.public_link_download_url}?file_format=asdf", expected_status_code=400)

@patch('perma.models.Link.get_warc', autospec=True)
def test_download_nonexistent_warc(self, get_warc):
get_warc.side_effect = RuntimeError
self.rejected_get(self.public_link_download_url, expected_status_code=404)
self.assertEqual(get_warc.call_count, 1)

@patch('perma.models.Link.get_wacz', autospec=True)
def test_download_nonexistent_wacz(self, get_wacz):
get_wacz.side_effect = RuntimeError
self.rejected_get(f"{self.public_link_download_url}?file_format=wacz", expected_status_code=404)
self.assertEqual(get_wacz.call_count, 1)

def test_private_download_at_public_url(self):
self.rejected_get(self.public_link_download_url_for_private_link, expected_status_code=404)
Expand All @@ -194,15 +221,30 @@ def test_replaced_link_authed_download(self):
self.assertEqual(resp.status_code, 302)
self.assertEqual(resp.url, self.replaced_link_authed_download_redirect_target)

@patch('perma.utils.stream_warc', autospec=True)
def test_private_download(self, stream):
stream.return_value = StreamingHttpResponse(StringIO("warc placeholder"))
@patch('perma.models.Link.get_warc', autospec=True)
def test_private_download_warc(self, get_warc):
get_warc.return_value = StringIO("archive placeholder")
self.api_client.force_authenticate(user=self.regular_user)
resp = self.api_client.get(
self.logged_in_private_link_download_url,
)
self.assertHttpOK(resp)
self.assertEqual(stream.call_count, 1)
self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.warc.gz"')
self.assertEqual(resp.get('Content-Type', ''), 'application/gzip')
self.assertEqual(get_warc.call_count, 1)

@patch('perma.models.Link.get_wacz', autospec=True)
def test_private_download_wacz(self, get_wacz):
get_wacz.return_value = StringIO("archive placeholder")
self.api_client.force_authenticate(user=self.regular_user)
resp = self.api_client.get(
f"{self.logged_in_private_link_download_url}?file_format=wacz",
)
self.assertHttpOK(resp)
self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.wacz"')
self.assertEqual(resp.get('Content-Type', ''), 'application/wacz')
self.assertEqual(get_wacz.call_count, 1)


############
# Updating #
Expand Down
30 changes: 29 additions & 1 deletion perma_web/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from django.urls import resolve, reverse
from django.urls.exceptions import NoReverseMatch
from rest_framework import serializers
from rest_framework.exceptions import PermissionDenied
from rest_framework.exceptions import PermissionDenied, ValidationError
from rest_framework.pagination import LimitOffsetPagination
from rest_framework.response import Response
from rest_framework.reverse import reverse as drf_reverse
Expand Down Expand Up @@ -240,3 +240,31 @@ class SpoofResponse:
'data': response.data
})
return responses


def get_download_file_format(request):
file_format = request.query_params.get('file_format', 'warc')
supported_formats = ['warc', 'wacz']
if file_format not in supported_formats:
raise ValidationError({
"file_format": f"The specified format is not supported. Options: {', '.join(supported_formats)}."
})
return file_format


def get_download_url(request, link, file_format='warc', public=True):
view_name = f"{'public_' if public else ''}archives_download"
match file_format:
case 'warc':
if link.warc_size or link.wacz_size:
return reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request)
return None
case 'wacz':
if link.wacz_size:
base_url = reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request)
return f"{base_url}?file_format=wacz"
return None
case _:
raise NotImplementedError("Unsupported file format.")


19 changes: 13 additions & 6 deletions perma_web/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from rest_framework.views import APIView
import surt

from perma.utils import stream_warc, stream_warc_if_permissible
from perma.utils import stream_archive, stream_archive_if_permissible
from perma.celery_tasks import run_next_capture
from perma.models import Folder, CaptureJob, Link, Capture, Organization, LinkBatch

from .utils import TastypiePagination, load_parent, raise_general_validation_error, \
raise_invalid_capture_job, dispatch_multiple_requests, reverse_api_view_relative, \
url_is_invalid_unicode
url_is_invalid_unicode, get_download_file_format
from .serializers import FolderSerializer, CaptureJobSerializer, LinkSerializer, AuthenticatedLinkSerializer, \
LinkUserSerializer, OrganizationSerializer, LinkBatchSerializer, DetailedLinkBatchSerializer
from django.conf import settings
Expand Down Expand Up @@ -353,9 +353,14 @@ def get(self, request, guid, format=None):
link = Link.objects.discoverable().get(pk=guid)
except Link.DoesNotExist:
raise Http404

file_format = get_download_file_format(request)

if link.replacement_link_id:
return HttpResponseRedirect(reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id}))
return stream_warc(link)
base_url = reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id})
return HttpResponseRedirect(f"{base_url}?file_format={file_format}")

return stream_archive(link, file_format=file_format)


# /archives
Expand Down Expand Up @@ -666,9 +671,11 @@ class AuthenticatedLinkDownloadView(BaseView):
def get(self, request, guid, format=None):
""" Download warc. """
link = self.get_object_for_user_by_pk(request.user, guid)
file_format = get_download_file_format(request)
if link.replacement_link_id:
return HttpResponseRedirect(reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id}))
return stream_warc_if_permissible(link, request.user)
base_url = reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id})
return HttpResponseRedirect(f"{base_url}?file_format={file_format}")
return stream_archive_if_permissible(link, request.user, file_format=file_format)


# /folders/:parent_id/archives/:guid
Expand Down
6 changes: 6 additions & 0 deletions perma_web/perma/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2037,6 +2037,12 @@ def get_warc(self, extract_from_wacz_if_present=True, force_from_wacz=False):
else:
raise RuntimeError(f'No archive present for {self.guid}')

@contextmanager
def get_wacz(self):
if not self.wacz_size:
raise RuntimeError(f'No WACZ present for {self.guid}')
yield storages[settings.WACZ_STORAGE].open(self.wacz_storage_file(), 'rb')

def accessible_to(self, user):
return user.can_edit(self)

Expand Down
5 changes: 4 additions & 1 deletion perma_web/perma/templates/archive/single-link.html
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,10 @@
<a href="?embed=False" class="btn btn-ui-small btn-dashboard">Show playback controls</a>
{% endif %}
{% if link.can_play_back %}
<a href="{% url 'single_permalink' guid=link.guid %}?type=warc_download" role="button" class="btn btn-ui-small btn-dashboard" title="download">Download Archive</a>
<a href="{% url 'single_permalink' guid=link.guid %}?type=warc_download" role="button" class="btn btn-ui-small btn-dashboard" title="download">Download WARC</a>
{% if link.wacz_size %}
<a href="{% url 'single_permalink' guid=link.guid %}?type=wacz_download" role="button" class="btn btn-ui-small btn-dashboard" title="download">Download WACZ</a>
{% endif %}
{% endif %}
{% if not can_edit %}
<a href="{% url 'report' %}?guid={{link.guid}}" role="button" class="btn btn-ui-small btn-dashboard flag" title="Flag as inappropriate">Flag<span class="_verbose"> as inappropriate</span></a>
Expand Down
Loading

0 comments on commit c02c37a

Please sign in to comment.