From 9902f1946b1e921c60d2924de1f002fa5373bafa Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 12:02:50 -0500 Subject: [PATCH 01/14] First draft: add utilities for download WACZ. --- perma_web/perma/models.py | 6 ++++++ perma_web/perma/utils.py | 23 +++++++++++++++++++++++ perma_web/perma/views/playback.py | 9 +++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/perma_web/perma/models.py b/perma_web/perma/models.py index 97c0f3629..aa1e91108 100755 --- a/perma_web/perma/models.py +++ b/perma_web/perma/models.py @@ -2037,6 +2037,12 @@ def get_warc(self, extract_from_wacz_if_present=True, force_from_wacz=False): else: raise RuntimeError(f'No archive present for {self.guid}') + @contextmanager + def get_wacz(self): + if not self.wacz_size: + raise RuntimeError(f'No WARC present for {self.guid}') + yield storages[settings.WACZ_STORAGE].open(self.wacz_storage_file(), 'rb') + def accessible_to(self, user): return user.can_edit(self) diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index 321a52fa7..0c7b6f64c 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -623,12 +623,35 @@ def stream_warc(link, stream=True): raise Http404 return get_warc_stream(link, stream) + def stream_warc_if_permissible(link, user, stream=True): if user.can_view(link): return stream_warc(link, stream) return HttpResponseForbidden('Private archive.') +def stream_wacz(link, stream=True): + # `link.user_deleted` is checked here for dev convenience: + # it's easy to forget that deleted links/waczs aren't truly deleted, + # and easy to accidentally permit the downloading of "deleted" waczs. + # Users of stream_wacz shouldn't have to worry about / remember this. + if link.user_deleted or not link.can_play_back(): + raise Http404 + with link.get_wacz() as wacz_file: + if stream: + response = StreamingHttpResponse(wacz_file, content_type="application/wacz") + else: + response = HttpResponse(wacz_file, content_type="application/wacz") + response['Content-Disposition'] = f'attachment; filename="{link.guid}.wacz"' + return response + + +def stream_wacz_if_permissible(link, user, stream=True): + if user.can_view(link): + return stream_wacz(link, stream) + return HttpResponseForbidden('Private archive.') + + def calculate_s3_etag(fp, chunk_size, multipart_format=False): """ Adapted from https://stackoverflow.com/a/43819225 diff --git a/perma_web/perma/views/playback.py b/perma_web/perma/views/playback.py index 6c58e3aca..768e1c61c 100644 --- a/perma_web/perma/views/playback.py +++ b/perma_web/perma/views/playback.py @@ -14,12 +14,13 @@ from perma.models import Link from perma.utils import (if_anonymous, ratelimit_ip_key, memento_url, timemap_url, timegate_url, - protocol, remove_control_characters, stream_warc_if_permissible) + protocol, remove_control_characters, stream_warc_if_permissible, + stream_wacz_if_permissible) import logging logger = logging.getLogger(__name__) -valid_serve_types = ['image', 'warc_download', 'standard'] +valid_serve_types = ['image', 'warc_download', 'wacz_download', 'standard'] @if_anonymous(cache_control(max_age=settings.CACHE_MAX_AGES['single_permalink'])) @@ -61,6 +62,10 @@ def single_permalink(request, guid): if serve_type == 'warc_download': return stream_warc_if_permissible(link, request.user) + # serve raw WACZ + if serve_type == 'wacz_download': + return stream_wacz_if_permissible(link, request.user) + # handle requested capture type if serve_type == 'image': capture = link.screenshot_capture From 83c9cb6272643117f96969b2c6ee5fa232b7d7ba Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 12:40:38 -0500 Subject: [PATCH 02/14] Dry it up. --- perma_web/api/tests/test_link_resource.py | 4 +- perma_web/api/views.py | 6 +-- perma_web/perma/utils.py | 46 ++++++++++------------- perma_web/perma/views/playback.py | 7 ++-- 4 files changed, 28 insertions(+), 35 deletions(-) diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index 12d39a156..7c5cb4ef9 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -163,7 +163,7 @@ def test_get_list_json(self): def test_get_detail_json(self): self.successful_get(self.public_link_detail_url, fields=self.logged_out_fields) - @patch('api.views.stream_warc', autospec=True) + @patch('api.views.stream_archive', autospec=True) def test_public_download(self, stream): stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) resp = self.api_client.get(self.public_link_download_url) @@ -194,7 +194,7 @@ def test_replaced_link_authed_download(self): self.assertEqual(resp.status_code, 302) self.assertEqual(resp.url, self.replaced_link_authed_download_redirect_target) - @patch('perma.utils.stream_warc', autospec=True) + @patch('perma.utils.stream_archive', autospec=True) def test_private_download(self, stream): stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) self.api_client.force_authenticate(user=self.regular_user) diff --git a/perma_web/api/views.py b/perma_web/api/views.py index e7b1d87ba..4a9b981a4 100644 --- a/perma_web/api/views.py +++ b/perma_web/api/views.py @@ -14,7 +14,7 @@ from rest_framework.views import APIView import surt -from perma.utils import stream_warc, stream_warc_if_permissible +from perma.utils import stream_archive, stream_archive_if_permissible from perma.celery_tasks import run_next_capture from perma.models import Folder, CaptureJob, Link, Capture, Organization, LinkBatch @@ -355,7 +355,7 @@ def get(self, request, guid, format=None): raise Http404 if link.replacement_link_id: return HttpResponseRedirect(reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_warc(link) + return stream_archive(link, file_format='warc') # /archives @@ -668,7 +668,7 @@ def get(self, request, guid, format=None): link = self.get_object_for_user_by_pk(request.user, guid) if link.replacement_link_id: return HttpResponseRedirect(reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_warc_if_permissible(link, request.user) + return stream_archive_if_permissible(link, request.user, file_format='warc') # /folders/:parent_id/archives/:guid diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index 0c7b6f64c..9b25c14ba 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -613,30 +613,7 @@ def get_warc_stream(link, stream=True): response['Content-Disposition'] = f'attachment; filename="{filename}"' return response - -def stream_warc(link, stream=True): - # `link.user_deleted` is checked here for dev convenience: - # it's easy to forget that deleted links/warcs aren't truly deleted, - # and easy to accidentally permit the downloading of "deleted" warcs. - # Users of stream_warc shouldn't have to worry about / remember this. - if link.user_deleted or not link.can_play_back(): - raise Http404 - return get_warc_stream(link, stream) - - -def stream_warc_if_permissible(link, user, stream=True): - if user.can_view(link): - return stream_warc(link, stream) - return HttpResponseForbidden('Private archive.') - - -def stream_wacz(link, stream=True): - # `link.user_deleted` is checked here for dev convenience: - # it's easy to forget that deleted links/waczs aren't truly deleted, - # and easy to accidentally permit the downloading of "deleted" waczs. - # Users of stream_wacz shouldn't have to worry about / remember this. - if link.user_deleted or not link.can_play_back(): - raise Http404 +def get_wacz_stream(link, stream=True): with link.get_wacz() as wacz_file: if stream: response = StreamingHttpResponse(wacz_file, content_type="application/wacz") @@ -646,9 +623,26 @@ def stream_wacz(link, stream=True): return response -def stream_wacz_if_permissible(link, user, stream=True): +def stream_archive(link, stream=True, file_format='warc'): + # `link.user_deleted` is checked here for dev convenience: + # it's easy to forget that deleted Perma Links' files aren't truly deleted, + # and easy to accidentally permit the downloading of "deleted" archive files. + # Users of stream_archive shouldn't have to worry about / remember this. + if link.user_deleted or not link.can_play_back(): + raise Http404 + + match file_format: + case 'warc': + return get_warc_stream(link, stream) + case 'wacz': + return get_wacz_stream(link, stream) + case _: + raise NotImplementedError("Unsupported file format.") + + +def stream_archive_if_permissible(link, user, stream=True, file_format='warc'): if user.can_view(link): - return stream_wacz(link, stream) + return stream_archive(link, stream, file_format) return HttpResponseForbidden('Private archive.') diff --git a/perma_web/perma/views/playback.py b/perma_web/perma/views/playback.py index 768e1c61c..75c41ccca 100644 --- a/perma_web/perma/views/playback.py +++ b/perma_web/perma/views/playback.py @@ -14,8 +14,7 @@ from perma.models import Link from perma.utils import (if_anonymous, ratelimit_ip_key, memento_url, timemap_url, timegate_url, - protocol, remove_control_characters, stream_warc_if_permissible, - stream_wacz_if_permissible) + protocol, remove_control_characters, stream_archive_if_permissible) import logging @@ -60,11 +59,11 @@ def single_permalink(request, guid): # serve raw WARC if serve_type == 'warc_download': - return stream_warc_if_permissible(link, request.user) + return stream_archive_if_permissible(link, request.user, file_format='warc') # serve raw WACZ if serve_type == 'wacz_download': - return stream_wacz_if_permissible(link, request.user) + return stream_archive_if_permissible(link, request.user, file_format='wacz') # handle requested capture type if serve_type == 'image': From 72e71cb731e2d5554cc7a98de85a2eedec48fb51 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 12:51:40 -0500 Subject: [PATCH 03/14] Add WACZ download button to tray; re-label WARC download button. --- perma_web/perma/templates/archive/single-link.html | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/perma_web/perma/templates/archive/single-link.html b/perma_web/perma/templates/archive/single-link.html index 6b9765c79..a490ac0a1 100755 --- a/perma_web/perma/templates/archive/single-link.html +++ b/perma_web/perma/templates/archive/single-link.html @@ -225,7 +225,10 @@ Show playback controls {% endif %} {% if link.can_play_back %} - Download Archive + Download WARC + {% if link.wacz_size %} + Download WACZ + {% endif %} {% endif %} {% if not can_edit %} Flag as inappropriate From 2c884e847474704d8b1c016ec8fc5f72fefb9801 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 14:07:26 -0500 Subject: [PATCH 04/14] Support WARC and WACZ formats in the API download route (default WARC). --- perma_web/api/tests/test_link_resource.py | 4 ++-- perma_web/api/utils.py | 10 ++++++++++ perma_web/api/views.py | 17 ++++++++++++----- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index 7c5cb4ef9..fe50da6f4 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -67,9 +67,9 @@ def setUp(self): self.public_link_download_url_for_private_link = reverse('api:public_archives_download', args=[self.unrelated_private_link.pk]) self.replaced_link_public_download_url = reverse('api:public_archives_download', args=['ABCD-0006']) - self.replaced_link_public_download_redirect_target = reverse('api:public_archives_download', args=['3SLN-JHX9']) + self.replaced_link_public_download_redirect_target = f"{reverse('api:public_archives_download', args=['3SLN-JHX9'])}?file_format=warc" self.replaced_link_authed_download_url = reverse('api:archives_download', args=['ABCD-0006']) - self.replaced_link_authed_download_redirect_target = reverse('api:archives_download', args=['3SLN-JHX9']) + self.replaced_link_authed_download_redirect_target = f"{reverse('api:archives_download', args=['3SLN-JHX9'])}?file_format=warc" self.replaced_link_owner = LinkUser.objects.get(id=4) self.logged_out_fields = [ diff --git a/perma_web/api/utils.py b/perma_web/api/utils.py index b3dd86528..0b66d8d23 100644 --- a/perma_web/api/utils.py +++ b/perma_web/api/utils.py @@ -240,3 +240,13 @@ class SpoofResponse: 'data': response.data }) return responses + + +def get_download_file_format(request): + file_format = request.query_params.get('file_format', 'warc') + supported_formats = ['warc', 'wacz'] + if file_format not in supported_formats: + raise ValidationError({ + "file_format": f"The specified format is not supported. Options: {', '.join(supported_formats)}." + }) + return file_format diff --git a/perma_web/api/views.py b/perma_web/api/views.py index 4a9b981a4..38c602bf1 100644 --- a/perma_web/api/views.py +++ b/perma_web/api/views.py @@ -20,7 +20,7 @@ from .utils import TastypiePagination, load_parent, raise_general_validation_error, \ raise_invalid_capture_job, dispatch_multiple_requests, reverse_api_view_relative, \ - url_is_invalid_unicode + url_is_invalid_unicode, get_download_file_format from .serializers import FolderSerializer, CaptureJobSerializer, LinkSerializer, AuthenticatedLinkSerializer, \ LinkUserSerializer, OrganizationSerializer, LinkBatchSerializer, DetailedLinkBatchSerializer from django.conf import settings @@ -353,9 +353,14 @@ def get(self, request, guid, format=None): link = Link.objects.discoverable().get(pk=guid) except Link.DoesNotExist: raise Http404 + + file_format = get_download_file_format(request) + if link.replacement_link_id: - return HttpResponseRedirect(reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_archive(link, file_format='warc') + base_url = reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id}) + return HttpResponseRedirect(f"{base_url}?file_format={file_format}") + + return stream_archive(link, file_format=file_format) # /archives @@ -666,9 +671,11 @@ class AuthenticatedLinkDownloadView(BaseView): def get(self, request, guid, format=None): """ Download warc. """ link = self.get_object_for_user_by_pk(request.user, guid) + file_format = get_download_file_format(request) if link.replacement_link_id: - return HttpResponseRedirect(reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_archive_if_permissible(link, request.user, file_format='warc') + base_url = reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id}) + return HttpResponseRedirect(f"{base_url}?file_format={file_format}") + return stream_archive_if_permissible(link, request.user, file_format=file_format) # /folders/:parent_id/archives/:guid From 45a4eac71148b33f5c4dfbcbd225878787dd4dd3 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 14:38:59 -0500 Subject: [PATCH 05/14] Add WACZ-related fields to the API. --- perma_web/api/serializers.py | 32 +++++++++++++++++------ perma_web/api/tests/test_link_resource.py | 2 ++ perma_web/api/utils.py | 18 +++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/perma_web/api/serializers.py b/perma_web/api/serializers.py index 8510de4a2..9e9235739 100644 --- a/perma_web/api/serializers.py +++ b/perma_web/api/serializers.py @@ -9,7 +9,7 @@ from perma.models import LinkUser, Folder, CaptureJob, Capture, Link, Organization, LinkBatch from perma.utils import send_to_scoop -from .utils import get_mime_type, mime_type_lookup, reverse_api_view +from .utils import get_mime_type, mime_type_lookup, reverse_api_view, get_download_url import logging logger = logging.getLogger(__name__) @@ -183,10 +183,24 @@ class LinkSerializer(BaseSerializer): queue_time = serializers.SerializerMethodField() capture_time = serializers.SerializerMethodField() warc_download_url = serializers.SerializerMethodField() + wacz_download_url = serializers.SerializerMethodField() class Meta: model = Link - fields = ('guid', 'creation_timestamp', 'url', 'title', 'description', 'warc_size', 'warc_download_url', 'captures', 'queue_time', 'capture_time') + fields = ( + 'guid', + 'creation_timestamp', + 'url', + 'title', + 'description', + 'warc_size', + 'warc_download_url', + 'wacz_size', + 'wacz_download_url', + 'captures', + 'queue_time', + 'capture_time' + ) def get_queue_time(self, link): try: @@ -203,9 +217,10 @@ def get_capture_time(self, link): return None def get_warc_download_url(self, link): - if link.warc_size: - return reverse_api_view('public_archives_download', kwargs={'guid': link.guid}, request=self.context['request']) - return None + return get_download_url(self.context['request'], link, file_format='warc', public=True) + + def get_wacz_download_url(self, link): + return get_download_url(self.context['request'], link, file_format='wacz', public=True) class AuthenticatedLinkSerializer(LinkSerializer): @@ -220,9 +235,10 @@ class Meta(LinkSerializer.Meta): allowed_update_fields = ['submitted_title', 'submitted_description', 'notes', 'is_private', 'private_reason', 'default_to_screenshot_view'] def get_warc_download_url(self, link): - if link.warc_size: - return reverse_api_view('archives_download', kwargs={'guid': link.guid}, request=self.context['request']) - return None + return get_download_url(self.context['request'], link, file_format='warc', public=False) + + def get_wacz_download_url(self, link): + return get_download_url(self.context['request'], link, file_format='wacz', public=False) def validate_url(self, url): # Clean up the user submitted url diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index fe50da6f4..7b4fbd6e8 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -81,6 +81,8 @@ def setUp(self): 'captures', 'warc_size', 'warc_download_url', + 'wacz_size', + 'wacz_download_url', 'queue_time', 'capture_time', ] diff --git a/perma_web/api/utils.py b/perma_web/api/utils.py index 0b66d8d23..da6bb5f20 100644 --- a/perma_web/api/utils.py +++ b/perma_web/api/utils.py @@ -250,3 +250,21 @@ def get_download_file_format(request): "file_format": f"The specified format is not supported. Options: {', '.join(supported_formats)}." }) return file_format + + +def get_download_url(request, link, file_format='warc', public=True): + view_name = f"{'public_' if public else ''}archives_download" + match file_format: + case 'warc': + if link.warc_size or link.wacz_size: + return reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request) + return None + case 'wacz': + if link.wacz_size: + base_url = reverse_api_view('public_archives_download', kwargs={'guid': link.guid}, request=request) + return f"{base_url}?file_format=wacz" + return None + case _: + raise NotImplementedError("Unsupported file format.") + + From f8d2445606d04394d30e5a4ec4ef069ad7932552 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 15:33:56 -0500 Subject: [PATCH 06/14] Add missing import. --- perma_web/api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perma_web/api/utils.py b/perma_web/api/utils.py index da6bb5f20..1b0d8cc2d 100644 --- a/perma_web/api/utils.py +++ b/perma_web/api/utils.py @@ -9,7 +9,7 @@ from django.urls import resolve, reverse from django.urls.exceptions import NoReverseMatch from rest_framework import serializers -from rest_framework.exceptions import PermissionDenied +from rest_framework.exceptions import PermissionDenied, ValidationError from rest_framework.pagination import LimitOffsetPagination from rest_framework.response import Response from rest_framework.reverse import reverse as drf_reverse From 2105286bf83d013b0cf209225c3784a62ef4b810 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 16:02:54 -0500 Subject: [PATCH 07/14] Add tests for WACZ download; make WARC download tests more thorough. --- perma_web/api/tests/test_link_resource.py | 45 +++++++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index 7b4fbd6e8..c9b9943f9 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -165,12 +165,26 @@ def test_get_list_json(self): def test_get_detail_json(self): self.successful_get(self.public_link_detail_url, fields=self.logged_out_fields) - @patch('api.views.stream_archive', autospec=True) - def test_public_download(self, stream): - stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) + @patch('perma.models.Link.get_warc', autospec=True) + def test_public_download_warc(self, get_warc): + get_warc.return_value = StringIO("archive placeholder") resp = self.api_client.get(self.public_link_download_url) self.assertHttpOK(resp) - self.assertEqual(stream.call_count, 1) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.warc.gz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/gzip') + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_public_download_wacz(self, get_wacz): + get_wacz.return_value = StringIO("archive placeholder") + resp = self.api_client.get(f"{self.public_link_download_url}?file_format=wacz") + self.assertHttpOK(resp) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.wacz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/wacz') + self.assertEqual(get_wacz.call_count, 1) + + def test_public_download_unsupported_format(self): + self.rejected_get(f"{self.public_link_download_url}?file_format=asdf", expected_status_code=400) def test_private_download_at_public_url(self): self.rejected_get(self.public_link_download_url_for_private_link, expected_status_code=404) @@ -196,15 +210,30 @@ def test_replaced_link_authed_download(self): self.assertEqual(resp.status_code, 302) self.assertEqual(resp.url, self.replaced_link_authed_download_redirect_target) - @patch('perma.utils.stream_archive', autospec=True) - def test_private_download(self, stream): - stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) + @patch('perma.models.Link.get_warc', autospec=True) + def test_private_download_warc(self, get_warc): + get_warc.return_value = StringIO("archive placeholder") self.api_client.force_authenticate(user=self.regular_user) resp = self.api_client.get( self.logged_in_private_link_download_url, ) self.assertHttpOK(resp) - self.assertEqual(stream.call_count, 1) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.warc.gz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/gzip') + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_private_download_wacz(self, get_wacz): + get_wacz.return_value = StringIO("archive placeholder") + self.api_client.force_authenticate(user=self.regular_user) + resp = self.api_client.get( + f"{self.logged_in_private_link_download_url}?file_format=wacz", + ) + self.assertHttpOK(resp) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.wacz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/wacz') + self.assertEqual(get_wacz.call_count, 1) + ############ # Updating # From 09f4a45b6dd10f03bce645f4fbb5815ddd331d25 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 16:21:38 -0500 Subject: [PATCH 08/14] FileWrapper prevents error about I/O on a closed file descriptor during tests. --- perma_web/perma/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index 9b25c14ba..a52754dce 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -615,10 +615,11 @@ def get_warc_stream(link, stream=True): def get_wacz_stream(link, stream=True): with link.get_wacz() as wacz_file: + wacz_stream = FileWrapper(wacz_file) if stream: - response = StreamingHttpResponse(wacz_file, content_type="application/wacz") + response = StreamingHttpResponse(wacz_stream, content_type="application/wacz") else: - response = HttpResponse(wacz_file, content_type="application/wacz") + response = HttpResponse(wacz_stream, content_type="application/wacz") response['Content-Disposition'] = f'attachment; filename="{link.guid}.wacz"' return response From 02b299cc54abce1f40738597ce3e1b32499c3734 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 16:21:45 -0500 Subject: [PATCH 09/14] Lint. --- perma_web/api/serializers.py | 2 +- perma_web/api/tests/test_link_resource.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/perma_web/api/serializers.py b/perma_web/api/serializers.py index 9e9235739..767919f3d 100644 --- a/perma_web/api/serializers.py +++ b/perma_web/api/serializers.py @@ -9,7 +9,7 @@ from perma.models import LinkUser, Folder, CaptureJob, Capture, Link, Organization, LinkBatch from perma.utils import send_to_scoop -from .utils import get_mime_type, mime_type_lookup, reverse_api_view, get_download_url +from .utils import get_mime_type, mime_type_lookup, get_download_url import logging logger = logging.getLogger(__name__) diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index c9b9943f9..c08eb04be 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -8,7 +8,6 @@ from django.conf import settings from django.urls import reverse -from django.http import StreamingHttpResponse from django.test.utils import override_settings from mock import patch From 14af4bedc948de72acfe608a601b44ba27bf4c63 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 16:53:17 -0500 Subject: [PATCH 10/14] Update API docs. --- .../perma/templates/docs/developer/index.html | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/perma_web/perma/templates/docs/developer/index.html b/perma_web/perma/templates/docs/developer/index.html index 42a7537a2..a25585a78 100644 --- a/perma_web/perma/templates/docs/developer/index.html +++ b/perma_web/perma/templates/docs/developer/index.html @@ -78,20 +78,29 @@

Get all public archives

Use HTTP GET to retrieve every public archive in Perma.cc. (In order to keep this example short, we limit the number of returned items to one.)

curl {{base_url}}/public/archives/?limit=1

Response:

-
{"meta":{"limit":1,"next":"{{ base_url }}/public/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"W6PY-UZ99","creation_timestamp":"2018-05-15T18:13:52Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":20932,"warc_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W6PY-UZ99/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":4}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/public/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"W6PY-UZ99","creation_timestamp":"2018-05-15T18:13:52Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":20932,"warc_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download","wacz_size":21844,"wacz_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W6PY-UZ99/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":4}]}

Get a single public archive's details

If we have the globally unique ID (GUID) of one public archive, we can GET details about it.

curl {{ base_url }}/public/archives/W5MF-N9EV/

Response:

-
{"guid":"W5MF-N9EV","creation_timestamp":"2018-05-14T15:12:33Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":19156,"warc_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W5MF-N9EV/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":3,"capture_time":4}
+
{"guid":"W5MF-N9EV","creation_timestamp":"2018-05-14T15:12:33Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":19156,"warc_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download","wacz_size":19962,"wacz_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W5MF-N9EV/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":3,"capture_time":4}

Download a single public archive

-

Perma archives are downloadable and can be viewed using tools that can replay WARC files, like ReplayWeb.page.

+

Perma archives are downloadable and can be viewed using tools that can replay WARC or WACZ files, like ReplayWeb.page.

+ +

WARC

+
wget {{ base_url }}/public/archives/Y6JJ-TDUJ/download

or

curl -o your_favorite_filename.warc.gz {{ base_url }}/public/archives/Y6JJ-TDUJ/download
+

WACZ

+ +
wget {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
+

or

+
curl -o your_favorite_filename.warc.gz {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
+ @@ -184,7 +193,7 @@

Create an archive

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X POST -d '{"url": "http://example.com", "title": "This is an example site", "folder": 1}' {{ base_url }}/archives/
 curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X POST -d '{"url": "http://example.com", "title": "This is another example site", "folder": 27}' {{ base_url }}/archives/

The response includes detailed information about the newly created archive, including the globally unique ID (GUID) of the archive:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19.516152Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":null,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19.516152Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19.516152Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":null,"warc_download_url":null,"wacz_size":null,"wacz_download_url":null,"captures":[{"role":"primary","status":"pending","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19.516152Z","organization":null}

Note that finalized details about an archive may not be available immediately: fields will update until the archiving process is complete. Use the Capture Job API and additional Archives API endpoints to get up-to-date details about a GUID.

To create many archives at once, use the Archives API endpoints for Batches

@@ -192,34 +201,43 @@

View the details of one arc

Use GET to retrieve details about an archive owned by you or by one of your organizations. More details are available via this authenticated Archives endpoint than are available from the non-authenticated Public Archives endpoint.

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":20183,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Download a single archive

-

Perma archives are downloadable and can be viewed using tools that can replay WARC files, like ReplayWeb.page.

+

Perma archives are downloadable and can be viewed using tools that can replay WARC or WACZ files, like ReplayWeb.page.

+ +

WARC

+
wget --header "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/Y6JJ-TDUJ/download

or

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.warc.gz {{ base_url }}/archives/Y6JJ-TDUJ/download
+

WACZ

+ +
wget --header "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz
+

or

+
curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.warc.gz {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz
+

Make an archive private

Use PATCH to make an archive private.

Include the GUID of the archive in the URL, and set the archive's "is_private" field to true using JSON-encoded data:

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X PATCH -d '{"is_private": true}' {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Edit the title and notes fields of an archive

Use PATCH to change an archive's notes or title field.

Include the GUID of the archive in the URL, and specify your desired changes using JSON-encoded data:

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X PATCH -d '{"title": "My updated title"}' {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Move an archive

Use PUT to move an archive into a different folder.

Include the ID of the destination folder as the first variable in the URL and the GUID of the archive as the second. The below example moves 85LS-BXV7 into folder 31.

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -X PUT {{ base_url }}/folders/31/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}

Delete an archive

Use DELETE to delete an archive.

@@ -232,7 +250,7 @@

View all archives

This includes both public and private archives and includes all folders. (We're limiting ourselves to just 1 result to keep this example short.)

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/?limit=1

Response:

-
{"meta":{"limit":1,"next":"{{ base_url }}/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"F9BV-XLHU","creation_timestamp":"2018-05-05T23:03:41Z","url":"http://example.com","title":"example.com","description":"","warc_size":null,"warc_download_url":"{{ base_url }}/archives/F9BV-XLHU/download","captures":[{"role":"primary","status":"success","url":"file:///F9BV-XLHU/upload.png","record_type":"resource","content_type":"image/png","user_upload":true}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":1,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-06T23:03:41Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"F9BV-XLHU","creation_timestamp":"2018-05-05T23:03:41Z","url":"http://example.com","title":"example.com","description":"","warc_size":null,"warc_download_url":null,"wacz_size":null,"wacz_download_url":null,"captures":[{"role":"primary","status":"pending","url":"file:///F9BV-XLHU/upload.png","record_type":"resource","content_type":"image/png","user_upload":true}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":1,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-06T23:03:41Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}]}

To restrict the results to a single folder, use the Folders API endpoint.

Work with batches of archives

@@ -283,7 +301,7 @@

View a folder's subfolders

View a folder's archives

Use GET to list the archives stored in a folder. (We're limiting ourselves to just 1 result to keep this example short.)

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/folders/25/archives?limit=1
-
{"meta":{"limit":1,"next":"{{ base_url }}/folders/24/archives?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"X5BR-VEZT","creation_timestamp":"2018-01-31T14:46:49Z","url":"http://perma.cc","title":"Perma.cc","description":" Broken links are everywhere. Perma helps authors and journals create permanent links for citations in their published work.","warc_size":2700433,"warc_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download","captures":[{"role":"primary","status":"success","url":"http://perma.cc","record_type":"response","content_type":"text/html; charset=utf-8","user_upload":false},{"role":"screenshot","status":"success","url":"file:///X5BR-VEZT/cap.png","record_type":"resource","content_type":"image/png","user_upload":false},{"role":"favicon","status":"success","url":"https://perma.cc/static/img/favicon.ico","record_type":"response","content_type":"image/x-icon","user_upload":false}],"queue_time":0,"capture_time":10,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-02-01T14:46:49Z","organization":null}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/folders/24/archives?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"X5BR-VEZT","creation_timestamp":"2018-01-31T14:46:49Z","url":"http://perma.cc","title":"Perma.cc","description":" Broken links are everywhere. Perma helps authors and journals create permanent links for citations in their published work.","warc_size":2700433,"warc_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download","wacz_size":2701999,"wacz_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://perma.cc","record_type":"response","content_type":"text/html; charset=utf-8","user_upload":false},{"role":"screenshot","status":"success","url":"file:///X5BR-VEZT/cap.png","record_type":"resource","content_type":"image/png","user_upload":false},{"role":"favicon","status":"success","url":"https://perma.cc/static/img/favicon.ico","record_type":"response","content_type":"image/x-icon","user_upload":false}],"queue_time":0,"capture_time":10,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-02-01T14:46:49Z","organization":null}]}

Note that archives stored in the folder's subfolders are not included.

Rename a folder

From 5fa621f99d4d20f734e2903968babae78a12b6f4 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 17:25:09 -0500 Subject: [PATCH 11/14] Replace mistakenly hard-coded view name. --- perma_web/api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perma_web/api/utils.py b/perma_web/api/utils.py index 1b0d8cc2d..aae3d2f6c 100644 --- a/perma_web/api/utils.py +++ b/perma_web/api/utils.py @@ -261,7 +261,7 @@ def get_download_url(request, link, file_format='warc', public=True): return None case 'wacz': if link.wacz_size: - base_url = reverse_api_view('public_archives_download', kwargs={'guid': link.guid}, request=request) + base_url = reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request) return f"{base_url}?file_format=wacz" return None case _: From a661b618ec87d0d8615cfa06b085b373e9478e46 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 17:26:33 -0500 Subject: [PATCH 12/14] Tyop --- perma_web/perma/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perma_web/perma/models.py b/perma_web/perma/models.py index aa1e91108..9d7c1eb41 100755 --- a/perma_web/perma/models.py +++ b/perma_web/perma/models.py @@ -2040,7 +2040,7 @@ def get_warc(self, extract_from_wacz_if_present=True, force_from_wacz=False): @contextmanager def get_wacz(self): if not self.wacz_size: - raise RuntimeError(f'No WARC present for {self.guid}') + raise RuntimeError(f'No WACZ present for {self.guid}') yield storages[settings.WACZ_STORAGE].open(self.wacz_storage_file(), 'rb') def accessible_to(self, user): From 4c5137d4c389b3a41c4ac20e1a6edb247a0322db Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 17:29:07 -0500 Subject: [PATCH 13/14] Use correct file extension in curl examples. --- perma_web/perma/templates/docs/developer/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perma_web/perma/templates/docs/developer/index.html b/perma_web/perma/templates/docs/developer/index.html index a25585a78..0cb30712e 100644 --- a/perma_web/perma/templates/docs/developer/index.html +++ b/perma_web/perma/templates/docs/developer/index.html @@ -99,7 +99,7 @@

WACZ

wget {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz

or

-
curl -o your_favorite_filename.warc.gz {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
+
curl -o your_favorite_filename.wacz {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
@@ -216,7 +216,7 @@

WACZ

wget --header "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz

or

-
curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.warc.gz {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz
+
curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.wacz {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz

Make an archive private

Use PATCH to make an archive private.

From 432782b33e412bad6e7722a65e3fb1db34c6b474 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 3 Dec 2024 17:56:25 -0500 Subject: [PATCH 14/14] Add handling for requests to download unavilable formats. --- perma_web/api/tests/test_link_resource.py | 12 ++++++++++++ perma_web/perma/utils.py | 19 ++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index c08eb04be..e08313ec0 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -185,6 +185,18 @@ def test_public_download_wacz(self, get_wacz): def test_public_download_unsupported_format(self): self.rejected_get(f"{self.public_link_download_url}?file_format=asdf", expected_status_code=400) + @patch('perma.models.Link.get_warc', autospec=True) + def test_download_nonexistent_warc(self, get_warc): + get_warc.side_effect = RuntimeError + self.rejected_get(self.public_link_download_url, expected_status_code=404) + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_download_nonexistent_wacz(self, get_wacz): + get_wacz.side_effect = RuntimeError + self.rejected_get(f"{self.public_link_download_url}?file_format=wacz", expected_status_code=404) + self.assertEqual(get_wacz.call_count, 1) + def test_private_download_at_public_url(self): self.rejected_get(self.public_link_download_url_for_private_link, expected_status_code=404) diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index a52754dce..ba31a4255 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -632,13 +632,18 @@ def stream_archive(link, stream=True, file_format='warc'): if link.user_deleted or not link.can_play_back(): raise Http404 - match file_format: - case 'warc': - return get_warc_stream(link, stream) - case 'wacz': - return get_wacz_stream(link, stream) - case _: - raise NotImplementedError("Unsupported file format.") + try: + match file_format: + case 'warc': + return get_warc_stream(link, stream) + case 'wacz': + return get_wacz_stream(link, stream) + case _: + raise NotImplementedError("Unsupported file format.") + except RuntimeError: + # If the requested format is not available, return 404 + # just like with deleted and failed Perma Links + raise Http404 def stream_archive_if_permissible(link, user, stream=True, file_format='warc'):