diff --git a/perma_web/api/serializers.py b/perma_web/api/serializers.py index 8510de4a2..767919f3d 100644 --- a/perma_web/api/serializers.py +++ b/perma_web/api/serializers.py @@ -9,7 +9,7 @@ from perma.models import LinkUser, Folder, CaptureJob, Capture, Link, Organization, LinkBatch from perma.utils import send_to_scoop -from .utils import get_mime_type, mime_type_lookup, reverse_api_view +from .utils import get_mime_type, mime_type_lookup, get_download_url import logging logger = logging.getLogger(__name__) @@ -183,10 +183,24 @@ class LinkSerializer(BaseSerializer): queue_time = serializers.SerializerMethodField() capture_time = serializers.SerializerMethodField() warc_download_url = serializers.SerializerMethodField() + wacz_download_url = serializers.SerializerMethodField() class Meta: model = Link - fields = ('guid', 'creation_timestamp', 'url', 'title', 'description', 'warc_size', 'warc_download_url', 'captures', 'queue_time', 'capture_time') + fields = ( + 'guid', + 'creation_timestamp', + 'url', + 'title', + 'description', + 'warc_size', + 'warc_download_url', + 'wacz_size', + 'wacz_download_url', + 'captures', + 'queue_time', + 'capture_time' + ) def get_queue_time(self, link): try: @@ -203,9 +217,10 @@ def get_capture_time(self, link): return None def get_warc_download_url(self, link): - if link.warc_size: - return reverse_api_view('public_archives_download', kwargs={'guid': link.guid}, request=self.context['request']) - return None + return get_download_url(self.context['request'], link, file_format='warc', public=True) + + def get_wacz_download_url(self, link): + return get_download_url(self.context['request'], link, file_format='wacz', public=True) class AuthenticatedLinkSerializer(LinkSerializer): @@ -220,9 +235,10 @@ class Meta(LinkSerializer.Meta): allowed_update_fields = ['submitted_title', 'submitted_description', 'notes', 'is_private', 'private_reason', 'default_to_screenshot_view'] def get_warc_download_url(self, link): - if link.warc_size: - return reverse_api_view('archives_download', kwargs={'guid': link.guid}, request=self.context['request']) - return None + return get_download_url(self.context['request'], link, file_format='warc', public=False) + + def get_wacz_download_url(self, link): + return get_download_url(self.context['request'], link, file_format='wacz', public=False) def validate_url(self, url): # Clean up the user submitted url diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index 12d39a156..e08313ec0 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -8,7 +8,6 @@ from django.conf import settings from django.urls import reverse -from django.http import StreamingHttpResponse from django.test.utils import override_settings from mock import patch @@ -67,9 +66,9 @@ def setUp(self): self.public_link_download_url_for_private_link = reverse('api:public_archives_download', args=[self.unrelated_private_link.pk]) self.replaced_link_public_download_url = reverse('api:public_archives_download', args=['ABCD-0006']) - self.replaced_link_public_download_redirect_target = reverse('api:public_archives_download', args=['3SLN-JHX9']) + self.replaced_link_public_download_redirect_target = f"{reverse('api:public_archives_download', args=['3SLN-JHX9'])}?file_format=warc" self.replaced_link_authed_download_url = reverse('api:archives_download', args=['ABCD-0006']) - self.replaced_link_authed_download_redirect_target = reverse('api:archives_download', args=['3SLN-JHX9']) + self.replaced_link_authed_download_redirect_target = f"{reverse('api:archives_download', args=['3SLN-JHX9'])}?file_format=warc" self.replaced_link_owner = LinkUser.objects.get(id=4) self.logged_out_fields = [ @@ -81,6 +80,8 @@ def setUp(self): 'captures', 'warc_size', 'warc_download_url', + 'wacz_size', + 'wacz_download_url', 'queue_time', 'capture_time', ] @@ -163,12 +164,38 @@ def test_get_list_json(self): def test_get_detail_json(self): self.successful_get(self.public_link_detail_url, fields=self.logged_out_fields) - @patch('api.views.stream_warc', autospec=True) - def test_public_download(self, stream): - stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) + @patch('perma.models.Link.get_warc', autospec=True) + def test_public_download_warc(self, get_warc): + get_warc.return_value = StringIO("archive placeholder") resp = self.api_client.get(self.public_link_download_url) self.assertHttpOK(resp) - self.assertEqual(stream.call_count, 1) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.warc.gz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/gzip') + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_public_download_wacz(self, get_wacz): + get_wacz.return_value = StringIO("archive placeholder") + resp = self.api_client.get(f"{self.public_link_download_url}?file_format=wacz") + self.assertHttpOK(resp) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.link.pk}.wacz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/wacz') + self.assertEqual(get_wacz.call_count, 1) + + def test_public_download_unsupported_format(self): + self.rejected_get(f"{self.public_link_download_url}?file_format=asdf", expected_status_code=400) + + @patch('perma.models.Link.get_warc', autospec=True) + def test_download_nonexistent_warc(self, get_warc): + get_warc.side_effect = RuntimeError + self.rejected_get(self.public_link_download_url, expected_status_code=404) + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_download_nonexistent_wacz(self, get_wacz): + get_wacz.side_effect = RuntimeError + self.rejected_get(f"{self.public_link_download_url}?file_format=wacz", expected_status_code=404) + self.assertEqual(get_wacz.call_count, 1) def test_private_download_at_public_url(self): self.rejected_get(self.public_link_download_url_for_private_link, expected_status_code=404) @@ -194,15 +221,30 @@ def test_replaced_link_authed_download(self): self.assertEqual(resp.status_code, 302) self.assertEqual(resp.url, self.replaced_link_authed_download_redirect_target) - @patch('perma.utils.stream_warc', autospec=True) - def test_private_download(self, stream): - stream.return_value = StreamingHttpResponse(StringIO("warc placeholder")) + @patch('perma.models.Link.get_warc', autospec=True) + def test_private_download_warc(self, get_warc): + get_warc.return_value = StringIO("archive placeholder") self.api_client.force_authenticate(user=self.regular_user) resp = self.api_client.get( self.logged_in_private_link_download_url, ) self.assertHttpOK(resp) - self.assertEqual(stream.call_count, 1) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.warc.gz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/gzip') + self.assertEqual(get_warc.call_count, 1) + + @patch('perma.models.Link.get_wacz', autospec=True) + def test_private_download_wacz(self, get_wacz): + get_wacz.return_value = StringIO("archive placeholder") + self.api_client.force_authenticate(user=self.regular_user) + resp = self.api_client.get( + f"{self.logged_in_private_link_download_url}?file_format=wacz", + ) + self.assertHttpOK(resp) + self.assertEqual(resp.get('Content-Disposition', ''), f'attachment; filename="{self.unrelated_private_link.pk}.wacz"') + self.assertEqual(resp.get('Content-Type', ''), 'application/wacz') + self.assertEqual(get_wacz.call_count, 1) + ############ # Updating # diff --git a/perma_web/api/utils.py b/perma_web/api/utils.py index b3dd86528..aae3d2f6c 100644 --- a/perma_web/api/utils.py +++ b/perma_web/api/utils.py @@ -9,7 +9,7 @@ from django.urls import resolve, reverse from django.urls.exceptions import NoReverseMatch from rest_framework import serializers -from rest_framework.exceptions import PermissionDenied +from rest_framework.exceptions import PermissionDenied, ValidationError from rest_framework.pagination import LimitOffsetPagination from rest_framework.response import Response from rest_framework.reverse import reverse as drf_reverse @@ -240,3 +240,31 @@ class SpoofResponse: 'data': response.data }) return responses + + +def get_download_file_format(request): + file_format = request.query_params.get('file_format', 'warc') + supported_formats = ['warc', 'wacz'] + if file_format not in supported_formats: + raise ValidationError({ + "file_format": f"The specified format is not supported. Options: {', '.join(supported_formats)}." + }) + return file_format + + +def get_download_url(request, link, file_format='warc', public=True): + view_name = f"{'public_' if public else ''}archives_download" + match file_format: + case 'warc': + if link.warc_size or link.wacz_size: + return reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request) + return None + case 'wacz': + if link.wacz_size: + base_url = reverse_api_view(view_name, kwargs={'guid': link.guid}, request=request) + return f"{base_url}?file_format=wacz" + return None + case _: + raise NotImplementedError("Unsupported file format.") + + diff --git a/perma_web/api/views.py b/perma_web/api/views.py index e7b1d87ba..38c602bf1 100644 --- a/perma_web/api/views.py +++ b/perma_web/api/views.py @@ -14,13 +14,13 @@ from rest_framework.views import APIView import surt -from perma.utils import stream_warc, stream_warc_if_permissible +from perma.utils import stream_archive, stream_archive_if_permissible from perma.celery_tasks import run_next_capture from perma.models import Folder, CaptureJob, Link, Capture, Organization, LinkBatch from .utils import TastypiePagination, load_parent, raise_general_validation_error, \ raise_invalid_capture_job, dispatch_multiple_requests, reverse_api_view_relative, \ - url_is_invalid_unicode + url_is_invalid_unicode, get_download_file_format from .serializers import FolderSerializer, CaptureJobSerializer, LinkSerializer, AuthenticatedLinkSerializer, \ LinkUserSerializer, OrganizationSerializer, LinkBatchSerializer, DetailedLinkBatchSerializer from django.conf import settings @@ -353,9 +353,14 @@ def get(self, request, guid, format=None): link = Link.objects.discoverable().get(pk=guid) except Link.DoesNotExist: raise Http404 + + file_format = get_download_file_format(request) + if link.replacement_link_id: - return HttpResponseRedirect(reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_warc(link) + base_url = reverse_api_view_relative('public_archives_download', kwargs={'guid': link.replacement_link_id}) + return HttpResponseRedirect(f"{base_url}?file_format={file_format}") + + return stream_archive(link, file_format=file_format) # /archives @@ -666,9 +671,11 @@ class AuthenticatedLinkDownloadView(BaseView): def get(self, request, guid, format=None): """ Download warc. """ link = self.get_object_for_user_by_pk(request.user, guid) + file_format = get_download_file_format(request) if link.replacement_link_id: - return HttpResponseRedirect(reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id})) - return stream_warc_if_permissible(link, request.user) + base_url = reverse_api_view_relative('archives_download', kwargs={'guid': link.replacement_link_id}) + return HttpResponseRedirect(f"{base_url}?file_format={file_format}") + return stream_archive_if_permissible(link, request.user, file_format=file_format) # /folders/:parent_id/archives/:guid diff --git a/perma_web/perma/models.py b/perma_web/perma/models.py index 97c0f3629..9d7c1eb41 100755 --- a/perma_web/perma/models.py +++ b/perma_web/perma/models.py @@ -2037,6 +2037,12 @@ def get_warc(self, extract_from_wacz_if_present=True, force_from_wacz=False): else: raise RuntimeError(f'No archive present for {self.guid}') + @contextmanager + def get_wacz(self): + if not self.wacz_size: + raise RuntimeError(f'No WACZ present for {self.guid}') + yield storages[settings.WACZ_STORAGE].open(self.wacz_storage_file(), 'rb') + def accessible_to(self, user): return user.can_edit(self) diff --git a/perma_web/perma/templates/archive/single-link.html b/perma_web/perma/templates/archive/single-link.html index 6b9765c79..a490ac0a1 100755 --- a/perma_web/perma/templates/archive/single-link.html +++ b/perma_web/perma/templates/archive/single-link.html @@ -225,7 +225,10 @@ Show playback controls {% endif %} {% if link.can_play_back %} - Download Archive + Download WARC + {% if link.wacz_size %} + Download WACZ + {% endif %} {% endif %} {% if not can_edit %} Flag as inappropriate diff --git a/perma_web/perma/templates/docs/developer/index.html b/perma_web/perma/templates/docs/developer/index.html index 42a7537a2..0cb30712e 100644 --- a/perma_web/perma/templates/docs/developer/index.html +++ b/perma_web/perma/templates/docs/developer/index.html @@ -78,20 +78,29 @@

Get all public archives

Use HTTP GET to retrieve every public archive in Perma.cc. (In order to keep this example short, we limit the number of returned items to one.)

curl {{base_url}}/public/archives/?limit=1

Response:

-
{"meta":{"limit":1,"next":"{{ base_url }}/public/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"W6PY-UZ99","creation_timestamp":"2018-05-15T18:13:52Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":20932,"warc_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W6PY-UZ99/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":4}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/public/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"W6PY-UZ99","creation_timestamp":"2018-05-15T18:13:52Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":20932,"warc_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download","wacz_size":21844,"wacz_download_url":"{{ base_url }}/public/archives/W6PY-UZ99/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W6PY-UZ99/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":4}]}

Get a single public archive's details

If we have the globally unique ID (GUID) of one public archive, we can GET details about it.

curl {{ base_url }}/public/archives/W5MF-N9EV/

Response:

-
{"guid":"W5MF-N9EV","creation_timestamp":"2018-05-14T15:12:33Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":19156,"warc_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W5MF-N9EV/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":3,"capture_time":4}
+
{"guid":"W5MF-N9EV","creation_timestamp":"2018-05-14T15:12:33Z","url":"http://example.com","title":"Example Domain","description":null,"warc_size":19156,"warc_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download","wacz_size":19962,"wacz_download_url":"{{ base_url }}/public/archives/W5MF-N9EV/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///W5MF-N9EV/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":3,"capture_time":4}

Download a single public archive

-

Perma archives are downloadable and can be viewed using tools that can replay WARC files, like ReplayWeb.page.

+

Perma archives are downloadable and can be viewed using tools that can replay WARC or WACZ files, like ReplayWeb.page.

+ +

WARC

+
wget {{ base_url }}/public/archives/Y6JJ-TDUJ/download

or

curl -o your_favorite_filename.warc.gz {{ base_url }}/public/archives/Y6JJ-TDUJ/download
+

WACZ

+ +
wget {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
+

or

+
curl -o your_favorite_filename.wacz {{ base_url }}/public/archives/Y6JJ-TDUJ/download?file_format=wacz
+ @@ -184,7 +193,7 @@

Create an archive

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X POST -d '{"url": "http://example.com", "title": "This is an example site", "folder": 1}' {{ base_url }}/archives/
 curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X POST -d '{"url": "http://example.com", "title": "This is another example site", "folder": 27}' {{ base_url }}/archives/

The response includes detailed information about the newly created archive, including the globally unique ID (GUID) of the archive:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19.516152Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":null,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19.516152Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19.516152Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":null,"warc_download_url":null,"wacz_size":null,"wacz_download_url":null,"captures":[{"role":"primary","status":"pending","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19.516152Z","organization":null}

Note that finalized details about an archive may not be available immediately: fields will update until the archiving process is complete. Use the Capture Job API and additional Archives API endpoints to get up-to-date details about a GUID.

To create many archives at once, use the Archives API endpoints for Batches

@@ -192,34 +201,43 @@

View the details of one arc

Use GET to retrieve details about an archive owned by you or by one of your organizations. More details are available via this authenticated Archives endpoint than are available from the non-authenticated Public Archives endpoint.

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":20183,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Download a single archive

-

Perma archives are downloadable and can be viewed using tools that can replay WARC files, like ReplayWeb.page.

+

Perma archives are downloadable and can be viewed using tools that can replay WARC or WACZ files, like ReplayWeb.page.

+ +

WARC

+
wget --header "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/Y6JJ-TDUJ/download

or

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.warc.gz {{ base_url }}/archives/Y6JJ-TDUJ/download
+

WACZ

+ +
wget --header "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz
+

or

+
curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -o your_favorite_filename.wacz {{ base_url }}/archives/Y6JJ-TDUJ/download?file_format=wacz
+

Make an archive private

Use PATCH to make an archive private.

Include the GUID of the archive in the URL, and set the archive's "is_private" field to true using JSON-encoded data:

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X PATCH -d '{"is_private": true}' {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"This is an example site","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Edit the title and notes fields of an archive

Use PATCH to change an archive's notes or title field.

Include the GUID of the archive in the URL, and specify your desired changes using JSON-encoded data:

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -H 'Content-Type: application/json' -X PATCH -d '{"title": "My updated title"}' {{ base_url }}/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":null}

Move an archive

Use PUT to move an archive into a different folder.

Include the ID of the destination folder as the first variable in the URL and the GUID of the archive as the second. The below example moves 85LS-BXV7 into folder 31.

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" -X PUT {{ base_url }}/folders/31/archives/85LS-BXV7/

Response:

-
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}
+
{"guid":"85LS-BXV7","creation_timestamp":"2018-05-16T16:11:19Z","url":"http://example.com","title":"My updated title","description":null,"warc_size":20924,"warc_download_url":"{{ base_url }}/archives/85LS-BXV7/download","wacz_size":21837,"wacz_download_url":"{{ base_url }}/archives/85LS-BXV7/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://example.com","record_type":"response","content_type":"text/html","user_upload":false},{"role":"screenshot","status":"success","url":"file:///85LS-BXV7/cap.png","record_type":"resource","content_type":"image/png","user_upload":false}],"queue_time":0,"capture_time":3,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-17T16:11:19Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}

Delete an archive

Use DELETE to delete an archive.

@@ -232,7 +250,7 @@

View all archives

This includes both public and private archives and includes all folders. (We're limiting ourselves to just 1 result to keep this example short.)

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/archives/?limit=1

Response:

-
{"meta":{"limit":1,"next":"{{ base_url }}/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"F9BV-XLHU","creation_timestamp":"2018-05-05T23:03:41Z","url":"http://example.com","title":"example.com","description":"","warc_size":null,"warc_download_url":"{{ base_url }}/archives/F9BV-XLHU/download","captures":[{"role":"primary","status":"success","url":"file:///F9BV-XLHU/upload.png","record_type":"resource","content_type":"image/png","user_upload":true}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":1,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-06T23:03:41Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/archives/?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"F9BV-XLHU","creation_timestamp":"2018-05-05T23:03:41Z","url":"http://example.com","title":"example.com","description":"","warc_size":null,"warc_download_url":null,"wacz_size":null,"wacz_download_url":null,"captures":[{"role":"primary","status":"pending","url":"file:///F9BV-XLHU/upload.png","record_type":"resource","content_type":"image/png","user_upload":true}],"queue_time":null,"capture_time":null,"notes":"","created_by":{"id":1,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":true,"private_reason":"user","user_deleted":false,"archive_timestamp":"2018-05-06T23:03:41Z","organization":{"id":3,"name":"A Third Journal","registrar":"Test Library","default_to_private":true,"shared_folder":{"id":31,"name":"A Third Journal","parent":null,"has_children":true,"path":"31","organization":3}}}]}

To restrict the results to a single folder, use the Folders API endpoint.

Work with batches of archives

@@ -283,7 +301,7 @@

View a folder's subfolders

View a folder's archives

Use GET to list the archives stored in a folder. (We're limiting ourselves to just 1 result to keep this example short.)

curl -H "Authorization: ApiKey {{ request.user.get_api_key | default:'your-api-key'}}" {{ base_url }}/folders/25/archives?limit=1
-
{"meta":{"limit":1,"next":"{{ base_url }}/folders/24/archives?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"X5BR-VEZT","creation_timestamp":"2018-01-31T14:46:49Z","url":"http://perma.cc","title":"Perma.cc","description":" Broken links are everywhere. Perma helps authors and journals create permanent links for citations in their published work.","warc_size":2700433,"warc_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download","captures":[{"role":"primary","status":"success","url":"http://perma.cc","record_type":"response","content_type":"text/html; charset=utf-8","user_upload":false},{"role":"screenshot","status":"success","url":"file:///X5BR-VEZT/cap.png","record_type":"resource","content_type":"image/png","user_upload":false},{"role":"favicon","status":"success","url":"https://perma.cc/static/img/favicon.ico","record_type":"response","content_type":"image/x-icon","user_upload":false}],"queue_time":0,"capture_time":10,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-02-01T14:46:49Z","organization":null}]}
+
{"meta":{"limit":1,"next":"{{ base_url }}/folders/24/archives?limit=1&offset=1","offset":0,"previous":null},"objects":[{"guid":"X5BR-VEZT","creation_timestamp":"2018-01-31T14:46:49Z","url":"http://perma.cc","title":"Perma.cc","description":" Broken links are everywhere. Perma helps authors and journals create permanent links for citations in their published work.","warc_size":2700433,"warc_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download","wacz_size":2701999,"wacz_download_url":"{{base_url }}/v1/archives/X5BR-VEZT/download?file_format=wacz","captures":[{"role":"primary","status":"success","url":"http://perma.cc","record_type":"response","content_type":"text/html; charset=utf-8","user_upload":false},{"role":"screenshot","status":"success","url":"file:///X5BR-VEZT/cap.png","record_type":"resource","content_type":"image/png","user_upload":false},{"role":"favicon","status":"success","url":"https://perma.cc/static/img/favicon.ico","record_type":"response","content_type":"image/x-icon","user_upload":false}],"queue_time":0,"capture_time":10,"notes":"","created_by":{"id":3,"first_name":"Jane","last_name":"Doe","full_name":"Jane Doe","short_name":"Jane"},"is_private":false,"private_reason":null,"user_deleted":false,"archive_timestamp":"2018-02-01T14:46:49Z","organization":null}]}

Note that archives stored in the folder's subfolders are not included.

Rename a folder

diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index 321a52fa7..ba31a4255 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -613,19 +613,42 @@ def get_warc_stream(link, stream=True): response['Content-Disposition'] = f'attachment; filename="{filename}"' return response +def get_wacz_stream(link, stream=True): + with link.get_wacz() as wacz_file: + wacz_stream = FileWrapper(wacz_file) + if stream: + response = StreamingHttpResponse(wacz_stream, content_type="application/wacz") + else: + response = HttpResponse(wacz_stream, content_type="application/wacz") + response['Content-Disposition'] = f'attachment; filename="{link.guid}.wacz"' + return response + -def stream_warc(link, stream=True): +def stream_archive(link, stream=True, file_format='warc'): # `link.user_deleted` is checked here for dev convenience: - # it's easy to forget that deleted links/warcs aren't truly deleted, - # and easy to accidentally permit the downloading of "deleted" warcs. - # Users of stream_warc shouldn't have to worry about / remember this. + # it's easy to forget that deleted Perma Links' files aren't truly deleted, + # and easy to accidentally permit the downloading of "deleted" archive files. + # Users of stream_archive shouldn't have to worry about / remember this. if link.user_deleted or not link.can_play_back(): raise Http404 - return get_warc_stream(link, stream) -def stream_warc_if_permissible(link, user, stream=True): + try: + match file_format: + case 'warc': + return get_warc_stream(link, stream) + case 'wacz': + return get_wacz_stream(link, stream) + case _: + raise NotImplementedError("Unsupported file format.") + except RuntimeError: + # If the requested format is not available, return 404 + # just like with deleted and failed Perma Links + raise Http404 + + +def stream_archive_if_permissible(link, user, stream=True, file_format='warc'): if user.can_view(link): - return stream_warc(link, stream) + return stream_archive(link, stream, file_format) return HttpResponseForbidden('Private archive.') diff --git a/perma_web/perma/views/playback.py b/perma_web/perma/views/playback.py index 6c58e3aca..75c41ccca 100644 --- a/perma_web/perma/views/playback.py +++ b/perma_web/perma/views/playback.py @@ -14,12 +14,12 @@ from perma.models import Link from perma.utils import (if_anonymous, ratelimit_ip_key, memento_url, timemap_url, timegate_url, - protocol, remove_control_characters, stream_warc_if_permissible) + protocol, remove_control_characters, stream_archive_if_permissible) import logging logger = logging.getLogger(__name__) -valid_serve_types = ['image', 'warc_download', 'standard'] +valid_serve_types = ['image', 'warc_download', 'wacz_download', 'standard'] @if_anonymous(cache_control(max_age=settings.CACHE_MAX_AGES['single_permalink'])) @@ -59,7 +59,11 @@ def single_permalink(request, guid): # serve raw WARC if serve_type == 'warc_download': - return stream_warc_if_permissible(link, request.user) + return stream_archive_if_permissible(link, request.user, file_format='warc') + + # serve raw WACZ + if serve_type == 'wacz_download': + return stream_archive_if_permissible(link, request.user, file_format='wacz') # handle requested capture type if serve_type == 'image':