Skip to content

Commit

Permalink
Merge pull request #3419 from rebeccacremona/no-provenance
Browse files Browse the repository at this point in the history
Don't assume provenance summary is present.
  • Loading branch information
rebeccacremona authored Oct 30, 2023
2 parents 01e6892 + 10cf44e commit ac1454d
Showing 1 changed file with 22 additions and 16 deletions.
38 changes: 22 additions & 16 deletions perma_web/perma/celery_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,15 +835,9 @@ def save_scoop_capture(link, capture_job, data):
description = data['scoop_capture_summary']['pageInfo'].get('description')
if description:
link.submitted_description=description[:300]
software = data['scoop_capture_summary']['provenanceInfo']['software'].lower()
version = data['scoop_capture_summary']['provenanceInfo']['version'].lower()
link.captured_by_software = f"{software}: {version}"
link.captured_by_browser = data['scoop_capture_summary']['provenanceInfo']['userAgent']
link.save(update_fields=[
'submitted_title',
'submitted_description',
'captured_by_software',
'captured_by_browser'
'submitted_description'
])

# Make this link private by policy, if the captured domain is on the list.
Expand Down Expand Up @@ -884,16 +878,28 @@ def save_scoop_capture(link, capture_job, data):
#
# OTHER ATTACHMENTS
#
provenance_filename = data['scoop_capture_summary']['attachments'].get("provenanceSummary")
if provenance_filename:
Capture(
link=link,
role='provenance_summary',
status='success',
record_type='response',
url=f"file:///{provenance_filename}",
content_type='text/html; charset=utf-8',
).save()

provenance_filename = data['scoop_capture_summary']['attachments']["provenanceSummary"]
Capture(
link=link,
role='provenance_summary',
status='success',
record_type='response',
url=f"file:///{provenance_filename}",
content_type='text/html; charset=utf-8',
).save()
software = data['scoop_capture_summary']['provenanceInfo']['software'].lower()
version = data['scoop_capture_summary']['provenanceInfo']['version'].lower()
link.captured_by_software = f"{software}: {version}"
link.captured_by_browser = data['scoop_capture_summary']['provenanceInfo']['userAgent']
link.save(update_fields=[
'captured_by_software',
'captured_by_browser'
])
else:
link.tags.add('scoop-missing-provenance')
logger.warning(f"{capture_job.link_id}: Scoop warc does not contain provenance summary ({data['id_capture']}).")

#
# WARC
Expand Down

0 comments on commit ac1454d

Please sign in to comment.