This repository has been archived by the owner on May 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
ingest_mets_job.rb
153 lines (130 loc) · 5.24 KB
/
ingest_mets_job.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
require 'new_relic/agent/method_tracer'
class IngestMETSJob < ApplicationJob
include ::NewRelic::Agent::MethodTracer
queue_as :ingest
# @param [String] mets_file Filename of a METS file to ingest
# @param [String] user User to ingest as
def perform(mets_file, user)
logger.info "Ingesting METS #{mets_file}"
@ingest = IngestService.new(logger)
@mets = METSDocument::Factory.new(mets_file).new
@user = user
ingest
end
private
def ingest
if @mets.multi_volume?
volumes = ingest_volumes
mvw = create_resource
mvw.ordered_members = volumes
mvw.thumbnail_id = volumes.first.thumbnail_id
mvw.save!
volumes.each(&:update_index)
else
resource = create_resource
ingest_files(resource: resource, files: @mets.files)
if @mets.structure.present?
resource.logical_order.order = map_fileids(@mets.structure)
end
resource.viewing_hint = [@mets.viewing_hint] if @mets.viewing_hint
resource.save!
validate!(resource)
end
end
def create_resource
@ingest.delete_duplicates!("identifier_ssim:#{RSolr.solr_escape(@mets.ark_id)}")
klass = @mets.multi_volume? ? MultiVolumeWork : ScannedResource
cols = Array(@mets.collection_slugs).select(&:present?).map { |slug| find_or_create_collection(slug) }
resource = @ingest.minimal_record(klass, @user, viewing_direction: [@mets.viewing_direction],
identifier: [@mets.ark_id],
replaces: [@mets.pudl_id],
source_metadata_identifier: [@mets.bib_id],
member_of_collections: cols)
attach_mets resource
resource
end
def validate!(resource)
return if @mets.files.length == resource.member_ids.length
logger.info "Incorrect number of files ingested for #{resource.id}: #{resource.member_ids.length} of expected #{@mets.files.length}"
end
def find_or_create_collection(slug)
return unless slug.present?
existing = Collection.where exhibit_id_ssim: slug
return existing.first if existing.first
col = Collection.new metadata_for_collection(slug)
col.apply_depositor_metadata @user
col.save!
col
end
def metadata_for_collection(slug)
collection_metadata.each do |c|
return { exhibit_id: slug, title: [c['title']], description: [c['blurb']] } if c['slug'] == slug
end
raise StandardError, "No collection metadata found for slug '#{slug}'"
end
def collection_metadata
@collection_metadata ||= JSON.parse(File.read(File.join(Rails.root, 'config', 'pudl_collections.json')))
end
def attach_mets(resource)
mets_file_set = FileSet.new
mets_file_set.title = ['METS XML']
actor = BatchFileSetActor.new(mets_file_set, @user)
actor.attach_related_object(resource)
actor.attach_content(File.open(@mets.source_file, 'r:UTF-8'))
end
def ingest_files(resource: nil, files: [])
ordered_members = []
files.each do |f|
file_set = ingest_file(resource: resource, f: f)
ordered_members << file_set if file_set
end
resource.ordered_members = ordered_members
end
def ingest_file(resource: nil, f: nil, count: 0)
file_set = @ingest.ingest_file(resource, @mets.decorated_file(f), @user, @mets.file_opts(f),
title: [@mets.file_label(f[:id])], replaces: [f[:replaces]])
mets_to_repo_map[f[:id]] = file_set.id
if f[:path] == @mets.thumbnail_path
resource.thumbnail_id = file_set.id
resource.save!
end
return file_set
rescue StandardError => e
raise e if count > 4
count += 1
logger.info "Failed ingesting #{f[:path]} #{count} times, retrying. Error: #{e.message}"
return ingest_file(resource: resource, f: f, count: count)
end
add_method_tracer :ingest_file, 'IngestMETSJob/ingest_file'
def ingest_volumes
@mets.volume_ids.map do |volume_id|
r = find_volume("#{@mets.pudl_id}/#{volume_id}")
if r
logger.info "Found existing volume: #{r.id}"
else
r = @ingest.minimal_record(ScannedResource, @user, title: [@mets.label_for_volume(volume_id)])
ingest_files(resource: r, files: @mets.files_for_volume(volume_id))
end
r.logical_order.order = map_fileids(@mets.structure_for_volume(volume_id))
r.thumbnail_id = r.file_sets.first.id unless r.thumbnail_id
r.title = [@mets.label_for_volume(volume_id)]
r.viewing_direction = [@mets.viewing_direction]
r.viewing_hint = [@mets.viewing_hint] if @mets.viewing_hint
r.save!
r
end
end
def find_volume(replaces)
solr_rec = ActiveFedora::SolrService.query("replaces_ssim:#{replaces}").first
ActiveFedora::Base.find(solr_rec['id']) if solr_rec
end
def map_fileids(hsh)
hsh.each do |k, v|
hsh[k] = v.each { |node| map_fileids(node) } if k == :nodes
hsh[k] = mets_to_repo_map[v] if k == :proxy
end
end
def mets_to_repo_map
@mets_to_repo_map ||= {}
end
end