Skip to content

Commit

Permalink
index Hyrax::FileSet in the valkyrie index
Browse files Browse the repository at this point in the history
This adds a basic solr doc for Hyrax::FileSet which is a Valkryie::Resource.
  • Loading branch information
elrayle authored and tamsin johnson committed Mar 16, 2021
1 parent 386359c commit 10f92a2
Show file tree
Hide file tree
Showing 14 changed files with 595 additions and 8 deletions.
27 changes: 26 additions & 1 deletion app/actors/hyrax/actors/create_with_remote_files_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,17 @@ def attach_files(env, remote_files)
true
end

def create_file_from_url(env, uri, file_name, auth_header)
if env.curation_concern.is_a? Valkyrie::Resource
create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
else
create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
end
end

# Generic utility for creating FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url(env, uri, file_name, auth_header = {})
def create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
::FileSet.new(import_url: import_url, label: file_name) do |fs|
actor = Hyrax::Actors::FileSetActor.new(fs, env.user)
Expand All @@ -80,6 +88,23 @@ def create_file_from_url(env, uri, file_name, auth_header = {})
end
end

# Generic utility for creating Hyrax::FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
fs = Hyrax.persister.save(resource: Hyrax::FileSet.new(import_url: import_url, label: file_name))
actor = Hyrax::Actors::FileSetActor.new(fs, env.user, use_valkyrie: true)
actor.create_metadata(visibility: env.curation_concern.visibility)
actor.attach_to_work(env.curation_concern)
if uri.scheme == 'file'
# Turn any %20 into spaces.
file_path = CGI.unescape(uri.path)
IngestLocalFileJob.perform_later(fs, file_path, env.user)
else
ImportUrlJob.perform_later(fs, operation_for(user: actor.user), auth_header)
end
end

def operation_for(user:)
Hyrax::Operation.create!(user: user,
operation_type: "Attach Remote File")
Expand Down
6 changes: 4 additions & 2 deletions app/actors/hyrax/actors/file_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class FileActor
# @param [FileSet] file_set the parent FileSet
# @param [Symbol, #to_sym] relation the type/use for the file
# @param [User] user the user to record as the Agent acting upon the file
def initialize(file_set, relation, user, use_valkyrie: false)
def initialize(file_set, relation, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@relation = normalize_relation(relation)
Expand Down Expand Up @@ -75,7 +75,7 @@ def perform_ingest_file_through_active_fedora(io)
CharacterizeJob.perform_later(file_set, repository_file.id, pathhint(io))
end

def perform_ingest_file_through_valkyrie(io)
def perform_ingest_file_through_valkyrie(io) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
file =
begin
Hyrax.storage_adapter.upload(resource: file_set, file: io, original_filename: io.original_name, use: relation)
Expand All @@ -87,7 +87,9 @@ def perform_ingest_file_through_valkyrie(io)
create_version(file_metadata, user)
id = file_metadata.file_identifier
file_set.file_ids << id
file_set.original_file_id = id
Hyrax.persister.save(resource: file_set)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
CharacterizeJob.perform_later(file_set, id.to_s, pathhint(io))
file_metadata
end
Expand Down
4 changes: 2 additions & 2 deletions app/actors/hyrax/actors/file_set_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class FileSetActor # rubocop:disable Metrics/ClassLength
include Lockable
attr_reader :file_set, :user, :attributes, :use_valkyrie

def initialize(file_set, user, use_valkyrie: false)
def initialize(file_set, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@user = user
Expand Down Expand Up @@ -87,7 +87,7 @@ def attach_to_work(work, file_set_params = {})
def attach_to_valkyrie_work(work, file_set_params)
work = Hyrax.query_service.find_by(id: work.id) unless work.new_record
file_set.visibility = work.visibility unless assign_visibility?(file_set_params)
Hyrax.persister.save(resource: file_set)
@file_set = Hyrax.persister.save(resource: file_set)
work.member_ids << file_set.id
work.representative_id = file_set.id if work.representative_id.blank?
work.thumbnail_id = file_set.id if work.thumbnail_id.blank?
Expand Down
116 changes: 116 additions & 0 deletions app/indexers/hyrax/valkyrie_file_set_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# frozen_string_literal: true

module Hyrax
##
# Indexes Hyrax::FileSet objects
class ValkyrieFileSetIndexer < Hyrax::ValkyrieIndexer
include Hyrax::ResourceIndexer
include Hyrax::PermissionIndexer
include Hyrax::VisibilityIndexer
include Hyrax::Indexer(:core_metadata)
include Hyrax::Indexer(:basic_metadata)

# include Hyrax::IndexesThumbnails # TODO: Is there a Valkyrie version of a thumbnail indexer?

def to_solr # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
super.tap do |solr_doc| # rubocop:disable Metrics/BlockLength
# Metadata from the FileSet
solr_doc['file_ids_ssim'] = resource.file_ids&.map(&:to_s)
solr_doc['original_file_id_ssi'] = resource.original_file_id.to_s
solr_doc['thumbnail_id_ssi'] = resource.thumbnail_id.to_s
solr_doc['extracted_text_id_ssi'] = resource.extracted_text_id.to_s

# Add in metadata from the original file.
file_metadata = original_file
return solr_doc unless file_metadata

# Label is the actual file name. It's not editable by the user.
solr_doc['original_file_alternate_ids_tesim'] = file_metadata.alternate_ids&.map(&:to_s) if file_metadata.alternate_ids.present?

solr_doc['original_filename_tesi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['original_filename_ssi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['mime_type_tesi'] = file_metadata.mime_type if file_metadata.mime_type.present?
solr_doc['mime_type_ssi'] = file_metadata.mime_type if file_metadata.mime_type.present?

solr_doc['file_format_tesim'] = file_format(file_metadata)
solr_doc['file_format_sim'] = file_format(file_metadata)
solr_doc['file_size_lts'] = file_metadata.size[0]
solr_doc['type_tesim'] = file_metadata.type if file_metadata.type.present?

# attributes set by fits
solr_doc['format_label_tesim'] = file_metadata.format_label if file_metadata.format_label.present?
solr_doc['size_tesim'] = file_metadata.size if file_metadata.size.present?
solr_doc['well_formed_tesim'] = file_metadata.well_formed if file_metadata.well_formed.present?
solr_doc['valid_tesim'] = file_metadata.valid if file_metadata.valid.present?
solr_doc['fits_version_tesim'] = file_metadata.fits_version if file_metadata.fits_version.present?
solr_doc['exif_version_tesim'] = file_metadata.exif_version if file_metadata.exif_version.present?
solr_doc['checksum_tesim'] = file_metadata.checksum if file_metadata.checksum.present?

# shared attributes across multiple file types
solr_doc['frame_rate_tesim'] = file_metadata.frame_rate if file_metadata.frame_rate.present? # audio, video
solr_doc['bit_rate_tesim'] = file_metadata.bit_rate if file_metadata.bit_rate.present? # audio, video
solr_doc['duration_tesim'] = file_metadata.duration if file_metadata.duration.present? # audio, video
solr_doc['sample_rate_tesim'] = file_metadata.sample_rate if file_metadata.sample_rate.present? # audio, video

solr_doc['height_tesim'] = file_metadata.height if file_metadata.height.present? # image, video
solr_doc['width_tesim'] = file_metadata.width if file_metadata.width.present? # image, video

# attributes set by fits for audio files
solr_doc['bit_depth_tesim'] = file_metadata.bit_depth if file_metadata.bit_depth.present?
solr_doc['channels_tesim'] = file_metadata.channels if file_metadata.channels.present?
solr_doc['data_format_tesim'] = file_metadata.data_format if file_metadata.data_format.present?
solr_doc['offset_tesim'] = file_metadata.offset if file_metadata.offset.present?

# attributes set by fits for documents
solr_doc['file_title_tesim'] = file_metadata.file_title if file_metadata.file_title.present?
solr_doc['page_count_tesim'] = file_metadata.page_count if file_metadata.page_count.present?
solr_doc['language_tesim'] = file_metadata.language if file_metadata.language.present?
solr_doc['word_count_tesim'] = file_metadata.word_count if file_metadata.word_count.present?
solr_doc['character_count_tesim'] = file_metadata.character_count if file_metadata.character_count.present?
solr_doc['line_count_tesim'] = file_metadata.line_count if file_metadata.line_count.present?
solr_doc['character_set_tesim'] = file_metadata.character_set if file_metadata.character_set.present?
solr_doc['markup_basis_tesim'] = file_metadata.markup_basis if file_metadata.markup_basis.present?
solr_doc['paragraph_count_tesim'] = file_metadata.paragraph_count if file_metadata.paragraph_count.present?
solr_doc['markup_language_tesim'] = file_metadata.markup_language if file_metadata.markup_language.present?
solr_doc['table_count_tesim'] = file_metadata.table_count if file_metadata.table_count.present?
solr_doc['graphics_count_tesim'] = file_metadata.graphics_count if file_metadata.graphics_count.present?

# attributes set by fits for images
solr_doc['byte_order_tesim'] = file_metadata.byte_order if file_metadata.byte_order.present?
solr_doc['compression_tesim'] = file_metadata.compression if file_metadata.compression.present?
solr_doc['color_space_tesim'] = file_metadata.color_space if file_metadata.color_space.present?
solr_doc['profile_name_tesim'] = file_metadata.profile_name if file_metadata.profile_name.present?
solr_doc['profile_version_tesim'] = file_metadata.profile_version if file_metadata.profile_version.present?
solr_doc['orientation_tesim'] = file_metadata.orientation if file_metadata.orientation.present?
solr_doc['color_map_tesim'] = file_metadata.color_map if file_metadata.color_map.present?
solr_doc['image_producer_tesim'] = file_metadata.image_producer if file_metadata.image_producer.present?
solr_doc['capture_device_tesim'] = file_metadata.capture_device if file_metadata.capture_device.present?
solr_doc['scanning_software_tesim'] = file_metadata.scanning_software if file_metadata.scanning_software.present?
solr_doc['gps_timestamp_tesim'] = file_metadata.gps_timestamp if file_metadata.gps_timestamp.present?
solr_doc['latitude_tesim'] = file_metadata.latitude if file_metadata.latitude.present?
solr_doc['longitude_tesim'] = file_metadata.longitude if file_metadata.longitude.present?

# attributes set by fits for video
solr_doc['aspect_ratio_tesim'] = file_metadata.aspect_ratio if file_metadata.aspect_ratio.present?
end
end

private

def original_file
Hyrax.custom_queries.find_original_file(file_set: resource)
rescue Valkyrie::Persistence::ObjectNotFoundError
Hyrax.custom_queries.find_files(file_set: resource).first
end

def file_format(file)
if file.mime_type.present? && file.format_label.present?
"#{file.mime_type.split('/').last} (#{file.format_label.join(', ')})"
elsif file.mime_type.present?
file.mime_type.split('/').last
elsif file.format_label.present?
file.format_label
end
end
end
end
7 changes: 6 additions & 1 deletion app/indexers/hyrax/valkyrie_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,12 @@ class << self
# @example
# ValkyrieIndexer.for(resource: Book.new) # => #<BookIndexer ...>
def for(resource:)
indexer_class_for(resource).new(resource: resource)
case resource
when Hyrax::FileSet
Hyrax::ValkyrieFileSetIndexer.new(resource: resource)
else
indexer_class_for(resource).new(resource: resource)
end
end

private
Expand Down
1 change: 1 addition & 0 deletions app/models/hyrax/file_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module Hyrax
# @see https://wiki.duraspace.org/display/samvera/Hydra%3A%3AWorks+Shared+Modeling
class FileSet < Hyrax::Resource
include Hyrax::Schema(:core_metadata)
include Hyrax::Schema(:basic_metadata)

attribute :file_ids, Valkyrie::Types::Array.of(Valkyrie::Types::ID) # id for FileMetadata resources
attribute :original_file_id, Valkyrie::Types::ID # id for FileMetadata resource
Expand Down
2 changes: 2 additions & 0 deletions app/models/hyrax/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class Work < Hyrax::Resource
attribute :on_behalf_of, Valkyrie::Types::String
attribute :proxy_depositor, Valkyrie::Types::String
attribute :state, Valkyrie::Types::URI.default(Hyrax::ResourceStatus::ACTIVE)
attribute :representative_id, Valkyrie::Types::ID
attribute :thumbnail_id, Valkyrie::Types::ID

##
# @return [Boolean] true
Expand Down
2 changes: 1 addition & 1 deletion app/models/job_io_wrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def size
nil # unable to determine
end

def file_set(use_valkyrie: false)
def file_set(use_valkyrie: Hyrax.config.query_index_from_valkyrie)
return FileSet.find(file_set_id) unless use_valkyrie
Hyrax.query_service.find_by(id: Valkyrie::ID.new(file_set_id))
end
Expand Down
3 changes: 2 additions & 1 deletion app/services/hyrax/work_uploads_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def initialize(work:, persister: Hyrax.persister)
# @note we immediately and silently discard uploads with an existing
# file_set_uri, in a half-considered attempt at supporting idempotency
# (for job retries). this is for legacy/AttachFilesToWorkJob
# compatibility, but could stand for a roubst reimplementation.
# compatibility, but could stand for a robust reimplementation.
#
# @param [Enumberable<Hyrax::UploadedFile>] files files to add
#
Expand Down Expand Up @@ -106,6 +106,7 @@ def make_file_set_and_ingest(file)
Hyrax::AccessControlList.copy_permissions(source: target_permissions, target: file_set)
append_to_work(file_set)
IngestJob.perform_later(wrap_file(file, file_set))
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: file.user)
{ file_set: file_set, user: file.user }
end

Expand Down
2 changes: 2 additions & 0 deletions config/initializers/listeners.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

Hyrax.config.callback.set(:after_create_fileset, warn: false) do |file_set, user|
Hyrax.publisher.publish('file.set.attached', file_set: file_set, user: user)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_revert_content, warn: false) do |file_set, user, revision|
Hyrax.publisher.publish('file.set.restored', file_set: file_set, user: user, revision: revision)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_update_metadata, warn: false) do |curation_concern, user|
Expand Down
110 changes: 110 additions & 0 deletions spec/actors/hyrax/actors/create_with_remote_files_actor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,114 @@
expect(actor.send(:validate_remote_url, URI('https://example.com/test.txt'))).to be true
end
end

context 'when work is a valkyrie resource' do
let(:work) { valkyrie_create(:monograph) }

context "with source uris that are remote" do
let(:remote_files) do
[{ url: url1,
expires: "2014-03-31T20:37:36.214Z",
file_name: "filepicker-demo.txt.txt" },
{ url: url2,
expires: "2014-03-31T20:37:36.731Z",
file_name: "Getting+Started.pdf" }]
end

it "attaches files" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, {}).twice
expect(actor.create(environment)).to be true
end
end

context "with source URIs that are remote and contain encoded parameters" do
let(:url1) { "https://dl.dropbox.com/fake/file?param1=%28example%29&param2=%5Bexample2%5D" }

before do
allow(Hyrax::FileSet).to receive(:new).and_call_original
end

it "preserves the encoded parameters in the URIs" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, {}).twice
expect(actor.create(environment)).to be true
expect(Hyrax::FileSet).to have_received(:new).with(import_url: "https://dl.dropbox.com/fake/file?param1=%28example%29&param2=%5Bexample2%5D", label: "filepicker-demo.txt.txt")
end
end

context "with source uris that are remote bearing auth headers" do
let(:remote_files) do
[{ url: url1,
expires: "2014-03-31T20:37:36.214Z",
file_name: "filepicker-demo.txt.txt",
auth_header: { 'Authorization' => 'Bearer access-token' } },
{ url: url2,
expires: "2014-03-31T20:37:36.731Z",
file_name: "Getting+Started.pdf",
auth_header: { 'Authorization' => 'Bearer access-token' } }]
end

it "attaches files" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, 'Authorization' => 'Bearer access-token').twice
expect(actor.create(environment)).to be true
end
end

context "with source uris that are local files" do
let(:remote_files) do
[{ url: file,
expires: "2014-03-31T20:37:36.214Z",
file_name: "here.txt" }]
end

before do
allow(Hyrax.config).to receive(:registered_ingest_dirs).and_return(["/local/file/"])
end

it "attaches files" do
expect(IngestLocalFileJob).to receive(:perform_later).with(Hyrax::FileSet, "/local/file/here.txt", user)
expect(actor.create(environment)).to be true
end

context "with files from non-registered directories" do
let(:file) { "file:///local/otherdir/test.txt" }

it "doesn't attach files" do
expect(actor).to receive(:validate_remote_url).and_call_original
expect(IngestLocalFileJob).not_to receive(:perform_later)
expect(actor.create(environment)).to be false
end
end

context "with spaces" do
let(:file) { "file:///local/file/ pigs .txt" }

it "attaches files" do
expect(IngestLocalFileJob).to receive(:perform_later).with(Hyrax::FileSet, "/local/file/ pigs .txt", user)
expect(actor.create(environment)).to be true
end
end
end

describe "#validate_remote_url" do
before do
allow(Hyrax.config).to receive(:registered_ingest_dirs).and_return(['/test/', '/local/file/'])
end

it "accepts file: urls in registered directories" do
expect(actor.send(:validate_remote_url, URI('file:///local/file/test.txt'))).to be true
expect(actor.send(:validate_remote_url, URI('file:///local/file/subdirectory/test.txt'))).to be true
expect(actor.send(:validate_remote_url, URI('file:///test/test.txt'))).to be true
end

it "rejects file: urls outside registered directories" do
expect(actor.send(:validate_remote_url, URI('file:///tmp/test.txt'))).to be false
expect(actor.send(:validate_remote_url, URI('file:///test/../tmp/test.txt'))).to be false
expect(actor.send(:validate_remote_url, URI('file:///test/'))).to be false
end

it "accepts other types of urls" do
expect(actor.send(:validate_remote_url, URI('https://example.com/test.txt'))).to be true
end
end
end
end
Loading

0 comments on commit 10f92a2

Please sign in to comment.