Skip to content

Commit

Permalink
index Hyrax::FileSet in the valkyrie index
Browse files Browse the repository at this point in the history
This adds a basic solr doc for Hyrax::FileSet which is a Valkryie::Resource.

Known remaining issues:
* There are a few fields in the AF solr doc that are not in the FileMetadata resource.  These are marked as TODOs in the resource indexer to be evaluated to see if there is more work to be done on this.
* There are several jobs that are expecting an AF FileSet.  They will need to be addressed separately.  They include, but may not be limited to, CharacterizationJob, FileSetAttachedEventJob, ContentUpdateEventJob.
  • Loading branch information
elrayle committed Mar 12, 2021
1 parent d43b903 commit ab248f5
Show file tree
Hide file tree
Showing 13 changed files with 586 additions and 8 deletions.
27 changes: 26 additions & 1 deletion app/actors/hyrax/actors/create_with_remote_files_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,17 @@ def attach_files(env, remote_files)
true
end

def create_file_from_url(env, uri, file_name, auth_header)
if env.curation_concern.is_a? Valkyrie::Resource
create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
else
create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
end
end

# Generic utility for creating FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url(env, uri, file_name, auth_header = {})
def create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
::FileSet.new(import_url: import_url, label: file_name) do |fs|
actor = Hyrax::Actors::FileSetActor.new(fs, env.user)
Expand All @@ -80,6 +88,23 @@ def create_file_from_url(env, uri, file_name, auth_header = {})
end
end

# Generic utility for creating Hyrax::FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
fs = Hyrax.persister.save(resource: Hyrax::FileSet.new(import_url: import_url, label: file_name))
actor = Hyrax::Actors::FileSetActor.new(fs, env.user, use_valkyrie: true)
actor.create_metadata(visibility: env.curation_concern.visibility)
actor.attach_to_work(env.curation_concern)
if uri.scheme == 'file'
# Turn any %20 into spaces.
file_path = CGI.unescape(uri.path)
IngestLocalFileJob.perform_later(fs, file_path, env.user)
else
ImportUrlJob.perform_later(fs, operation_for(user: actor.user), auth_header)
end
end

def operation_for(user:)
Hyrax::Operation.create!(user: user,
operation_type: "Attach Remote File")
Expand Down
6 changes: 4 additions & 2 deletions app/actors/hyrax/actors/file_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class FileActor
# @param [FileSet] file_set the parent FileSet
# @param [Symbol, #to_sym] relation the type/use for the file
# @param [User] user the user to record as the Agent acting upon the file
def initialize(file_set, relation, user, use_valkyrie: false)
def initialize(file_set, relation, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@relation = normalize_relation(relation)
Expand Down Expand Up @@ -75,7 +75,7 @@ def perform_ingest_file_through_active_fedora(io)
CharacterizeJob.perform_later(file_set, repository_file.id, pathhint(io))
end

def perform_ingest_file_through_valkyrie(io)
def perform_ingest_file_through_valkyrie(io) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
file =
begin
Hyrax.storage_adapter.upload(resource: file_set, file: io, original_filename: io.original_name, use: relation)
Expand All @@ -87,7 +87,9 @@ def perform_ingest_file_through_valkyrie(io)
create_version(file_metadata, user)
id = file_metadata.file_identifier
file_set.file_ids << id
file_set.original_file_id = id
Hyrax.persister.save(resource: file_set)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
CharacterizeJob.perform_later(file_set, id.to_s, pathhint(io))
file_metadata
end
Expand Down
4 changes: 2 additions & 2 deletions app/actors/hyrax/actors/file_set_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class FileSetActor # rubocop:disable Metrics/ClassLength
include Lockable
attr_reader :file_set, :user, :attributes, :use_valkyrie

def initialize(file_set, user, use_valkyrie: false)
def initialize(file_set, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@user = user
Expand Down Expand Up @@ -87,7 +87,7 @@ def attach_to_work(work, file_set_params = {})
def attach_to_valkyrie_work(work, file_set_params)
work = Hyrax.query_service.find_by(id: work.id) unless work.new_record
file_set.visibility = work.visibility unless assign_visibility?(file_set_params)
Hyrax.persister.save(resource: file_set)
@file_set = Hyrax.persister.save(resource: file_set)
work.member_ids << file_set.id
work.representative_id = file_set.id if work.representative_id.blank?
work.thumbnail_id = file_set.id if work.thumbnail_id.blank?
Expand Down
116 changes: 116 additions & 0 deletions app/indexers/hyrax/valkyrie_file_set_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# frozen_string_literal: true

module Hyrax
##
# Indexes Hyrax::FileSet objects
class ValkyrieFileSetIndexer < Hyrax::ValkyrieIndexer
include Hyrax::ResourceIndexer
include Hyrax::PermissionIndexer
include Hyrax::VisibilityIndexer
include Hyrax::Indexer(:core_metadata)
include Hyrax::Indexer(:basic_metadata)

# include Hyrax::IndexesThumbnails # TODO: Is there a Valkyrie version of a thumbnail indexer?

def to_solr # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
super.tap do |solr_doc| # rubocop:disable Metrics/BlockLength
# Metadata from the FileSet
solr_doc['file_ids_ssim'] = resource.file_ids&.map(&:to_s)
solr_doc['original_file_id_ssi'] = resource.original_file_id.to_s
solr_doc['thumbnail_id_ssi'] = resource.thumbnail_id.to_s
solr_doc['extracted_text_id_ssi'] = resource.extracted_text_id.to_s

# Add in metadata from the original file.
file_metadata = original_file
return solr_doc unless file_metadata

# Label is the actual file name. It's not editable by the user.
solr_doc['original_file_alternate_ids_tesim'] = file_metadata.alternate_ids&.map(&:to_s) if file_metadata.alternate_ids.present?

solr_doc['original_filename_tesi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['original_filename_ssi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['mime_type_tesi'] = file_metadata.mime_type if file_metadata.mime_type.present?
solr_doc['mime_type_ssi'] = file_metadata.mime_type if file_metadata.mime_type.present?

solr_doc['file_format_tesim'] = file_format(file_metadata)
solr_doc['file_format_sim'] = file_format(file_metadata)
solr_doc['file_size_lts'] = file_metadata.size[0]
solr_doc['type_tesim'] = file_metadata.type if file_metadata.type.present?

# attributes set by fits
solr_doc['format_label_tesim'] = file_metadata.format_label if file_metadata.format_label.present?
solr_doc['size_tesim'] = file_metadata.size if file_metadata.size.present?
solr_doc['well_formed_tesim'] = file_metadata.well_formed if file_metadata.well_formed.present?
solr_doc['valid_tesim'] = file_metadata.valid if file_metadata.valid.present?
solr_doc['fits_version_tesim'] = file_metadata.fits_version if file_metadata.fits_version.present?
solr_doc['exif_version_tesim'] = file_metadata.exif_version if file_metadata.exif_version.present?
solr_doc['checksum_tesim'] = file_metadata.checksum if file_metadata.checksum.present?

# shared attributes across multiple file types
solr_doc['frame_rate_tesim'] = file_metadata.frame_rate if file_metadata.frame_rate.present? # audio, video
solr_doc['bit_rate_tesim'] = file_metadata.bit_rate if file_metadata.bit_rate.present? # audio, video
solr_doc['duration_tesim'] = file_metadata.duration if file_metadata.duration.present? # audio, video
solr_doc['sample_rate_tesim'] = file_metadata.sample_rate if file_metadata.sample_rate.present? # audio, video

solr_doc['height_tesim'] = file_metadata.height if file_metadata.height.present? # image, video
solr_doc['width_tesim'] = file_metadata.width if file_metadata.width.present? # image, video

# attributes set by fits for audio files
solr_doc['bit_depth_tesim'] = file_metadata.bit_depth if file_metadata.bit_depth.present?
solr_doc['channels_tesim'] = file_metadata.channels if file_metadata.channels.present?
solr_doc['data_format_tesim'] = file_metadata.data_format if file_metadata.data_format.present?
solr_doc['offset_tesim'] = file_metadata.offset if file_metadata.offset.present?

# attributes set by fits for documents
solr_doc['file_title_tesim'] = file_metadata.file_title if file_metadata.file_title.present?
solr_doc['page_count_tesim'] = file_metadata.page_count if file_metadata.page_count.present?
solr_doc['language_tesim'] = file_metadata.language if file_metadata.language.present?
solr_doc['word_count_tesim'] = file_metadata.word_count if file_metadata.word_count.present?
solr_doc['character_count_tesim'] = file_metadata.character_count if file_metadata.character_count.present?
solr_doc['line_count_tesim'] = file_metadata.line_count if file_metadata.line_count.present?
solr_doc['character_set_tesim'] = file_metadata.character_set if file_metadata.character_set.present?
solr_doc['markup_basis_tesim'] = file_metadata.markup_basis if file_metadata.markup_basis.present?
solr_doc['paragraph_count_tesim'] = file_metadata.paragraph_count if file_metadata.paragraph_count.present?
solr_doc['markup_language_tesim'] = file_metadata.markup_language if file_metadata.markup_language.present?
solr_doc['table_count_tesim'] = file_metadata.table_count if file_metadata.table_count.present?
solr_doc['graphics_count_tesim'] = file_metadata.graphics_count if file_metadata.graphics_count.present?

# attributes set by fits for images
solr_doc['byte_order_tesim'] = file_metadata.byte_order if file_metadata.byte_order.present?
solr_doc['compression_tesim'] = file_metadata.compression if file_metadata.compression.present?
solr_doc['color_space_tesim'] = file_metadata.color_space if file_metadata.color_space.present?
solr_doc['profile_name_tesim'] = file_metadata.profile_name if file_metadata.profile_name.present?
solr_doc['profile_version_tesim'] = file_metadata.profile_version if file_metadata.profile_version.present?
solr_doc['orientation_tesim'] = file_metadata.orientation if file_metadata.orientation.present?
solr_doc['color_map_tesim'] = file_metadata.color_map if file_metadata.color_map.present?
solr_doc['image_producer_tesim'] = file_metadata.image_producer if file_metadata.image_producer.present?
solr_doc['capture_device_tesim'] = file_metadata.capture_device if file_metadata.capture_device.present?
solr_doc['scanning_software_tesim'] = file_metadata.scanning_software if file_metadata.scanning_software.present?
solr_doc['gps_timestamp_tesim'] = file_metadata.gps_timestamp if file_metadata.gps_timestamp.present?
solr_doc['latitude_tesim'] = file_metadata.latitude if file_metadata.latitude.present?
solr_doc['longitude_tesim'] = file_metadata.longitude if file_metadata.longitude.present?

# attributes set by fits for video
solr_doc['aspect_ratio_tesim'] = file_metadata.aspect_ratio if file_metadata.aspect_ratio.present?
end
end

private

def original_file
Hyrax.custom_queries.find_original_file(file_set: resource)
rescue Valkyrie::Persistence::ObjectNotFoundError
Hyrax.custom_queries.find_files(file_set: resource).first
end

def file_format(file)
if file.mime_type.present? && file.format_label.present?
"#{file.mime_type.split('/').last} (#{file.format_label.join(', ')})"
elsif file.mime_type.present?
file.mime_type.split('/').last
elsif file.format_label.present?
file.format_label
end
end
end
end
7 changes: 6 additions & 1 deletion app/indexers/hyrax/valkyrie_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,12 @@ class << self
# @example
# ValkyrieIndexer.for(resource: Book.new) # => #<BookIndexer ...>
def for(resource:)
indexer_class_for(resource).new(resource: resource)
case resource
when Hyrax::FileSet
Hyrax::ValkyrieFileSetIndexer
else
indexer_class_for(resource).new(resource: resource)
end
end

private
Expand Down
1 change: 1 addition & 0 deletions app/models/hyrax/file_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module Hyrax
# @see https://wiki.duraspace.org/display/samvera/Hydra%3A%3AWorks+Shared+Modeling
class FileSet < Hyrax::Resource
include Hyrax::Schema(:core_metadata)
include Hyrax::Schema(:basic_metadata)

attribute :file_ids, Valkyrie::Types::Array.of(Valkyrie::Types::ID) # id for FileMetadata resources
attribute :original_file_id, Valkyrie::Types::ID # id for FileMetadata resource
Expand Down
2 changes: 2 additions & 0 deletions app/models/hyrax/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class Work < Hyrax::Resource
attribute :on_behalf_of, Valkyrie::Types::String
attribute :proxy_depositor, Valkyrie::Types::String
attribute :state, Valkyrie::Types::URI.default(Hyrax::ResourceStatus::ACTIVE)
attribute :representative_id, Valkyrie::Types::ID
attribute :thumbnail_id, Valkyrie::Types::ID

##
# @return [Boolean] true
Expand Down
2 changes: 1 addition & 1 deletion app/models/job_io_wrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def size
nil # unable to determine
end

def file_set(use_valkyrie: false)
def file_set(use_valkyrie: Hyrax.config.query_index_from_valkyrie)
return FileSet.find(file_set_id) unless use_valkyrie
Hyrax.query_service.find_by(id: Valkyrie::ID.new(file_set_id))
end
Expand Down
3 changes: 2 additions & 1 deletion app/services/hyrax/work_uploads_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def initialize(work:, persister: Hyrax.persister)
# @note we immediately and silently discard uploads with an existing
# file_set_uri, in a half-considered attempt at supporting idempotency
# (for job retries). this is for legacy/AttachFilesToWorkJob
# compatibility, but could stand for a roubst reimplementation.
# compatibility, but could stand for a robust reimplementation.
#
# @param [Enumberable<Hyrax::UploadedFile>] files files to add
#
Expand Down Expand Up @@ -106,6 +106,7 @@ def make_file_set_and_ingest(file)
Hyrax::AccessControlList.copy_permissions(source: target_permissions, target: file_set)
append_to_work(file_set)
IngestJob.perform_later(wrap_file(file, file_set))
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: file.user)
{ file_set: file_set, user: file.user }
end

Expand Down
2 changes: 2 additions & 0 deletions config/initializers/listeners.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

Hyrax.config.callback.set(:after_create_fileset, warn: false) do |file_set, user|
Hyrax.publisher.publish('file.set.attached', file_set: file_set, user: user)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_revert_content, warn: false) do |file_set, user, revision|
Hyrax.publisher.publish('file.set.restored', file_set: file_set, user: user, revision: revision)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_update_metadata, warn: false) do |curation_concern, user|
Expand Down
110 changes: 110 additions & 0 deletions spec/actors/hyrax/actors/create_with_remote_files_actor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,114 @@
expect(actor.send(:validate_remote_url, URI('https://example.com/test.txt'))).to be true
end
end

context 'when work is a valkyrie resource' do
let(:work) { valkyrie_create(:monograph) }

context "with source uris that are remote" do
let(:remote_files) do
[{ url: url1,
expires: "2014-03-31T20:37:36.214Z",
file_name: "filepicker-demo.txt.txt" },
{ url: url2,
expires: "2014-03-31T20:37:36.731Z",
file_name: "Getting+Started.pdf" }]
end

it "attaches files" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, {}).twice
expect(actor.create(environment)).to be true
end
end

context "with source URIs that are remote and contain encoded parameters" do
let(:url1) { "https://dl.dropbox.com/fake/file?param1=%28example%29&param2=%5Bexample2%5D" }

before do
allow(Hyrax::FileSet).to receive(:new).and_call_original
end

it "preserves the encoded parameters in the URIs" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, {}).twice
expect(actor.create(environment)).to be true
expect(Hyrax::FileSet).to have_received(:new).with(import_url: "https://dl.dropbox.com/fake/file?param1=%28example%29&param2=%5Bexample2%5D", label: "filepicker-demo.txt.txt")
end
end

context "with source uris that are remote bearing auth headers" do
let(:remote_files) do
[{ url: url1,
expires: "2014-03-31T20:37:36.214Z",
file_name: "filepicker-demo.txt.txt",
auth_header: { 'Authorization' => 'Bearer access-token' } },
{ url: url2,
expires: "2014-03-31T20:37:36.731Z",
file_name: "Getting+Started.pdf",
auth_header: { 'Authorization' => 'Bearer access-token' } }]
end

it "attaches files" do
expect(ImportUrlJob).to receive(:perform_later).with(Hyrax::FileSet, Hyrax::Operation, 'Authorization' => 'Bearer access-token').twice
expect(actor.create(environment)).to be true
end
end

context "with source uris that are local files" do
let(:remote_files) do
[{ url: file,
expires: "2014-03-31T20:37:36.214Z",
file_name: "here.txt" }]
end

before do
allow(Hyrax.config).to receive(:registered_ingest_dirs).and_return(["/local/file/"])
end

it "attaches files" do
expect(IngestLocalFileJob).to receive(:perform_later).with(Hyrax::FileSet, "/local/file/here.txt", user)
expect(actor.create(environment)).to be true
end

context "with files from non-registered directories" do
let(:file) { "file:///local/otherdir/test.txt" }

it "doesn't attach files" do
expect(actor).to receive(:validate_remote_url).and_call_original
expect(IngestLocalFileJob).not_to receive(:perform_later)
expect(actor.create(environment)).to be false
end
end

context "with spaces" do
let(:file) { "file:///local/file/ pigs .txt" }

it "attaches files" do
expect(IngestLocalFileJob).to receive(:perform_later).with(Hyrax::FileSet, "/local/file/ pigs .txt", user)
expect(actor.create(environment)).to be true
end
end
end

describe "#validate_remote_url" do
before do
allow(Hyrax.config).to receive(:registered_ingest_dirs).and_return(['/test/', '/local/file/'])
end

it "accepts file: urls in registered directories" do
expect(actor.send(:validate_remote_url, URI('file:///local/file/test.txt'))).to be true
expect(actor.send(:validate_remote_url, URI('file:///local/file/subdirectory/test.txt'))).to be true
expect(actor.send(:validate_remote_url, URI('file:///test/test.txt'))).to be true
end

it "rejects file: urls outside registered directories" do
expect(actor.send(:validate_remote_url, URI('file:///tmp/test.txt'))).to be false
expect(actor.send(:validate_remote_url, URI('file:///test/../tmp/test.txt'))).to be false
expect(actor.send(:validate_remote_url, URI('file:///test/'))).to be false
end

it "accepts other types of urls" do
expect(actor.send(:validate_remote_url, URI('https://example.com/test.txt'))).to be true
end
end
end
end
Loading

0 comments on commit ab248f5

Please sign in to comment.