Skip to content

Commit

Permalink
index Hyrax::FileSet in the valkyrie index
Browse files Browse the repository at this point in the history
This adds a basic solr doc for Hyrax::FileSet which is a Valkryie::Resource.

Known remaining issues:
* There are a few fields in the AF solr doc that are not in the FileMetadata resource.  These are marked as TODOs in the resource indexer to be evaluated to see if there is more work to be done on this.
* There are several jobs that are expecting an AF FileSet.  They will need to be addressed separately.  They include, but may not be limited to, CharacterizationJob, FileSetAttachedEventJob, ContentUpdateEventJob.
  • Loading branch information
elrayle committed Mar 12, 2021
1 parent d43b903 commit e2bb038
Show file tree
Hide file tree
Showing 15 changed files with 709 additions and 138 deletions.
27 changes: 26 additions & 1 deletion app/actors/hyrax/actors/create_with_remote_files_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,17 @@ def attach_files(env, remote_files)
true
end

def create_file_from_url(env, uri, file_name, auth_header)
if env.curation_concern.is_a? Valkyrie::Resource
create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
else
create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
end
end

# Generic utility for creating FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url(env, uri, file_name, auth_header = {})
def create_file_from_url_through_active_fedora(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
::FileSet.new(import_url: import_url, label: file_name) do |fs|
actor = Hyrax::Actors::FileSetActor.new(fs, env.user)
Expand All @@ -80,6 +88,23 @@ def create_file_from_url(env, uri, file_name, auth_header = {})
end
end

# Generic utility for creating Hyrax::FileSet from a URL
# Used in to import files using URLs from a file picker like browse_everything
def create_file_from_url_through_valkyrie(env, uri, file_name, auth_header)
import_url = URI.decode_www_form_component(uri.to_s)
fs = Hyrax.persister.save(resource: Hyrax::FileSet.new(import_url: import_url, label: file_name))
actor = Hyrax::Actors::FileSetActor.new(fs, env.user, use_valkyrie: true)
actor.create_metadata(visibility: env.curation_concern.visibility)
actor.attach_to_work(env.curation_concern)
if uri.scheme == 'file'
# Turn any %20 into spaces.
file_path = CGI.unescape(uri.path)
IngestLocalFileJob.perform_later(fs, file_path, env.user)
else
ImportUrlJob.perform_later(fs, operation_for(user: actor.user), auth_header)
end
end

def operation_for(user:)
Hyrax::Operation.create!(user: user,
operation_type: "Attach Remote File")
Expand Down
6 changes: 4 additions & 2 deletions app/actors/hyrax/actors/file_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class FileActor
# @param [FileSet] file_set the parent FileSet
# @param [Symbol, #to_sym] relation the type/use for the file
# @param [User] user the user to record as the Agent acting upon the file
def initialize(file_set, relation, user, use_valkyrie: false)
def initialize(file_set, relation, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@relation = normalize_relation(relation)
Expand Down Expand Up @@ -75,7 +75,7 @@ def perform_ingest_file_through_active_fedora(io)
CharacterizeJob.perform_later(file_set, repository_file.id, pathhint(io))
end

def perform_ingest_file_through_valkyrie(io)
def perform_ingest_file_through_valkyrie(io) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
file =
begin
Hyrax.storage_adapter.upload(resource: file_set, file: io, original_filename: io.original_name, use: relation)
Expand All @@ -87,7 +87,9 @@ def perform_ingest_file_through_valkyrie(io)
create_version(file_metadata, user)
id = file_metadata.file_identifier
file_set.file_ids << id
file_set.original_file_id = id
Hyrax.persister.save(resource: file_set)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
CharacterizeJob.perform_later(file_set, id.to_s, pathhint(io))
file_metadata
end
Expand Down
4 changes: 2 additions & 2 deletions app/actors/hyrax/actors/file_set_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class FileSetActor # rubocop:disable Metrics/ClassLength
include Lockable
attr_reader :file_set, :user, :attributes, :use_valkyrie

def initialize(file_set, user, use_valkyrie: false)
def initialize(file_set, user, use_valkyrie: Hyrax.config.query_index_from_valkyrie)
@use_valkyrie = use_valkyrie
@file_set = file_set
@user = user
Expand Down Expand Up @@ -87,7 +87,7 @@ def attach_to_work(work, file_set_params = {})
def attach_to_valkyrie_work(work, file_set_params)
work = Hyrax.query_service.find_by(id: work.id) unless work.new_record
file_set.visibility = work.visibility unless assign_visibility?(file_set_params)
Hyrax.persister.save(resource: file_set)
@file_set = Hyrax.persister.save(resource: file_set)
work.member_ids << file_set.id
work.representative_id = file_set.id if work.representative_id.blank?
work.thumbnail_id = file_set.id if work.thumbnail_id.blank?
Expand Down
58 changes: 58 additions & 0 deletions app/indexers/hyrax/active_fedora_file_set_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# frozen_string_literal: true
module Hyrax
class ActiveFedoraFileSetIndexer < ActiveFedora::IndexingService
include Hyrax::IndexesThumbnails
include Hyrax::IndexesBasicMetadata
STORED_LONG = ActiveFedora::Indexing::Descriptor.new(:long, :stored)

def generate_solr_document # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
super.tap do |solr_doc|
solr_doc['hasRelatedMediaFragment_ssim'] = object.representative_id
solr_doc['hasRelatedImage_ssim'] = object.thumbnail_id
# Label is the actual file name. It's not editable by the user.
solr_doc['label_tesim'] = object.label
solr_doc['label_ssi'] = object.label
solr_doc['file_format_tesim'] = file_format
solr_doc['file_format_sim'] = file_format
solr_doc['file_size_lts'] = object.file_size[0]
solr_doc['all_text_timv'] = object.extracted_text.content if object.extracted_text.present?
solr_doc['height_is'] = Integer(object.height.first) if object.height.present?
solr_doc['width_is'] = Integer(object.width.first) if object.width.present?
solr_doc['visibility_ssi'] = object.visibility
solr_doc['mime_type_ssi'] = object.mime_type
# Index the Fedora-generated SHA1 digest to create a linkage between
# files on disk (in fcrepo.binary-store-path) and objects in the repository.
solr_doc['digest_ssim'] = [digest_from_content]
solr_doc['page_count_tesim'] = object.page_count
solr_doc['file_title_tesim'] = object.file_title
solr_doc['duration_tesim'] = object.duration
solr_doc['sample_rate_tesim'] = object.sample_rate
solr_doc['original_checksum_tesim'] = object.original_checksum
solr_doc['alpha_channels_ssi'] = object.alpha_channels
solr_doc['original_file_id_ssi'] = original_file_id
end
end

private

def digest_from_content
return unless object.original_file
object.original_file.digest.first.to_s
end

def original_file_id
return unless object.original_file
Hyrax::VersioningService.versioned_file_id object.original_file
end

def file_format
if object.mime_type.present? && object.format_label.present?
"#{object.mime_type.split('/').last} (#{object.format_label.join(', ')})"
elsif object.mime_type.present?
object.mime_type.split('/').last
elsif object.format_label.present?
object.format_label
end
end
end
end
146 changes: 102 additions & 44 deletions app/indexers/hyrax/file_set_indexer.rb
Original file line number Diff line number Diff line change
@@ -1,57 +1,115 @@
# frozen_string_literal: true

module Hyrax
class FileSetIndexer < ActiveFedora::IndexingService
include Hyrax::IndexesThumbnails
include Hyrax::IndexesBasicMetadata
STORED_LONG = ActiveFedora::Indexing::Descriptor.new(:long, :stored)

def generate_solr_document # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
super.tap do |solr_doc|
solr_doc['hasRelatedMediaFragment_ssim'] = object.representative_id
solr_doc['hasRelatedImage_ssim'] = object.thumbnail_id
##
# Indexes Hyrax::FileSet objects
class FileSetIndexer < Hyrax::ValkyrieIndexer
include Hyrax::ResourceIndexer
include Hyrax::PermissionIndexer
include Hyrax::VisibilityIndexer
include Hyrax::Indexer(:core_metadata)
include Hyrax::Indexer(:basic_metadata)

# include Hyrax::IndexesThumbnails # TODO: Is there a Valkyrie version of a thumbnail indexer?

def to_solr # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
super.tap do |solr_doc| # rubocop:disable Metrics/BlockLength
# Metadata from the FileSet
solr_doc['file_ids_ssim'] = resource.file_ids&.map(&:to_s)
solr_doc['original_file_id_ssi'] = resource.original_file_id.to_s
solr_doc['thumbnail_id_ssi'] = resource.thumbnail_id.to_s
solr_doc['extracted_text_id_ssi'] = resource.extracted_text_id.to_s

# Add in metadata from the original file.
file_metadata = original_file
return solr_doc unless file_metadata

# Label is the actual file name. It's not editable by the user.
solr_doc['label_tesim'] = object.label
solr_doc['label_ssi'] = object.label
solr_doc['file_format_tesim'] = file_format
solr_doc['file_format_sim'] = file_format
solr_doc['file_size_lts'] = object.file_size[0]
solr_doc['all_text_timv'] = object.extracted_text.content if object.extracted_text.present?
solr_doc['height_is'] = Integer(object.height.first) if object.height.present?
solr_doc['width_is'] = Integer(object.width.first) if object.width.present?
solr_doc['visibility_ssi'] = object.visibility
solr_doc['mime_type_ssi'] = object.mime_type
# Index the Fedora-generated SHA1 digest to create a linkage between
# files on disk (in fcrepo.binary-store-path) and objects in the repository.
solr_doc['digest_ssim'] = [digest_from_content]
solr_doc['page_count_tesim'] = object.page_count
solr_doc['file_title_tesim'] = object.file_title
solr_doc['duration_tesim'] = object.duration
solr_doc['sample_rate_tesim'] = object.sample_rate
solr_doc['original_checksum_tesim'] = object.original_checksum
solr_doc['alpha_channels_ssi'] = object.alpha_channels
solr_doc['original_file_id_ssi'] = original_file_id
solr_doc['original_file_alternate_ids_tesim'] = file_metadata.alternate_ids if file_metadata.alternate_ids.present?

solr_doc['original_filename_tesi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['original_filename_ssi'] = file_metadata.original_filename if file_metadata.original_filename.present?
solr_doc['mime_type_tesi'] = file_metadata.mime_type if file_metadata.mime_type.present?
solr_doc['mime_type_ssi'] = file_metadata.mime_type if file_metadata.mime_type.present?

solr_doc['file_format_tesim'] = file_format(file_metadata)
solr_doc['file_format_sim'] = file_format(file_metadata)
solr_doc['file_size_lts'] = file_metadata.size[0]
solr_doc['type_tesim'] = file_metadata.type if file_metadata.type.present?

# attributes set by fits
solr_doc['format_label_tesim'] = file_metadata.format_label if file_metadata.format_label.present?
solr_doc['size_tesim'] = file_metadata.size if file_metadata.size.present?
solr_doc['well_formed_tesim'] = file_metadata.well_formed if file_metadata.well_formed.present?
solr_doc['valid_tesim'] = file_metadata.valid if file_metadata.valid.present?
solr_doc['fits_version_tesim'] = file_metadata.fits_version if file_metadata.fits_version.present?
solr_doc['exif_version_tesim'] = file_metadata.exif_version if file_metadata.exif_version.present?
solr_doc['checksum_tesim'] = file_metadata.checksum if file_metadata.checksum.present?

# shared attributes across multiple file types
solr_doc['frame_rate_tesim'] = file_metadata.frame_rate if file_metadata.frame_rate.present? # audio, video
solr_doc['bit_rate_tesim'] = file_metadata.bit_rate if file_metadata.bit_rate.present? # audio, video
solr_doc['duration_tesim'] = file_metadata.duration if file_metadata.duration.present? # audio, video
solr_doc['sample_rate_tesim'] = file_metadata.sample_rate if file_metadata.sample_rate.present? # audio, video

solr_doc['height_tesim'] = file_metadata.height if file_metadata.height.present? # image, video
solr_doc['width_tesim'] = file_metadata.width if file_metadata.width.present? # image, video

# attributes set by fits for audio files
solr_doc['bit_depth_tesim'] = file_metadata.bit_depth if file_metadata.bit_depth.present?
solr_doc['channels_tesim'] = file_metadata.channels if file_metadata.channels.present?
solr_doc['data_format_tesim'] = file_metadata.data_format if file_metadata.data_format.present?
solr_doc['offset_tesim'] = file_metadata.offset if file_metadata.offset.present?

# attributes set by fits for documents
solr_doc['file_title_tesim'] = file_metadata.file_title if file_metadata.file_title.present?
solr_doc['page_count_tesim'] = file_metadata.page_count if file_metadata.page_count.present?
solr_doc['language_tesim'] = file_metadata.language if file_metadata.language.present?
solr_doc['word_count_tesim'] = file_metadata.word_count if file_metadata.word_count.present?
solr_doc['character_count_tesim'] = file_metadata.character_count if file_metadata.character_count.present?
solr_doc['line_count_tesim'] = file_metadata.line_count if file_metadata.line_count.present?
solr_doc['character_set_tesim'] = file_metadata.character_set if file_metadata.character_set.present?
solr_doc['markup_basis_tesim'] = file_metadata.markup_basis if file_metadata.markup_basis.present?
solr_doc['paragraph_count_tesim'] = file_metadata.paragraph_count if file_metadata.paragraph_count.present?
solr_doc['markup_language_tesim'] = file_metadata.markup_language if file_metadata.markup_language.present?
solr_doc['table_count_tesim'] = file_metadata.table_count if file_metadata.table_count.present?
solr_doc['graphics_count_tesim'] = file_metadata.graphics_count if file_metadata.graphics_count.present?

# attributes set by fits for images
solr_doc['byte_order_tesim'] = file_metadata.byte_order if file_metadata.byte_order.present?
solr_doc['compression_tesim'] = file_metadata.compression if file_metadata.compression.present?
solr_doc['color_space_tesim'] = file_metadata.color_space if file_metadata.color_space.present?
solr_doc['profile_name_tesim'] = file_metadata.profile_name if file_metadata.profile_name.present?
solr_doc['profile_version_tesim'] = file_metadata.profile_version if file_metadata.profile_version.present?
solr_doc['orientation_tesim'] = file_metadata.orientation if file_metadata.orientation.present?
solr_doc['color_map_tesim'] = file_metadata.color_map if file_metadata.color_map.present?
solr_doc['image_producer_tesim'] = file_metadata.image_producer if file_metadata.image_producer.present?
solr_doc['capture_device_tesim'] = file_metadata.capture_device if file_metadata.capture_device.present?
solr_doc['scanning_software_tesim'] = file_metadata.scanning_software if file_metadata.scanning_software.present?
solr_doc['gps_timestamp_tesim'] = file_metadata.gps_timestamp if file_metadata.gps_timestamp.present?
solr_doc['latitude_tesim'] = file_metadata.latitude if file_metadata.latitude.present?
solr_doc['longitude_tesim'] = file_metadata.longitude if file_metadata.longitude.present?

# attributes set by fits for video
solr_doc['aspect_ratio_tesim'] = file_metadata.aspect_ratio if file_metadata.aspect_ratio.present?
end
end

private

def digest_from_content
return unless object.original_file
object.original_file.digest.first.to_s
end

def original_file_id
return unless object.original_file
Hyrax::VersioningService.versioned_file_id object.original_file
def original_file
Hyrax.custom_queries.find_original_file(file_set: resource)
rescue Valkyrie::Persistence::ObjectNotFoundError
Hyrax.custom_queries.find_files(file_set: resource).first
end

def file_format
if object.mime_type.present? && object.format_label.present?
"#{object.mime_type.split('/').last} (#{object.format_label.join(', ')})"
elsif object.mime_type.present?
object.mime_type.split('/').last
elsif object.format_label.present?
object.format_label
def file_format(file)
if file.mime_type.present? && file.format_label.present?
"#{file.mime_type.split('/').last} (#{file.format_label.join(', ')})"
elsif file.mime_type.present?
file.mime_type.split('/').last
elsif file.format_label.present?
file.format_label
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion app/models/concerns/hyrax/file_set/indexing.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module Indexing

included do
# the default indexing service
self.indexer = Hyrax::FileSetIndexer
self.indexer = Hyrax::ActiveFedoraFileSetIndexer
end
end
end
Expand Down
1 change: 1 addition & 0 deletions app/models/hyrax/file_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module Hyrax
# @see https://wiki.duraspace.org/display/samvera/Hydra%3A%3AWorks+Shared+Modeling
class FileSet < Hyrax::Resource
include Hyrax::Schema(:core_metadata)
include Hyrax::Schema(:basic_metadata)

attribute :file_ids, Valkyrie::Types::Array.of(Valkyrie::Types::ID) # id for FileMetadata resources
attribute :original_file_id, Valkyrie::Types::ID # id for FileMetadata resource
Expand Down
2 changes: 1 addition & 1 deletion app/models/job_io_wrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def size
nil # unable to determine
end

def file_set(use_valkyrie: false)
def file_set(use_valkyrie: Hyrax.config.query_index_from_valkyrie)
return FileSet.find(file_set_id) unless use_valkyrie
Hyrax.query_service.find_by(id: Valkyrie::ID.new(file_set_id))
end
Expand Down
3 changes: 2 additions & 1 deletion app/services/hyrax/work_uploads_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def initialize(work:, persister: Hyrax.persister)
# @note we immediately and silently discard uploads with an existing
# file_set_uri, in a half-considered attempt at supporting idempotency
# (for job retries). this is for legacy/AttachFilesToWorkJob
# compatibility, but could stand for a roubst reimplementation.
# compatibility, but could stand for a robust reimplementation.
#
# @param [Enumberable<Hyrax::UploadedFile>] files files to add
#
Expand Down Expand Up @@ -106,6 +106,7 @@ def make_file_set_and_ingest(file)
Hyrax::AccessControlList.copy_permissions(source: target_permissions, target: file_set)
append_to_work(file_set)
IngestJob.perform_later(wrap_file(file, file_set))
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: file.user)
{ file_set: file_set, user: file.user }
end

Expand Down
2 changes: 2 additions & 0 deletions config/initializers/listeners.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

Hyrax.config.callback.set(:after_create_fileset, warn: false) do |file_set, user|
Hyrax.publisher.publish('file.set.attached', file_set: file_set, user: user)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_revert_content, warn: false) do |file_set, user, revision|
Hyrax.publisher.publish('file.set.restored', file_set: file_set, user: user, revision: revision)
Hyrax.publisher.publish('object.metadata.updated', object: file_set, user: user)
end

Hyrax.config.callback.set(:after_update_metadata, warn: false) do |curation_concern, user|
Expand Down
Loading

0 comments on commit e2bb038

Please sign in to comment.