Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copy reprocessor script from GBH into Hyku #2166

Merged
merged 3 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
inherit_from:
- .rubocop_todo.yml
- .rubocop_fixme.yml

inherit_gem:
Expand Down
129 changes: 1 addition & 128 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,134 +6,7 @@
# Note that changes in the inspected code, or installation of new
# versions of RuboCop, may require this file to be generated again.

# Offense count: 2
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: EmptyLineBetweenMethodDefs, EmptyLineBetweenClassDefs, EmptyLineBetweenModuleDefs, AllowAdjacentOneLineDefs, NumberOfEmptyLines.
Layout/EmptyLineBetweenDefs:
Exclude:
- 'app/jobs/bulkrax/import_file_set_job.rb'
- 'app/models/bulkrax/entry.rb'

# Offense count: 2
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: EnforcedStyle.
# SupportedStyles: empty_lines, no_empty_lines
Layout/EmptyLinesAroundBlockBody:
Exclude:
- 'spec/rails_helper.rb'

# Offense count: 3
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
# SupportedHashRocketStyles: key, separator, table
# SupportedColonStyles: key, separator, table
# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
Layout/HashAlignment:
Exclude:
- 'app/parsers/bulkrax/csv_parser.rb'
- 'spec/models/bulkrax/rdf_entry_spec.rb'
- 'spec/models/bulkrax/xml_entry_spec.rb'

# Offense count: 1
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: Width, AllowedPatterns, IgnoredPatterns.
Layout/IndentationWidth:
Exclude:
- 'spec/rails_helper.rb'

# Offense count: 8
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, AllowedPatterns, IgnoredPatterns.
# URISchemes: http, https
Layout/LineLength:
Max: 301

# Offense count: 1
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: EnforcedStyle.
# SupportedStyles: symmetrical, new_line, same_line
Layout/MultilineMethodCallBraceLayout:
Exclude:
- 'app/parsers/bulkrax/csv_parser.rb'

# Offense count: 1
# This cop supports safe auto-correction (--auto-correct).
Layout/RescueEnsureAlignment:
Exclude:
- 'spec/rails_helper.rb'

# Offense count: 7
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: AllowInHeredoc.
Layout/TrailingWhitespace:
Exclude:
- 'app/models/bulkrax/csv_entry.rb'
- 'app/parsers/bulkrax/csv_parser.rb'
- 'spec/models/bulkrax/rdf_entry_spec.rb'
- 'spec/models/bulkrax/xml_entry_spec.rb'
- 'spec/rails_helper.rb'

# Offense count: 16
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
Metrics/AbcSize:
Max: 42

# Offense count: 4
# Configuration parameters: CountComments, CountAsOne.
Metrics/ClassLength:
Max: 201

# Offense count: 13
# Configuration parameters: IgnoredMethods.
Metrics/CyclomaticComplexity:
Max: 19

# Offense count: 32
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
Metrics/MethodLength:
Max: 26

# Offense count: 2
# Configuration parameters: CountComments, CountAsOne.
Metrics/ModuleLength:
Max: 131

# Offense count: 9
# Configuration parameters: IgnoredMethods.
Metrics/PerceivedComplexity:
Max: 19

# Offense count: 1
# Configuration parameters: Include.
# Include: app/models/**/*.rb
Rails/HasManyOrHasOneDependent:
Exclude:
- 'app/models/concerns/bulkrax/status_info.rb'

# Offense count: 2
# This cop supports safe auto-correction (--auto-correct).
# Configuration parameters: Keywords, RequireColon.
# Keywords: TODO, FIXME, OPTIMIZE, HACK, REVIEW, NOTE
Style/CommentAnnotation:
Exclude:
- 'app/models/bulkrax/xml_entry.rb'
- 'spec/models/bulkrax/oai_entry_spec.rb'

# Offense count: 2
# This cop supports safe auto-correction (--auto-correct).
Style/IfUnlessModifier:
Exclude:
- 'app/models/bulkrax/csv_entry.rb'
- 'lib/generators/bulkrax/templates/config/initializers/bulkrax.rb'

# Offense count: 1
# This cop supports safe auto-correction (--auto-correct).
Style/MultilineIfModifier:
Exclude:
- 'app/models/bulkrax/csv_entry.rb'

# Offense count: 1
# This cop supports safe auto-correction (--auto-correct).
Style/RedundantBegin:
Exclude:
- 'spec/rails_helper.rb'
Max: 21
202 changes: 202 additions & 0 deletions lib/reprocessor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# frozen_string_literal: true

require 'singleton'

class Reprocessor # rubocop:disable Metrics/ClassLength
include Singleton

SETTINGS = %w[header_lines batch_size current_location limit incremental_save log_dir].freeze

attr_accessor(*SETTINGS)
def initialize
@header_lines = 1
@batch_size = 1000
@current_location = 0
@limit = nil
@incremental_save = true
@log_dir = 'tmp/imports'
super
end

[:capture_ids, :process_ids].each do |method|
define_singleton_method(method) do |*args|
instance.send(method, *args)
end
end

SETTINGS.each do |method|
define_singleton_method(method) do |*args|
instance.send(method, *args)
end

define_singleton_method("#{method}=") do |*args|
instance.send("#{method}=", *args)
end
end

def self.load(log_dir = Rails.root.join('tmp', 'imports').to_s)
state = JSON.parse(File.read("#{log_dir}/work_processor.json"))
SETTINGS.each do |setting|
instance.send("#{setting}=", state[setting])
end
rescue Errno::ENOENT
puts 'no save file to load' # rubocop:disable Rails/Output
instance.log_dir = log_dir
end

def self.save
state = {}
SETTINGS.each do |setting|
state[setting] = instance.send(setting)
end
File.write("#{instance.log_dir}/work_processor.json", state.to_json)
end

def capture_work_ids
Hyrax.config.query_index_from_valkyrie = false
search = "has_model_ssim:(#{Bulkrax.curation_concerns.join(' OR ')})"
caputre_with_solr(search)
end

def capture_file_set_ids
Hyrax.config.query_index_from_valkyrie = false
search = "has_model_ssim:(FileSet)"
caputre_with_solr(search)
end

def capture_collection_ids
Hyrax.config.query_index_from_valkyrie = false
search = "has_model_ssim:(Collection)"
caputre_with_solr(search)
end

def caputre_with_solr(search)
count = Hyrax::SolrService.count(search)
progress(count)
while current_location < count
break if limit && current_location >= limit
ids = Hyrax::SolrService.query(search, fl: 'id', rows: batch_size, start: current_location)
self.current_location += batch_size
ids.each do |i|
id_log.error(i['id'])
end
progress.progress = [self.current_location, count].min
Reprocessor.save if incremental_save
end
end

def capture_bulkrax_entry_ids(query)
count = query.count
progress(count)
i = 0
query.find_each do |entry|
next if i < self.current_location
break if limit && i >= limit
id_log.error(entry.id)
progress.increment
i += 1
self.current_location += 1
Reprocessor.save if incremental_save
end
end

def process_ids(lamb)
progress(id_line_size)
line_counter = 0
with_id_lines do |lines|
lines.each do |line|
line_counter += 1
if line_counter < current_location
progress.increment
next
end
break if limit && current_location >= limit
begin
lamb.call(line, progress)
rescue => e
error(line, e)
end
self.current_location += 1
progress.increment
Reprocessor.save if incremental_save
end
# double break to get out of the lazy loop
break if limit && current_location >= limit
end
end

def error(line, exception)
msg = "#{line} - #{exception.message[0..200]}"
error_log.error(msg)
end

def error_log
@error_log ||= ActiveSupport::Logger.new("#{log_dir}/error.log")
end

def id_path
@id_path ||= "#{log_dir}/ids.log"
end

def id_log
@id_log ||= ActiveSupport::Logger.new(id_path)
end

def id_line_size
@id_line_size ||= `wc -l #{id_path}`.split.first.to_i
end

def with_id_lines
File.open(id_path) do |file|
file.lazy.drop(header_lines).each_slice(batch_size) do |lines|
yield lines
end
end
end

def lambda_create_relationships
@lambda_create_relationships ||= lambda { |line, _progress|
id = line.strip
e = Bulkrax::Entry.find(id)
::SEEN ||= [] # rubocop:disable Style/MutableConstant
unless ::SEEN.include?(e.importer.id)
::SEEN << e.importer.id
e.parser.create_parent_child_relationships
end
}
end

def lambda_save
@lambda_save ||= lambda { |line, _progress|
id = line.strip
w = Hyrax.query_service.find_by(id:)
w.save
}
end

def lambda_index
@lambda_save ||= lambda { |line, _progress|
id = line.strip
w = Hyrax.query_service.find_by(id:)
Hyrax.index_adapter.save(resource: w)
}
end

def lambda_print
@lambda_save ||= lambda { |line, progress|
id = line.strip
progress.log id
}
end

def progress(total = nil)
if total
@progress = ProgressBar.create(total:,
format: "%a %b\u{15E7}%i %c/%C %p%% %t",
progress_mark: ' ',
remainder_mark: "\u{FF65}")
else
@progress
end
end
end
Loading