Skip to content

Commit

Permalink
add delete job, track records without source ids without creating ent… (
Browse files Browse the repository at this point in the history
#199)

* add delete job, track records without source ids without creating entries, properly show invalid records

* fixes for rubocop and feedback
  • Loading branch information
orangewolf authored Jun 5, 2020
1 parent 6e02790 commit 37b98fb
Show file tree
Hide file tree
Showing 18 changed files with 208 additions and 132 deletions.
99 changes: 7 additions & 92 deletions app/factories/bulkrax/object_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Bulkrax
class ObjectFactory
extend ActiveModel::Callbacks
include Bulkrax::FileFactory
define_model_callbacks :save, :create
class_attribute :system_identifier_field
attr_reader :attributes, :object, :unique_identifier, :klass, :replace_files
Expand All @@ -30,6 +31,12 @@ def run
object
end

def run!
self.run
# Create the error exception if the object is not validly saved for some reason
raise ActiveFedora::RecordInvalid, object if !object.persisted? || object.changed?
end

def update
raise "Object doesn't exist" unless object
destroy_existing_files if @replace_files && klass != Collection
Expand Down Expand Up @@ -214,98 +221,6 @@ def transform_attributes
.merge(file_attributes)
end

# Find existing files or upload new files. This assumes a Work will have unique file titles;
# and that those file titles will not have changed
# could filter by URIs instead (slower).
# When an uploaded_file already exists we do not want to pass its id in `file_attributes`
# otherwise it gets reuploaded by `work_actor`.
# support multiple files; ensure attributes[:file] is an Array
def upload_ids
return [] if klass == Collection
attributes[:file] = file_paths
work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files
end

def file_attributes
hash = {}
return hash if klass == Collection
hash[:uploaded_files] = upload_ids if attributes[:file].present?
hash[:remote_files] = new_remote_files if new_remote_files.present?
hash
end

# Its possible to get just an array of strings here, so we need to make sure they are all hashes
def parsed_remote_files
return @parsed_remote_files if @parsed_remote_files.present?
@parsed_remote_files = attributes[:remote_files] || []
@parsed_remote_files = @parsed_remote_files.map do |file_value|
if file_value.is_a?(Hash)
file_value
elsif file_value.is_a?(String)
{ url: file_value }
else
Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type")
nil
end
end
@parsed_remote_files.delete(nil)
@parsed_remote_files
end

def new_remote_files
@new_remote_files ||= if object.present? && object.file_sets.present?
parsed_remote_files.select do |file|
# is the url valid?
is_valid = file[:url]&.match(URI::ABS_URI)
# does the file already exist
is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] }
is_valid && !is_existing
end
else
parsed_remote_files.select do |file|
file[:url]&.match(URI::ABS_URI)
end
end
end

def file_paths
@file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) }
end

# Retrieve the orginal filenames for the files to be imported
def work_files_filenames
object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present?
end

# Retrieve the filenames for the files to be imported
def import_files_filenames
file_paths.map { |f| f.split('/').last }
end

# Called if #replace_files is true
# Destroy all file_sets for this object
# Reload the object to ensure the remaining methods have the most up to date object
def destroy_existing_files
return unless object.present? && object.file_sets.present?
object.file_sets.each do |fs|
Hyrax::Actors::FileSetActor.new(fs, @user).destroy
end
@object = object.reload
log_deleted_fs(object)
end

def import_files
file_paths.map { |path| import_file(path) }
end

def import_file(path)
u = Hyrax::UploadedFile.new
u.user_id = @user.id
u.file = CarrierWave::SanitizedFile.new(path)
u.save
u.id
end

# Regardless of what the Parser gives us, these are the properties we are prepared to accept.
def permitted_attributes
klass.properties.keys.map(&:to_sym) + %i[id edit_users edit_groups read_groups visibility work_members_attributes]
Expand Down
16 changes: 16 additions & 0 deletions app/jobs/bulkrax/delete_work_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

module Bulkrax
class DeleteWorkJob < ApplicationJob
queue_as :import

# rubocop:disable Rails/SkipsModelValidations
def perform(entry, importer_run)
work = entry.factory.find
work&.delete
importer_run.increment!(:deleted_records)
importer_run.decrement!(:enqueued_records)
end
# rubocop:enable Rails/SkipsModelValidations
end
end
5 changes: 0 additions & 5 deletions app/jobs/bulkrax/import_work_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ def perform(*args)
entry.save!
rescue Bulkrax::CollectionsCreatedError
reschedule(args[0], args[1])
# Exceptions here are not an issue with building the work.
# Those are caught separately, these are more likely network, db or other unexpected issues.
# Note that these temporary type issues do not raise the failure count
rescue StandardError, OAIError, RSolr::Error::Http => e
raise e
end
# rubocop:enable Rails/SkipsModelValidations

Expand Down
18 changes: 0 additions & 18 deletions app/models/bulkrax/importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def replace_files
def import_works(only_updates = false)
self.only_updates = only_updates
parser.create_works
remove_unseen
status_info
rescue StandardError => e
status_info(e)
Expand All @@ -99,23 +98,6 @@ def unique_collection_identifier(id)
"#{self.parser_fields['base_url'].split('/')[2]}_#{id}"
end

def remove_unseen
# TODO
# if primary_collection
# primary_collection.member_ids.each do |id|
# w = Work.find id
# unless seen[w.source[0]]
# if w.in_collections.size > 1
# primary_collection.members.delete w # only removes from primary collection - wants the record, not the id
# primary_collection.save
# else
# w.delete # removes from all collections
# end
# end
# end
# end
end

# The format for metadata for the incoming import; corresponds to an Entry class
def import_metadata_format
[['CSV', 'Bulkrax::CsvEntry'], ['RDF (N-Triples)', 'Bulkrax::RdfEntry']]
Expand Down
8 changes: 4 additions & 4 deletions app/models/bulkrax/importer_run.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ class ImporterRun < ApplicationRecord
belongs_to :importer

def importer_status
import_runs = importer.importer_runs.last
import_run = importer.importer_runs.last

return "Failed" if import_runs&.failed_records&.positive? || import_runs&.failed_collections&.positive? || import_runs&.failed_children&.positive?
return "Processing" if import_runs&.enqueued_records&.positive?
return "Completed" if import_runs&.enqueued_records&.zero? && import_runs&.processed_records == import_runs&.total_work_entries
return "Failed" if import_run&.failed_records&.positive? || import_run&.failed_collections&.positive? || import_run&.failed_children&.positive? || import_run&.invalid_records&.present?
return "Processing" if import_run&.enqueued_records&.positive?
return "Completed" if import_run&.enqueued_records&.zero? && import_run&.processed_records == import_run&.total_work_entries
end
end
end
99 changes: 99 additions & 0 deletions app/models/concerns/bulkrax/file_factory.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# frozen_string_literal: true

module Bulkrax
module FileFactory
extend ActiveSupport::Concern

# Find existing files or upload new files. This assumes a Work will have unique file titles;
# and that those file titles will not have changed
# could filter by URIs instead (slower).
# When an uploaded_file already exists we do not want to pass its id in `file_attributes`
# otherwise it gets reuploaded by `work_actor`.
# support multiple files; ensure attributes[:file] is an Array
def upload_ids
return [] if klass == Collection
attributes[:file] = file_paths
work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files
end

def file_attributes
hash = {}
return hash if klass == Collection
hash[:uploaded_files] = upload_ids if attributes[:file].present?
hash[:remote_files] = new_remote_files if new_remote_files.present?
hash
end

# Its possible to get just an array of strings here, so we need to make sure they are all hashes
def parsed_remote_files
return @parsed_remote_files if @parsed_remote_files.present?
@parsed_remote_files = attributes[:remote_files] || []
@parsed_remote_files = @parsed_remote_files.map do |file_value|
if file_value.is_a?(Hash)
file_value
elsif file_value.is_a?(String)
{ url: file_value }
else
Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type")
nil
end
end
@parsed_remote_files.delete(nil)
@parsed_remote_files
end

def new_remote_files
@new_remote_files ||= if object.present? && object.file_sets.present?
parsed_remote_files.select do |file|
# is the url valid?
is_valid = file[:url]&.match(URI::ABS_URI)
# does the file already exist
is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] }
is_valid && !is_existing
end
else
parsed_remote_files.select do |file|
file[:url]&.match(URI::ABS_URI)
end
end
end

def file_paths
@file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) }
end

# Retrieve the orginal filenames for the files to be imported
def work_files_filenames
object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present?
end

# Retrieve the filenames for the files to be imported
def import_files_filenames
file_paths.map { |f| f.split('/').last }
end

# Called if #replace_files is true
# Destroy all file_sets for this object
# Reload the object to ensure the remaining methods have the most up to date object
def destroy_existing_files
return unless object.present? && object.file_sets.present?
object.file_sets.each do |fs|
Hyrax::Actors::FileSetActor.new(fs, @user).destroy
end
@object = object.reload
log_deleted_fs(object)
end

def import_files
file_paths.map { |path| import_file(path) }
end

def import_file(path)
u = Hyrax::UploadedFile.new
u.user_id = @user.id
u.file = CarrierWave::SanitizedFile.new(path)
u.save
u.id
end
end
end
2 changes: 1 addition & 1 deletion app/models/concerns/bulkrax/has_matchers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def field_supported?(field)
field = field.gsub('_attributes', '')

return false if excluded?(field)
return true if ['file', 'remote_files', 'model'].include?(field)
return true if ['file', 'remote_files', 'model', 'delete'].include?(field)
return factory_class.method_defined?(field) && factory_class.properties[field].present?
end

Expand Down
2 changes: 1 addition & 1 deletion app/models/concerns/bulkrax/import_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def build_for_importer
build_metadata
unless self.importerexporter.validate_only
raise CollectionsCreatedError unless collections_created?
@item = factory.run
@item = factory.run!
end
rescue RSolr::Error::Http, CollectionsCreatedError => e
raise e
Expand Down
14 changes: 12 additions & 2 deletions app/parsers/bulkrax/csv_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,22 @@ def create_collections

def create_works
records.each_with_index do |record, index|
next if record[:source_identifier].blank?
if record[:source_identifier].blank?
current_importer_run.invalid_records ||= ""
current_importer_run.invalid_records += "Missing #{Bulkrax.system_identifier_field} for #{record.to_h}\n"
current_importer_run.failed_records += 1
current_importer_run.save
next
end
break if limit_reached?(limit, index)

seen[record[:source_identifier]] = true
new_entry = find_or_create_entry(entry_class, record[:source_identifier], 'Bulkrax::Importer', record.to_h.compact)
ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id)
if record[:delete].present?
DeleteWorkJob.send(perform_method, new_entry, current_importer_run)
else
ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id)
end
increment_counters(index)
end
status_info
Expand Down
5 changes: 3 additions & 2 deletions app/parsers/bulkrax/oai_dc_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ def create_works
return unless results.present?
results.full.each_with_index do |record, index|
break if limit_reached?(limit, index)
seen[record.identifier] = true
new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create!
if record.deleted? # TODO: record.status == "deleted"
DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_importer_run)
importerexporter.current_importer_run.deleted_records += 1
importerexporter.current_importer_run.save!
else
seen[record.identifier] = true
new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create!
ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_importer_run.id)
increment_counters(index)
end
Expand Down
4 changes: 2 additions & 2 deletions app/views/bulkrax/entries/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
<% elsif @entry.last_succeeded_at != nil %>
<strong>Succeeded At:</strong> <%= @entry.last_succeeded_at %>
<% else %>
<strong>Succeeded At:</strong> Item has not been <%= @importer.present? ? 'imported' : 'exported' %> successfully
<strong>Succeeded At:</strong> Item has not yet been <%= @importer.present? ? 'imported' : 'exported' %> successfully
<% end %>
</p>

Expand All @@ -79,7 +79,7 @@
<%= link_to @entry.factory_class.to_s, main_app.polymorphic_path(factory_record) %>
<% end %>
<% else %>
<strong>Item Link:</strong> Item has not been imported successfully
<strong>Item Link:</strong> Item has not yet been imported successfully
<% end %>
<% else %>
<% record = @entry&.hyrax_record %>
Expand Down
3 changes: 3 additions & 0 deletions app/views/bulkrax/importers/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@
<strong>Error:</strong> <%= @importer.last_error['error_class'] %> - <%= @importer.last_error['error_message'] %><br />
<strong>Error Trace:</strong> <button id='err_toggle'>Toggle</button><br>
<span id='error_trace' style='display: none'><%= @importer.last_error['error_trace'] %></span>
<% elsif @importer.importer_runs.last&.invalid_records&.present? %>
<strong>Error Trace:</strong> <button id='err_toggle'>Toggle</button><br>
<span id='error_trace' style='display: none'><%= @importer.importer_runs.last.invalid_records %></span>
<% elsif @importer.last_succeeded_at.present? %>
<strong>Succeeded At:</strong> <%= @importer.last_succeeded_at %>
<% end %>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddInvalidRecordToImporterRun < ActiveRecord::Migration[5.1]
def change
add_column :bulkrax_importer_runs, :invalid_records, :text
end
end
Loading

0 comments on commit 37b98fb

Please sign in to comment.