diff --git a/app/factories/bulkrax/object_factory.rb b/app/factories/bulkrax/object_factory.rb index eec2b6a8..c95c7320 100644 --- a/app/factories/bulkrax/object_factory.rb +++ b/app/factories/bulkrax/object_factory.rb @@ -4,6 +4,7 @@ module Bulkrax class ObjectFactory extend ActiveModel::Callbacks + include Bulkrax::FileFactory define_model_callbacks :save, :create class_attribute :system_identifier_field attr_reader :attributes, :object, :unique_identifier, :klass, :replace_files @@ -30,6 +31,12 @@ def run object end + def run! + self.run + # Create the error exception if the object is not validly saved for some reason + raise ActiveFedora::RecordInvalid, object if !object.persisted? || object.changed? + end + def update raise "Object doesn't exist" unless object destroy_existing_files if @replace_files && klass != Collection @@ -214,98 +221,6 @@ def transform_attributes .merge(file_attributes) end - # Find existing files or upload new files. This assumes a Work will have unique file titles; - # and that those file titles will not have changed - # could filter by URIs instead (slower). - # When an uploaded_file already exists we do not want to pass its id in `file_attributes` - # otherwise it gets reuploaded by `work_actor`. - # support multiple files; ensure attributes[:file] is an Array - def upload_ids - return [] if klass == Collection - attributes[:file] = file_paths - work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files - end - - def file_attributes - hash = {} - return hash if klass == Collection - hash[:uploaded_files] = upload_ids if attributes[:file].present? - hash[:remote_files] = new_remote_files if new_remote_files.present? - hash - end - - # Its possible to get just an array of strings here, so we need to make sure they are all hashes - def parsed_remote_files - return @parsed_remote_files if @parsed_remote_files.present? - @parsed_remote_files = attributes[:remote_files] || [] - @parsed_remote_files = @parsed_remote_files.map do |file_value| - if file_value.is_a?(Hash) - file_value - elsif file_value.is_a?(String) - { url: file_value } - else - Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type") - nil - end - end - @parsed_remote_files.delete(nil) - @parsed_remote_files - end - - def new_remote_files - @new_remote_files ||= if object.present? && object.file_sets.present? - parsed_remote_files.select do |file| - # is the url valid? - is_valid = file[:url]&.match(URI::ABS_URI) - # does the file already exist - is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] } - is_valid && !is_existing - end - else - parsed_remote_files.select do |file| - file[:url]&.match(URI::ABS_URI) - end - end - end - - def file_paths - @file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) } - end - - # Retrieve the orginal filenames for the files to be imported - def work_files_filenames - object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present? - end - - # Retrieve the filenames for the files to be imported - def import_files_filenames - file_paths.map { |f| f.split('/').last } - end - - # Called if #replace_files is true - # Destroy all file_sets for this object - # Reload the object to ensure the remaining methods have the most up to date object - def destroy_existing_files - return unless object.present? && object.file_sets.present? - object.file_sets.each do |fs| - Hyrax::Actors::FileSetActor.new(fs, @user).destroy - end - @object = object.reload - log_deleted_fs(object) - end - - def import_files - file_paths.map { |path| import_file(path) } - end - - def import_file(path) - u = Hyrax::UploadedFile.new - u.user_id = @user.id - u.file = CarrierWave::SanitizedFile.new(path) - u.save - u.id - end - # Regardless of what the Parser gives us, these are the properties we are prepared to accept. def permitted_attributes klass.properties.keys.map(&:to_sym) + %i[id edit_users edit_groups read_groups visibility work_members_attributes] diff --git a/app/jobs/bulkrax/delete_work_job.rb b/app/jobs/bulkrax/delete_work_job.rb new file mode 100644 index 00000000..ece2c386 --- /dev/null +++ b/app/jobs/bulkrax/delete_work_job.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +module Bulkrax + class DeleteWorkJob < ApplicationJob + queue_as :import + + # rubocop:disable Rails/SkipsModelValidations + def perform(entry, importer_run) + work = entry.factory.find + work&.delete + importer_run.increment!(:deleted_records) + importer_run.decrement!(:enqueued_records) + end + # rubocop:enable Rails/SkipsModelValidations + end +end diff --git a/app/jobs/bulkrax/import_work_job.rb b/app/jobs/bulkrax/import_work_job.rb index 011865dd..0e41905e 100644 --- a/app/jobs/bulkrax/import_work_job.rb +++ b/app/jobs/bulkrax/import_work_job.rb @@ -20,11 +20,6 @@ def perform(*args) entry.save! rescue Bulkrax::CollectionsCreatedError reschedule(args[0], args[1]) - # Exceptions here are not an issue with building the work. - # Those are caught separately, these are more likely network, db or other unexpected issues. - # Note that these temporary type issues do not raise the failure count - rescue StandardError, OAIError, RSolr::Error::Http => e - raise e end # rubocop:enable Rails/SkipsModelValidations diff --git a/app/models/bulkrax/importer.rb b/app/models/bulkrax/importer.rb index ce8900c3..20ce78c2 100644 --- a/app/models/bulkrax/importer.rb +++ b/app/models/bulkrax/importer.rb @@ -80,7 +80,6 @@ def replace_files def import_works(only_updates = false) self.only_updates = only_updates parser.create_works - remove_unseen status_info rescue StandardError => e status_info(e) @@ -99,23 +98,6 @@ def unique_collection_identifier(id) "#{self.parser_fields['base_url'].split('/')[2]}_#{id}" end - def remove_unseen - # TODO - # if primary_collection - # primary_collection.member_ids.each do |id| - # w = Work.find id - # unless seen[w.source[0]] - # if w.in_collections.size > 1 - # primary_collection.members.delete w # only removes from primary collection - wants the record, not the id - # primary_collection.save - # else - # w.delete # removes from all collections - # end - # end - # end - # end - end - # The format for metadata for the incoming import; corresponds to an Entry class def import_metadata_format [['CSV', 'Bulkrax::CsvEntry'], ['RDF (N-Triples)', 'Bulkrax::RdfEntry']] diff --git a/app/models/bulkrax/importer_run.rb b/app/models/bulkrax/importer_run.rb index fec5639a..51d474be 100644 --- a/app/models/bulkrax/importer_run.rb +++ b/app/models/bulkrax/importer_run.rb @@ -5,11 +5,11 @@ class ImporterRun < ApplicationRecord belongs_to :importer def importer_status - import_runs = importer.importer_runs.last + import_run = importer.importer_runs.last - return "Failed" if import_runs&.failed_records&.positive? || import_runs&.failed_collections&.positive? || import_runs&.failed_children&.positive? - return "Processing" if import_runs&.enqueued_records&.positive? - return "Completed" if import_runs&.enqueued_records&.zero? && import_runs&.processed_records == import_runs&.total_work_entries + return "Failed" if import_run&.failed_records&.positive? || import_run&.failed_collections&.positive? || import_run&.failed_children&.positive? || import_run&.invalid_records&.present? + return "Processing" if import_run&.enqueued_records&.positive? + return "Completed" if import_run&.enqueued_records&.zero? && import_run&.processed_records == import_run&.total_work_entries end end end diff --git a/app/models/concerns/bulkrax/file_factory.rb b/app/models/concerns/bulkrax/file_factory.rb new file mode 100644 index 00000000..148d90df --- /dev/null +++ b/app/models/concerns/bulkrax/file_factory.rb @@ -0,0 +1,99 @@ +# frozen_string_literal: true + +module Bulkrax + module FileFactory + extend ActiveSupport::Concern + + # Find existing files or upload new files. This assumes a Work will have unique file titles; + # and that those file titles will not have changed + # could filter by URIs instead (slower). + # When an uploaded_file already exists we do not want to pass its id in `file_attributes` + # otherwise it gets reuploaded by `work_actor`. + # support multiple files; ensure attributes[:file] is an Array + def upload_ids + return [] if klass == Collection + attributes[:file] = file_paths + work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files + end + + def file_attributes + hash = {} + return hash if klass == Collection + hash[:uploaded_files] = upload_ids if attributes[:file].present? + hash[:remote_files] = new_remote_files if new_remote_files.present? + hash + end + + # Its possible to get just an array of strings here, so we need to make sure they are all hashes + def parsed_remote_files + return @parsed_remote_files if @parsed_remote_files.present? + @parsed_remote_files = attributes[:remote_files] || [] + @parsed_remote_files = @parsed_remote_files.map do |file_value| + if file_value.is_a?(Hash) + file_value + elsif file_value.is_a?(String) + { url: file_value } + else + Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type") + nil + end + end + @parsed_remote_files.delete(nil) + @parsed_remote_files + end + + def new_remote_files + @new_remote_files ||= if object.present? && object.file_sets.present? + parsed_remote_files.select do |file| + # is the url valid? + is_valid = file[:url]&.match(URI::ABS_URI) + # does the file already exist + is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] } + is_valid && !is_existing + end + else + parsed_remote_files.select do |file| + file[:url]&.match(URI::ABS_URI) + end + end + end + + def file_paths + @file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) } + end + + # Retrieve the orginal filenames for the files to be imported + def work_files_filenames + object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present? + end + + # Retrieve the filenames for the files to be imported + def import_files_filenames + file_paths.map { |f| f.split('/').last } + end + + # Called if #replace_files is true + # Destroy all file_sets for this object + # Reload the object to ensure the remaining methods have the most up to date object + def destroy_existing_files + return unless object.present? && object.file_sets.present? + object.file_sets.each do |fs| + Hyrax::Actors::FileSetActor.new(fs, @user).destroy + end + @object = object.reload + log_deleted_fs(object) + end + + def import_files + file_paths.map { |path| import_file(path) } + end + + def import_file(path) + u = Hyrax::UploadedFile.new + u.user_id = @user.id + u.file = CarrierWave::SanitizedFile.new(path) + u.save + u.id + end + end +end diff --git a/app/models/concerns/bulkrax/has_matchers.rb b/app/models/concerns/bulkrax/has_matchers.rb index 3e83df1b..b9a84bc0 100644 --- a/app/models/concerns/bulkrax/has_matchers.rb +++ b/app/models/concerns/bulkrax/has_matchers.rb @@ -59,7 +59,7 @@ def field_supported?(field) field = field.gsub('_attributes', '') return false if excluded?(field) - return true if ['file', 'remote_files', 'model'].include?(field) + return true if ['file', 'remote_files', 'model', 'delete'].include?(field) return factory_class.method_defined?(field) && factory_class.properties[field].present? end diff --git a/app/models/concerns/bulkrax/import_behavior.rb b/app/models/concerns/bulkrax/import_behavior.rb index 63f122dc..9f9b9658 100644 --- a/app/models/concerns/bulkrax/import_behavior.rb +++ b/app/models/concerns/bulkrax/import_behavior.rb @@ -10,7 +10,7 @@ def build_for_importer build_metadata unless self.importerexporter.validate_only raise CollectionsCreatedError unless collections_created? - @item = factory.run + @item = factory.run! end rescue RSolr::Error::Http, CollectionsCreatedError => e raise e diff --git a/app/parsers/bulkrax/csv_parser.rb b/app/parsers/bulkrax/csv_parser.rb index d0491497..ec373e0b 100644 --- a/app/parsers/bulkrax/csv_parser.rb +++ b/app/parsers/bulkrax/csv_parser.rb @@ -65,12 +65,22 @@ def create_collections def create_works records.each_with_index do |record, index| - next if record[:source_identifier].blank? + if record[:source_identifier].blank? + current_importer_run.invalid_records ||= "" + current_importer_run.invalid_records += "Missing #{Bulkrax.system_identifier_field} for #{record.to_h}\n" + current_importer_run.failed_records += 1 + current_importer_run.save + next + end break if limit_reached?(limit, index) seen[record[:source_identifier]] = true new_entry = find_or_create_entry(entry_class, record[:source_identifier], 'Bulkrax::Importer', record.to_h.compact) - ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id) + if record[:delete].present? + DeleteWorkJob.send(perform_method, new_entry, current_importer_run) + else + ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id) + end increment_counters(index) end status_info diff --git a/app/parsers/bulkrax/oai_dc_parser.rb b/app/parsers/bulkrax/oai_dc_parser.rb index ef773e53..bd771a0b 100644 --- a/app/parsers/bulkrax/oai_dc_parser.rb +++ b/app/parsers/bulkrax/oai_dc_parser.rb @@ -85,12 +85,13 @@ def create_works return unless results.present? results.full.each_with_index do |record, index| break if limit_reached?(limit, index) + seen[record.identifier] = true + new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create! if record.deleted? # TODO: record.status == "deleted" + DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_importer_run) importerexporter.current_importer_run.deleted_records += 1 importerexporter.current_importer_run.save! else - seen[record.identifier] = true - new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create! ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_importer_run.id) increment_counters(index) end diff --git a/app/views/bulkrax/entries/show.html.erb b/app/views/bulkrax/entries/show.html.erb index c8119de9..2426cf15 100644 --- a/app/views/bulkrax/entries/show.html.erb +++ b/app/views/bulkrax/entries/show.html.erb @@ -54,7 +54,7 @@ <% elsif @entry.last_succeeded_at != nil %> Succeeded At: <%= @entry.last_succeeded_at %> <% else %> - Succeeded At: Item has not been <%= @importer.present? ? 'imported' : 'exported' %> successfully + Succeeded At: Item has not yet been <%= @importer.present? ? 'imported' : 'exported' %> successfully <% end %>

@@ -79,7 +79,7 @@ <%= link_to @entry.factory_class.to_s, main_app.polymorphic_path(factory_record) %> <% end %> <% else %> - Item Link: Item has not been imported successfully + Item Link: Item has not yet been imported successfully <% end %> <% else %> <% record = @entry&.hyrax_record %> diff --git a/app/views/bulkrax/importers/show.html.erb b/app/views/bulkrax/importers/show.html.erb index d92d1d22..6d007938 100644 --- a/app/views/bulkrax/importers/show.html.erb +++ b/app/views/bulkrax/importers/show.html.erb @@ -47,6 +47,9 @@ Error: <%= @importer.last_error['error_class'] %> - <%= @importer.last_error['error_message'] %>
Error Trace:
+ <% elsif @importer.importer_runs.last&.invalid_records&.present? %> + Error Trace:
+ <% elsif @importer.last_succeeded_at.present? %> Succeeded At: <%= @importer.last_succeeded_at %> <% end %> diff --git a/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb b/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb new file mode 100644 index 00000000..de2de733 --- /dev/null +++ b/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb @@ -0,0 +1,5 @@ +class AddInvalidRecordToImporterRun < ActiveRecord::Migration[5.1] + def change + add_column :bulkrax_importer_runs, :invalid_records, :text + end +end diff --git a/spec/jobs/bulkrax/delete_work_job_spec.rb b/spec/jobs/bulkrax/delete_work_job_spec.rb new file mode 100644 index 00000000..785ff06d --- /dev/null +++ b/spec/jobs/bulkrax/delete_work_job_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +require 'rails_helper' + +module Bulkrax + RSpec.describe DeleteWorkJob, type: :job do + subject(:delete_work_job) { described_class.new } + let(:entry) { FactoryBot.build(:bulkrax_entry) } + let(:importer_run) { FactoryBot.build(:bulkrax_importer_run) } + + before do + Bulkrax.default_work_type = 'Work' + end + + after do + Bulkrax.default_work_type = nil + end + + describe 'successful job object removed' do + before do + work = instance_double("Work") + factory = instance_double("Bulkrax::ObjectFactory") + expect(work).to receive(:delete).and_return true + expect(factory).to receive(:find).and_return(work) + expect(entry).to receive(:factory).and_return(factory) + end + + it 'increments :deleted_records' do + expect(importer_run).to receive(:increment!).with(:deleted_records) + expect(importer_run).to receive(:decrement!).with(:enqueued_records) + delete_work_job.perform(entry, importer_run) + end + end + + describe 'successful job object not found' do + before do + factory = instance_double("Bulkrax::ObjectFactory") + expect(factory).to receive(:find).and_return(nil) + expect(entry).to receive(:factory).and_return(factory) + end + + it 'increments :deleted_records' do + expect(importer_run).to receive(:increment!).with(:deleted_records) + expect(importer_run).to receive(:decrement!).with(:enqueued_records) + delete_work_job.perform(entry, importer_run) + end + end + end +end diff --git a/spec/models/bulkrax/csv_entry_spec.rb b/spec/models/bulkrax/csv_entry_spec.rb index 0ef0b018..87a4b8d4 100644 --- a/spec/models/bulkrax/csv_entry_spec.rb +++ b/spec/models/bulkrax/csv_entry_spec.rb @@ -24,7 +24,7 @@ module Bulkrax context 'with required metadata' do before do - allow_any_instance_of(ObjectFactory).to receive(:run) + allow_any_instance_of(ObjectFactory).to receive(:run!) allow(subject).to receive(:record).and_return('source_identifier' => '2', 'title' => 'some title') end @@ -36,7 +36,7 @@ module Bulkrax context 'with files containing spaces' do before do - allow_any_instance_of(ObjectFactory).to receive(:run) + allow_any_instance_of(ObjectFactory).to receive(:run!) allow(subject).to receive(:record).and_return('source_identifier' => '3', 'title' => 'some title') allow(File).to receive(:exist?).with('./spec/fixtures/csv/test_file.csv').and_return(true) diff --git a/spec/models/bulkrax/rdf_entry_spec.rb b/spec/models/bulkrax/rdf_entry_spec.rb index 0e97433e..9c0a62e0 100644 --- a/spec/models/bulkrax/rdf_entry_spec.rb +++ b/spec/models/bulkrax/rdf_entry_spec.rb @@ -50,7 +50,7 @@ module Bulkrax before do subject.raw_metadata = raw_metadata subject.identifier = "http://example.org/ns/19158" - allow_any_instance_of(ObjectFactory).to receive(:run).and_return(instance_of(Work)) + allow_any_instance_of(ObjectFactory).to receive(:run!).and_return(instance_of(Work)) allow(User).to receive(:batch_user) end diff --git a/spec/models/bulkrax/xml_entry_spec.rb b/spec/models/bulkrax/xml_entry_spec.rb index e0470095..8e430e06 100644 --- a/spec/models/bulkrax/xml_entry_spec.rb +++ b/spec/models/bulkrax/xml_entry_spec.rb @@ -50,7 +50,7 @@ module Bulkrax before do xml_entry.raw_metadata = raw_metadata allow(ObjectFactory).to receive(:new).and_return(object_factory) - allow(object_factory).to receive(:run).and_return(instance_of(Work)) + allow(object_factory).to receive(:run!).and_return(instance_of(Work)) allow(User).to receive(:batch_user) end diff --git a/spec/test_app/db/schema.rb b/spec/test_app/db/schema.rb index e86e449e..fc4ce5db 100644 --- a/spec/test_app/db/schema.rb +++ b/spec/test_app/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 20200326235838) do +ActiveRecord::Schema.define(version: 20200601204556) do create_table "bookmarks", force: :cascade do |t| t.integer "user_id", null: false @@ -82,6 +82,7 @@ t.integer "total_collection_entries", default: 0 t.integer "processed_children", default: 0 t.integer "failed_children", default: 0 + t.text "invalid_records" t.index ["importer_id"], name: "index_bulkrax_importer_runs_on_importer_id" end