add delete job, track records without source ids without creating ent… (

#199) * add delete job, track records without source ids without creating entries, properly show invalid records * fixes for rubocop and feedback
samvera · Jun 5, 2020 · 37b98fb · 37b98fb
1 parent 6e02790
commit 37b98fb
Show file tree

Hide file tree

Showing 18 changed files with 208 additions and 132 deletions.
diff --git a/app/factories/bulkrax/object_factory.rb b/app/factories/bulkrax/object_factory.rb
@@ -4,6 +4,7 @@
 module Bulkrax
   class ObjectFactory
     extend ActiveModel::Callbacks
+    include Bulkrax::FileFactory
     define_model_callbacks :save, :create
     class_attribute :system_identifier_field
     attr_reader :attributes, :object, :unique_identifier, :klass, :replace_files
@@ -30,6 +31,12 @@ def run
       object
     end
 
+    def run!
+      self.run
+      # Create the error exception if the object is not validly saved for some reason
+      raise ActiveFedora::RecordInvalid, object if !object.persisted? || object.changed?
+    end
+
     def update
       raise "Object doesn't exist" unless object
       destroy_existing_files if @replace_files && klass != Collection
@@ -214,98 +221,6 @@ def transform_attributes
                   .merge(file_attributes)
       end
 
-      # Find existing files or upload new files. This assumes a Work will have unique file titles;
-      #   and that those file titles will not have changed
-      # could filter by URIs instead (slower).
-      # When an uploaded_file already exists we do not want to pass its id in `file_attributes`
-      # otherwise it gets reuploaded by `work_actor`.
-      # support multiple files; ensure attributes[:file] is an Array
-      def upload_ids
-        return [] if klass == Collection
-        attributes[:file] = file_paths
-        work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files
-      end
-
-      def file_attributes
-        hash = {}
-        return hash if klass == Collection
-        hash[:uploaded_files] = upload_ids if attributes[:file].present?
-        hash[:remote_files] = new_remote_files if new_remote_files.present?
-        hash
-      end
-
-      # Its possible to get just an array of strings here, so we need to make sure they are all hashes
-      def parsed_remote_files
-        return @parsed_remote_files if @parsed_remote_files.present?
-        @parsed_remote_files = attributes[:remote_files] || []
-        @parsed_remote_files = @parsed_remote_files.map do |file_value|
-          if file_value.is_a?(Hash)
-            file_value
-          elsif file_value.is_a?(String)
-            { url: file_value }
-          else
-            Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type")
-            nil
-          end
-        end
-        @parsed_remote_files.delete(nil)
-        @parsed_remote_files
-      end
-
-      def new_remote_files
-        @new_remote_files ||= if object.present? && object.file_sets.present?
-                                parsed_remote_files.select do |file|
-                                  # is the url valid?
-                                  is_valid = file[:url]&.match(URI::ABS_URI)
-                                  # does the file already exist
-                                  is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] }
-                                  is_valid && !is_existing
-                                end
-                              else
-                                parsed_remote_files.select do |file|
-                                  file[:url]&.match(URI::ABS_URI)
-                                end
-                              end
-      end
-
-      def file_paths
-        @file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) }
-      end
-
-      # Retrieve the orginal filenames for the files to be imported
-      def work_files_filenames
-        object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present?
-      end
-
-      # Retrieve the filenames for the files to be imported
-      def import_files_filenames
-        file_paths.map { |f| f.split('/').last }
-      end
-
-      # Called if #replace_files is true
-      # Destroy all file_sets for this object
-      # Reload the object to ensure the remaining methods have the most up to date object
-      def destroy_existing_files
-        return unless object.present? && object.file_sets.present?
-        object.file_sets.each do |fs|
-          Hyrax::Actors::FileSetActor.new(fs, @user).destroy
-        end
-        @object = object.reload
-        log_deleted_fs(object)
-      end
-
-      def import_files
-        file_paths.map { |path| import_file(path) }
-      end
-
-      def import_file(path)
-        u = Hyrax::UploadedFile.new
-        u.user_id = @user.id
-        u.file = CarrierWave::SanitizedFile.new(path)
-        u.save
-        u.id
-      end
-
       # Regardless of what the Parser gives us, these are the properties we are prepared to accept.
       def permitted_attributes
         klass.properties.keys.map(&:to_sym) + %i[id edit_users edit_groups read_groups visibility work_members_attributes]

diff --git a/app/jobs/bulkrax/delete_work_job.rb b/app/jobs/bulkrax/delete_work_job.rb
@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+
+module Bulkrax
+  class DeleteWorkJob < ApplicationJob
+    queue_as :import
+
+    # rubocop:disable Rails/SkipsModelValidations
+    def perform(entry, importer_run)
+      work = entry.factory.find
+      work&.delete
+      importer_run.increment!(:deleted_records)
+      importer_run.decrement!(:enqueued_records)
+    end
+    # rubocop:enable Rails/SkipsModelValidations
+  end
+end
diff --git a/app/jobs/bulkrax/import_work_job.rb b/app/jobs/bulkrax/import_work_job.rb
@@ -20,11 +20,6 @@ def perform(*args)
       entry.save!
     rescue Bulkrax::CollectionsCreatedError
       reschedule(args[0], args[1])
-      # Exceptions here are not an issue with building the work.
-      # Those are caught separately, these are more likely network, db or other unexpected issues.
-      # Note that these temporary type issues do not raise the failure count
-    rescue StandardError, OAIError, RSolr::Error::Http => e
-      raise e
     end
     # rubocop:enable Rails/SkipsModelValidations
 

diff --git a/app/models/bulkrax/importer.rb b/app/models/bulkrax/importer.rb
@@ -80,7 +80,6 @@ def replace_files
     def import_works(only_updates = false)
       self.only_updates = only_updates
       parser.create_works
-      remove_unseen
       status_info
     rescue StandardError => e
       status_info(e)
@@ -99,23 +98,6 @@ def unique_collection_identifier(id)
       "#{self.parser_fields['base_url'].split('/')[2]}_#{id}"
     end
 
-    def remove_unseen
-      # TODO
-      # if primary_collection
-      #   primary_collection.member_ids.each do |id|
-      #     w = Work.find id
-      #     unless seen[w.source[0]]
-      #       if w.in_collections.size > 1
-      #         primary_collection.members.delete w # only removes from primary collection - wants the record, not the id
-      #         primary_collection.save
-      #       else
-      #         w.delete # removes from all collections
-      #       end
-      #     end
-      #   end
-      # end
-    end
-
     # The format for metadata for the incoming import; corresponds to an Entry class
     def import_metadata_format
       [['CSV', 'Bulkrax::CsvEntry'], ['RDF (N-Triples)', 'Bulkrax::RdfEntry']]

diff --git a/app/models/bulkrax/importer_run.rb b/app/models/bulkrax/importer_run.rb
@@ -5,11 +5,11 @@ class ImporterRun < ApplicationRecord
     belongs_to :importer
 
     def importer_status
-      import_runs = importer.importer_runs.last
+      import_run = importer.importer_runs.last
 
-      return "Failed" if import_runs&.failed_records&.positive? || import_runs&.failed_collections&.positive? || import_runs&.failed_children&.positive?
-      return "Processing" if import_runs&.enqueued_records&.positive?
-      return "Completed" if import_runs&.enqueued_records&.zero? && import_runs&.processed_records == import_runs&.total_work_entries
+      return "Failed" if import_run&.failed_records&.positive? || import_run&.failed_collections&.positive? || import_run&.failed_children&.positive? || import_run&.invalid_records&.present?
+      return "Processing" if import_run&.enqueued_records&.positive?
+      return "Completed" if import_run&.enqueued_records&.zero? && import_run&.processed_records == import_run&.total_work_entries
     end
   end
 end
diff --git a/app/models/concerns/bulkrax/file_factory.rb b/app/models/concerns/bulkrax/file_factory.rb
@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+
+module Bulkrax
+  module FileFactory
+    extend ActiveSupport::Concern
+
+    # Find existing files or upload new files. This assumes a Work will have unique file titles;
+    #   and that those file titles will not have changed
+    # could filter by URIs instead (slower).
+    # When an uploaded_file already exists we do not want to pass its id in `file_attributes`
+    # otherwise it gets reuploaded by `work_actor`.
+    # support multiple files; ensure attributes[:file] is an Array
+    def upload_ids
+      return [] if klass == Collection
+      attributes[:file] = file_paths
+      work_files_filenames && (work_files_filenames & import_files_filenames).present? ? [] : import_files
+    end
+
+    def file_attributes
+      hash = {}
+      return hash if klass == Collection
+      hash[:uploaded_files] = upload_ids if attributes[:file].present?
+      hash[:remote_files] = new_remote_files if new_remote_files.present?
+      hash
+    end
+
+    # Its possible to get just an array of strings here, so we need to make sure they are all hashes
+    def parsed_remote_files
+      return @parsed_remote_files if @parsed_remote_files.present?
+      @parsed_remote_files = attributes[:remote_files] || []
+      @parsed_remote_files = @parsed_remote_files.map do |file_value|
+        if file_value.is_a?(Hash)
+          file_value
+        elsif file_value.is_a?(String)
+          { url: file_value }
+        else
+          Rails.logger.error("skipped remote file #{file_value} because we do not recognize the type")
+          nil
+        end
+      end
+      @parsed_remote_files.delete(nil)
+      @parsed_remote_files
+    end
+
+    def new_remote_files
+      @new_remote_files ||= if object.present? && object.file_sets.present?
+                              parsed_remote_files.select do |file|
+                                # is the url valid?
+                                is_valid = file[:url]&.match(URI::ABS_URI)
+                                # does the file already exist
+                                is_existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] }
+                                is_valid && !is_existing
+                              end
+                            else
+                              parsed_remote_files.select do |file|
+                                file[:url]&.match(URI::ABS_URI)
+                              end
+                            end
+    end
+
+    def file_paths
+      @file_paths ||= Array.wrap(attributes[:file])&.select { |file| File.exist?(file) }
+    end
+
+    # Retrieve the orginal filenames for the files to be imported
+    def work_files_filenames
+      object.file_sets.map { |fn| fn.original_file.file_name.to_a }.flatten if object.present? && object.file_sets.present?
+    end
+
+    # Retrieve the filenames for the files to be imported
+    def import_files_filenames
+      file_paths.map { |f| f.split('/').last }
+    end
+
+    # Called if #replace_files is true
+    # Destroy all file_sets for this object
+    # Reload the object to ensure the remaining methods have the most up to date object
+    def destroy_existing_files
+      return unless object.present? && object.file_sets.present?
+      object.file_sets.each do |fs|
+        Hyrax::Actors::FileSetActor.new(fs, @user).destroy
+      end
+      @object = object.reload
+      log_deleted_fs(object)
+    end
+
+    def import_files
+      file_paths.map { |path| import_file(path) }
+    end
+
+    def import_file(path)
+      u = Hyrax::UploadedFile.new
+      u.user_id = @user.id
+      u.file = CarrierWave::SanitizedFile.new(path)
+      u.save
+      u.id
+    end
+  end
+end
diff --git a/app/models/concerns/bulkrax/has_matchers.rb b/app/models/concerns/bulkrax/has_matchers.rb
@@ -59,7 +59,7 @@ def field_supported?(field)
       field = field.gsub('_attributes', '')
 
       return false if excluded?(field)
-      return true if ['file', 'remote_files', 'model'].include?(field)
+      return true if ['file', 'remote_files', 'model', 'delete'].include?(field)
       return factory_class.method_defined?(field) && factory_class.properties[field].present?
     end
 

diff --git a/app/models/concerns/bulkrax/import_behavior.rb b/app/models/concerns/bulkrax/import_behavior.rb
@@ -10,7 +10,7 @@ def build_for_importer
         build_metadata
         unless self.importerexporter.validate_only
           raise CollectionsCreatedError unless collections_created?
-          @item = factory.run
+          @item = factory.run!
         end
       rescue RSolr::Error::Http, CollectionsCreatedError => e
         raise e

diff --git a/app/parsers/bulkrax/csv_parser.rb b/app/parsers/bulkrax/csv_parser.rb
@@ -65,12 +65,22 @@ def create_collections
 
     def create_works
       records.each_with_index do |record, index|
-        next if record[:source_identifier].blank?
+        if record[:source_identifier].blank?
+          current_importer_run.invalid_records ||= ""
+          current_importer_run.invalid_records += "Missing #{Bulkrax.system_identifier_field} for #{record.to_h}\n"
+          current_importer_run.failed_records += 1
+          current_importer_run.save
+          next
+        end
         break if limit_reached?(limit, index)
 
         seen[record[:source_identifier]] = true
         new_entry = find_or_create_entry(entry_class, record[:source_identifier], 'Bulkrax::Importer', record.to_h.compact)
-        ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id)
+        if record[:delete].present?
+          DeleteWorkJob.send(perform_method, new_entry, current_importer_run)
+        else
+          ImportWorkJob.send(perform_method, new_entry.id, current_importer_run.id)
+        end
         increment_counters(index)
       end
       status_info

diff --git a/app/parsers/bulkrax/oai_dc_parser.rb b/app/parsers/bulkrax/oai_dc_parser.rb
@@ -85,12 +85,13 @@ def create_works
       return unless results.present?
       results.full.each_with_index do |record, index|
         break if limit_reached?(limit, index)
+        seen[record.identifier] = true
+        new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create!
         if record.deleted? # TODO: record.status == "deleted"
+          DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_importer_run)
           importerexporter.current_importer_run.deleted_records += 1
           importerexporter.current_importer_run.save!
         else
-          seen[record.identifier] = true
-          new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: record.identifier).first_or_create!
           ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_importer_run.id)
           increment_counters(index)
         end

diff --git a/app/views/bulkrax/entries/show.html.erb b/app/views/bulkrax/entries/show.html.erb
@@ -54,7 +54,7 @@
       <% elsif @entry.last_succeeded_at != nil %>
         <strong>Succeeded At:</strong> <%= @entry.last_succeeded_at %>
       <% else %>
-        <strong>Succeeded At:</strong> Item has not been <%= @importer.present? ? 'imported' : 'exported' %> successfully
+        <strong>Succeeded At:</strong> Item has not yet been <%= @importer.present? ? 'imported' : 'exported' %> successfully
       <% end %>
     </p>
 
@@ -79,7 +79,7 @@
             <%= link_to @entry.factory_class.to_s, main_app.polymorphic_path(factory_record) %>
           <% end %>
         <% else %>
-          <strong>Item Link:</strong> Item has not been imported successfully
+          <strong>Item Link:</strong> Item has not yet been imported successfully
         <% end %>
       <% else %>
         <% record = @entry&.hyrax_record %>

diff --git a/app/views/bulkrax/importers/show.html.erb b/app/views/bulkrax/importers/show.html.erb
@@ -47,6 +47,9 @@
         <strong>Error:</strong> <%= @importer.last_error['error_class'] %> - <%= @importer.last_error['error_message'] %><br />
         <strong>Error Trace:</strong> <button id='err_toggle'>Toggle</button><br>
         <span id='error_trace' style='display: none'><%= @importer.last_error['error_trace'] %></span>
+      <% elsif @importer.importer_runs.last&.invalid_records&.present? %>
+        <strong>Error Trace:</strong> <button id='err_toggle'>Toggle</button><br>
+        <span id='error_trace' style='display: none'><%= @importer.importer_runs.last.invalid_records %></span>
       <% elsif @importer.last_succeeded_at.present?  %>
         <strong>Succeeded At:</strong> <%= @importer.last_succeeded_at %>
       <% end %>

diff --git a/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb b/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb
@@ -0,0 +1,5 @@
+class AddInvalidRecordToImporterRun < ActiveRecord::Migration[5.1]
+  def change
+    add_column :bulkrax_importer_runs, :invalid_records, :text
+  end
+end