diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..c99d2e73 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/Gemfile b/Gemfile index 2a534f54..e9db0eca 100644 --- a/Gemfile +++ b/Gemfile @@ -11,4 +11,7 @@ gemspec # your gem to rubygems.org. # To use a debugger -# gem 'byebug', group: [:development, :test] +gem 'byebug', group: [:development, :test] +gem "rspec-rails" +gem "factory_bot_rails" +gem "oai" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..e6b39818 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,167 @@ +PATH + remote: . + specs: + bulkrax (0.1.0) + iso8601 (~> 0.9.0) + language_list (~> 1.2, >= 1.2.1) + libxml-ruby (~> 3.1.0) + oai (~> 0.4) + rails (~> 5.1.6) + simple_form (~> 3.2, <= 3.5.0) + +GEM + remote: https://rubygems.org/ + specs: + actioncable (5.1.6) + actionpack (= 5.1.6) + nio4r (~> 2.0) + websocket-driver (~> 0.6.1) + actionmailer (5.1.6) + actionpack (= 5.1.6) + actionview (= 5.1.6) + activejob (= 5.1.6) + mail (~> 2.5, >= 2.5.4) + rails-dom-testing (~> 2.0) + actionpack (5.1.6) + actionview (= 5.1.6) + activesupport (= 5.1.6) + rack (~> 2.0) + rack-test (>= 0.6.3) + rails-dom-testing (~> 2.0) + rails-html-sanitizer (~> 1.0, >= 1.0.2) + actionview (5.1.6) + activesupport (= 5.1.6) + builder (~> 3.1) + erubi (~> 1.4) + rails-dom-testing (~> 2.0) + rails-html-sanitizer (~> 1.0, >= 1.0.3) + activejob (5.1.6) + activesupport (= 5.1.6) + globalid (>= 0.3.6) + activemodel (5.1.6) + activesupport (= 5.1.6) + activerecord (5.1.6) + activemodel (= 5.1.6) + activesupport (= 5.1.6) + arel (~> 8.0) + activesupport (5.1.6) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) + minitest (~> 5.1) + tzinfo (~> 1.1) + arel (8.0.0) + builder (3.2.3) + byebug (10.0.2) + concurrent-ruby (1.0.5) + crass (1.0.4) + diff-lcs (1.3) + erubi (1.7.1) + factory_bot (4.11.1) + activesupport (>= 3.0.0) + factory_bot_rails (4.11.1) + factory_bot (~> 4.11.1) + railties (>= 3.0.0) + faraday (0.15.3) + multipart-post (>= 1.2, < 3) + faraday_middleware (0.12.2) + faraday (>= 0.7.4, < 1.0) + globalid (0.4.1) + activesupport (>= 4.2.0) + i18n (1.1.0) + concurrent-ruby (~> 1.0) + iso8601 (0.9.1) + language_list (1.2.1) + libxml-ruby (3.1.0) + loofah (2.2.2) + crass (~> 1.0.2) + nokogiri (>= 1.5.9) + mail (2.7.1) + mini_mime (>= 0.1.1) + method_source (0.9.0) + mini_mime (1.0.1) + mini_portile2 (2.3.0) + minitest (5.11.3) + multipart-post (2.0.0) + nio4r (2.3.1) + nokogiri (1.8.5) + mini_portile2 (~> 2.3.0) + oai (0.4.0) + builder (>= 3.1.0) + faraday + faraday_middleware + rack (2.0.5) + rack-test (1.1.0) + rack (>= 1.0, < 3) + rails (5.1.6) + actioncable (= 5.1.6) + actionmailer (= 5.1.6) + actionpack (= 5.1.6) + actionview (= 5.1.6) + activejob (= 5.1.6) + activemodel (= 5.1.6) + activerecord (= 5.1.6) + activesupport (= 5.1.6) + bundler (>= 1.3.0) + railties (= 5.1.6) + sprockets-rails (>= 2.0.0) + rails-dom-testing (2.0.3) + activesupport (>= 4.2.0) + nokogiri (>= 1.6) + rails-html-sanitizer (1.0.4) + loofah (~> 2.2, >= 2.2.2) + railties (5.1.6) + actionpack (= 5.1.6) + activesupport (= 5.1.6) + method_source + rake (>= 0.8.7) + thor (>= 0.18.1, < 2.0) + rake (12.3.1) + rspec-core (3.8.0) + rspec-support (~> 3.8.0) + rspec-expectations (3.8.2) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.8.0) + rspec-mocks (3.8.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.8.0) + rspec-rails (3.8.0) + actionpack (>= 3.0) + activesupport (>= 3.0) + railties (>= 3.0) + rspec-core (~> 3.8.0) + rspec-expectations (~> 3.8.0) + rspec-mocks (~> 3.8.0) + rspec-support (~> 3.8.0) + rspec-support (3.8.0) + simple_form (3.5.0) + actionpack (> 4, < 5.2) + activemodel (> 4, < 5.2) + sprockets (3.7.2) + concurrent-ruby (~> 1.0) + rack (> 1, < 3) + sprockets-rails (3.2.1) + actionpack (>= 4.0) + activesupport (>= 4.0) + sprockets (>= 3.0.0) + sqlite3 (1.3.13) + thor (0.20.0) + thread_safe (0.3.6) + tzinfo (1.2.5) + thread_safe (~> 0.1) + websocket-driver (0.6.5) + websocket-extensions (>= 0.1.0) + websocket-extensions (0.1.3) + +PLATFORMS + ruby + +DEPENDENCIES + bulkrax! + byebug + factory_bot_rails + oai + rspec-rails + sqlite3 + +BUNDLED WITH + 1.17.2 diff --git a/app/assets/javascripts/bulkrax/application.js b/app/assets/javascripts/bulkrax/application.js index e54c6461..dd80ecef 100644 --- a/app/assets/javascripts/bulkrax/application.js +++ b/app/assets/javascripts/bulkrax/application.js @@ -10,4 +10,5 @@ // Read Sprockets README (https://github.com/rails/sprockets#sprockets-directives) for details // about supported directives. // + //= require_tree . diff --git a/app/assets/javascripts/bulkrax/importers.js b/app/assets/javascripts/bulkrax/importers.js new file mode 100644 index 00000000..e9838e1b --- /dev/null +++ b/app/assets/javascripts/bulkrax/importers.js @@ -0,0 +1,66 @@ +// Place all the behaviors and hooks related to the matching controller here. +// All this logic will automatically be available in application.js. +$(document).ready(function() { + var refresh_button = $('.refresh-set-source') + var base_url = $('#importer_parser_fields_base_url') + var external_set_select = $("#importer_parser_fields_set") + var initial_base_url = base_url.val() + + // handle refreshing/loading of external setes via button click + $('body').on('click', '.refresh-set-source', function(e) { + e.preventDefault() + + handleSourceLoad(refresh_button, base_url, external_set_select) + }) + + // handle refreshing/loading of external sets via blur event for the base_url field + $('body').on('blur', '#importer_parser_fields_base_url', function(e) { + e.preventDefault() + + // ensure we don't make another query if the value is the same -- this can be forced by clicking the refresh button + if (initial_base_url != base_url.val()) { + handleSourceLoad(refresh_button, base_url, external_set_select) + initial_base_url = base_url.val() + } + }) +}); + +function handleSourceLoad(refresh_button, base_url, external_set_select) { + if (base_url.val() == "") { // ignore empty base_url value + return + } + + var initial_button_text = refresh_button.html() + + refresh_button.html('Refreshing...') + refresh_button.attr('disabled', true) + + $.post('/importers/external_sets', { + base_url: base_url.val(), + }, function(res) { + if (!res.error) { + genExternalSetOptions(external_set_select, res.sets) // sets is [[name, spec]...] + } else { + setError(external_set_select, res.error) + } + + refresh_button.html(initial_button_text) + refresh_button.attr('disabled', false) + }) +} + +function genExternalSetOptions(selector, sets) { + out = '' + + out += sets.map(function(set) { + return '' + }) + + selector.html(out) + selector.attr('disabled', false) +} + +function setError(selector, error) { + selector.html('') + selector.attr('disabled', true) +} diff --git a/app/controllers/bulkrax/importers_controller.rb b/app/controllers/bulkrax/importers_controller.rb new file mode 100644 index 00000000..61702431 --- /dev/null +++ b/app/controllers/bulkrax/importers_controller.rb @@ -0,0 +1,108 @@ +require_dependency "bulkrax/application_controller" +require_dependency "oai" + +module Bulkrax + class ImportersController < ApplicationController + include Hyrax::ThemedLayoutController + + before_action :set_importer, only: [:show, :edit, :update, :destroy] + with_themed_layout 'dashboard' + + # GET /importers + def index + add_breadcrumb t(:'hyrax.controls.home'), main_app.root_path + add_breadcrumb t(:'hyrax.dashboard.breadcrumbs.admin'), hyrax.dashboard_path + add_breadcrumb 'Importers', bulkrax.importers_path + @importers = Importer.all + end + + # GET /importers/1 + def show + end + + # GET /importers/new + def new + add_breadcrumb t(:'hyrax.controls.home'), main_app.root_path + add_breadcrumb t(:'hyrax.dashboard.breadcrumbs.admin'), hyrax.dashboard_path + add_breadcrumb 'Importers', bulkrax.importers_path + @importer = Importer.new + end + + # GET /importers/1/edit + def edit + add_breadcrumb t(:'hyrax.controls.home'), main_app.root_path + add_breadcrumb t(:'hyrax.dashboard.breadcrumbs.admin'), hyrax.dashboard_path + add_breadcrumb 'Importers', bulkrax.importers_path + end + + # POST /importers + def create + @importer = Importer.new(importer_params) + + if @importer.save + redirect_to @importer, notice: 'Importer was successfully created.' + else + render :new + end + end + + # PATCH/PUT /importers/1 + def update + if @importer.update(importer_params) + redirect_to @importer, notice: 'Importer was successfully updated.' + else + render :edit + end + end + + # DELETE /importers/1 + def destroy + @importer.destroy + redirect_to importers_url, notice: 'Importer was successfully destroyed.' + end + + def external_sets + if list_external_sets + render json: { base_url: params[:base_url], sets: @sets } + else + render json: { base_url: params[:base_url], error: "unable to pull data from #{params[:base_url]}" } + end + end + + private + # Use callbacks to share common setup or constraints between actions. + def set_importer + @importer = Importer.find(params[:id]) + end + + # Only allow a trusted parameter "white list" through. + def importer_params + params.require(:importer).permit(:name, :admin_set_id, :user_id, :frequency, :parser_klass, :limit, parser_fields: {}, field_mapping: {}) + end + + def list_external_sets + url = params[:base_url] || (@harvester ? @harvester.base_url : nil) + setup_client(url) if url.present? + + @sets = [['All', 'all']] + + begin + @client.list_sets.each do |s| + @sets << [s.name, s.spec] + end + rescue + return false + end + + @sets + end + + def setup_client(url) + return false if url.nil? + + headers = { from: 'server@atla.com' } + + @client ||= OAI::Client.new(url, headers: headers, parser: 'libxml', metadata_prefix: 'oai_dc') + end + end +end diff --git a/app/helpers/bulkrax/application_helper.rb b/app/helpers/bulkrax/application_helper.rb index 1a53cddf..2a798d27 100644 --- a/app/helpers/bulkrax/application_helper.rb +++ b/app/helpers/bulkrax/application_helper.rb @@ -1,4 +1,6 @@ module Bulkrax module ApplicationHelper + include Hyrax::HyraxHelperBehavior + end end diff --git a/app/helpers/bulkrax/importers_helper.rb b/app/helpers/bulkrax/importers_helper.rb new file mode 100644 index 00000000..521da68a --- /dev/null +++ b/app/helpers/bulkrax/importers_helper.rb @@ -0,0 +1,11 @@ +module Bulkrax + module ImportersHelper + # borrowd from batch-importer https://github.com/samvera-labs/hyrax-batch_ingest/blob/master/app/controllers/hyrax/batch_ingest/batches_controller.rb + def available_admin_sets + # Restrict available_admin_sets to only those current user can desposit to. + @available_admin_sets ||= Hyrax::Collections::PermissionsService.source_ids_for_deposit(ability: current_ability, source_type: 'admin_set').map do |admin_set_id| + [AdminSet.find(admin_set_id).title.first, admin_set_id] + end + end + end +end diff --git a/app/jobs/bulkrax/import_work_job.rb b/app/jobs/bulkrax/import_work_job.rb new file mode 100644 index 00000000..204e1f21 --- /dev/null +++ b/app/jobs/bulkrax/import_work_job.rb @@ -0,0 +1,10 @@ +module Bulkrax + class ImportWorkJob < ApplicationJob + queue_as :import + + def perform(*args) + @importer = Importer.find(args[0]) + @importer.import_work(args[1]) + end + end +end diff --git a/app/jobs/bulkrax/importer_job.rb b/app/jobs/bulkrax/importer_job.rb new file mode 100644 index 00000000..d2695674 --- /dev/null +++ b/app/jobs/bulkrax/importer_job.rb @@ -0,0 +1,17 @@ +module Bulkrax + class ImporterJob < ApplicationJob + queue_as :import + + def perform(importer_id, only_updates_since_last_import=false) + start = Time.current + importer = Importer.find(id) + + importer.import_works + importer.last_imported_at = start + if importer.schedulable? + ImporterJob.set(wait_until: importer.next_import_at).perform_later(importer.id, true) + end + + end + end +end diff --git a/app/models/bulkrax/entries/application_entry.rb b/app/models/bulkrax/entries/application_entry.rb new file mode 100644 index 00000000..4884b696 --- /dev/null +++ b/app/models/bulkrax/entries/application_entry.rb @@ -0,0 +1,12 @@ +module Bulkrax + module Entries + class ApplicationEntry + + def build + # attributes, files_dir = nil, files = [], user = nil + Bulkrax::Factories::ApplicationFactory.for(entry_class).new(all_attrs, nil, [], user).run + end + + end + end +end diff --git a/app/models/bulkrax/entries/oai_entry.rb b/app/models/bulkrax/entries/oai_entry.rb new file mode 100644 index 00000000..9bcd0178 --- /dev/null +++ b/app/models/bulkrax/entries/oai_entry.rb @@ -0,0 +1,46 @@ +module Bulkrax + module Entries + class OaiEntry < ApplicationEntry + attr_accessor :parser, :importer, :raw_record, :parsed_record, :all_attrs, :identifier + + delegate :client, + :mapping_class, + :collection_name, + :user, + to: :parser + + def initialize(parser, identifier) + @parser= parser + @identifier = identifier + end + + def entry_class + 'ETD' + end + + def raw_record + @raw_record ||= client.get_record({identifier: identifier}) + end + + def mapping + @mapping ||= mapping_class.new( + raw_record, + parser.parser_fields['rights_statement'], + parser.parser_fields['institution_name'], + parser.parser_fields['thumbnail_url'], + collection_name == "all" + ) + end + + def all_attrs + return @all_attrs if @all_attrs + @all_attrs ||= mapping.all_attrs + unless collection_name == "all" + @all_attrs['collection'] = {identifier: [collection_name]} + end + return @all_attrs + end + + end + end +end diff --git a/app/models/bulkrax/factories/application_factory.rb b/app/models/bulkrax/factories/application_factory.rb new file mode 100644 index 00000000..e381f8e8 --- /dev/null +++ b/app/models/bulkrax/factories/application_factory.rb @@ -0,0 +1,20 @@ +module Bulkrax + module Factories + class ApplicationFactory + extend ActiveSupport::Autoload + + eager_autoload do + autoload :CollectionFactory + autoload :ETDFactory + autoload :ImageFactory + autoload :ObjectFactory + autoload :WithAssociatedCollection + end + + # @param [#to_s] First (Xxx) portion of an "XxxFactory" constant + def self.for(model_name) + const_get "Bulkrax::Factories::#{model_name}Factory" + end + end + end +end diff --git a/app/models/bulkrax/factories/collection_factory.rb b/app/models/bulkrax/factories/collection_factory.rb new file mode 100644 index 00000000..bc4a23df --- /dev/null +++ b/app/models/bulkrax/factories/collection_factory.rb @@ -0,0 +1,23 @@ +module Bulkrax + module Factories + class CollectionFactory < ObjectFactory + self.klass = Collection + self.system_identifier_field = :identifier + + def find_or_create + collection = find + return collection if collection + run(&:save!) + end + + def update + raise "Collection doesn't exist" unless object + object.attributes = update_attributes + run_callbacks(:save) do + object.save! + end + log_updated(object) + end + end + end +end diff --git a/app/models/bulkrax/factories/etd_factory.rb b/app/models/bulkrax/factories/etd_factory.rb new file mode 100644 index 00000000..79675d86 --- /dev/null +++ b/app/models/bulkrax/factories/etd_factory.rb @@ -0,0 +1,16 @@ +module Bulkrax + module Factories + class ETDFactory < ObjectFactory + include WithAssociatedCollection + + self.klass = Work + # A way to identify objects that are not Hydra minted identifiers + self.system_identifier_field = 'identifier' + + # TODO: add resource type? + # def create_attributes + # #super.merge(resource_type: 'ETD') + # end + end + end +end diff --git a/app/models/bulkrax/factories/image_factory.rb b/app/models/bulkrax/factories/image_factory.rb new file mode 100644 index 00000000..88208e35 --- /dev/null +++ b/app/models/bulkrax/factories/image_factory.rb @@ -0,0 +1,16 @@ +module Bulkrax + module Factories + class ImageFactory < ObjectFactory + include WithAssociatedCollection + + self.klass = Image + # A way to identify objects that are not Hydra minted identifiers + self.system_identifier_field = :identifier + + # TODO: add resource type? + # def create_attributes + # #super.merge(resource_type: 'Image') + # end + end + end +end diff --git a/app/models/bulkrax/factories/object_factory.rb b/app/models/bulkrax/factories/object_factory.rb new file mode 100644 index 00000000..1fd9ff50 --- /dev/null +++ b/app/models/bulkrax/factories/object_factory.rb @@ -0,0 +1,156 @@ +# TODO require 'importer/log_subscriber' +module Bulkrax + module Factories + class ObjectFactory + extend ActiveModel::Callbacks + define_model_callbacks :save, :create + class_attribute :klass, :system_identifier_field + attr_reader :attributes, :files_directory, :object, :files + + def initialize(attributes, files_dir = nil, files = [], user = nil) + @attributes = ActiveSupport::HashWithIndifferentAccess.new(attributes) + @files_directory = files_dir + @files = files + @user = user || User.batch_user + end + + def run + arg_hash = { id: attributes[:id], name: 'UPDATE', klass: klass } + @object = find + if @object + ActiveSupport::Notifications.instrument('import.importer', arg_hash) { update } + else + ActiveSupport::Notifications.instrument('import.importer', arg_hash.merge(name: 'CREATE')) { create } + end + yield(object) if block_given? + object + end + + def update + raise "Object doesn't exist" unless object + run_callbacks(:save) do + work_actor.update(environment(update_attributes)) + end + log_updated(object) + end + + def create_attributes + transform_attributes + end + + def update_attributes + transform_attributes.except(:id) + end + + def find + return find_by_id if attributes[:id] + return search_by_identifier if attributes[system_identifier_field].present? + + raise "Missing identifier: Unable to search for existing object without " \ + "either fedora ID or #{system_identifier_field}" + end + + def find_by_id + klass.find(attributes[:id]) if klass.exists?(attributes[:id]) + end + + def search_by_identifier + query = { system_identifier_field => + attributes[system_identifier_field] } + klass.where(query).first + end + + # An ActiveFedora bug when there are many habtm <-> has_many associations means they won't all get saved. + # https://github.com/projecthydra/active_fedora/issues/874 + # 2+ years later, still open! + def create + attrs = create_attributes + @object = klass.new + run_callbacks :save do + run_callbacks :create do + Rails.logger.debug("============= 6") + klass == Collection ? create_collection(attrs) : work_actor.create(environment(attrs)) + Rails.logger.debug("============= 7") + + end + end + log_created(object) + end + + def log_created(obj) + msg = "Created #{klass.model_name.human} #{obj.id}" + Rails.logger.info("#{msg} (#{Array(attributes[system_identifier_field]).first})") + end + + def log_updated(obj) + msg = "Updated #{klass.model_name.human} #{obj.id}" + Rails.logger.info("#{msg} (#{Array(attributes[system_identifier_field]).first})") + end + + private + + # @param [Hash] attrs the attributes to put in the environment + # @return [Hyrax::Actors::Environment] + def environment(attrs) + Hyrax::Actors::Environment.new(@object, Ability.new(@user), attrs) + end + + def work_actor + Hyrax::CurationConcern.actor + end + + def create_collection(attrs) + Rails.logger.debug("============= 8") + + @object.attributes = attrs + @object.apply_depositor_metadata(@user) + Rails.logger.debug("============= 9") + + @object.save! + end + + # Override if we need to map the attributes from the parser in + # a way that is compatible with how the factory needs them. + def transform_attributes + attributes.slice(*permitted_attributes) + .merge(file_attributes) + end + + # Find existing file or upload new file. This assumes a Work will have unique file titles; + # could filter by URIs instead (slower). + # When an uploaded_file already exists we do not want to pass its id in `file_attributes` + # otherwise it gets reuploaded by `work_actor`. + def upload_ids + work_files_titles = object.file_sets.map(&:title) if object.present? && object.file_sets.present? + work_files_titles && work_files_titles.include?(attributes[:file]) ? [] : [import_file(file_paths.first)] + end + + def file_attributes + hash = {} + hash[:uploaded_files] = upload_ids if files_directory.present? && attributes[:file].present? + hash[:remote_files] = attributes[:remote_files] if attributes[:remote_files].present? + hash + end + + def file_paths + attributes[:file].map { |file_name| File.join(files_directory, file_name) } if attributes[:file] + end + + def import_file(path) + u = Hyrax::UploadedFile.new + u.user_id = @user.id + u.file = CarrierWave::SanitizedFile.new(path) + u.save + u.id + end + + ## TO DO: handle invalid file in CSV + ## currently the importer stops if no file corresponding to a given file_name is found + + # Regardless of what the MODS Parser gives us, these are the properties we are prepared to accept. + def permitted_attributes + klass.properties.keys.map(&:to_sym) + %i[id edit_users edit_groups read_groups visibility] + end + end + end +end diff --git a/app/models/bulkrax/factories/with_associated_collection.rb b/app/models/bulkrax/factories/with_associated_collection.rb new file mode 100644 index 00000000..49ea8c24 --- /dev/null +++ b/app/models/bulkrax/factories/with_associated_collection.rb @@ -0,0 +1,29 @@ +module Bulkrax + module Factories + module WithAssociatedCollection + extend ActiveSupport::Concern + + # Strip out the :collection key, and add the member_of_collection_ids, + # which is used by Hyrax::Actors::AddAsMemberOfCollectionsActor + def create_attributes + return super if attributes[:collection].nil? + super.except(:collection).merge(member_of_collection_ids: [collection.id]) + end + + # Strip out the :collection key, and add the member_of_collection_ids, + # which is used by Hyrax::Actors::AddAsMemberOfCollectionsActor + def update_attributes + return super if attributes[:collection].nil? + super.except(:collection).merge(member_of_collection_ids: [collection.id]) + end + + private + + def collection + @collection ||= CollectionFactory.new(attributes.fetch(:collection)).find_or_create + @collection.reindex_extent = Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX + return @collection + end + end + end +end diff --git a/app/models/bulkrax/importer.rb b/app/models/bulkrax/importer.rb new file mode 100644 index 00000000..cad8d8a6 --- /dev/null +++ b/app/models/bulkrax/importer.rb @@ -0,0 +1,115 @@ +require 'iso8601' + +module Bulkrax + class Importer < ApplicationRecord + serialize :parser_fields, JSON + serialize :field_mapping, JSON + + belongs_to :user + has_many :importer_runs, dependent: :destroy, foreign_key: 'bulkrax_importer_id' + + validates :name, presence: true + validates :admin_set_id, presence: true + # TODO validates :metadata_prefix, presence: true + # TODO validates :base_url, presence: true + + def parser + # create an parser based on importer + @parser ||= self.parser_klass.constantize.new( + self + # self.file_url, + # # self.right_statement, + # # self.institution_name, + # self.user, + # self.admin_set_id, + # self.parser_fields, + # self.field_mapping + # # self.external_set_id, + ) + end + + def frequency_enums + # these duration values use ISO 8601 Durations (https://en.wikipedia.org/wiki/ISO_8601#Durations) + # TLDR; all durations are prefixed with 'P' and the parts are a number with the type of duration. + # i.e. P1Y2M3W4DT5H6M7S == 1 Year, 2 Months, 3 Weeks, 4 Days, 5 Hours, 6 Minutes, 7 Seconds + [['Daily', 'P1D'], ['Monthly', 'P1M'], ['Yearly', 'P1Y'], ['Once (on save)', 'PT0S']] + end + + def frequency=(frequency) + write_attribute(:frequency, ISO8601::Duration.new(frequency).to_s) + end + + def frequency + ISO8601::Duration.new read_attribute(:frequency) if read_attribute(:frequency) + end + + def schedulable? + frequency.to_seconds != 0 + end + + def next_import_at + (last_imported_at || Time.current) + frequency.to_seconds if schedulable? and last_imported_at.present? + end + + def current_importer_run + @current_importer_run ||= self.importer_runs.create!(total_records: self.limit) + end + + def seen + @seen ||= {} + end + + def import_works + parser.create_collections + parser.records(quick: true).each_with_index do |record, index| + if !limit.nil? && index >= limit + break + elsif record.deleted? # TODO record.status == "deleted" + self.current_importer_run.deleted_records += 1 + else + seen[record.identifier] = true + ImportWorkJob.perform_later(self.id, record.identifier) + self.increment_counters(index) + end + current_importer_run.save + end + + remove_unseen + end + + def remove_unseen + # TODO + # if primary_collection + # primary_collection.member_ids.each do |id| + # w = Work.find id + # unless seen[w.source[0]] + # if w.in_collections.size > 1 + # primary_collection.members.delete w # only removes from primary collection - wants the record, not the id + # primary_collection.save + # else + # w.delete # removes from all collections + # end + # end + # end + # end + end + + def import_work(identifier) + entry = parser.entry(identifier) + entry.build + end + + def increment_counters(index) + if limit.to_i > 0 + current_importer_run.total_records = limit + elsif parser.total > 0 + current_importer_run.total_records = parser.total + else + current_importer_run.total_records = index + 1 + end + current_importer_run.enqueued_records = index + 1 + current_importer_run.save! + end + + end +end diff --git a/app/models/bulkrax/importer_run.rb b/app/models/bulkrax/importer_run.rb new file mode 100644 index 00000000..bbd3626f --- /dev/null +++ b/app/models/bulkrax/importer_run.rb @@ -0,0 +1,5 @@ +module Bulkrax + class ImporterRun < ApplicationRecord + belongs_to :importer, foreign_key: 'bulkrax_importer_id' + end +end diff --git a/app/models/bulkrax/mappings/application_mapping.rb b/app/models/bulkrax/mappings/application_mapping.rb new file mode 100644 index 00000000..8cea050d --- /dev/null +++ b/app/models/bulkrax/mappings/application_mapping.rb @@ -0,0 +1,110 @@ +require 'language_list' +require 'erb' +require 'ostruct' + +module Bulkrax + module Mappings + class ApplicationMapping + attr_accessor :record, :rights_statement, :contributing_institution, :thumbnail_url, :all + class_attribute :matchers + + def initialize(record, rights_statement, contributing_institution, thumbnail_url, all = false) + @record = record.record + @rights_statement = rights_statement + @contributing_institution = contributing_institution + @thumbnail_url = thumbnail_url + @all = all + end + + def self.matcher(name, args={}) + self.matchers ||= {} + from = args[:from] || [name] + + matcher = matcher_class.new( + to: name, + from: from, + parsed: args[:parsed], + split: args[:split], + if: args[:if] + ) + + from.each do |lookup| + self.matchers[lookup] = matcher + end + end + + def metadata + return @metadata if @metadata + + @metadata = {} + record.metadata.children.each do |child| + child.children.each do |node| + add_metadata(node.name, node.content) + end + end + +# TODO go through all parer_fields and add them? + add_metadata('thumbnail_url', thumbnail_url) + + @metadata['contributing_institution'] = [contributing_institution] + @metadata['rights_statement'] = [rights_statement] + @metadata['visibility'] = 'open' + + @metadata + end + + def add_metadata(node_name, node_content) + matcher = self.class.matchers[node_name] + + if matcher + result = matcher.result(self, node_content) + if result + key = matcher.to + @metadata[key] ||= [] + + if result.is_a?(Array) + @metadata[key] += result + else + @metadata[key] << result + end + end + end + end + + def all_attrs + merge_attrs(header, metadata) + end + + def context + @context ||= OpenStruct.new(record: record, identifier: record.header.identifier) + end + + def thumbnail_url + ERB.new(@thumbnail_url).result(context.instance_eval { binding }) + end + + def header + { + 'source' => [record.header.identifier] + } + end + + def merge_attrs(first, second) + return first if second.blank? + + first = {} if first.blank? + + first.merge(second) do |key, old, new| + if key =~ /identifier/ + merged_value = old if old.first =~ /^http/ + merged_value = new if new.first =~ /^http/ + else + merged_value = old + new + end + merged_value + end + end + + end + end +end diff --git a/app/models/bulkrax/mappings/oai_mapping.rb b/app/models/bulkrax/mappings/oai_mapping.rb new file mode 100644 index 00000000..19638011 --- /dev/null +++ b/app/models/bulkrax/mappings/oai_mapping.rb @@ -0,0 +1,24 @@ +module Bulkrax + module Mappings + class OaiMapping < ApplicationMapping + def self.matcher_class + Matchers::OaiMatcher + end + + matcher 'contributor', split: true + matcher 'creator', split: true + matcher 'date', from: ['date'], split: true + matcher 'description' + matcher 'format_digital', from: ['format_digital', 'format'], parsed: true + matcher 'identifier', from: ['identifier'], if: ->(parser, content) { content.match(/http(s{0,1}):\/\//) } + matcher 'language', parsed: true, split: true + matcher 'place', from: ['coverage'] + matcher 'publisher', split: /\s*[;]\s*/ + matcher 'relation', split: true + matcher 'subject', split: true + matcher 'title' + matcher 'types', from: ['types', 'type'], split: true, parsed: true + matcher 'remote_files', from: ['thumbnail_url'], parsed: true + end + end +end diff --git a/app/models/bulkrax/matchers/application_matcher.rb b/app/models/bulkrax/matchers/application_matcher.rb new file mode 100644 index 00000000..39400fd2 --- /dev/null +++ b/app/models/bulkrax/matchers/application_matcher.rb @@ -0,0 +1,39 @@ +module Bulkrax + module Matchers + class ApplicationMatcher + attr_accessor :to, :from, :parsed, :if, :split + + def initialize(args) + args.each do |k, v| + send("#{k}=", v) + end + end + + def result(parser, content) + return nil if self.if && !self.if.call(parser, content) + + @result = content.gsub(/\s/, ' ') # remove any line feeds and tabs + + if self.split.is_a?(Regexp) + @result = @result.split(self.split) + elsif self.split + @result = @result.split(/\s*[:;|]\s*/) # default split by : ; | + end + + if @result.is_a?(Array) && @result.size == 1 + @result = @result[0] + end + + if @result.is_a?(Array) && self.parsed + @result.each_with_index do |res, index| + @result[index] = send("parse_#{to}", res) + end + elsif self.parsed + @result = send("parse_#{to}", @result) + end + + return @result + end + end + end +end diff --git a/app/models/bulkrax/matchers/oai_matcher.rb b/app/models/bulkrax/matchers/oai_matcher.rb new file mode 100644 index 00000000..71dea4a4 --- /dev/null +++ b/app/models/bulkrax/matchers/oai_matcher.rb @@ -0,0 +1,67 @@ +module Bulkrax + module Matchers + class OaiMatcher < ApplicationMatcher + def parse_remote_files(src) + {url: src} + end + + def parse_language(src) + l = LanguageList::LanguageInfo.find(src) + return l ? l.name : src + end + + def parse_types(src) + src.to_s.titleize + end + + def parse_format_original(src) + src.to_s.titleize + end + + def parse_format_digital(src) + case src + when 'application/pdf','pdf', 'PDF' + 'PDF' + when 'image/jpeg', 'image/jpg', 'jpeg', 'jpg', 'JPEG', 'JPG' + 'JPEG' + when 'image/tiff', 'image/tif', 'tiff', 'tif', 'TIFF', 'TIF' + 'TIFF' + when 'image/jp2', 'jp2', 'JP2' + 'JP2' + when 'image/png', 'png', 'PNG' + 'PNG' + when 'image/gif', 'gif', 'GIF' + 'GIF' + when 'video/mp4', 'mp4', 'MP4' + 'MP4' + when 'video/ogg', 'ogg', 'OGG' + 'OGG' + when 'video/vnd.avi', 'video/avi', 'avi', 'AVI' + 'AVI' + when 'audio/aac', 'aac', 'AAC' + 'AAC' + when 'audio/mp4', 'mp4', 'MP4' + 'MP4' + when 'audio/mpeg', 'audio/mp3', 'audio/mpeg3', 'mpeg', 'MPEG', 'mp3', 'MP3', 'mpeg3', 'MPEG3' + 'MPEG' + when 'audio/ogg', 'ogg', 'OGG' + 'OGG' + when 'audio/aiff', 'aiff', 'AIFF' + 'AIFF' + when 'audio/webm', 'webm', 'WEBM' + 'WEBM' + when 'audio/wav', 'wav', 'WAV' + 'WAV' + when 'text/csv', 'csv', 'CSV' + 'CSV' + when 'text/html', 'html', 'HTML' + 'HTML' + when 'text/rtf', 'rtf', 'RTF' + 'RTF' + else + src.to_s.titleize + end + end + end + end +end diff --git a/app/models/bulkrax/parsers/application_parser.rb b/app/models/bulkrax/parsers/application_parser.rb new file mode 100644 index 00000000..65df0ccd --- /dev/null +++ b/app/models/bulkrax/parsers/application_parser.rb @@ -0,0 +1,45 @@ +module Bulkrax + module Parsers + class ApplicationParser + + #attr_accessor :url, :headers, :file_url, :user, :admin_set_id, :rights, :institution, :total, :client, :collection_name, :metadata_prefix + attr_accessor :importer, :total + + def self.parser_fields + {} + end + + def initialize(importer) + @importer = importer + end + + # @api + def entry_class + raise 'must be defined' + end + + # @api + def mapping_class + raise 'must be defined' + end + + # @api + def records(opts = {}) + raise 'must be defined' + end + + def record(identifier, opts = {}) + return @record if @record + + @record = entry_class.new(self, identifier) + @record.build + return @record + end + + def total + 0 + end + + end + end +end diff --git a/app/models/bulkrax/parsers/oai_parser.rb b/app/models/bulkrax/parsers/oai_parser.rb new file mode 100644 index 00000000..21536af7 --- /dev/null +++ b/app/models/bulkrax/parsers/oai_parser.rb @@ -0,0 +1,84 @@ +module Bulkrax + module Parsers + class OaiParser < ApplicationParser + attr_accessor :client, :headers + delegate :list_sets, to: :client + delegate :parser_fields, :user, to: :importer + + def self.parser_fields + { + base_url: :string, + metadata_prefix: :string, + set: :string, + institution_name: :string, + rights_statements: :string, + thumbnail_url: :string + } + end + + def initialize(importer) + super + @headers = { from: importer.user.email } + end + + def client + @client ||= OAI::Client.new(parser_fields['base_url'], + headers: headers, + parser: 'libxml', + metadata_prefix: importer.parser_fields['metadata_prefix']) + end + + def collection_name + @collection_name ||= parser_fields['set'] || 'all' + end + + def entry_class + Entries::OaiEntry + end + + def mapping_class + Mappings::OaiMapping + end + + def entry(identifier) + entry_class.new(self, identifier) + end + + def records(opts = {}) + if opts[:quick] + opts.delete(:quick) + @short_records = client.list_identifiers(opts) + else + @records ||= client.list_records(opts) + end + end + + def list_sets + client.list_sets + end + + def create_collections + list_sets.each do |set| + if collection_name == 'all' || collection_name == set.spec + attrs = { + title: [set.name], + identifier: [set.spec], + institution: [parser_fields['institution_name']], + collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid + } + #Bulkrax::Factories::CollectionFactory.new(attrs).find_or_create + collection = Collection.where(identifier: [set.spec]).first + collection ||= Collection.create!(attrs) + end + end + end + + def total + @total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i + rescue + @total = 0 + end + + end + end +end diff --git a/app/views/bulkrax/importers/_form.html.erb b/app/views/bulkrax/importers/_form.html.erb new file mode 100644 index 00000000..dba30658 --- /dev/null +++ b/app/views/bulkrax/importers/_form.html.erb @@ -0,0 +1,55 @@ +
+ The Thumbnail URL allows for basic templating and substitution on any identified information in to the url. For example: +
+ ++ http://commons.ptsem.edu/?cover=<%= identifier.split(':').last %>&size=L +
+ ++ http://commons.ptsem.edu/?cover=<%= record.header.identifier.split(':').last %>&size=L +
+Name | +Last Run | +Next Run | +Records Enqueued | +Records Processed | +Records Failed | +Records Deleted Upstream | +Total Records | +||
---|---|---|---|---|---|---|---|---|---|
<%= importer.name %> | +<%= importer.last_imported_at.strftime("%b %d, %Y") if importer.last_imported_at %> | +<%= importer.next_import_at.strftime("%b %d, %Y") if importer.next_import_at %> | +<%= importer.import_runs.last&.enqueued %> | +<%= importer.import_runs.last&.processed %> | +<%= importer.import_runs.last&.failures %> | +<%= importer.import_runs.last&.deleted %> | +<%= importer.import_runs.last&.total %> | +<%= link_to raw(''), edit_importer_path(importer) %> | +<%= link_to raw(''), importer, method: :delete, data: { confirm: 'Are you sure?' } %> | +
No importers have been created.
+ <% end %> +<%= notice %>
+ ++ Name: + <%= @importer.name %> +
+ ++ Admin set: + <%= @importer.admin_set_id %> +
+ ++ User: + <%= @importer.user %> +
+ ++ Frequency: + <%= @importer.frequency %> +
+ ++ Parser klass: + <%= @importer.parser_klass %> +
+ ++ Limit: + <%= @importer.limit %> +
+ ++ Parser fields: + <%= @importer.parser_fields %> +
+ ++ Field mapping: + <%= @importer.field_mapping %> +
+ +<%= link_to 'Edit', edit_importer_path(@importer) %> | +<%= link_to 'Back', importers_path %> diff --git a/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb b/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb new file mode 100644 index 00000000..9641191f --- /dev/null +++ b/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb @@ -0,0 +1,15 @@ +" do + render + expect(rendered).to match(/Name/) + expect(rendered).to match(/Admin Set/) + expect(rendered).to match(//) + expect(rendered).to match(/Frequency/) + expect(rendered).to match(/Parser Klass/) + expect(rendered).to match(/2/) + expect(rendered).to match(//) + expect(rendered).to match(//) + end +end