Skip to content

Commit

Permalink
Account for commons mistakes when importing controlled Questioning Au…
Browse files Browse the repository at this point in the history
…thority fields (#515)

* for default Hyrax QA fields, replace https with http and add missing trailing slashes

* validate presence of active term in config files

* add qa_controlled_properties config option to Bulkrax engine

* replace constant with config values

* support valid URIs that use https

* add method explanation documentation, better named methods

* ignore Metrics/ModuleLength lint (rework file in future)

* fix / expand CSV specs

* remove duplicate attr from Bulkrax

* only add necessary properties to test Work model
  • Loading branch information
bkiahstroud authored May 19, 2022
1 parent 823ed5a commit d7ea5d5
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 5 deletions.
1 change: 1 addition & 0 deletions app/models/bulkrax/csv_entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def build_metadata
add_visibility
add_metadata_for_model
add_rights_statement
sanitize_controlled_uri_values!
add_local

self.parsed_metadata
Expand Down
61 changes: 60 additions & 1 deletion app/models/concerns/bulkrax/import_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module Bulkrax
# Import Behavior for Entry classes
module ImportBehavior
module ImportBehavior # rubocop:disable Metrics/ModuleLength
extend ActiveSupport::Concern

def build_for_importer
Expand Down Expand Up @@ -105,6 +105,65 @@ def add_collections
end
end

# Attempt to sanitize Questioning Authority URI values for configured controlled fields of common
# data entry mistakes. Controlled URI values are only valid if they are an exact match.
# Example:
# Valid value: http://rightsstatements.org/vocab/InC/1.0/
# Provided value: https://rightsstatements.org/vocab/InC/1.0
# Sanitized value: http://rightsstatements.org/vocab/InC/1.0/ ("s" from "https" removed, trailing "/" added)
#
# @return [Boolean] true if all controlled URI values are sanitized successfully
def sanitize_controlled_uri_values!
Bulkrax.qa_controlled_properties.each do |field|
next if parsed_metadata[field].blank?

parsed_metadata[field].each_with_index do |value, i|
next if value.blank?

if (validated_uri_value = validate_value(value, field))
parsed_metadata[field][i] = validated_uri_value
else
debug_msg = %(Unable to locate active authority ID "#{value}" in config/authorities/#{field.pluralize}.yml)
Rails.logger.debug(debug_msg)
error_msg = %("#{value}" is not a valid and/or active authority ID for the :#{field} field)
raise ::StandardError, error_msg
end
end
end

true
end

# @param value [String] value to validate
# @param field [String] name of the controlled property
# @return [String, nil] validated URI value or nil
def validate_value(value, field)
if value.match?(::URI::DEFAULT_PARSER.make_regexp)
value = value.strip.chomp
# add trailing forward slash unless one is already present
value << '/' unless value.match?(%r{/$})
end

valid = if active_id_for_authority?(value, field)
true
else
value.include?('https') ? value.sub!('https', 'http') : value.sub!('http', 'https')
active_id_for_authority?(value, field)
end

valid ? value : nil
end

# @param value [String] value to check
# @param field [String] name of the controlled property
# @return [Boolean] provided value is a present, active authority ID for the provided field
def active_id_for_authority?(value, field)
field_service = ('Hyrax::' + "#{field}_service".camelcase).constantize
active_authority_ids = field_service.new.active_elements.map { |ae| ae['id'] }

active_authority_ids.include?(value)
end

def factory
@factory ||= Bulkrax::ObjectFactory.new(attributes: self.parsed_metadata,
source_identifier_value: identifier,
Expand Down
11 changes: 7 additions & 4 deletions lib/bulkrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ class << self
:related_children_field_mapping,
:related_parents_field_mapping,
:reserved_properties,
:qa_controlled_properties,
:field_mappings,
:import_path,
:export_path,
:removed_image_path,
:server_name,
:api_definition,
:removed_image_path
:api_definition

self.parsers = [
{ name: "OAI - Dublin Core", class_name: "Bulkrax::OaiDcParser", partial: "oai_fields" },
Expand Down Expand Up @@ -119,6 +119,11 @@ class << self
original_url
relative_path
]

# List of Questioning Authority properties that are controlled via YAML files in
# the config/authorities/ directory. For example, the :rights_statement property
# is controlled by the active terms in config/authorities/rights_statements.yml
self.qa_controlled_properties = %w[rights_statement license]
end

def self.api_definition
Expand All @@ -131,8 +136,6 @@ def self.api_definition
)
end

self.removed_image_path = 'app/assets/images/bulkrax/removed.png'

# this function maps the vars from your app into your engine
def self.setup
yield self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@

# Properties that should not be used in imports/exports. They are reserved for use by Hyrax.
# config.reserved_properties += ['my_field']

# List of Questioning Authority properties that are controlled via YAML files in
# the config/authorities/ directory. For example, the :rights_statement property
# is controlled by the active terms in config/authorities/rights_statements.yml
# Defaults: 'rights_statement' and 'license'
# config.qa_controlled_properties += ['my_field']
end

# Sidebar for hyrax 3+ support
Expand Down
122 changes: 122 additions & 0 deletions spec/models/bulkrax/csv_entry_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,128 @@ module Bulkrax
end
end

describe 'importing controlled questioning authority fields' do
before do
allow(subject).to receive(:raw_metadata).and_return('source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'calls #sanitize_controlled_uri_values!' do
expect(subject).to receive(:sanitize_controlled_uri_values!).once

subject.build_metadata
end

describe 'importing :rights_statement' do
context 'when the http/https does not match' do
context 'when the authority ID has https' do
let(:auth_id) { 'https://rightsstatements.org/vocab/fake/1.0/' }

before do
allow(subject).to receive(:raw_metadata).and_return('rights_statement' => 'http://rightsstatements.org/vocab/fake/1.0/', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'replaces http with https' do
subject.build_metadata

expect(subject.parsed_metadata['rights_statement']).to eq([auth_id])
end
end

context 'when the authority ID has http' do
let(:auth_id) { 'http://rightsstatements.org/vocab/InC/1.0/' }

before do
allow(subject).to receive(:raw_metadata).and_return('rights_statement' => 'https://rightsstatements.org/vocab/InC/1.0/', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'replaces https with http' do
subject.build_metadata

expect(subject.parsed_metadata['rights_statement']).to eq([auth_id])
end
end
end

context 'when the value does not have a forward slash at the end' do
before do
allow(subject).to receive(:raw_metadata).and_return('rights_statement' => 'http://rightsstatements.org/vocab/InC/1.0', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'adds a trailing forward slash' do
subject.build_metadata

expect(subject.parsed_metadata['rights_statement']).to eq(['http://rightsstatements.org/vocab/InC/1.0/'])
end
end

context 'when the value does not match an existing, active authority ID' do
before do
allow(subject).to receive(:raw_metadata).and_return('rights_statement' => 'hello world', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'raises a StandardError' do
expect { subject.build_metadata }
.to raise_error(StandardError, %("hello world" is not a valid and/or active authority ID for the :rights_statement field))
end
end
end

describe 'importing :license' do
context 'when the http/https does not match' do
context 'when the authority ID has https' do
let(:auth_id) { 'https://creativecommons.org/licenses/by/4.0/' }

before do
allow(subject).to receive(:raw_metadata).and_return('license' => 'http://creativecommons.org/licenses/by/4.0/', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'replaces http with https' do
subject.build_metadata

expect(subject.parsed_metadata['license']).to eq([auth_id])
end
end

context 'when the authority ID has http' do
let(:auth_id) { 'http://creativecommons.org/publicdomain/mark/1.0/' }

before do
allow(subject).to receive(:raw_metadata).and_return('license' => 'https://creativecommons.org/publicdomain/mark/1.0/', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'replaces https with http' do
subject.build_metadata

expect(subject.parsed_metadata['license']).to eq([auth_id])
end
end
end

context 'when the value does not have a forward slash at the end' do
before do
allow(subject).to receive(:raw_metadata).and_return('license' => 'https://creativecommons.org/licenses/by/4.0', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'adds a trailing forward slash' do
subject.build_metadata

expect(subject.parsed_metadata['license']).to eq(['https://creativecommons.org/licenses/by/4.0/'])
end
end

context 'when the value does not match an existing, active authority ID' do
before do
allow(subject).to receive(:raw_metadata).and_return('license' => 'hello world', 'source_identifier' => 'qa_1', 'title' => 'some title')
end

it 'raises a StandardError' do
expect { subject.build_metadata }
.to raise_error(StandardError, %("hello world" is not a valid and/or active authority ID for the :license field))
end
end
end
end

context 'with parent-child relationships' do
let(:importer) { FactoryBot.create(:bulkrax_importer_csv, :with_relationships_mappings) }
let(:required_data) do
Expand Down
3 changes: 3 additions & 0 deletions spec/test_app/app/models/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ class Work < ActiveFedora::Base
include ::Hyrax::WorkBehavior
property :single_object, predicate: ::RDF::Vocab::DC.creator, multiple: false
property :multiple_objects, predicate: ::RDF::Vocab::DC.creator

property :license, predicate: ::RDF::Vocab::DC.rights
property :rights_statement, predicate: ::RDF::Vocab::EDM.rights
end
3 changes: 3 additions & 0 deletions spec/test_app/config/authorities/rights_statements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ terms:
- id: http://rightsstatements.org/vocab/NKC/1.0/
term: "No Known Copyright"
active: true
- id: https://rightsstatements.org/vocab/fake/1.0/
term: "Fake Term for Testing"
active: true

0 comments on commit d7ea5d5

Please sign in to comment.