diff --git a/app/jobs/file_sets_reprocess_job.rb b/app/jobs/file_sets_reprocess_job.rb new file mode 100644 index 00000000..f0610930 --- /dev/null +++ b/app/jobs/file_sets_reprocess_job.rb @@ -0,0 +1,130 @@ +# frozen_string_literal: true + +## +# This job is responsible for finding file sets that may need re-processing and then dispatching new +# jobs to perform that processing. +# +# The reasons are two fold, and addressed by the two jobs: +# +# 1. We did not successfully split a PDF; handled by ConditionallyResplitFileSetJob +# 2. We did not successfully attach a PDF; handled by ConditionallyResplitFileSetJob +class FileSetsReprocessJob < ApplicationJob + ## + # @param cname [String, Symbol] when given :all, submit one {FileSetsReprocessJob} per tenant. + # Otherwise, switch to the given tenant and submit a {FileSetsReprocessJob} + def self.for_tenant(cname = :all) + if cname == :all + Account.all.each do |account| + account.switch! + FileSetsReprocessJob.perform_later + end + else + Account.switch!(cname) + FileSetsReprocessJob.perform_later + end + end + + class_attribute :solr_page_size, default: 1000 + class_attribute :solr_q_parameter, + default: "(mime_type_ssi:application/pdf OR label_ssi:*.pdf) AND has_model_ssim:FileSet" + class_attribute :solr_fl_parameter, default: 'id,label_ssi,mime_type_ssi' + class_attribute :desired_mime_type, default: "application/pdf" + + def perform + count = ActiveFedora::SolrService.count(solr_q_parameter) + + (0..(1 + (count / solr_page_size))).each do |page| + ActiveFedora::SolrService.query(solr_q_parameter, + fl: solr_fl_parameter, + rows: solr_page_size, + start: page * solr_page_size).each do |document| + if document[:mime_type_ssi] == desired_mime_type + # Given that we have a mime_type we can assume that we've successfully attached the file. + ConditionallyResplitFileSetJob.perform_later(file_set_id: document[:id]) + else + # We have failed to attach the file to the work. + ConditionallyReingestFileSetJob.perform_later(file_set_id: document[:id]) + end + end + end + end + + ## + # A helper module for conditionally finding a file set. + # + # @see #find + module FileSetFinder + ## + # @param file_set_id [String] + # @return [FileSet] when the given :file_set_id is found. + # @return [FalseClass] when the given :file_set_id is not found. + def self.find(file_set_id:) + FileSet.find(file_set_id) + rescue ActiveFedora::ObjectNotFoundError + message = "#{self.class}##{__method__} unable to find FileSet with ID=#{file_set_id}. " \ + "It may have been deleted between the enqueuing of this job and running this job." + Rails.logger.warning(message) + return false + end + end + + ## + # This job conditionally re-splits a file_set's PDF. How do we know if we need to re-split + # it? See the {#perform} method for details. + # + # 1. The file_set is a PDF. + # 2. The file_set's PDF is one that we would normally split. + # 3. The file_set's parent does not have child works; the assumption being that if it doesn't + # have child works, then + class ConditionallyResplitFileSetJob < ApplicationJob + ## + # @param file_set_id [String] + # + # @return [Symbol] A terse explanation of what was done with this job. + # + # @raise [ActiveFedora::ObjectNotFoundError] when the given FileSet's parent could not be found. + # rubocop:disable Metrics/LineLength + def perform(file_set_id:) + file_set = FileSetFinder.find(file_set_id: file_set_id) + + # We've logged this (see FileSetFinder.find) so we'll move along. + return :file_set_not_found unless file_set + + # When we aren't working with a PDF, let's not proceed. + return :not_a_pdf unless file_set.pdf? + + # When the PDF we are working with isn't something we split, let's bail. + return :non_splitting_pdf unless IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter.split_for?(path: file_set.label) + + parent = IiifPrint.parent_for(file_set) + + raise ActiveFedora::ObjectNotFoundError, "Expected #{file_set.class} ID=#{file_set.id} to have a parent record." unless parent + + return :parent_does_not_split unless parent.try(:iiif_print_config).try(:pdf_splitter_service) + + # When the parent has children, assume that we've already previously succeeded on splitting + # this PDF. + return :has_children if parent.child_work_ids.any? + + IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: file_set, user: User.batch_user) + :requesting_split + end + # rubocop:enable Metrics/LineLength + end + + ## + # + class ConditionallyReingestFileSetJob < ApplicationJob + ## + # @param file_set_id [String] + # @return [Symbol] A terse explanation of what was done with this job. + def perform(file_set_id:) + file_set = FileSetFinder.find(file_set_id: file_set_id) + + # We've logged this (see FileSetFinder.find) so we'll move along. + return :file_set_not_found unless file_set + + # TODO: The file set does not appear to have a properly attached file. + end + end +end diff --git a/lib/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter.rb b/lib/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter.rb index f9fc8700..1b5115e1 100644 --- a/lib/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter.rb +++ b/lib/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter.rb @@ -3,13 +3,24 @@ module IiifPrint module SplitPdfs module AdventistPagesToJpgsSplitter + ## + # @param path [String] the path, in particular filename (that hopefully ends with an + # extension). + # + # @param suffixes [Array] the list of suffixes that we want to ignore for splitting. + # @return [TrueClass] when we should be splitting this path. + # @return [TrueClass] when we should not be splitting this path. + def self.split_this?(path:, suffixes: CreateDerivativesJobDecorator::NON_ARCHIVAL_PDF_SUFFIXES) + suffixes.none? { |suffix| path.downcase.end_with?(suffix) } + end + ## # We do not always want to split a PDF; this provides a decision point. # # @param path [String] the path of the file we're attempting to run derivatives against. # @param args [Array] pass through args # @param splitter [IiifPrint::SplitPdfs::BaseSplitter] (for dependency injection) - # @param suffix [String] (for dependency injection) + # @param suffixes [String] (for dependency injection) # # @return [Enumerable] when we are going to skip splitting, return an empty array; otherwise return # an instance of {IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter}. @@ -20,7 +31,7 @@ def self.call(path, splitter: DerivativeRodeoSplitter, suffixes: CreateDerivativesJobDecorator::NON_ARCHIVAL_PDF_SUFFIXES, **args) - return [] if suffixes.any? { |suffix| path.downcase.end_with?(suffix) } + return [] unless AdventistPagesToJpgsSplitter.split_this?(path: path, suffixes: suffixes) splitter.call(path, **args) end diff --git a/spec/factories/file_sets.rb b/spec/factories/file_sets.rb index 606c6d2a..fc68671e 100644 --- a/spec/factories/file_sets.rb +++ b/spec/factories/file_sets.rb @@ -4,9 +4,21 @@ factory :file_set do transient do user { FactoryBot.create(:user) } + content { nil } end + after(:build) do |fs, evaluator| fs.apply_depositor_metadata evaluator.user end + + factory :file_with_work do + after(:build) do |file, _evaluator| + file.title = ['testfile'] + end + after(:create) do |file, evaluator| + Hydra::Works::UploadFileToFileSet.call(file, evaluator.content) if evaluator.content + create(:generic_work, user: evaluator.user).members << file + end + end end end diff --git a/spec/fixtures/latex.pdf b/spec/fixtures/latex.pdf new file mode 100644 index 00000000..bdbe71eb Binary files /dev/null and b/spec/fixtures/latex.pdf differ diff --git a/spec/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter_spec.rb b/spec/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter_spec.rb index 352b7989..f948eda4 100644 --- a/spec/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter_spec.rb +++ b/spec/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter_spec.rb @@ -3,6 +3,23 @@ require 'spec_helper' RSpec.describe IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter do + describe '.split_this?' do + subject { described_class.split_this?(path: path) } + + [ + ["hello.jpg", true], + ["hello.reader.pdf", false], + ["hello.reader.jpg", true], + ["hello.reader.pdf.pdf", true] + ].each do |given_path, expected_value| + context "given #{given_path.inspect}" do + let(:path) { given_path } + + it { is_expected.to eq(expected_value) } + end + end + end + describe '.call' do subject { described_class.call(path, suffixes: ["spec.rb"], file_set: create(:file_set)) } diff --git a/spec/jobs/file_sets_reprocess_job_spec.rb b/spec/jobs/file_sets_reprocess_job_spec.rb new file mode 100644 index 00000000..b40a76e5 --- /dev/null +++ b/spec/jobs/file_sets_reprocess_job_spec.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe FileSetsReprocessJob, clean: true do + describe '#perform' do + let(:user) { FactoryBot.create(:user) } + let(:file_set) { FactoryBot.create(:file_with_work, content: file_content, user: user) } + let(:file_content) { File.open(fixture_path + '/latex.pdf') } + + it 'submits jobs' do + expect(described_class::ConditionallyResplitFileSetJob).to receive(:perform_later).with(file_set_id: file_set.id) + file_set + + described_class.perform_now + # Verifying that we found one record to consider resplitting. + end + end +end