From c6be86fb3e5d6c737317336f168607048c35d1ce Mon Sep 17 00:00:00 2001 From: LaRita Robinson Date: Thu, 1 Jun 2023 17:01:24 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Add=20derivative=5Frodeo=5Fsplit?= =?UTF-8?q?ter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new PDF splitter option that wraps the DerivateRodeo's PdfSplitGenerator. It handles, in theory, PDF splitting and the derivative's generated in the DerivativeRodeo. Related to: - https://github.com/scientist-softserv/iiif_print/issues/220 Co-authored-by: LaRita Robinson Co-authored-by: Shana Moore --- .../iiif_print/derivative_rodeo_service.rb | 80 ++++++++++++------- .../pluggable_derivative_service.rb | 2 +- lib/iiif_print.rb | 1 + lib/iiif_print/data/fileset_helper.rb | 2 +- .../jobs/child_works_from_pdf_job.rb | 19 +++-- .../split_pdfs/derivative_rodeo_splitter.rb | 71 ++++++++++++++++ .../derivative_rodeo_splitter_spec.rb | 33 ++++++++ .../derivative_rodeo_service_spec.rb | 6 +- 8 files changed, 173 insertions(+), 41 deletions(-) create mode 100644 lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb create mode 100644 spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb index 7afe3d85..c4b10c85 100644 --- a/app/services/iiif_print/derivative_rodeo_service.rb +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -25,9 +25,9 @@ class DerivativeRodeoService class_attribute :parent_work_identifier_property_name, default: 'aark_id' ## - # @attr input_location_adapter_name [String] The name of a derivative rodeo storage location; + # @attr pre_processed_location_adapter_name [String] The name of a derivative rodeo storage location; # this will must be a registered with the DerivativeRodeo::StorageLocations::BaseLocation. - class_attribute :input_location_adapter_name, default: 's3' + class_attribute :pre_processed_location_adapter_name, default: 's3' ## # @attr named_derivatives_and_generators_by_type [Hash] the named derivative and it's @@ -43,22 +43,24 @@ class DerivativeRodeoService ## ## - # This method "hard-codes" some existing assumptions about the input_uri based on - # implementations for Adventist. Those are reasonable assumptions but time will tell how - # reasonable. + # This method encodes some existing assumptions about the URI based on implementations for + # Adventist. Those are reasonable assumptions but time will tell how reasonable. + # + # By convention, this method is returning output_location of the SpaceStone::Serverless + # processing. We might know the original location that SpaceStone::Serverless processed, but + # that seems to be a tenuous assumption. + # + # In other words, where would SpaceStone, by convention, have written the original file and by + # convention written that original file's derivatives. + # + # TODO: We also need to account for PDF splitting # # @param file_set [FileSet] + # @param filename [String] + # @param extension [String] + # # @return [String] - def self.derivative_rodeo_input_uri(file_set:) - return @derivative_rodeo_input_uri if defined?(@derivative_rodeo_input_uri) - - # TODO: URGENT For a child work (e.g. an image split off of a PDF) we will know that the file_set's - # parent is a child, and the rules of the URI for those derivatives are different from the - # original ingested PDF or the original ingested Image. - - # TODO: This logic will work for an attached PDF; but not for each of the split pages of that - # PDF. How to do that? - + def self.input_uri(file_set:, filename: nil, extension: nil) # TODO: This is duplicated logic for another service, consider extracting a helper module; # better yet wouldn't it be nice if Hyrax did this right and proper. parent = file_set.parent || file_set.member_of.find(&:work?) @@ -70,20 +72,28 @@ def self.derivative_rodeo_input_uri(file_set:) # expendiency, I'm using it. See # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 # TODO: Could we get away with filename that is passed in the create_derivatives process? - filename = Hydra::Works::DetermineOriginalName.call(file_set.original_file) + filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file) + extension ||= File.extname(filename) + extension = ".#{extension}" unless extension.start_with?(".") + basename = File.basename(filename, extension) # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need # to "validate" it in another step. - location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(input_location_adapter_name) - - # TODO: This is based on the provided output template in - # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 - # and is very much hard-coded. We likely want to "store" the template in a common place for - # the application. - # - # s3://s3-antics/:aark_id/:file_name_with_extension - # s3://s3-antics/12345/hello-world.pdf - @derivative_rodeo_input_uri = File.join(location.adapter_prefix, dirname, File.basename(filename)) + location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(pre_processed_location_adapter_name) + + if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) + # In this case the basename will include the "filename--page-\d.extension" type format. + File.join(location.adapter_prefix, dirname, basename, "pages", "#{basename}#{extension}") + else + # TODO: This is based on the provided output template in + # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 + # and is very much hard-coded. We likely want to "store" the template in a common place for + # the application. + # + # s3://s3-antics/:aark_id/:file_name_with_extension + # s3://s3-antics/12345/hello-world.pdf + File.join(location.adapter_prefix, dirname, "#{basename}#{extension}") + end end def initialize(file_set) @@ -131,8 +141,18 @@ def lasso_up_some_derivatives(type:, **) named_derivatives_and_generators_by_type.fetch(type).flat_map do |named_derivative, generator_name| # This is the location that Hyrax expects us to put files that will be added to Fedora. output_location_template = "file://#{Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative)}" + + # The generator knows the output extensions. generator = generator_name.constantize - generator.new(input_uris: [derivative_rodeo_input_uri], output_location_template: output_location_template).generate_uris + + # This is the location where we hope the derivative rodeo will have generated the file. + pre_processed_location_template = self.class.derivative_rodeo_uri(file_set: file_set, extension: generator.output_extension) + + generator.new( + input_uris: [input_uri], + pre_processed_location_template: pre_processed_location_template, + output_location_template: output_location_template + ).generate_uris end end @@ -141,8 +161,8 @@ def supported_mime_types named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") } end - def derivative_rodeo_input_uri - @derivative_rodeo_input_uri ||= self.class.derivative_rodeo_input_uri(file_set: file_set) + def input_uri + @input_uri ||= self.class.derivative_rodeo_uri(file_set: file_set) end def in_the_rodeo? @@ -150,7 +170,7 @@ def in_the_rodeo? # for looking in the rodeo. return false unless supported_mime_types.include?(mime_type) - location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(derivative_rodeo_input_uri) + location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(input_uri) location.exist? end end diff --git a/app/services/iiif_print/pluggable_derivative_service.rb b/app/services/iiif_print/pluggable_derivative_service.rb index fce11316..a909f677 100644 --- a/app/services/iiif_print/pluggable_derivative_service.rb +++ b/app/services/iiif_print/pluggable_derivative_service.rb @@ -39,7 +39,7 @@ def initialize(file_set, plugins: plugins_for(file_set)) # multiple plugins, some of which may or may not be valid, so # validity checks happen within as well. def valid? - !valid_plugins.size.zero? + !valid_plugins.empty? end # get derivative services relevant to method name and file_set context diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb index 38e6f9d6..a9d164ea 100644 --- a/lib/iiif_print.rb +++ b/lib/iiif_print.rb @@ -19,6 +19,7 @@ require "iiif_print/jobs/child_works_from_pdf_job" require "iiif_print/split_pdfs/base_splitter" require "iiif_print/split_pdfs/child_work_creation_from_pdf_service" +require "iiif_print/split_pdfs/derivative_rodeo_splitter" module IiifPrint extend ActiveSupport::Autoload diff --git a/lib/iiif_print/data/fileset_helper.rb b/lib/iiif_print/data/fileset_helper.rb index d7d013aa..dc8c603e 100644 --- a/lib/iiif_print/data/fileset_helper.rb +++ b/lib/iiif_print/data/fileset_helper.rb @@ -7,7 +7,7 @@ def fileset_id # if context is itself a string, presume it is a file set id return @work if @work.is_a? String # if context is not a String, presume a work or fileset context: - fileset.nil? ? nil : fileset.id + fileset&.id end def first_fileset diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb index fef883ce..e61b17b2 100644 --- a/lib/iiif_print/jobs/child_works_from_pdf_job.rb +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -15,17 +15,19 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) # We know that we have cases where parent_work is nil, this will definitely raise an # exception; which is fine because we were going to do it later anyway. @parent_work = if candidate_for_parency.work? + pdf_file_set = nil candidate_for_parency else # We likely have a file set + pdf_file_set = candidate_for_parency IiifPrint.parent_for(candidate_for_parency) end @child_admin_set_id = admin_set_id child_model = @parent_work.iiif_print_config.pdf_split_child_model - # handle each input pdf + # handle each input pdf (when input is a file set, we will only have one). pdf_paths.each do |original_pdf_path| - split_pdf(original_pdf_path, user, child_model) + split_pdf(original_pdf_path, user, child_model, pdf_file_set) end # Link newly created child works to the parent @@ -47,10 +49,8 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) private # rubocop:disable Metrics/ParameterLists - def split_pdf(original_pdf_path, user, child_model) - # TODO: This is the place to change out the existing service and instead use the derivative - # rodeo; we will likely need to look at method signatures to tighten this interface. - image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path) + def split_pdf(original_pdf_path, user, child_model, pdf_file_set) + image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path, file_set: pdf_file_set) return if image_files.blank? prepare_import_data(original_pdf_path, image_files, user) @@ -97,6 +97,13 @@ def prepare_import_data(original_pdf_path, image_files, user) PendingRelationship.create!(child_title: child_title, parent_id: @parent_work.id, child_order: child_title) + + begin + # Clean up the temporary image path. + File.rm_f(image_path) if File.exist?(image_path) + rescue + # If we can't delete, let's move on. Maybe it was already cleaned-up. + end end end # rubocop:enable Metrics/MethodLength diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb new file mode 100644 index 00000000..8e37e52d --- /dev/null +++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb @@ -0,0 +1,71 @@ +module IiifPrint + module SplitPdfs + ## + # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed + # images, or split a PDF if there are no preprocessed images. + # + # We have already attached the original file to the file_set. We want to convert that original + # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have + # written to Fedora as the PDF) + # + # @see .call + class DerivativeRodeoSplitter + ## + # @param path [String] the local file location + # @param file_set [FileSet] file set containing a PDF file to split + # + # @return [Array] paths to images split from each page of PDF file + def self.call(path, file_set:) + new(path, file_set: file_set).split_files + end + + def initialize(path, file_set:, output_tmp_dir: Dir.tmpdir) + @input_uri = "file://#{path}" + + # We are writing the images to a location that CarrierWave can upload. + # + # https://github.com/scientist-softserv/iiif_print/blob/b969541de1a0526305b54de37bf7cf100289f088/lib/iiif_print/jobs/child_works_from_pdf_job.rb#L108 + output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}') + + @output_location_template = "file://#{output_template_path}" + @preprocessed_location_template = IiifPrint::DerivativeRodeoService.derivative_rodeo_input_uri(file_set: file_set, filename: filename) + end + + ## + # This is where, in "Fedora" we have the original file. This is not the original file in the + # pre-processing location but instead the long-term location of the file in the application + # that mounts IIIF Print. + # + # @return [String] + attr_reader :input_uri + + ## + # This is the location where we're going to write the derivatives that will "go into Fedora". + # + # @return [String] + attr_reader :output_location_template + + ## + # Where can we find, in the DerivativeRodeo's storage, what has already been done regarding + # derivative generation. + # + # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3 + # bucket that we then use for IIIF Print. + # + # @return [String] + # + # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63 + attr_reader :preprocessed_location_template + + ## + # @return [Array] the paths to each of the images split off from the PDF. + def split_files + DerivativeRodeo::Generators::PdfSplitGenerator.new( + input_uris: [@input_uri], + output_location_template: output_location_template, + preprocessed_location_template: preprocessed_location_template + ).generated_files.map(&:file_path) + end + end + end +end diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb new file mode 100644 index 00000000..e872f39d --- /dev/null +++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IiifPrint::SplitPdfs::DerivativeRodeoSplitter do + let(:path) { nil } + let(:work) { double(MyWork, aark_id: '12345') } + let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } + + describe 'class' do + subject { described_class } + + it { is_expected.to respond_to(:call) } + end + + describe "instance" do + subject { described_class.new(file_set: file_set) } + + before do + allow(file_set).to receive(:parent).and_return(work) + # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + + it { is_expected.to respond_to :split_files } + + it 'uses the rodeo to split' do + expect(DerivativeRodeo::Generators::PdfSplitGenerator).to receive(:new) + described_class.call(path, file_set: file_set) + end + end +end diff --git a/spec/services/iiif_print/derivative_rodeo_service_spec.rb b/spec/services/iiif_print/derivative_rodeo_service_spec.rb index 303127e3..03fe925d 100644 --- a/spec/services/iiif_print/derivative_rodeo_service_spec.rb +++ b/spec/services/iiif_print/derivative_rodeo_service_spec.rb @@ -28,7 +28,7 @@ end describe '.derivative_rodeo_input_uri' do - subject { described_class.derivative_rodeo_input_uri(file_set: file_set) } + subject { described_class.derivative_rodeo_pre_processed_uri(file_set: file_set) } context 'when the file_set does not have a parent' do xit 'is expected to raise an error' do @@ -44,7 +44,7 @@ allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) end - it { is_expected.to start_with("#{described_class.input_location_adapter_name}://") } + it { is_expected.to start_with("#{described_class.pre_processed_location_adapter_name}://") } it { is_expected.to end_with(File.basename(__FILE__)) } end end @@ -64,7 +64,7 @@ end context 'when derivative rodeo has not pre-processed the file set' do - before { instance.input_location_adapter_name = "file" } + before { instance.pre_processed_location_adapter_name = "file" } let(:mime_type) { "application/pdf" } it { is_expected.to be_falsey }