From 50ab0e756299a78ae4ead9371be63f9cd734ccf2 Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Wed, 10 May 2023 16:38:22 -0400 Subject: [PATCH] Adding the word coordinates generator With this commit, we're introducing the creation of word coordinates files based on the likely previously generated HOCR file. Closes: #5 - https://github.com/scientist-softserv/derivative_rodeo/issues/5 --- .../generators/word_coordinates_generator.rb | 39 ++++ ...word_coordinates_from_hocr_sgml_service.rb | 218 ++++++++++++++++++ .../word_coordinates_generator_spec.rb | 22 ++ ...coordinates_from_hocr_sgml_service_spec.rb | 35 +++ spec/fixtures.rb | 11 +- 5 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 lib/derivative_rodeo/generators/word_coordinates_generator.rb create mode 100644 lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb create mode 100644 spec/derivative_rodeo/generators/word_coordinates_generator_spec.rb create mode 100644 spec/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service_spec.rb diff --git a/lib/derivative_rodeo/generators/word_coordinates_generator.rb b/lib/derivative_rodeo/generators/word_coordinates_generator.rb new file mode 100644 index 0000000..761a219 --- /dev/null +++ b/lib/derivative_rodeo/generators/word_coordinates_generator.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +module DerivativeRodeo + module Generators + ## + # Generate the word coordinates (as JSON) from the given input_uris. + # + # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}). + class WordCoordinatesGenerator < BaseGenerator + self.output_extension = "coordinates.json" + + ## + # @param out_file [StorageAdapters::BaseAdapter] + # @param in_tmp_path [String] the location of the file that we can use for processing. + # + # @return [StorageAdapters::BaseAdapter] + # + # @see #requisite_files + def build_step(out_file:, in_tmp_path:, **) + out_file.with_new_tmp_path do |out_tmp_path| + convert_to_coordinates(path_to_hocr: in_tmp_path, path_to_coordinate: out_tmp_path) + end + end + + private + + ## + # @param path_to_hocr [String] + # @param path_to_coordinate [String] + # @param service [#to_json, Services::ExtractWordCoordinatesFromHocrSgmlService] + def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService) + hocr_html = File.read(path_to_hocr) + File.open(path_to_coordinate, "w+") do |file| + file.puts service.call(hocr_html) + end + end + end + end +end diff --git a/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb b/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb new file mode 100644 index 0000000..0169ce2 --- /dev/null +++ b/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb @@ -0,0 +1,218 @@ +# frozen_string_literal: true + +require 'forwardable' +require 'json' +require 'nokogiri' + +module DerivativeRodeo + module Services + ## + # Responsible for converting an SGML string into JSON coordinates + class ExtractWordCoordinatesFromHocrSgmlService + ## + # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file. + # @return [String] A JSON document + def self.call(sgml) + new(sgml).to_json + end + + ## + # Construct with either path or HTML [String] + # + # @param html [String] either an XML string or a path to a file. + def initialize(html) + @source = xml?(html) ? html : File.read(html) + @doc_stream = DocStream.new + parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream) + parser.parse(@source) + end + attr_reader :doc_stream, :source + + delegate :text, :width, :height, :words, to: :doc_stream + + # Output JSON flattened word coordinates + # + # @return [String] JSON serialization of flattened word coordinates + def to_json + @to_json ||= WordCoordinates.to_json( + words: doc_stream.words, + width: doc_stream.width, + height: doc_stream.height + ) + end + alias json to_json + + private + + def xml?(xml) + xml.lstrip.start_with?('<') + end + + # SAX Document Stream class to gather text and word tokens from hOCR + class DocStream < Nokogiri::XML::SAX::Document + attr_accessor :text, :words, :width, :height + + def initialize + super() + # plain text buffer: + @text = '' + # list of word hash, containing word+coord: + @words = [] + # page width and height to be found in hOCR for `div.ocr_page` + @width = nil + @height = nil + # to hold current word data state across #start_element, #characters, + # and #end_element methods (to associate word with coordinates). + @current = nil + # to preserve element classname from start to use by #end_element + @element_class_name = nil + end + + # Return coordinates from `span.ocrx_word` element attribute hash + # + # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes + # @return [Array] Array of position x, y, width, height in px. + def s_coords(attrs) + element_title = attrs['title'] + bbox = element_title.split(';')[0].split('bbox ')[-1] + x1, y1, x2, y2 = bbox.split(' ').map(&:to_i) + height = y2 - y1 + width = x2 - x1 + hpos = x1 + vpos = y1 + [hpos, vpos, width, height] + end + + # Consider element for processing? + # - `div.ocr_page` — to get page width/height + # - `span.ocr_line` — to help make plain text readable + # - `span.ocrx_word` — for word-coordinate JSON and plain text word + # @param name [String] Element name + # @param class_name [String] HTML class name + # @return [Boolean] true if element should be processed; otherwise false + def consider?(name, class_name) + selector = "#{name}.#{class_name}" + ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector) + end + + def start_word(attrs) + @current = {} + # will be replaced during #characters method call: + @current[:word] = nil + @current[:coordinates] = s_coords(attrs) + end + + def start_page(attrs) + title = attrs['title'] + fields = title.split(';') + bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i) + # width and height: + @width = bbox[2] + @height = bbox[3] + end + + def word_complete? + return false if @current.nil? + coords = @current[:coordinates] + @current[:word].present? && coords.size == 4 + end + + def end_word + # add trailing space to plaintext buffer for between words: + @text += ' ' + @words.push(@current) if word_complete? + end + + def end_line + # strip trailing whitespace + @text.strip! + # then insert a line break + @text += "\n" + end + + # Callback for element start, ignores elements except for: + # - `div.ocr_page` — to get page width/height + # - `span.ocr_line` — to help make plain text readable + # - `span.ocrx_word` — for word-coordinate JSON and plain text word + # + # @param name [String] element name. + # @param attrs [Array] Array of key, value pair Arrays. + def start_element(name, attrs = []) + attributes = attrs.to_h + @element_class_name = attributes['class'] + return unless consider?(name, @element_class_name) + start_word(attributes) if @element_class_name == 'ocrx_word' + start_page(attributes) if @element_class_name == 'ocr_page' + end + + def characters(value) + return if @current.nil? + return if @current[:coordinates].nil? + @current[:word] ||= '' + @current[:word] += value + @text += value + end + + # Callback for element end; at this time, flush word coordinate state + # for current word, and append line endings to plain text: + # + # @param _name [String] element name. + def end_element(_name) + end_line if @element_class_name == 'ocr_line' + end_word if @element_class_name == 'ocrx_word' + end + + # Callback for completion of parsing hOCR, used to normalize generated + # text content (strip unneeded whitespace incidental to output). + def end_document + # postprocess @text to remove trailing spaces on lines + @text = @text.split("\n").map(&:strip).join("\n") + # remove excess line break + @text.gsub!(/\n+/, "\n") + @text.delete("\r") + # remove trailing whitespace at end of buffer + @text.strip! + end + end + + class WordCoordinates + ## + # @api public + # + # @param words [Array] an array of hash objects that have the keys `:word` and `:coordinates`. + # @param width [Integer] the width of the "canvas" on which the words appear. + # @param height [Integer] the height of the "canvas" on which the words appear. + # + # @return [String] a JSON encoded string. + def self.to_json(words:, width: nil, height: nil) + new(words: words, width: width, height: height).to_json + end + + def initialize(words:, width:, height:) + @words = words + @width = width + @height = height + end + attr_reader :words, :width, :height + + # Output JSON flattened word coordinates + # + # @return [String] JSON serialization of flattened word coordinates + def to_json + coordinates = {} + words.each do |word| + word_chars = word[:word] + word_coords = word[:coordinates] + if coordinates[word_chars] + coordinates[word_chars] << word_coords + else + coordinates[word_chars] = [word_coords] + end + end + payload = { width: width, height: height, coords: coordinates } + JSON.generate(payload) + end + end + end + end +end diff --git a/spec/derivative_rodeo/generators/word_coordinates_generator_spec.rb b/spec/derivative_rodeo/generators/word_coordinates_generator_spec.rb new file mode 100644 index 0000000..0bfbf30 --- /dev/null +++ b/spec/derivative_rodeo/generators/word_coordinates_generator_spec.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe DerivativeRodeo::Generators::WordCoordinatesGenerator do + describe "#generated_files" do + it "derives the word coordinates from the given hocr file" do + generated_file = nil + Fixtures.with_file_uris_for("ocr_mono_text_hocr.html") do |hocr_paths, out_tmp_dir| + template = "file://#{out_tmp_dir}/{{ basename }}.coordinates.json" + input_uri = "file://#{hocr_paths.first}" + instance = described_class.new(input_uris: [input_uri], output_target_template: template) + generated_file = instance.generated_files.first + json = JSON.parse(File.read(generated_file.file_path)) + expect(json.keys).to match_array(["width", "height", "coords"]) + expect(generated_file.exist?).to be_truthy + end + + expect(generated_file.exist?).to be_falsey + end + end +end diff --git a/spec/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service_spec.rb b/spec/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service_spec.rb new file mode 100644 index 0000000..5c3f4c5 --- /dev/null +++ b/spec/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service_spec.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe DerivativeRodeo::Services::ExtractWordCoordinatesFromHocrSgmlService do + let(:minimal) { File.read(minimal_path) } + + let(:reader_minimal) { described_class.new(minimal) } + let(:reader_minimal_path) { described_class.new(minimal_path) } + + let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) } + let(:hocr) { described_class.new(xml) } + subject { hocr } + + # I want to deprecate this, but for now, it's here. + it { is_expected.to respond_to(:json) } + + describe '#text' do + let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) } + subject { hocr.text } + + it 'outputs plain text' do + expect(subject.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. " + expect(subject.size).to eq 831 + end + end + + describe '#to_json' do + subject { hocr.to_json } + it 'outputs JSON that includes coords key' do + parsed = JSON.parse(subject) + expect(parsed['coords'].length).to be > 1 + end + end +end diff --git a/spec/fixtures.rb b/spec/fixtures.rb index 7a4e2e6..f12c29a 100644 --- a/spec/fixtures.rb +++ b/spec/fixtures.rb @@ -5,6 +5,8 @@ module Fixtures # # @yieldparam [String] def self.with_temporary_directory + raise "You must pass a block" unless block_given? + Dir.mktmpdir do |dir| yield(dir) end @@ -21,14 +23,19 @@ def self.path_for(filename) # This function copies the given :filenames to a new temporary location. # # @yieldparam filenames [Array] path to the temporary fixture files. - def self.with_file_uris_for(*filenames) + # @yieldparam output_tmp_dir [String] (Optional) path to the temporary directory where we copied + # the files. + def self.with_file_uris_for(*filenames, &block) + raise "You must pass a block" unless block_given? + with_temporary_directory do |dir| targets = filenames.map do |filename| target = File.join(dir, filename) FileUtils.cp(path_for(filename), target) "file://#{target}" end - yield(targets) + yield(targets) if block.arity == 1 + yield(targets, dir) if block.arity == 2 end end end