Skip to content

Commit

Permalink
Adding the word coordinates generator
Browse files Browse the repository at this point in the history
With this commit, we're introducing the creation of word coordinates
files based on the likely previously generated HOCR file.

Closes: #5

- #5
  • Loading branch information
jeremyf committed May 10, 2023
1 parent 9bb74c0 commit 50ab0e7
Show file tree
Hide file tree
Showing 5 changed files with 323 additions and 2 deletions.
39 changes: 39 additions & 0 deletions lib/derivative_rodeo/generators/word_coordinates_generator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# frozen_string_literal: true

module DerivativeRodeo
module Generators
##
# Generate the word coordinates (as JSON) from the given input_uris.
#
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
class WordCoordinatesGenerator < BaseGenerator
self.output_extension = "coordinates.json"

##
# @param out_file [StorageAdapters::BaseAdapter]
# @param in_tmp_path [String] the location of the file that we can use for processing.
#
# @return [StorageAdapters::BaseAdapter]
#
# @see #requisite_files
def build_step(out_file:, in_tmp_path:, **)
out_file.with_new_tmp_path do |out_tmp_path|
convert_to_coordinates(path_to_hocr: in_tmp_path, path_to_coordinate: out_tmp_path)
end
end

private

##
# @param path_to_hocr [String]
# @param path_to_coordinate [String]
# @param service [#to_json, Services::ExtractWordCoordinatesFromHocrSgmlService]
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
hocr_html = File.read(path_to_hocr)
File.open(path_to_coordinate, "w+") do |file|
file.puts service.call(hocr_html)
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
# frozen_string_literal: true

require 'forwardable'
require 'json'
require 'nokogiri'

module DerivativeRodeo
module Services
##
# Responsible for converting an SGML string into JSON coordinates
class ExtractWordCoordinatesFromHocrSgmlService
##
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
# @return [String] A JSON document
def self.call(sgml)
new(sgml).to_json
end

##
# Construct with either path or HTML [String]
#
# @param html [String] either an XML string or a path to a file.
def initialize(html)
@source = xml?(html) ? html : File.read(html)
@doc_stream = DocStream.new
parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream)
parser.parse(@source)
end
attr_reader :doc_stream, :source

delegate :text, :width, :height, :words, to: :doc_stream

# Output JSON flattened word coordinates
#
# @return [String] JSON serialization of flattened word coordinates
def to_json
@to_json ||= WordCoordinates.to_json(
words: doc_stream.words,
width: doc_stream.width,
height: doc_stream.height
)
end
alias json to_json

private

def xml?(xml)
xml.lstrip.start_with?('<')
end

# SAX Document Stream class to gather text and word tokens from hOCR
class DocStream < Nokogiri::XML::SAX::Document
attr_accessor :text, :words, :width, :height

def initialize
super()
# plain text buffer:
@text = ''
# list of word hash, containing word+coord:
@words = []
# page width and height to be found in hOCR for `div.ocr_page`
@width = nil
@height = nil
# to hold current word data state across #start_element, #characters,
# and #end_element methods (to associate word with coordinates).
@current = nil
# to preserve element classname from start to use by #end_element
@element_class_name = nil
end

# Return coordinates from `span.ocrx_word` element attribute hash
#
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
# @return [Array] Array of position x, y, width, height in px.
def s_coords(attrs)
element_title = attrs['title']
bbox = element_title.split(';')[0].split('bbox ')[-1]
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
height = y2 - y1
width = x2 - x1
hpos = x1
vpos = y1
[hpos, vpos, width, height]
end

# Consider element for processing?
# - `div.ocr_page` — to get page width/height
# - `span.ocr_line` — to help make plain text readable
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
# @param name [String] Element name
# @param class_name [String] HTML class name
# @return [Boolean] true if element should be processed; otherwise false
def consider?(name, class_name)
selector = "#{name}.#{class_name}"
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
end

def start_word(attrs)
@current = {}
# will be replaced during #characters method call:
@current[:word] = nil
@current[:coordinates] = s_coords(attrs)
end

def start_page(attrs)
title = attrs['title']
fields = title.split(';')
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
# width and height:
@width = bbox[2]
@height = bbox[3]
end

def word_complete?
return false if @current.nil?
coords = @current[:coordinates]
@current[:word].present? && coords.size == 4
end

def end_word
# add trailing space to plaintext buffer for between words:
@text += ' '
@words.push(@current) if word_complete?
end

def end_line
# strip trailing whitespace
@text.strip!
# then insert a line break
@text += "\n"
end

# Callback for element start, ignores elements except for:
# - `div.ocr_page` — to get page width/height
# - `span.ocr_line` — to help make plain text readable
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
#
# @param name [String] element name.
# @param attrs [Array] Array of key, value pair Arrays.
def start_element(name, attrs = [])
attributes = attrs.to_h
@element_class_name = attributes['class']
return unless consider?(name, @element_class_name)
start_word(attributes) if @element_class_name == 'ocrx_word'
start_page(attributes) if @element_class_name == 'ocr_page'
end

def characters(value)
return if @current.nil?
return if @current[:coordinates].nil?
@current[:word] ||= ''
@current[:word] += value
@text += value
end

# Callback for element end; at this time, flush word coordinate state
# for current word, and append line endings to plain text:
#
# @param _name [String] element name.
def end_element(_name)
end_line if @element_class_name == 'ocr_line'
end_word if @element_class_name == 'ocrx_word'
end

# Callback for completion of parsing hOCR, used to normalize generated
# text content (strip unneeded whitespace incidental to output).
def end_document
# postprocess @text to remove trailing spaces on lines
@text = @text.split("\n").map(&:strip).join("\n")
# remove excess line break
@text.gsub!(/\n+/, "\n")
@text.delete("\r")
# remove trailing whitespace at end of buffer
@text.strip!
end
end

class WordCoordinates
##
# @api public
#
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
# @param width [Integer] the width of the "canvas" on which the words appear.
# @param height [Integer] the height of the "canvas" on which the words appear.
#
# @return [String] a JSON encoded string.
def self.to_json(words:, width: nil, height: nil)
new(words: words, width: width, height: height).to_json
end

def initialize(words:, width:, height:)
@words = words
@width = width
@height = height
end
attr_reader :words, :width, :height

# Output JSON flattened word coordinates
#
# @return [String] JSON serialization of flattened word coordinates
def to_json
coordinates = {}
words.each do |word|
word_chars = word[:word]
word_coords = word[:coordinates]
if coordinates[word_chars]
coordinates[word_chars] << word_coords
else
coordinates[word_chars] = [word_coords]
end
end
payload = { width: width, height: height, coords: coordinates }
JSON.generate(payload)
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe DerivativeRodeo::Generators::WordCoordinatesGenerator do
describe "#generated_files" do
it "derives the word coordinates from the given hocr file" do
generated_file = nil
Fixtures.with_file_uris_for("ocr_mono_text_hocr.html") do |hocr_paths, out_tmp_dir|
template = "file://#{out_tmp_dir}/{{ basename }}.coordinates.json"
input_uri = "file://#{hocr_paths.first}"
instance = described_class.new(input_uris: [input_uri], output_target_template: template)
generated_file = instance.generated_files.first
json = JSON.parse(File.read(generated_file.file_path))
expect(json.keys).to match_array(["width", "height", "coords"])
expect(generated_file.exist?).to be_truthy
end

expect(generated_file.exist?).to be_falsey
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe DerivativeRodeo::Services::ExtractWordCoordinatesFromHocrSgmlService do
let(:minimal) { File.read(minimal_path) }

let(:reader_minimal) { described_class.new(minimal) }
let(:reader_minimal_path) { described_class.new(minimal_path) }

let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) }
let(:hocr) { described_class.new(xml) }
subject { hocr }

# I want to deprecate this, but for now, it's here.
it { is_expected.to respond_to(:json) }

describe '#text' do
let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) }
subject { hocr.text }

it 'outputs plain text' do
expect(subject.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. "
expect(subject.size).to eq 831
end
end

describe '#to_json' do
subject { hocr.to_json }
it 'outputs JSON that includes coords key' do
parsed = JSON.parse(subject)
expect(parsed['coords'].length).to be > 1
end
end
end
11 changes: 9 additions & 2 deletions spec/fixtures.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ module Fixtures
#
# @yieldparam [String]
def self.with_temporary_directory
raise "You must pass a block" unless block_given?

Dir.mktmpdir do |dir|
yield(dir)
end
Expand All @@ -21,14 +23,19 @@ def self.path_for(filename)
# This function copies the given :filenames to a new temporary location.
#
# @yieldparam filenames [Array<String>] path to the temporary fixture files.
def self.with_file_uris_for(*filenames)
# @yieldparam output_tmp_dir [String] (Optional) path to the temporary directory where we copied
# the files.
def self.with_file_uris_for(*filenames, &block)
raise "You must pass a block" unless block_given?

with_temporary_directory do |dir|
targets = filenames.map do |filename|
target = File.join(dir, filename)
FileUtils.cp(path_for(filename), target)
"file://#{target}"
end
yield(targets)
yield(targets) if block.arity == 1
yield(targets, dir) if block.arity == 2
end
end
end

0 comments on commit 50ab0e7

Please sign in to comment.