-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding the word coordinates generator
With this commit, we're introducing the creation of word coordinates files based on the likely previously generated HOCR file. Closes: #5 - #5
- Loading branch information
Showing
5 changed files
with
323 additions
and
2 deletions.
There are no files selected for viewing
39 changes: 39 additions & 0 deletions
39
lib/derivative_rodeo/generators/word_coordinates_generator.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# frozen_string_literal: true | ||
|
||
module DerivativeRodeo | ||
module Generators | ||
## | ||
# Generate the word coordinates (as JSON) from the given input_uris. | ||
# | ||
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}). | ||
class WordCoordinatesGenerator < BaseGenerator | ||
self.output_extension = "coordinates.json" | ||
|
||
## | ||
# @param out_file [StorageAdapters::BaseAdapter] | ||
# @param in_tmp_path [String] the location of the file that we can use for processing. | ||
# | ||
# @return [StorageAdapters::BaseAdapter] | ||
# | ||
# @see #requisite_files | ||
def build_step(out_file:, in_tmp_path:, **) | ||
out_file.with_new_tmp_path do |out_tmp_path| | ||
convert_to_coordinates(path_to_hocr: in_tmp_path, path_to_coordinate: out_tmp_path) | ||
end | ||
end | ||
|
||
private | ||
|
||
## | ||
# @param path_to_hocr [String] | ||
# @param path_to_coordinate [String] | ||
# @param service [#to_json, Services::ExtractWordCoordinatesFromHocrSgmlService] | ||
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService) | ||
hocr_html = File.read(path_to_hocr) | ||
File.open(path_to_coordinate, "w+") do |file| | ||
file.puts service.call(hocr_html) | ||
end | ||
end | ||
end | ||
end | ||
end |
218 changes: 218 additions & 0 deletions
218
lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'forwardable' | ||
require 'json' | ||
require 'nokogiri' | ||
|
||
module DerivativeRodeo | ||
module Services | ||
## | ||
# Responsible for converting an SGML string into JSON coordinates | ||
class ExtractWordCoordinatesFromHocrSgmlService | ||
## | ||
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file. | ||
# @return [String] A JSON document | ||
def self.call(sgml) | ||
new(sgml).to_json | ||
end | ||
|
||
## | ||
# Construct with either path or HTML [String] | ||
# | ||
# @param html [String] either an XML string or a path to a file. | ||
def initialize(html) | ||
@source = xml?(html) ? html : File.read(html) | ||
@doc_stream = DocStream.new | ||
parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream) | ||
parser.parse(@source) | ||
end | ||
attr_reader :doc_stream, :source | ||
|
||
delegate :text, :width, :height, :words, to: :doc_stream | ||
|
||
# Output JSON flattened word coordinates | ||
# | ||
# @return [String] JSON serialization of flattened word coordinates | ||
def to_json | ||
@to_json ||= WordCoordinates.to_json( | ||
words: doc_stream.words, | ||
width: doc_stream.width, | ||
height: doc_stream.height | ||
) | ||
end | ||
alias json to_json | ||
|
||
private | ||
|
||
def xml?(xml) | ||
xml.lstrip.start_with?('<') | ||
end | ||
|
||
# SAX Document Stream class to gather text and word tokens from hOCR | ||
class DocStream < Nokogiri::XML::SAX::Document | ||
attr_accessor :text, :words, :width, :height | ||
|
||
def initialize | ||
super() | ||
# plain text buffer: | ||
@text = '' | ||
# list of word hash, containing word+coord: | ||
@words = [] | ||
# page width and height to be found in hOCR for `div.ocr_page` | ||
@width = nil | ||
@height = nil | ||
# to hold current word data state across #start_element, #characters, | ||
# and #end_element methods (to associate word with coordinates). | ||
@current = nil | ||
# to preserve element classname from start to use by #end_element | ||
@element_class_name = nil | ||
end | ||
|
||
# Return coordinates from `span.ocrx_word` element attribute hash | ||
# | ||
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes | ||
# @return [Array] Array of position x, y, width, height in px. | ||
def s_coords(attrs) | ||
element_title = attrs['title'] | ||
bbox = element_title.split(';')[0].split('bbox ')[-1] | ||
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i) | ||
height = y2 - y1 | ||
width = x2 - x1 | ||
hpos = x1 | ||
vpos = y1 | ||
[hpos, vpos, width, height] | ||
end | ||
|
||
# Consider element for processing? | ||
# - `div.ocr_page` — to get page width/height | ||
# - `span.ocr_line` — to help make plain text readable | ||
# - `span.ocrx_word` — for word-coordinate JSON and plain text word | ||
# @param name [String] Element name | ||
# @param class_name [String] HTML class name | ||
# @return [Boolean] true if element should be processed; otherwise false | ||
def consider?(name, class_name) | ||
selector = "#{name}.#{class_name}" | ||
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector) | ||
end | ||
|
||
def start_word(attrs) | ||
@current = {} | ||
# will be replaced during #characters method call: | ||
@current[:word] = nil | ||
@current[:coordinates] = s_coords(attrs) | ||
end | ||
|
||
def start_page(attrs) | ||
title = attrs['title'] | ||
fields = title.split(';') | ||
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i) | ||
# width and height: | ||
@width = bbox[2] | ||
@height = bbox[3] | ||
end | ||
|
||
def word_complete? | ||
return false if @current.nil? | ||
coords = @current[:coordinates] | ||
@current[:word].present? && coords.size == 4 | ||
end | ||
|
||
def end_word | ||
# add trailing space to plaintext buffer for between words: | ||
@text += ' ' | ||
@words.push(@current) if word_complete? | ||
end | ||
|
||
def end_line | ||
# strip trailing whitespace | ||
@text.strip! | ||
# then insert a line break | ||
@text += "\n" | ||
end | ||
|
||
# Callback for element start, ignores elements except for: | ||
# - `div.ocr_page` — to get page width/height | ||
# - `span.ocr_line` — to help make plain text readable | ||
# - `span.ocrx_word` — for word-coordinate JSON and plain text word | ||
# | ||
# @param name [String] element name. | ||
# @param attrs [Array] Array of key, value pair Arrays. | ||
def start_element(name, attrs = []) | ||
attributes = attrs.to_h | ||
@element_class_name = attributes['class'] | ||
return unless consider?(name, @element_class_name) | ||
start_word(attributes) if @element_class_name == 'ocrx_word' | ||
start_page(attributes) if @element_class_name == 'ocr_page' | ||
end | ||
|
||
def characters(value) | ||
return if @current.nil? | ||
return if @current[:coordinates].nil? | ||
@current[:word] ||= '' | ||
@current[:word] += value | ||
@text += value | ||
end | ||
|
||
# Callback for element end; at this time, flush word coordinate state | ||
# for current word, and append line endings to plain text: | ||
# | ||
# @param _name [String] element name. | ||
def end_element(_name) | ||
end_line if @element_class_name == 'ocr_line' | ||
end_word if @element_class_name == 'ocrx_word' | ||
end | ||
|
||
# Callback for completion of parsing hOCR, used to normalize generated | ||
# text content (strip unneeded whitespace incidental to output). | ||
def end_document | ||
# postprocess @text to remove trailing spaces on lines | ||
@text = @text.split("\n").map(&:strip).join("\n") | ||
# remove excess line break | ||
@text.gsub!(/\n+/, "\n") | ||
@text.delete("\r") | ||
# remove trailing whitespace at end of buffer | ||
@text.strip! | ||
end | ||
end | ||
|
||
class WordCoordinates | ||
## | ||
# @api public | ||
# | ||
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`. | ||
# @param width [Integer] the width of the "canvas" on which the words appear. | ||
# @param height [Integer] the height of the "canvas" on which the words appear. | ||
# | ||
# @return [String] a JSON encoded string. | ||
def self.to_json(words:, width: nil, height: nil) | ||
new(words: words, width: width, height: height).to_json | ||
end | ||
|
||
def initialize(words:, width:, height:) | ||
@words = words | ||
@width = width | ||
@height = height | ||
end | ||
attr_reader :words, :width, :height | ||
|
||
# Output JSON flattened word coordinates | ||
# | ||
# @return [String] JSON serialization of flattened word coordinates | ||
def to_json | ||
coordinates = {} | ||
words.each do |word| | ||
word_chars = word[:word] | ||
word_coords = word[:coordinates] | ||
if coordinates[word_chars] | ||
coordinates[word_chars] << word_coords | ||
else | ||
coordinates[word_chars] = [word_coords] | ||
end | ||
end | ||
payload = { width: width, height: height, coords: coordinates } | ||
JSON.generate(payload) | ||
end | ||
end | ||
end | ||
end | ||
end |
22 changes: 22 additions & 0 deletions
22
spec/derivative_rodeo/generators/word_coordinates_generator_spec.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'spec_helper' | ||
|
||
RSpec.describe DerivativeRodeo::Generators::WordCoordinatesGenerator do | ||
describe "#generated_files" do | ||
it "derives the word coordinates from the given hocr file" do | ||
generated_file = nil | ||
Fixtures.with_file_uris_for("ocr_mono_text_hocr.html") do |hocr_paths, out_tmp_dir| | ||
template = "file://#{out_tmp_dir}/{{ basename }}.coordinates.json" | ||
input_uri = "file://#{hocr_paths.first}" | ||
instance = described_class.new(input_uris: [input_uri], output_target_template: template) | ||
generated_file = instance.generated_files.first | ||
json = JSON.parse(File.read(generated_file.file_path)) | ||
expect(json.keys).to match_array(["width", "height", "coords"]) | ||
expect(generated_file.exist?).to be_truthy | ||
end | ||
|
||
expect(generated_file.exist?).to be_falsey | ||
end | ||
end | ||
end |
35 changes: 35 additions & 0 deletions
35
spec/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service_spec.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'spec_helper' | ||
|
||
RSpec.describe DerivativeRodeo::Services::ExtractWordCoordinatesFromHocrSgmlService do | ||
let(:minimal) { File.read(minimal_path) } | ||
|
||
let(:reader_minimal) { described_class.new(minimal) } | ||
let(:reader_minimal_path) { described_class.new(minimal_path) } | ||
|
||
let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) } | ||
let(:hocr) { described_class.new(xml) } | ||
subject { hocr } | ||
|
||
# I want to deprecate this, but for now, it's here. | ||
it { is_expected.to respond_to(:json) } | ||
|
||
describe '#text' do | ||
let(:xml) { File.read(Fixtures.path_for('ocr_mono_text_hocr.html')) } | ||
subject { hocr.text } | ||
|
||
it 'outputs plain text' do | ||
expect(subject.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. " | ||
expect(subject.size).to eq 831 | ||
end | ||
end | ||
|
||
describe '#to_json' do | ||
subject { hocr.to_json } | ||
it 'outputs JSON that includes coords key' do | ||
parsed = JSON.parse(subject) | ||
expect(parsed['coords'].length).to be > 1 | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters