Skip to content

Commit

Permalink
Merge pull request #113 from swisstopo/LGD-564/improve-OCR-mistake-ha…
Browse files Browse the repository at this point in the history
…ndling

Lgd 564/improve ocr mistake handling
  • Loading branch information
lhaibach authored Feb 3, 2025
2 parents 763cd05 + 5b60643 commit 6066499
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 84 deletions.
43 changes: 34 additions & 9 deletions src/stratigraphy/sidebar/a_above_b_sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import statistics
from dataclasses import dataclass
from itertools import product

import fitz
import numpy as np
Expand Down Expand Up @@ -116,19 +117,31 @@ def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None:

def make_ascending(self):
median_value = np.median(np.array([entry.value for entry in self.entries]))
for i, entry in enumerate(self.entries):
if entry.value.is_integer() and entry.value > median_value:
factor100_value = entry.value / 100
previous_ok = i == 0 or all(entry.value < factor100_value for entry in self.entries[:i])
next_ok = i + 1 == len(self.entries) or factor100_value < self.entries[i + 1].value

if previous_ok and next_ok:
# Create a new entry instead of modifying the value of the current one, as this entry might be
# used in different sidebars as well.
self.entries[i] = DepthColumnEntry(rect=entry.rect, value=factor100_value)
for i, entry in enumerate(self.entries):
new_values = []

if entry.value.is_integer() and entry.value > median_value:
new_values.extend([entry.value / 100, entry.value / 10])

# Correct common OCR mistakes where "4" is recognized instead of "1"
# We don't control for OCR mistakes recognizing "9" as "3" (example zurich/680244005-bp.pdf)
if "4" in str(entry.value) and not self._valid_value(i, entry.value):
new_values.extend(generate_alternatives(entry.value))

# Assign the first valid correction
for new_value in new_values:
if self._valid_value(i, new_value):
self.entries[i] = DepthColumnEntry(rect=entry.rect, value=new_value)
break
return self

def _valid_value(self, index: int, new_value: float) -> bool:
"""Check if new value at given index is maintaining ascending order."""
previous_ok = index == 0 or all(other_entry.value < new_value for other_entry in self.entries[:index])
next_ok = index + 1 == len(self.entries) or new_value < self.entries[index + 1].value
return previous_ok and next_ok

def break_on_double_descending(self) -> list[AAboveBSidebar]:
segments = []
segment_start = 0
Expand Down Expand Up @@ -227,3 +240,15 @@ def identify_groups(
groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks))

return groups


def generate_alternatives(value: float) -> list[float]:
"""Generate a list of all possible alternatives by replacing each '4' with '1'."""
value_str = str(value)
alternatives = []
options = [(char if char != "4" else ["4", "1"]) for char in value_str]

for combo in product(*options):
alternatives.append(float("".join(combo)))

return alternatives
1 change: 0 additions & 1 deletion src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def find_in_words(
clusters = Cluster[DepthColumnEntry].create_clusters(entries)

numeric_columns = [AAboveBSidebar(cluster.entries) for cluster in clusters if len(cluster.entries) >= 3]
sidebar_validator = AAboveBSidebarValidator(**sidebar_params)

filtered_columns = [
column
Expand Down
70 changes: 0 additions & 70 deletions src/stratigraphy/sidebar/a_above_b_sidebar_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from .a_above_b_sidebar import AAboveBSidebar
from .sidebar import SidebarNoise, noise_count
from .sidebarentry import DepthColumnEntry


@dataclasses.dataclass
Expand Down Expand Up @@ -82,10 +81,6 @@ def reduce_until_valid(
if self.is_valid(sidebar_noise):
return sidebar_noise

corrected_sidebar_noise = self.correct_OCR_mistakes(sidebar_noise, word_rtree)
if corrected_sidebar_noise:
return corrected_sidebar_noise

new_sidebar = sidebar_noise.sidebar.remove_entry_by_correlation_gradient()
if not new_sidebar:
return None
Expand All @@ -94,68 +89,3 @@ def reduce_until_valid(
sidebar_noise = SidebarNoise(sidebar=new_sidebar, noise_count=new_noise_count)

return None

def correct_OCR_mistakes(self, sidebar_noise: SidebarNoise, word_rtree: rtree.index.Index) -> SidebarNoise | None:
"""Corrects OCR mistakes in the Sidebar entries.
Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
highest pearson correlation coefficient is selected and checked for validity.
This is useful if one or more entries have an OCR mistake, and the column is not valid because of it.
Currently, there is no limit on the number of corrections per depth column. Indeed, there are examples of depth
columns with multiple OCR errors on different depth values. On the other hand, allowing an unlimited number of
corrections increases the risk, that a random column of different values is incorrectly accepted as a depth
column after making the corrections, especially if the column has a low number of entries. A more robust
solution might be to allow corrections on less than 50% of all entries, or something similar. However, we
currently don't have enough examples to properly tune this parameter.
Note: Common mistakes should be extended as needed.
Args:
sidebar_noise (SidebarNoise): The SidebarNoise wrapping the sidebar to validate.
word_rtree (index.Index): R-tree of all words on page for efficient spatial queries.
Returns:
SidebarNoise | None: The corrected SidebarNoise, or None if no correction was possible.
"""
sidebar = sidebar_noise.sidebar
new_columns = [AAboveBSidebar(entries=[])]

for entry in sidebar.entries:
new_columns = [
AAboveBSidebar([*column.entries, DepthColumnEntry(entry.rect, new_value)])
for column in new_columns
for new_value in _value_alternatives(entry.value)
]
# Immediately require strictly increasing values, to avoid exponential complexity when many implausible
# alternative values are suggested
new_columns = [column for column in new_columns if column.is_strictly_increasing()]

if new_columns:
best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
new_noise_count = noise_count(best_column, word_rtree)

# We require a higher correlation coefficient when corrections are made
if self.is_valid(
SidebarNoise(sidebar=best_column, noise_count=new_noise_count), corr_coef_threshold=0.999
):
return SidebarNoise(sidebar=best_column, noise_count=new_noise_count)

return None


def _value_alternatives(value: float) -> set[float]:
"""Corrects frequent OCR errors in depth column entries.
Args:
value (float): The depth values to find plausible alternatives for
Returns:
set(float): all plausible values (including the original one)
"""
alternatives = {value}
# In older documents, OCR sometimes mistakes 1 for 4
alternatives.add(float(str(value).replace("4", "1")))

return alternatives
37 changes: 33 additions & 4 deletions tests/test_aabovebsidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import fitz
from stratigraphy.sidebar import AAboveBSidebar
from stratigraphy.sidebar.a_above_b_sidebar import generate_alternatives
from stratigraphy.sidebar.sidebarentry import DepthColumnEntry


Expand Down Expand Up @@ -47,20 +48,48 @@ def test_aabovebsidebar_makeascending(): # noqa: D103

def test(in_values, out_values):
sidebar = AAboveBSidebar([DepthColumnEntry(fitz.Rect(), value=value) for value in in_values])
assert [
entry.value for entry in sidebar.make_ascending().entries
] == out_values, "The depth values from the sidebar are not converted correctly"
result = [entry.value for entry in sidebar.make_ascending().entries]
assert result == out_values, f"Expected {out_values}, but got {result}"

# Basic transformation for values greater than the median, correct by factor 100
test([1.0, 200.0, 3.0], [1.0, 2.0, 3.0])
test([100.0, 2.0, 3.0], [1.0, 2.0, 3.0])
test([1.0, 2.0, 300.0], [1.0, 2.0, 3.0])
test([1.0, 200.0, 300.0, 4.0, 5.0, 6.0, 100.0], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0])

# Basic transformation for values greater than the median, correct by factor 10
test([1.0, 20.0, 300.0], [1.0, 20.0, 30.0])
test([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0])
test([100.0, 200.0, 300.0], [100.0, 200.0, 300.0])

## Transforming OCR mistakes
test([0.5, 4.0, 2.0, 5.0], [0.5, 1.0, 2.0, 5.0])
test([4.0, 4.4, 4.4, 5.0], [4.0, 4.1, 4.4, 5.0])

# ensure a "noise" value "0.0" does not influence the result
test([1.0, 2.0, 3.0, 0.0, 4.0], [1.0, 2.0, 3.0, 0.0, 4.0])


def test_generate_alternatives():
"""Test generate_alternatives function for alternative options to OCR mistakes."""
assert generate_alternatives(4) == [4, 1]
assert generate_alternatives(14) == [14, 11]
assert generate_alternatives(441) == [441, 411, 141, 111]
assert generate_alternatives(123) == [123]
assert generate_alternatives(4.4) == [4.4, 4.1, 1.4, 1.1]


def test_valid_value():
"""Test _valid_value helper function for make_ascending method of the AAboveBSidebar class."""
entries = [DepthColumnEntry(None, 1), DepthColumnEntry(None, 2), DepthColumnEntry(None, 3)]
sidebar = AAboveBSidebar(entries)

assert sidebar._valid_value(1, 2) is True
assert sidebar._valid_value(1, 3) is False
assert sidebar._valid_value(1, 1.5) is True
assert sidebar._valid_value(0, 2) is False
assert sidebar._valid_value(2, 3.5) is True


def test_aabovebsidebar_isstrictlyincreasing(): # noqa: D103
"""Test the is_strictly_increasing method of the AAboveBSidebar class."""
# Case 1: Strictly increasing values
Expand Down

1 comment on commit 6066499

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1721720%3–458
   get_files.py19190%3–47
   main.py1261260%3–333
src/stratigraphy/benchmark
   metrics.py594229%22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
   data_extractor.py76495%32, 45, 120, 164
   utility.py6350%28–36
src/stratigraphy/depth
   a_to_b_interval_extractor.py371559%41–60, 79, 92
   depthcolumnentry_extractor.py23291%45–46
   interval.py1035349%25–28, 33–36, 42, 48, 52, 91–137, 147, 161, 167–183
src/stratigraphy/depths_materials_column_pairs
   bounding_boxes.py301067%23, 32, 50, 60, 72–78
   material_description_rect_with_sidebar.py18761%25–37
src/stratigraphy/evaluation
   evaluation_dataclasses.py491178%52, 71–74, 90, 104, 125–131, 147
   groundwater_evaluator.py48198%77
   layer_evaluator.py664630%29–30, 35–39, 47, 69–95, 105–113, 128–149
   metadata_evaluator.py371462%46–65, 86–93
   utility.py16756%43–52
src/stratigraphy/groundwater
   groundwater_extraction.py1469038%52, 94, 137–148, 180–184, 199–215, 226–314, 335–363
   utility.py423614%10–17, 30–50, 62–76, 91–105
src/stratigraphy/layer
   layer.py371365%26, 29, 37, 52–72
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py106496%29, 93–94, 106
   elevation_extraction.py906033%34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 234–238
   language_detection.py181328%17–23, 37–45
   metadata.py662464%27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
   a_above_b_sidebar.py1143966%41, 44, 59–67, 72, 79, 101, 109–116, 158–159, 201–242
   a_above_b_sidebar_validator.py311068%50, 53, 56, 58, 84–91
   a_to_b_sidebar.py431467%36, 49–50, 67, 95–108
   layer_identifier_sidebar.py462937%46–65, 81–97, 109, 122
   layer_identifier_sidebar_extractor.py292031%31–41, 55–75
   sidebar.py53591%42, 47, 68, 80, 83
   sidebarentry.py20385%27, 31, 43
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py41880%26–34, 111–114
   textblock.py901188%22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
   dataclasses.py32391%37–39
   predictions.py723453%72, 95–115, 143–187
   util.py341265%69–76, 90–92, 116–117
TOTAL242297360% 

Tests Skipped Failures Errors Time
104 0 💤 0 ❌ 0 🔥 7.628s ⏱️

Please sign in to comment.