Merge pull request #113 from swisstopo/LGD-564/improve-OCR-mistake-ha…

…ndling Lgd 564/improve ocr mistake handling
swisstopo · Feb 3, 2025 · 6066499 · 6066499 · github-actions · Feb 3, 2025
2 parents 763cd05 + 5b60643
commit 6066499
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 84 deletions.
diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py
@@ -4,6 +4,7 @@
 
 import statistics
 from dataclasses import dataclass
+from itertools import product
 
 import fitz
 import numpy as np
@@ -116,19 +117,31 @@ def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None:
 
     def make_ascending(self):
         median_value = np.median(np.array([entry.value for entry in self.entries]))
-        for i, entry in enumerate(self.entries):
-            if entry.value.is_integer() and entry.value > median_value:
-                factor100_value = entry.value / 100
-                previous_ok = i == 0 or all(entry.value < factor100_value for entry in self.entries[:i])
-                next_ok = i + 1 == len(self.entries) or factor100_value < self.entries[i + 1].value
 
-                if previous_ok and next_ok:
-                    # Create a new entry instead of modifying the value of the current one, as this entry might be
-                    # used in different sidebars as well.
-                    self.entries[i] = DepthColumnEntry(rect=entry.rect, value=factor100_value)
+        for i, entry in enumerate(self.entries):
+            new_values = []
 
+            if entry.value.is_integer() and entry.value > median_value:
+                new_values.extend([entry.value / 100, entry.value / 10])
+
+            # Correct common OCR mistakes where "4" is recognized instead of "1"
+            # We don't control for OCR mistakes recognizing "9" as "3" (example zurich/680244005-bp.pdf)
+            if "4" in str(entry.value) and not self._valid_value(i, entry.value):
+                new_values.extend(generate_alternatives(entry.value))
+
+            # Assign the first valid correction
+            for new_value in new_values:
+                if self._valid_value(i, new_value):
+                    self.entries[i] = DepthColumnEntry(rect=entry.rect, value=new_value)
+                    break
         return self
 
+    def _valid_value(self, index: int, new_value: float) -> bool:
+        """Check if new value at given index is maintaining ascending order."""
+        previous_ok = index == 0 or all(other_entry.value < new_value for other_entry in self.entries[:index])
+        next_ok = index + 1 == len(self.entries) or new_value < self.entries[index + 1].value
+        return previous_ok and next_ok
+
     def break_on_double_descending(self) -> list[AAboveBSidebar]:
         segments = []
         segment_start = 0
@@ -227,3 +240,15 @@ def identify_groups(
             groups.append(IntervalBlockGroup(depth_intervals=current_intervals, blocks=current_blocks))
 
         return groups
+
+
+def generate_alternatives(value: float) -> list[float]:
+    """Generate a list of all possible alternatives by replacing each '4' with '1'."""
+    value_str = str(value)
+    alternatives = []
+    options = [(char if char != "4" else ["4", "1"]) for char in value_str]
+
+    for combo in product(*options):
+        alternatives.append(float("".join(combo)))
+
+    return alternatives
diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py
@@ -42,7 +42,6 @@ def find_in_words(
         clusters = Cluster[DepthColumnEntry].create_clusters(entries)
 
         numeric_columns = [AAboveBSidebar(cluster.entries) for cluster in clusters if len(cluster.entries) >= 3]
-        sidebar_validator = AAboveBSidebarValidator(**sidebar_params)
 
         filtered_columns = [
             column

diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py
@@ -6,7 +6,6 @@
 
 from .a_above_b_sidebar import AAboveBSidebar
 from .sidebar import SidebarNoise, noise_count
-from .sidebarentry import DepthColumnEntry
 
 
 @dataclasses.dataclass
@@ -82,10 +81,6 @@ def reduce_until_valid(
             if self.is_valid(sidebar_noise):
                 return sidebar_noise
 
-            corrected_sidebar_noise = self.correct_OCR_mistakes(sidebar_noise, word_rtree)
-            if corrected_sidebar_noise:
-                return corrected_sidebar_noise
-
             new_sidebar = sidebar_noise.sidebar.remove_entry_by_correlation_gradient()
             if not new_sidebar:
                 return None
@@ -94,68 +89,3 @@ def reduce_until_valid(
             sidebar_noise = SidebarNoise(sidebar=new_sidebar, noise_count=new_noise_count)
 
         return None
-
-    def correct_OCR_mistakes(self, sidebar_noise: SidebarNoise, word_rtree: rtree.index.Index) -> SidebarNoise | None:
-        """Corrects OCR mistakes in the Sidebar entries.
-
-        Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
-        highest pearson correlation coefficient is selected and checked for validity.
-
-        This is useful if one or more entries have an OCR mistake, and the column is not valid because of it.
-
-        Currently, there is no limit on the number of corrections per depth column. Indeed, there are examples of depth
-        columns with multiple OCR errors on different depth values. On the other hand, allowing an unlimited number of
-        corrections increases the risk, that a random column of different values is incorrectly accepted as a depth
-        column after making the corrections, especially if the column has a low number of entries. A more robust
-        solution might be to allow corrections on less than 50% of all entries, or something similar. However, we
-        currently don't have enough examples to properly tune this parameter.
-
-        Note: Common mistakes should be extended as needed.
-
-        Args:
-            sidebar_noise (SidebarNoise): The SidebarNoise wrapping the sidebar to validate.
-            word_rtree (index.Index): R-tree of all words on page for efficient spatial queries.
-
-        Returns:
-            SidebarNoise | None: The corrected SidebarNoise, or None if no correction was possible.
-        """
-        sidebar = sidebar_noise.sidebar
-        new_columns = [AAboveBSidebar(entries=[])]
-
-        for entry in sidebar.entries:
-            new_columns = [
-                AAboveBSidebar([*column.entries, DepthColumnEntry(entry.rect, new_value)])
-                for column in new_columns
-                for new_value in _value_alternatives(entry.value)
-            ]
-            # Immediately require strictly increasing values, to avoid exponential complexity when many implausible
-            # alternative values are suggested
-            new_columns = [column for column in new_columns if column.is_strictly_increasing()]
-
-        if new_columns:
-            best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
-            new_noise_count = noise_count(best_column, word_rtree)
-
-            # We require a higher correlation coefficient when corrections are made
-            if self.is_valid(
-                SidebarNoise(sidebar=best_column, noise_count=new_noise_count), corr_coef_threshold=0.999
-            ):
-                return SidebarNoise(sidebar=best_column, noise_count=new_noise_count)
-
-        return None
-
-
-def _value_alternatives(value: float) -> set[float]:
-    """Corrects frequent OCR errors in depth column entries.
-
-    Args:
-        value (float): The depth values to find plausible alternatives for
-
-    Returns:
-        set(float): all plausible values (including the original one)
-    """
-    alternatives = {value}
-    # In older documents, OCR sometimes mistakes 1 for 4
-    alternatives.add(float(str(value).replace("4", "1")))
-
-    return alternatives
diff --git a/tests/test_aabovebsidebar.py b/tests/test_aabovebsidebar.py
@@ -2,6 +2,7 @@
 
 import fitz
 from stratigraphy.sidebar import AAboveBSidebar
+from stratigraphy.sidebar.a_above_b_sidebar import generate_alternatives
 from stratigraphy.sidebar.sidebarentry import DepthColumnEntry
 
 
@@ -47,20 +48,48 @@ def test_aabovebsidebar_makeascending():  # noqa: D103
 
     def test(in_values, out_values):
         sidebar = AAboveBSidebar([DepthColumnEntry(fitz.Rect(), value=value) for value in in_values])
-        assert [
-            entry.value for entry in sidebar.make_ascending().entries
-        ] == out_values, "The depth values from the sidebar are not converted correctly"
+        result = [entry.value for entry in sidebar.make_ascending().entries]
+        assert result == out_values, f"Expected {out_values}, but got {result}"
 
+    # Basic transformation for values greater than the median, correct by factor 100
     test([1.0, 200.0, 3.0], [1.0, 2.0, 3.0])
     test([100.0, 2.0, 3.0], [1.0, 2.0, 3.0])
     test([1.0, 2.0, 300.0], [1.0, 2.0, 3.0])
-    test([1.0, 200.0, 300.0, 4.0, 5.0, 6.0, 100.0], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0])
+
+    # Basic transformation for values greater than the median, correct by factor 10
+    test([1.0, 20.0, 300.0], [1.0, 20.0, 30.0])
+    test([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0])
     test([100.0, 200.0, 300.0], [100.0, 200.0, 300.0])
 
+    ## Transforming OCR mistakes
+    test([0.5, 4.0, 2.0, 5.0], [0.5, 1.0, 2.0, 5.0])
+    test([4.0, 4.4, 4.4, 5.0], [4.0, 4.1, 4.4, 5.0])
+
     # ensure a "noise" value "0.0" does not influence the result
     test([1.0, 2.0, 3.0, 0.0, 4.0], [1.0, 2.0, 3.0, 0.0, 4.0])
 
 
+def test_generate_alternatives():
+    """Test generate_alternatives function for alternative options to OCR mistakes."""
+    assert generate_alternatives(4) == [4, 1]
+    assert generate_alternatives(14) == [14, 11]
+    assert generate_alternatives(441) == [441, 411, 141, 111]
+    assert generate_alternatives(123) == [123]
+    assert generate_alternatives(4.4) == [4.4, 4.1, 1.4, 1.1]
+
+
+def test_valid_value():
+    """Test _valid_value helper function for make_ascending method of the AAboveBSidebar class."""
+    entries = [DepthColumnEntry(None, 1), DepthColumnEntry(None, 2), DepthColumnEntry(None, 3)]
+    sidebar = AAboveBSidebar(entries)
+
+    assert sidebar._valid_value(1, 2) is True
+    assert sidebar._valid_value(1, 3) is False
+    assert sidebar._valid_value(1, 1.5) is True
+    assert sidebar._valid_value(0, 2) is False
+    assert sidebar._valid_value(2, 3.5) is True
+
+
 def test_aabovebsidebar_isstrictlyincreasing():  # noqa: D103
     """Test the is_strictly_increasing method of the AAboveBSidebar class."""
     # Case 1: Strictly increasing values
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	172	172	0%	3–458
get_files.py	19	19	0%	3–47
main.py	126	126	0%	3–333
src/stratigraphy/benchmark
metrics.py	59	42	29%	22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
data_extractor.py	76	4	95%	32, 45, 120, 164
utility.py	6	3	50%	28–36
src/stratigraphy/depth
a_to_b_interval_extractor.py	37	15	59%	41–60, 79, 92
depthcolumnentry_extractor.py	23	2	91%	45–46
interval.py	103	53	49%	25–28, 33–36, 42, 48, 52, 91–137, 147, 161, 167–183
src/stratigraphy/depths_materials_column_pairs
bounding_boxes.py	30	10	67%	23, 32, 50, 60, 72–78
material_description_rect_with_sidebar.py	18	7	61%	25–37
src/stratigraphy/evaluation
evaluation_dataclasses.py	49	11	78%	52, 71–74, 90, 104, 125–131, 147
groundwater_evaluator.py	48	1	98%	77
layer_evaluator.py	66	46	30%	29–30, 35–39, 47, 69–95, 105–113, 128–149
metadata_evaluator.py	37	14	62%	46–65, 86–93
utility.py	16	7	56%	43–52
src/stratigraphy/groundwater
groundwater_extraction.py	146	90	38%	52, 94, 137–148, 180–184, 199–215, 226–314, 335–363
utility.py	42	36	14%	10–17, 30–50, 62–76, 91–105
src/stratigraphy/layer
layer.py	37	13	65%	26, 29, 37, 52–72
src/stratigraphy/lines
geometric_line_utilities.py	86	2	98%	81, 131
line.py	51	4	92%	25, 50, 60, 110
linesquadtree.py	46	1	98%	75
src/stratigraphy/metadata
coordinate_extraction.py	106	4	96%	29, 93–94, 106
elevation_extraction.py	90	60	33%	34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 234–238
language_detection.py	18	13	28%	17–23, 37–45
metadata.py	66	24	64%	27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
a_above_b_sidebar.py	114	39	66%	41, 44, 59–67, 72, 79, 101, 109–116, 158–159, 201–242
a_above_b_sidebar_validator.py	31	10	68%	50, 53, 56, 58, 84–91
a_to_b_sidebar.py	43	14	67%	36, 49–50, 67, 95–108
layer_identifier_sidebar.py	46	29	37%	46–65, 81–97, 109, 122
layer_identifier_sidebar_extractor.py	29	20	31%	31–41, 55–75
sidebar.py	53	5	91%	42, 47, 68, 80, 83
sidebarentry.py	20	3	85%	27, 31, 43
src/stratigraphy/text
description_block_splitter.py	70	2	97%	24, 139
extract_text.py	29	3	90%	19, 53–54
find_description.py	41	8	80%	26–34, 111–114
textblock.py	90	11	88%	22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
dataclasses.py	32	3	91%	37–39
predictions.py	72	34	53%	72, 95–115, 143–187
util.py	34	12	65%	69–76, 90–92, 116–117
TOTAL	2422	973	60%