Merge pull request #111 from swisstopo/LGD-564/optimize_a_above_b_sid…

…ebar_extractor Lgd 564/optimize a above b sidebar extractor
swisstopo · Jan 27, 2025 · 763cd05 · 763cd05 · github-actions · Jan 27, 2025
2 parents a31eafa + abc93fd
commit 763cd05
Show file tree

Hide file tree

Showing 12 changed files with 230 additions and 92 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,8 @@ dependencies = [
     "PyMuPDF>=1.23.26",
     "opencv-python-headless",
     "quads>=1.1.0",
-    "numpy<2"
+    "numpy<2",
+    "rtree"
 ]
 
 [project.optional-dependencies]

diff --git a/src/stratigraphy/depth/interval.py b/src/stratigraphy/depth/interval.py
@@ -143,6 +143,9 @@ class AToBInterval(Interval):
     def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry):
         super().__init__(start, end)
 
+    def __repr__(self):
+        return f"({self.start}, {self.end})"
+
     @property
     def rect(self) -> fitz.Rect:
         """Get the rectangle surrounding the interval."""

diff --git a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 
 import fitz
-from stratigraphy.lines.line import TextWord
 from stratigraphy.sidebar import Sidebar
 
 
@@ -14,13 +13,12 @@ class MaterialDescriptionRectWithSidebar:
 
     sidebar: Sidebar | None
     material_description_rect: fitz.Rect
+    noise_count: int = 0
 
-    def score_match(self, all_words: list[TextWord] | None = None) -> float:
+    @property
+    def score_match(self) -> float:
         """Scores the match between a sidebar and a material description.
 
-        Args:
-            all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None.
-
         Returns:
             float: The score of the match. Better matches have a higher score value.
         """
@@ -36,5 +34,4 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float:
 
         height = bottom - top
 
-        noise_count = self.sidebar.noise_count(all_words) if all_words else 0
-        return (height - distance) * math.pow(0.8, 10 * noise_count / len(self.sidebar.entries))
+        return (height - distance) * math.pow(0.8, 10 * self.noise_count / len(self.sidebar.entries))
diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 
 import fitz
+import rtree
 
 from stratigraphy.data_extractor.data_extractor import FeatureOnPage
 from stratigraphy.depth import AAboveBInterval, Interval
@@ -19,6 +20,7 @@
     LayerIdentifierSidebarExtractor,
     Sidebar,
 )
+from stratigraphy.sidebar.sidebar import SidebarNoise, noise_count
 from stratigraphy.text.find_description import (
     get_description_blocks,
     get_description_lines,
@@ -75,37 +77,47 @@ def process_page(
             )
 
     if material_descriptions_sidebar_pairs:
-        material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match())
+        material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match)
 
     # If there is a layer identifier sidebar, then we use this directly.
     # Else, we search for sidebars with depths.
     # We could also think of some scoring mechanism to decide which one to use.
     if not material_descriptions_sidebar_pairs:
         words = sorted([word for line in lines for word in line.words], key=lambda word: word.rect.y0)
-        a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words)
+        word_rtree = rtree.index.Index()
+        for word in words:
+            word_rtree.insert(id(word), (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1), obj=word)
 
+        a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words)
         used_entry_rects = []
         for column in a_to_b_sidebars:
             for entry in column.entries:
                 used_entry_rects.extend([entry.start.rect, entry.end.rect])
 
-        sidebars: list[Sidebar] = a_to_b_sidebars
-        sidebars.extend(
+        # create sidebars with noise count
+        sidebars_noise: list[SidebarNoise] = [
+            SidebarNoise(sidebar=sidebar, noise_count=noise_count(sidebar, word_rtree)) for sidebar in a_to_b_sidebars
+        ]
+        sidebars_noise.extend(
             AAboveBSidebarExtractor.find_in_words(
-                words, used_entry_rects, sidebar_params=params["depth_column_params"]
+                words, word_rtree, used_entry_rects, sidebar_params=params["depth_column_params"]
             )
         )
 
-        for sidebar in sidebars:
+        for sidebar_noise in sidebars_noise:
             material_description_rect = find_material_description_column(
-                lines, sidebar, language, **params["material_description"]
+                lines, sidebar_noise.sidebar, language, **params["material_description"]
             )
             if material_description_rect:
                 material_descriptions_sidebar_pairs.append(
-                    MaterialDescriptionRectWithSidebar(sidebar, material_description_rect)
+                    MaterialDescriptionRectWithSidebar(
+                        sidebar=sidebar_noise.sidebar,
+                        material_description_rect=material_description_rect,
+                        noise_count=sidebar_noise.noise_count,
+                    )
                 )
         # lowest score first
-        material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match(words))
+        material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match)
 
     to_delete = []
     for i, pair in enumerate(material_descriptions_sidebar_pairs):
@@ -440,7 +452,7 @@ def is_below(best_x0, best_y1, line):
     if sidebar:
         return max(
             candidate_rects,
-            key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match(),
+            key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match,
         )
     else:
         return candidate_rects[0]
diff --git a/src/stratigraphy/lines/geometric_line_utilities.py b/src/stratigraphy/lines/geometric_line_utilities.py
@@ -195,7 +195,7 @@ def _are_parallel(line1: Line, line2: Line, angle_threshold: float) -> bool:
     Args:
         line1 (Line): The first line.
         line2 (Line): The second line.
-        angle_threshold (float, optional): The acceptable difference between the slopes of the lines.
+        angle_threshold (float, optional): The acceptable difference between the angles of the lines in degrees.
 
     Returns:
         bool: True if the lines are parallel, False otherwise.

diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py
@@ -11,6 +11,7 @@
 
 __all__ = [
     "Sidebar",
+    "SidebarNoise",
     "AAboveBSidebar",
     "AAboveBSidebarExtractor",
     "AAboveBSidebarValidator",

diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py
@@ -45,7 +45,7 @@ def strictly_contains(self, other: AAboveBSidebar) -> bool:
         )
 
     def is_strictly_increasing(self) -> bool:
-        return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))
+        return all(self.entries[i].value < self.entries[i + 1].value for i in range(len(self.entries) - 1))
 
     def depth_intervals(self) -> list[AAboveBInterval]:
         """Creates a list of depth intervals from the depth column entries.
@@ -94,11 +94,15 @@ def pearson_correlation_coef(self) -> float:
         positions = np.array([entry.rect.y1 for entry in self.entries])
         entries = np.array([entry.value for entry in self.entries])
 
-        # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0.
-        if np.std(entries) == 0 or np.std(positions) == 0:
+        std_positions = np.std(positions)
+        std_entries = np.std(entries)
+        if std_positions == 0 or std_entries == 0:
             return 0
 
-        return np.corrcoef(positions, entries)[0, 1].item()
+        # We calculate the Pearson correlation coefficient manually
+        # to avoid redundant standard deviation calculations that would occur with np.corrcoef.
+        covariance = np.mean((positions - np.mean(positions)) * (entries - np.mean(entries)))
+        return covariance / (std_positions * std_entries)
 
     def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None:
         if len(self.entries) < 3:

diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py
@@ -1,13 +1,15 @@
 """Module for finding AAboveBSidebar instances in a borehole profile."""
 
 import fitz
+import rtree
 
 from stratigraphy.depth import DepthColumnEntryExtractor
 from stratigraphy.lines.line import TextWord
 
 from .a_above_b_sidebar import AAboveBSidebar
 from .a_above_b_sidebar_validator import AAboveBSidebarValidator
 from .cluster import Cluster
+from .sidebar import SidebarNoise, noise_count
 from .sidebarentry import DepthColumnEntry
 
 
@@ -16,17 +18,21 @@ class AAboveBSidebarExtractor:
 
     @staticmethod
     def find_in_words(
-        all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict
-    ) -> list[AAboveBSidebar]:
+        all_words: list[TextWord],
+        word_rtree: rtree.index.Index,
+        used_entry_rects: list[fitz.Rect],
+        sidebar_params: dict,
+    ) -> list[SidebarNoise]:
         """Construct all possible AAboveBSidebar objects from the given words.
 
         Args:
             all_words (list[TextWord]): All words in the page.
+            word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries.
             used_entry_rects (list[fitz.Rect]): Part of the document to ignore.
             sidebar_params (dict): Parameters for the AAboveBSidebar objects.
 
         Returns:
-            list[AAboveBSidebar]: Found AAboveBSidebar objects.
+            list[SidebarNoise]: Validated AAboveBSidebar objects wrapped with noise count.
         """
         entries = [
             entry
@@ -36,7 +42,7 @@ def find_in_words(
         clusters = Cluster[DepthColumnEntry].create_clusters(entries)
 
         numeric_columns = [AAboveBSidebar(cluster.entries) for cluster in clusters if len(cluster.entries) >= 3]
-        sidebar_validator = AAboveBSidebarValidator(all_words, **sidebar_params)
+        sidebar_validator = AAboveBSidebarValidator(**sidebar_params)
 
         filtered_columns = [
             column
@@ -45,18 +51,27 @@ def find_in_words(
             if not column.close_to_arithmetic_progression()
         ]
 
-        validated_sidebars = [sidebar_validator.reduce_until_valid(column) for column in filtered_columns]
+        sidebar_validator = AAboveBSidebarValidator(**sidebar_params)
+
+        def process_column(column):
+            noise = noise_count(column, word_rtree)
+            sidebar_noise = SidebarNoise(sidebar=column, noise_count=noise)
+            return sidebar_validator.reduce_until_valid(sidebar_noise, word_rtree)
+
+        validated_sidebars = list(filter(None, map(process_column, filtered_columns)))
 
         sidebars_by_length = sorted(
-            [sidebar for sidebar in validated_sidebars if sidebar],
-            key=lambda sidebar: len(sidebar.entries),
+            [sidebar_noise for sidebar_noise in validated_sidebars if sidebar_noise.sidebar],
+            key=lambda sidebar_noise: len(sidebar_noise.sidebar.entries),
             reverse=True,
         )
 
         result = []
-        # Remove columns that are fully contained in a longer column
-        for sidebar in sidebars_by_length:
-            if not any(result_sidebar.rect().contains(sidebar.rect()) for result_sidebar in result):
-                result.append(sidebar)
+        # Remove sidebar_noise that are fully contained in a longer sidebar
+        for sidebar_noise in sidebars_by_length:
+            if not any(
+                result_sidebar.sidebar.rect().contains(sidebar_noise.sidebar.rect()) for result_sidebar in result
+            ):
+                result.append(sidebar_noise)
 
         return result
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	172	172	0%	3–458
get_files.py	19	19	0%	3–47
main.py	126	126	0%	3–333
src/stratigraphy/benchmark
metrics.py	59	42	29%	22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
data_extractor.py	76	4	95%	32, 45, 120, 164
utility.py	6	3	50%	28–36
src/stratigraphy/depth
a_to_b_interval_extractor.py	37	15	59%	41–60, 79, 92
depthcolumnentry_extractor.py	23	2	91%	45–46
interval.py	103	53	49%	25–28, 33–36, 42, 48, 52, 91–137, 147, 161, 167–183
src/stratigraphy/depths_materials_column_pairs
bounding_boxes.py	30	10	67%	23, 32, 50, 60, 72–78
material_description_rect_with_sidebar.py	18	7	61%	25–37
src/stratigraphy/evaluation
evaluation_dataclasses.py	49	11	78%	52, 71–74, 90, 104, 125–131, 147
groundwater_evaluator.py	48	1	98%	77
layer_evaluator.py	66	46	30%	29–30, 35–39, 47, 69–95, 105–113, 128–149
metadata_evaluator.py	37	14	62%	46–65, 86–93
utility.py	16	7	56%	43–52
src/stratigraphy/groundwater
groundwater_extraction.py	146	90	38%	52, 94, 137–148, 180–184, 199–215, 226–314, 335–363
utility.py	42	36	14%	10–17, 30–50, 62–76, 91–105
src/stratigraphy/layer
layer.py	37	13	65%	26, 29, 37, 52–72
src/stratigraphy/lines
geometric_line_utilities.py	86	2	98%	81, 131
line.py	51	4	92%	25, 50, 60, 110
linesquadtree.py	46	1	98%	75
src/stratigraphy/metadata
coordinate_extraction.py	106	4	96%	29, 93–94, 106
elevation_extraction.py	90	60	33%	34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 234–238
language_detection.py	18	13	28%	17–23, 37–45
metadata.py	66	24	64%	27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
a_above_b_sidebar.py	99	39	61%	40, 43, 58–66, 71, 78, 100, 108–115, 145–146, 188–229
a_above_b_sidebar_validator.py	51	27	47%	51, 54, 57, 59, 85–96, 122–145, 157–161
a_to_b_sidebar.py	43	14	67%	36, 49–50, 67, 95–108
layer_identifier_sidebar.py	46	29	37%	46–65, 81–97, 109, 122
layer_identifier_sidebar_extractor.py	29	20	31%	31–41, 55–75
sidebar.py	53	5	91%	42, 47, 68, 80, 83
sidebarentry.py	20	3	85%	27, 31, 43
src/stratigraphy/text
description_block_splitter.py	70	2	97%	24, 139
extract_text.py	29	3	90%	19, 53–54
find_description.py	41	8	80%	26–34, 111–114
textblock.py	90	11	88%	22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
dataclasses.py	32	3	91%	37–39
predictions.py	72	34	53%	72, 95–115, 143–187
util.py	34	12	65%	69–76, 90–92, 116–117
TOTAL	2428	990	59%