From c1188cce0fbf329f3c2afeddee5520f7fa84800d Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:22:08 +0100 Subject: [PATCH 01/16] create SidebarNoise class, function noise_count outside of class: calculates noise count for intersecting words --- src/stratigraphy/sidebar/sidebar.py | 63 ++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/stratigraphy/sidebar/sidebar.py b/src/stratigraphy/sidebar/sidebar.py index fcedaf8..2e244b4 100644 --- a/src/stratigraphy/sidebar/sidebar.py +++ b/src/stratigraphy/sidebar/sidebar.py @@ -7,6 +7,7 @@ from typing import Generic, TypeVar import fitz +import rtree from stratigraphy.depth import DepthColumnEntry from stratigraphy.lines.line import TextLine, TextWord @@ -44,24 +45,6 @@ def min_x1(self) -> float: """Get the minimum x1 value of the depth column entries.""" return min([rect.x1 for rect in self.rects()]) - def noise_count(self, all_words: list[TextWord]) -> int: - """Counts the number of words that intersect with the depth column entries. - - Returns the number of words that intersect with the depth column entries, but are not part of the depth column. - - Args: - all_words (list[TextWord]): A list of all text lines on the page. - - Returns: - int: The number of words that intersect with the depth column entries but are not part of it. - """ - - def significant_intersection(other_rect): - intersection = fitz.Rect(other_rect).intersect(self.rect()) - return intersection.is_valid and intersection.width > 0.25 * self.rect().width - - return len([word for word in all_words if significant_intersection(word.rect)]) - len(self.entries) - @abc.abstractmethod def identify_groups( self, @@ -106,3 +89,47 @@ def can_be_appended(self, rect: fitz.Rect) -> bool: ): return True return False + + +@dataclass +class SidebarNoise(Generic[EntryT]): + """Wrapper class for Sidebar to calculate noise count using intersecting words.""" + + sidebar: Sidebar[EntryT] + noise_count: int + + def __post_init__(self): + if not isinstance(self.sidebar, Sidebar): + raise TypeError(f"Expected a Sidebar instance, got {type(self.sidebar).__name__}") + + def __repr__(self): + return f"SidebarNoise(sidebar={repr(self.sidebar)}, noise_count={self.noise_count})" + + +def noise_count(sidebar: Sidebar, all_words: list[TextWord], word_rtree: rtree.index.Index) -> int: + """Counts the number of words that intersect with the Sidebar entries. + + Args: + sidebar (Sidebar): Sidebar object for which the noise count is calculated. + all_words (list[TextWord]): A list of words contained on a page. + word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries. + + Returns: + int: The number of words that intersect with the Sidebar entries but are not part of it. + """ + sidebar_rect = sidebar.rect() + intersecting_words = _get_intersecting_words(all_words, word_rtree, sidebar_rect) + + def significant_intersection(word): + intersection = fitz.Rect(word.rect).intersect(sidebar_rect) + return intersection.is_valid and intersection.width > 0.25 * sidebar_rect.width + + return sum(1 for word in filter(significant_intersection, intersecting_words)) - len(sidebar.entries) + + +def _get_intersecting_words( + all_words: list[TextWord], word_rtree: rtree.index.Index, rect: fitz.Rect() +) -> list[TextWord]: + """Retrieve all words from page intersecting with Sidebar bounding box.""" + intersecting_ids = list(word_rtree.intersection((rect.x0, rect.y0, rect.x1, rect.y1))) + return [all_words[i] for i in intersecting_ids] From f30e146b1548531a1bb70b40dc12cc85f8bb8c45 Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:22:49 +0100 Subject: [PATCH 02/16] repr for AtoInterval entries --- src/stratigraphy/depth/interval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/stratigraphy/depth/interval.py b/src/stratigraphy/depth/interval.py index 7c7e7e9..7ef04f5 100644 --- a/src/stratigraphy/depth/interval.py +++ b/src/stratigraphy/depth/interval.py @@ -144,6 +144,9 @@ class AToBInterval(Interval): def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): super().__init__(start, end) + def __repr__(self): + return f"({self.start}, {self.end})" + @property def rect(self) -> fitz.Rect: """Get the rectangle surrounding the interval.""" From 83e63a4b95ae3006a08ff6a315b23202c4fa3883 Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:45:56 +0100 Subject: [PATCH 03/16] AAboveBSidebar extraction returning SidebarNoise, Validator expecting SidebarNoise and recalculating noise_count if necessary. --- src/stratigraphy/sidebar/__init__.py | 1 + .../sidebar/a_above_b_sidebar_extractor.py | 26 ++++-- .../sidebar/a_above_b_sidebar_validator.py | 88 +++++++++++-------- 3 files changed, 73 insertions(+), 42 deletions(-) diff --git a/src/stratigraphy/sidebar/__init__.py b/src/stratigraphy/sidebar/__init__.py index f4a9cf2..8c0281e 100644 --- a/src/stratigraphy/sidebar/__init__.py +++ b/src/stratigraphy/sidebar/__init__.py @@ -11,6 +11,7 @@ __all__ = [ "Sidebar", + "SidebarNoise", "AAboveBSidebar", "AAboveBSidebarExtractor", "AAboveBSidebarValidator", diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py index 96104b6..0a50cc8 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -5,12 +5,14 @@ from collections import defaultdict import fitz +import rtree from stratigraphy.depth import DepthColumnEntryExtractor from stratigraphy.lines.line import TextWord from .a_above_b_sidebar import AAboveBSidebar from .a_above_b_sidebar_validator import AAboveBSidebarValidator +from .sidebar import SidebarNoise, noise_count logger = logging.getLogger(__name__) @@ -20,17 +22,21 @@ class AAboveBSidebarExtractor: @staticmethod def find_in_words( - all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict - ) -> list[AAboveBSidebar]: + all_words: list[TextWord], + word_rtree: rtree.index.Index, + used_entry_rects: list[fitz.Rect], + sidebar_params: dict, + ) -> list[SidebarNoise]: """Construct all possible AAboveBSidebar objects from the given words. Args: all_words (list[TextWord]): All words in the page. + word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries. used_entry_rects (list[fitz.Rect]): Part of the document to ignore. sidebar_params (dict): Parameters for the AAboveBSidebar objects. Returns: - list[AAboveBSidebar]: Found AAboveBSidebar objects. + list[SidebarNoise]: Validated AAboveBSidebar objects wrapped with noise count. """ entries = [ entry @@ -60,7 +66,6 @@ def find_in_words( cluster_dict[x0].entries.append(entry) numeric_columns = [cluster for cluster in cluster_dict.values() if len(cluster.entries) > 3] - sidebar_validator = AAboveBSidebarValidator(all_words, **sidebar_params) filtered_columns = [ column @@ -69,9 +74,16 @@ def find_in_words( if not column.significant_arithmetic_progression() ] - validated_columns = [sidebar_validator.reduce_until_valid(column) for column in filtered_columns] + sidebar_validator = AAboveBSidebarValidator(**sidebar_params) + + def process_column(column): + noise = noise_count(column, all_words, word_rtree) + sidebar_noise = SidebarNoise(sidebar=column, noise_count=noise) + return sidebar_validator.reduce_until_valid(sidebar_noise, all_words, word_rtree) + + validated_sidebars = list(filter(None, map(process_column, filtered_columns))) return sorted( - [column for column in validated_columns if column], - key=lambda column: len(column.entries), + [sidebar_noise for sidebar_noise in validated_sidebars if sidebar_noise.sidebar], + key=lambda sidebar_noise: len(sidebar_noise.sidebar.entries), ) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index 7ca4ca3..3551ba4 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -2,10 +2,13 @@ import dataclasses +import rtree + from stratigraphy.depth import DepthColumnEntry from stratigraphy.lines.line import TextWord from .a_above_b_sidebar import AAboveBSidebar +from .sidebar import SidebarNoise, noise_count @dataclasses.dataclass @@ -13,78 +16,86 @@ class AAboveBSidebarValidator: """Validation logic for instances of the AAboveBSidebar class. Args: - all_words (list[TextLine]): A list of all text lines on the page. noise_count_threshold (float): Noise count threshold deciding how much noise is allowed in a sidebar to be valid. noise_count_offset (int): Offset for the noise count threshold. Affects the noise count criterion. Effective specifically for sidebars with very few entries. """ - all_words: list[TextWord] noise_count_threshold: float noise_count_offset: int - def is_valid(self, sidebar: AAboveBSidebar, corr_coef_threshold: float = 0.99) -> bool: + def is_valid(self, sidebar_noise: SidebarNoise[AAboveBSidebar], corr_coef_threshold: float = 0.99) -> bool: """Checks whether the sidebar is valid. The sidebar is considered valid if: - - The number of entries is at least 3. - - The number of words that intersect with the depth column entries is less than the noise count threshold + - Its noise_count is less than the noise count threshold time the number of entries minus the noise count offset. - The entries are strictly increasing. - The entries are linearly correlated with their vertical position. - Note: The noise count criteria may require a rehaul. Some depth columns are not recognized as valid - even though they are. - Args: - sidebar (AAboveBSidebar): The AAboveBSidebar to validate. + sidebar_noise (SidebarNoise): The SidebarNoise wrapping the sidebar to validate. corr_coef_threshold (float): The minimal correlation coefficient for the column to be deemed valid. Returns: - bool: True if the depth column is valid, False otherwise. + bool: True if the sidebar is valid, False otherwise. """ - if len(sidebar.entries) < 3: - return False - # When too much other text is in the column, then it is probably not valid. # The quadratic behavior of the noise count check makes the check stricter for columns with few entries # than columns with more entries. The more entries we have, the less likely it is that we found them by chance. # TODO: Once evaluation data is of good enough qualities, we should optimize for the parameter below. - if ( - sidebar.noise_count(self.all_words) - > self.noise_count_threshold * (len(sidebar.entries) - self.noise_count_offset) ** 2 - ): + + sidebar = sidebar_noise.sidebar + noise = sidebar_noise.noise_count + + if noise > self.noise_count_threshold * (len(sidebar.entries) - self.noise_count_offset) ** 2: return False # Check if the entries are strictly increasing. if not sidebar.is_strictly_increasing(): return False corr_coef = sidebar.pearson_correlation_coef() - return corr_coef and corr_coef > corr_coef_threshold - def reduce_until_valid(self, column: AAboveBSidebar) -> AAboveBSidebar: + def reduce_until_valid( + self, sidebar_noise: SidebarNoise[AAboveBSidebar], all_words: list[TextWord], word_rtree: rtree.index.Index + ) -> SidebarNoise | None: """Removes entries from the depth column until it fulfills the is_valid condition. is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are linearly correlated with their vertical position. Args: - column (AAboveBSidebar): The depth column to validate + sidebar_noise (SidebarNoise): The SidebarNoise wrapping the AAboveBSidebar to validate. + all_words (list[TextWord]): A list of all words contained on a page. + word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries. + Returns: - AAboveBSidebar: The current depth column with entries removed until it is valid. + sidebar_noise | None : The current SidebarNoise with entries removed from Sidebar until it is valid + and the recalculated noise_count or None. """ - while column: - if self.is_valid(column): - return column - elif self.correct_OCR_mistakes(column) is not None: - return self.correct_OCR_mistakes(column) - else: - column = column.remove_entry_by_correlation_gradient() + while sidebar_noise.sidebar.entries: + if self.is_valid(sidebar_noise): + return sidebar_noise + + corrected_sidebar_noise = self.correct_OCR_mistakes(sidebar_noise, all_words, word_rtree) + if corrected_sidebar_noise: + return corrected_sidebar_noise - def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None: - """Corrects OCR mistakes in the depth column entries. + new_sidebar = sidebar_noise.sidebar.remove_entry_by_correlation_gradient() + if not new_sidebar: + return None + + new_noise_count = noise_count(new_sidebar, all_words, word_rtree) + sidebar_noise = SidebarNoise(sidebar=new_sidebar, noise_count=new_noise_count) + + return None + + def correct_OCR_mistakes( + self, sidebar_noise: SidebarNoise, all_words: list[TextWord], word_rtree: rtree.index.Index + ) -> SidebarNoise | None: + """Corrects OCR mistakes in the Sidebar entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the highest pearson correlation coefficient is selected and checked for validity. @@ -101,12 +112,16 @@ def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None Note: Common mistakes should be extended as needed. Args: - sidebar (AAboveBSidebar): The AAboveBSidebar to validate + sidebar_noise (SidebarNoise): The SidebarNoise wrapping the sidebar to validate. + all_words (list[TextWord]): All words on the page for recalculating noise count. + word_rtree (index.Index): R-tree for efficient spatial queries. Returns: - AAboveBSidebar | None: The corrected sidebar, or None if no correction was possible. + SidebarNoise | None: The corrected SidebarNoise, or None if no correction was possible. """ + sidebar = sidebar_noise.sidebar new_columns = [AAboveBSidebar(entries=[])] + for entry in sidebar.entries: new_columns = [ AAboveBSidebar([*column.entries, DepthColumnEntry(entry.rect, new_value)]) @@ -119,10 +134,13 @@ def correct_OCR_mistakes(self, sidebar: AAboveBSidebar) -> AAboveBSidebar | None if new_columns: best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef()) + new_noise_count = noise_count(best_column, all_words, word_rtree) - # We require a higher correlation coefficient when we've already corrected a mistake. - if self.is_valid(best_column, corr_coef_threshold=0.999): - return best_column + # We require a higher correlation coefficient when corrections are made + if self.is_valid( + SidebarNoise(sidebar=best_column, noise_count=new_noise_count), corr_coef_threshold=0.999 + ): + return SidebarNoise(sidebar=best_column, noise_count=new_noise_count) return None From 9154205fc9f7344cdcbb37d7b96aff1b844c6e9f Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:47:41 +0100 Subject: [PATCH 04/16] including noise count in MaterialDescriptionRectWithSidebar, score_match as property --- .../material_description_rect_with_sidebar.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py index 8d4aa39..9dc7555 100644 --- a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py +++ b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py @@ -4,7 +4,6 @@ from dataclasses import dataclass import fitz -from stratigraphy.lines.line import TextWord from stratigraphy.sidebar import Sidebar @@ -14,13 +13,12 @@ class MaterialDescriptionRectWithSidebar: sidebar: Sidebar | None material_description_rect: fitz.Rect + noise_count: int = 0 - def score_match(self, all_words: list[TextWord] | None = None) -> float: + @property + def score_match(self) -> float: """Scores the match between a sidebar and a material description. - Args: - all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. - Returns: float: The score of the match. """ @@ -36,6 +34,4 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float: height = bottom - top - noise_count = self.sidebar.noise_count(all_words) if all_words else 0 - - return (height - distance) * math.pow(0.8, noise_count) + return (height - distance) * math.pow(0.8, self.noise_count) From 440685c40a59415c0961e94235f7854a4cf9330c Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:48:47 +0100 Subject: [PATCH 05/16] create word rtree for all words on a page --- src/stratigraphy/extract.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 0b6a904..be775e3 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -4,6 +4,7 @@ from dataclasses import dataclass import fitz +import rtree from stratigraphy.data_extractor.data_extractor import FeatureOnPage from stratigraphy.depth import AAboveBInterval, Interval @@ -19,6 +20,7 @@ LayerIdentifierSidebarExtractor, Sidebar, ) +from stratigraphy.sidebar.sidebar import SidebarNoise, noise_count from stratigraphy.text.find_description import ( get_description_blocks, get_description_lines, @@ -75,37 +77,48 @@ def process_page( ) if material_descriptions_sidebar_pairs: - material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match()) + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match) # If there is a layer identifier sidebar, then we use this directly. # Else, we search for sidebars with depths. # We could also think of some scoring mechanism to decide which one to use. if not material_descriptions_sidebar_pairs: words = sorted([word for line in lines for word in line.words], key=lambda word: word.rect.y0) - a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words) + word_rtree = rtree.index.Index() + for i, word in enumerate(words): + word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words) used_entry_rects = [] for column in a_to_b_sidebars: for entry in column.entries: used_entry_rects.extend([entry.start.rect, entry.end.rect]) - sidebars: list[Sidebar] = a_to_b_sidebars - sidebars.extend( + # create sidebars with noise count + sidebars_noise: list[SidebarNoise] = [ + SidebarNoise(sidebar=sidebar, noise_count=noise_count(sidebar, words, word_rtree)) + for sidebar in a_to_b_sidebars + ] + sidebars_noise.extend( AAboveBSidebarExtractor.find_in_words( - words, used_entry_rects, sidebar_params=params["depth_column_params"] + words, word_rtree, used_entry_rects, sidebar_params=params["depth_column_params"] ) ) - for sidebar in sidebars: + for sidebar_noise in sidebars_noise: material_description_rect = find_material_description_column( - lines, sidebar, language, **params["material_description"] + lines, sidebar_noise.sidebar, language, **params["material_description"] ) if material_description_rect: material_descriptions_sidebar_pairs.append( - MaterialDescriptionRectWithSidebar(sidebar, material_description_rect) + MaterialDescriptionRectWithSidebar( + sidebar=sidebar_noise.sidebar, + material_description_rect=material_description_rect, + noise_count=sidebar_noise.noise_count, + ) ) # lowest score first - material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match(words)) + material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match) to_delete = [] for i, pair in enumerate(material_descriptions_sidebar_pairs): @@ -440,7 +453,7 @@ def is_below(best_x0, best_y1, line): if sidebar: return max( candidate_rects, - key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match(), + key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match, ) else: return candidate_rects[0] From d24c3d8a238b62d44bc426cde5767c999d223faf Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:49:19 +0100 Subject: [PATCH 06/16] fix AAboveBSidebar extraction tests --- tests/test_find_sidebar.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_find_sidebar.py b/tests/test_find_sidebar.py index 1b76469..0d1a6c0 100644 --- a/tests/test_find_sidebar.py +++ b/tests/test_find_sidebar.py @@ -2,6 +2,7 @@ import fitz import pytest +import rtree from stratigraphy.depth import DepthColumnEntryExtractor from stratigraphy.lines.line import TextWord from stratigraphy.sidebar import AAboveBSidebarExtractor, AToBSidebarExtractor @@ -63,12 +64,17 @@ def test_aabovebsidebarextractor_arithmetic_progression(): # noqa: D103 TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), ] + word_rtree = rtree.index.Index() + for i, word in enumerate(all_words): + word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) """Test the AAboveBSidebarExtractor with an arithmetic progression.""" - columns = AAboveBSidebarExtractor.find_in_words( + sidebars_noise = AAboveBSidebarExtractor.find_in_words( all_words, + word_rtree, used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) + columns = [sidebar_noise.sidebar for sidebar_noise in sidebars_noise] assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" @@ -81,11 +87,16 @@ def test_aabovebsidebarextractor(): # noqa: D103 TextWord(fitz.Rect(0, 6, 5, 7), "40.0", PAGE_NUMBER), TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), ] - columns = AAboveBSidebarExtractor.find_in_words( + word_rtree = rtree.index.Index() + for i, word in enumerate(all_words): + word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + sidebars_noise = AAboveBSidebarExtractor.find_in_words( all_words, + word_rtree, used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) + columns = [sidebar_noise.sidebar for sidebar_noise in sidebars_noise] assert len(columns) == 1, "There should be 1 column" assert len(columns[0].entries) == 5, "The column should have 5 entries" assert pytest.approx(columns[0].entries[0].value) == 12.0, "The first entry should have a value of 12.0" @@ -110,12 +121,17 @@ def test_aabovebsidebarextractor_two_column(): # noqa: D103 TextWord(fitz.Rect(20, 8, 25, 9), "50.0", PAGE_NUMBER), TextWord(fitz.Rect(20, 10, 25, 11), "61.0", PAGE_NUMBER), ] - - columns = AAboveBSidebarExtractor.find_in_words( + word_rtree = rtree.index.Index() + for i, word in enumerate(all_words): + word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + sidebars_noise = AAboveBSidebarExtractor.find_in_words( all_words, + word_rtree, used_entry_rects=[], sidebar_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) + columns = [sidebar_noise.sidebar for sidebar_noise in sidebars_noise] + assert len(columns) == 2, "There should be 2 columns" assert len(columns[0].entries) == 5, "The first column should have 5 entries" assert len(columns[1].entries) == 6, "The second column should have 6 entries" From c88345ae6e605ea93463ee8635b9c1d7264dc76e Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:50:47 +0100 Subject: [PATCH 07/16] improve computation time --- src/stratigraphy/sidebar/a_above_b_sidebar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index d808275..6822280 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -50,7 +50,11 @@ def strictly_contains(self, other: AAboveBSidebar) -> bool: ) def is_strictly_increasing(self) -> bool: - return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False)) + length = len(self.entries) + for i in range(length - 1): + if self.entries[i].value >= self.entries[i + 1].value: + return False + return True def depth_intervals(self) -> list[AAboveBInterval]: """Creates a list of depth intervals from the depth column entries. From 22adc25b6815529fb69266fabec0ea501a45b842 Mon Sep 17 00:00:00 2001 From: lillemor Date: Tue, 21 Jan 2025 11:51:11 +0100 Subject: [PATCH 08/16] correct description --- src/stratigraphy/lines/geometric_line_utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stratigraphy/lines/geometric_line_utilities.py b/src/stratigraphy/lines/geometric_line_utilities.py index b52aae4..3a5bfe9 100644 --- a/src/stratigraphy/lines/geometric_line_utilities.py +++ b/src/stratigraphy/lines/geometric_line_utilities.py @@ -195,7 +195,7 @@ def _are_parallel(line1: Line, line2: Line, angle_threshold: float) -> bool: Args: line1 (Line): The first line. line2 (Line): The second line. - angle_threshold (float, optional): The acceptable difference between the slopes of the lines. + angle_threshold (float, optional): The acceptable difference between the angles of the lines in degrees. Returns: bool: True if the lines are parallel, False otherwise. From 6c4202e5540e84e2fe4d18afef3855f1e97846b0 Mon Sep 17 00:00:00 2001 From: lillemor Date: Thu, 23 Jan 2025 16:25:04 +0100 Subject: [PATCH 09/16] remove logging --- src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py index 2c5127b..3e60b29 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -1,7 +1,5 @@ """Module for finding AAboveBSidebar instances in a borehole profile.""" -import logging - import fitz import rtree @@ -14,8 +12,6 @@ from .sidebar import SidebarNoise, noise_count from .sidebarentry import DepthColumnEntry -logger = logging.getLogger(__name__) - class AAboveBSidebarExtractor: """Class that finds AAboveBSidebar instances in a borehole profile.""" From 0456e4485c72527bd3ef499ba0c53280589f5e9a Mon Sep 17 00:00:00 2001 From: lillemor Date: Thu, 23 Jan 2025 16:26:22 +0100 Subject: [PATCH 10/16] improve runtime pearson_correlation_coef --- src/stratigraphy/sidebar/a_above_b_sidebar.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 992ba12..963793d 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -99,10 +99,13 @@ def pearson_correlation_coef(self) -> float: entries = np.array([entry.value for entry in self.entries]) # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0 or np.std(positions) == 0: + std_positions = np.std(positions) + std_entries = np.std(entries) + if std_positions == 0 or std_entries == 0: return 0 - return np.corrcoef(positions, entries)[0, 1].item() + covariance = np.mean((positions - np.mean(positions)) * (entries - np.mean(entries))) + return covariance / (std_positions * std_entries) def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None: if len(self.entries) < 3: From 3cfc944503b9d7da2db4013ef995ba43bd3cad98 Mon Sep 17 00:00:00 2001 From: lillemor Date: Thu, 23 Jan 2025 16:27:10 +0100 Subject: [PATCH 11/16] include length condition for valid sidebar again --- src/stratigraphy/sidebar/a_above_b_sidebar_validator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index eff524f..6994132 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -29,6 +29,7 @@ def is_valid(self, sidebar_noise: SidebarNoise[AAboveBSidebar], corr_coef_thresh """Checks whether the sidebar is valid. The sidebar is considered valid if: + - The number of entries is at least 3. - Its noise_count is less than the noise count threshold time the number of entries minus the noise count offset. - The entries are strictly increasing. @@ -48,6 +49,8 @@ def is_valid(self, sidebar_noise: SidebarNoise[AAboveBSidebar], corr_coef_thresh sidebar = sidebar_noise.sidebar noise = sidebar_noise.noise_count + if len(sidebar.entries) < 3: + return False if noise > self.noise_count_threshold * (len(sidebar.entries) - self.noise_count_offset) ** 2: return False From 870bcfe4c2f2b9428f31b63099d8e336629185a2 Mon Sep 17 00:00:00 2001 From: lillemor Date: Fri, 24 Jan 2025 09:12:08 +0100 Subject: [PATCH 12/16] fix requirements --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f5c60bc..57ba813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,8 @@ dependencies = [ "PyMuPDF>=1.23.26", "opencv-python-headless", "quads>=1.1.0", - "numpy<2" + "numpy<2", + "rtree" ] [project.optional-dependencies] From 048bb6a112b846017e3692a6402cc9513a0654fe Mon Sep 17 00:00:00 2001 From: Lillemor Haibach <161145400+lhaibach@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:05:11 +0100 Subject: [PATCH 13/16] Update src/stratigraphy/sidebar/a_above_b_sidebar.py correct is_strictly_increasing Co-authored-by: Stijn Vermeeren <144008419+stijnvermeeren-swisstopo@users.noreply.github.com> --- src/stratigraphy/sidebar/a_above_b_sidebar.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 963793d..669f70e 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -45,11 +45,10 @@ def strictly_contains(self, other: AAboveBSidebar) -> bool: ) def is_strictly_increasing(self) -> bool: - length = len(self.entries) - for i in range(length - 1): - if self.entries[i].value >= self.entries[i + 1].value: - return False - return True + return all( + self.entries[i].value < self.entries[i + 1].value + for i in range(len(self.entries) - 1) + ) def depth_intervals(self) -> list[AAboveBInterval]: """Creates a list of depth intervals from the depth column entries. From 55ed1c1d5c7db18dcb831c93332467454627ef40 Mon Sep 17 00:00:00 2001 From: lillemor Date: Mon, 27 Jan 2025 12:52:34 +0100 Subject: [PATCH 14/16] update comments --- src/stratigraphy/sidebar/a_above_b_sidebar.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 669f70e..1fa7d99 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -45,10 +45,7 @@ def strictly_contains(self, other: AAboveBSidebar) -> bool: ) def is_strictly_increasing(self) -> bool: - return all( - self.entries[i].value < self.entries[i + 1].value - for i in range(len(self.entries) - 1) - ) + return all(self.entries[i].value < self.entries[i + 1].value for i in range(len(self.entries) - 1)) def depth_intervals(self) -> list[AAboveBInterval]: """Creates a list of depth intervals from the depth column entries. @@ -97,12 +94,13 @@ def pearson_correlation_coef(self) -> float: positions = np.array([entry.rect.y1 for entry in self.entries]) entries = np.array([entry.value for entry in self.entries]) - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. std_positions = np.std(positions) std_entries = np.std(entries) if std_positions == 0 or std_entries == 0: return 0 + # We calculate the Pearson correlation coefficient manually + # to avoid redundant standard deviation calculations that would occur with np.corrcoef. covariance = np.mean((positions - np.mean(positions)) * (entries - np.mean(entries))) return covariance / (std_positions * std_entries) From f7a0d33d390696ddbdb28cddb0ef7732aa04444d Mon Sep 17 00:00:00 2001 From: lillemor Date: Mon, 27 Jan 2025 12:55:12 +0100 Subject: [PATCH 15/16] rtree contains TextWord obj not only ids --- src/stratigraphy/extract.py | 7 +++---- .../sidebar/a_above_b_sidebar_extractor.py | 4 ++-- .../sidebar/a_above_b_sidebar_validator.py | 20 +++++++------------ src/stratigraphy/sidebar/sidebar.py | 15 ++++++-------- tests/test_find_sidebar.py | 8 ++++---- 5 files changed, 22 insertions(+), 32 deletions(-) diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index be775e3..28413c1 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -85,8 +85,8 @@ def process_page( if not material_descriptions_sidebar_pairs: words = sorted([word for line in lines for word in line.words], key=lambda word: word.rect.y0) word_rtree = rtree.index.Index() - for i, word in enumerate(words): - word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + for word in words: + word_rtree.insert(id(word), (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1), obj=word) a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words) used_entry_rects = [] @@ -96,8 +96,7 @@ def process_page( # create sidebars with noise count sidebars_noise: list[SidebarNoise] = [ - SidebarNoise(sidebar=sidebar, noise_count=noise_count(sidebar, words, word_rtree)) - for sidebar in a_to_b_sidebars + SidebarNoise(sidebar=sidebar, noise_count=noise_count(sidebar, word_rtree)) for sidebar in a_to_b_sidebars ] sidebars_noise.extend( AAboveBSidebarExtractor.find_in_words( diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py index 3e60b29..d35988a 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -54,9 +54,9 @@ def find_in_words( sidebar_validator = AAboveBSidebarValidator(**sidebar_params) def process_column(column): - noise = noise_count(column, all_words, word_rtree) + noise = noise_count(column, word_rtree) sidebar_noise = SidebarNoise(sidebar=column, noise_count=noise) - return sidebar_validator.reduce_until_valid(sidebar_noise, all_words, word_rtree) + return sidebar_validator.reduce_until_valid(sidebar_noise, word_rtree) validated_sidebars = list(filter(None, map(process_column, filtered_columns))) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index 6994132..fa15f9e 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -4,8 +4,6 @@ import rtree -from stratigraphy.lines.line import TextWord - from .a_above_b_sidebar import AAboveBSidebar from .sidebar import SidebarNoise, noise_count from .sidebarentry import DepthColumnEntry @@ -65,7 +63,7 @@ def is_valid(self, sidebar_noise: SidebarNoise[AAboveBSidebar], corr_coef_thresh return corr_coef and corr_coef > corr_coef_threshold def reduce_until_valid( - self, sidebar_noise: SidebarNoise[AAboveBSidebar], all_words: list[TextWord], word_rtree: rtree.index.Index + self, sidebar_noise: SidebarNoise[AAboveBSidebar], word_rtree: rtree.index.Index ) -> SidebarNoise | None: """Removes entries from the depth column until it fulfills the is_valid condition. @@ -74,8 +72,7 @@ def reduce_until_valid( Args: sidebar_noise (SidebarNoise): The SidebarNoise wrapping the AAboveBSidebar to validate. - all_words (list[TextWord]): A list of all words contained on a page. - word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries. + word_rtree (rtree.index.Index): Pre-built R-tree of all words on page for spatial queries. Returns: sidebar_noise | None : The current SidebarNoise with entries removed from Sidebar until it is valid @@ -85,7 +82,7 @@ def reduce_until_valid( if self.is_valid(sidebar_noise): return sidebar_noise - corrected_sidebar_noise = self.correct_OCR_mistakes(sidebar_noise, all_words, word_rtree) + corrected_sidebar_noise = self.correct_OCR_mistakes(sidebar_noise, word_rtree) if corrected_sidebar_noise: return corrected_sidebar_noise @@ -93,14 +90,12 @@ def reduce_until_valid( if not new_sidebar: return None - new_noise_count = noise_count(new_sidebar, all_words, word_rtree) + new_noise_count = noise_count(new_sidebar, word_rtree) sidebar_noise = SidebarNoise(sidebar=new_sidebar, noise_count=new_noise_count) return None - def correct_OCR_mistakes( - self, sidebar_noise: SidebarNoise, all_words: list[TextWord], word_rtree: rtree.index.Index - ) -> SidebarNoise | None: + def correct_OCR_mistakes(self, sidebar_noise: SidebarNoise, word_rtree: rtree.index.Index) -> SidebarNoise | None: """Corrects OCR mistakes in the Sidebar entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the @@ -119,8 +114,7 @@ def correct_OCR_mistakes( Args: sidebar_noise (SidebarNoise): The SidebarNoise wrapping the sidebar to validate. - all_words (list[TextWord]): All words on the page for recalculating noise count. - word_rtree (index.Index): R-tree for efficient spatial queries. + word_rtree (index.Index): R-tree of all words on page for efficient spatial queries. Returns: SidebarNoise | None: The corrected SidebarNoise, or None if no correction was possible. @@ -140,7 +134,7 @@ def correct_OCR_mistakes( if new_columns: best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef()) - new_noise_count = noise_count(best_column, all_words, word_rtree) + new_noise_count = noise_count(best_column, word_rtree) # We require a higher correlation coefficient when corrections are made if self.is_valid( diff --git a/src/stratigraphy/sidebar/sidebar.py b/src/stratigraphy/sidebar/sidebar.py index 69f956f..2f5661f 100644 --- a/src/stratigraphy/sidebar/sidebar.py +++ b/src/stratigraphy/sidebar/sidebar.py @@ -83,19 +83,18 @@ def __repr__(self): return f"SidebarNoise(sidebar={repr(self.sidebar)}, noise_count={self.noise_count})" -def noise_count(sidebar: Sidebar, all_words: list[TextWord], word_rtree: rtree.index.Index) -> int: +def noise_count(sidebar: Sidebar, word_rtree: rtree.index.Index) -> int: """Counts the number of words that intersect with the Sidebar entries. Args: sidebar (Sidebar): Sidebar object for which the noise count is calculated. - all_words (list[TextWord]): A list of words contained on a page. - word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries. + word_rtree (rtree.index.Index): Pre-built R-tree of all words on page for spatial queries. Returns: int: The number of words that intersect with the Sidebar entries but are not part of it. """ sidebar_rect = sidebar.rect() - intersecting_words = _get_intersecting_words(all_words, word_rtree, sidebar_rect) + intersecting_words = _get_intersecting_words(word_rtree, sidebar_rect) def significant_intersection(word: TextWord) -> bool: word_rect = word.rect @@ -106,9 +105,7 @@ def significant_intersection(word: TextWord) -> bool: return sum(1 for word in intersecting_words if significant_intersection(word)) - len(sidebar.entries) -def _get_intersecting_words( - all_words: list[TextWord], word_rtree: rtree.index.Index, rect: fitz.Rect -) -> list[TextWord]: +def _get_intersecting_words(word_rtree: rtree.index.Index, rect: fitz.Rect) -> list[TextWord]: """Retrieve all words from the page intersecting with Sidebar bounding box.""" - intersecting_ids = list(word_rtree.intersection((rect.x0, rect.y0, rect.x1, rect.y1))) - return [all_words[i] for i in intersecting_ids if any(char.isalnum() for char in all_words[i].text)] + intersecting_words = list(word_rtree.intersection((rect.x0, rect.y0, rect.x1, rect.y1), objects="raw")) + return [word for word in intersecting_words if any(char.isalnum() for char in word.text)] diff --git a/tests/test_find_sidebar.py b/tests/test_find_sidebar.py index 5817817..63d5365 100644 --- a/tests/test_find_sidebar.py +++ b/tests/test_find_sidebar.py @@ -88,8 +88,8 @@ def test_aabovebsidebarextractor(): # noqa: D103 TextWord(fitz.Rect(0, 8, 5, 9), "50.0", PAGE_NUMBER), ] word_rtree = rtree.index.Index() - for i, word in enumerate(all_words): - word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + for word in all_words: + word_rtree.insert(id(word), (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1), obj=word) sidebars_noise = AAboveBSidebarExtractor.find_in_words( all_words, word_rtree, @@ -122,8 +122,8 @@ def test_aabovebsidebarextractor_two_column(): # noqa: D103 TextWord(fitz.Rect(20, 10, 25, 11), "61.0", PAGE_NUMBER), ] word_rtree = rtree.index.Index() - for i, word in enumerate(all_words): - word_rtree.insert(i, (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1)) + for word in all_words: + word_rtree.insert(id(word), (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1), obj=word) sidebars_noise = AAboveBSidebarExtractor.find_in_words( all_words, word_rtree, From c6ab8b617c02a0659ad4848b6c82fcca97d938c3 Mon Sep 17 00:00:00 2001 From: lillemor Date: Mon, 27 Jan 2025 12:56:07 +0100 Subject: [PATCH 16/16] unit test for AAboveBSidebar method is_strictly_increasing --- tests/test_aabovebsidebar.py | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test_aabovebsidebar.py b/tests/test_aabovebsidebar.py index af3123d..f90f9a2 100644 --- a/tests/test_aabovebsidebar.py +++ b/tests/test_aabovebsidebar.py @@ -59,3 +59,52 @@ def test(in_values, out_values): # ensure a "noise" value "0.0" does not influence the result test([1.0, 2.0, 3.0, 0.0, 4.0], [1.0, 2.0, 3.0, 0.0, 4.0]) + + +def test_aabovebsidebar_isstrictlyincreasing(): # noqa: D103 + """Test the is_strictly_increasing method of the AAboveBSidebar class.""" + # Case 1: Strictly increasing values + sidebar = AAboveBSidebar( + [ + DepthColumnEntry(fitz.Rect(), value=1), + DepthColumnEntry(fitz.Rect(), value=2), + DepthColumnEntry(fitz.Rect(), value=3), + DepthColumnEntry(fitz.Rect(), value=4), + DepthColumnEntry(fitz.Rect(), value=5), + ] + ) + assert sidebar.is_strictly_increasing(), "The sidebar should be strictly increasing" + + # Case 2: Not strictly increasing (equal values) + sidebar = AAboveBSidebar( + [ + DepthColumnEntry(fitz.Rect(), value=1), + DepthColumnEntry(fitz.Rect(), value=2), + DepthColumnEntry(fitz.Rect(), value=2), + DepthColumnEntry(fitz.Rect(), value=4), + ] + ) + assert not sidebar.is_strictly_increasing(), "The sidebar should not be strictly increasing" + + # Case 3: Not strictly increasing (decreasing) + sidebar = AAboveBSidebar( + [ + DepthColumnEntry(fitz.Rect(), value=5), + DepthColumnEntry(fitz.Rect(), value=4), + DepthColumnEntry(fitz.Rect(), value=3), + DepthColumnEntry(fitz.Rect(), value=2), + ] + ) + assert not sidebar.is_strictly_increasing(), "The sidebar should not be strictly increasing" + + # Case 4: Single entry (trivial) + sidebar = AAboveBSidebar( + [ + DepthColumnEntry(fitz.Rect(), value=1), + ] + ) + assert sidebar.is_strictly_increasing(), "A single entry should be considered strictly increasing" + + # Case 5: Empty + sidebar = AAboveBSidebar([]) + assert sidebar.is_strictly_increasing(), "An empty list should be considered strictly increasing"