diff --git a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py index 30fc2cf..97b3dee 100644 --- a/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py +++ b/src/stratigraphy/depths_materials_column_pairs/material_description_rect_with_sidebar.py @@ -22,7 +22,7 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float: all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None. Returns: - float: The score of the match. + float: The score of the match. Better matches have a higher score value. """ rect = self.sidebar.rect() top = rect.y0 @@ -37,4 +37,4 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float: height = bottom - top noise_count = self.sidebar.noise_count(all_words) if all_words else 0 - return (height - distance) * math.pow(0.8, noise_count) + return (height - distance) * math.pow(0.8, 10 * noise_count / len(self.sidebar.entries)) diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar.py b/src/stratigraphy/sidebar/a_above_b_sidebar.py index 8a1bb4f..3f1504b 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar.py @@ -2,6 +2,7 @@ from __future__ import annotations +import statistics from dataclasses import dataclass import fitz @@ -64,31 +65,28 @@ def depth_intervals(self) -> list[AAboveBInterval]: # (and includes additional lines below the actual material descriptions). return depth_intervals - def significant_arithmetic_progression(self) -> bool: - # to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an - # arithmetic progression - segment_length = 6 - if len(self.entries) < segment_length: - return self.is_arithmetic_progression() - else: - for i in range(len(self.entries) - segment_length + 1): - if AAboveBSidebar(self.entries[i : i + segment_length]).is_arithmetic_progression(): - return True + def close_to_arithmetic_progression(self) -> bool: + """Check if the depth values of the entries of this sidebar are very close to an arithmetic progressing.""" + if len(self.entries) < 2: return False - def is_arithmetic_progression(self) -> bool: - if len(self.entries) <= 2: - return True + values = [entry.value for entry in self.entries] - progression = np.array(range(len(self.entries))) - entries = np.array([entry.value for entry in self.entries]) - - # Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0. - if np.std(entries) == 0: + differences = [values[i + 1] - values[i] for i in range(len(values) - 1)] + step = round(statistics.median(differences), 2) + if step <= 0: return False - scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item() - return abs(scale_pearson_correlation_coef) >= 0.9999 + first = values[0] + last = values[-1] + arithmethic_progression = { + # ensure we have nicely rounded numbers, without inaccuracies from floating point arithmetic + round(value * step, 2) + for value in range(int(first / step), int(last / step) + 1) + } + score = [value in arithmethic_progression for value in values].count(True) + # 80% of the values must be contained in the closest arithmetic progression (allowing for 20% OCR errors) + return score > 0.8 * len(values) def pearson_correlation_coef(self) -> float: # We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py index 91b67ff..f03d061 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py @@ -42,7 +42,7 @@ def find_in_words( column for numeric_column in numeric_columns for column in numeric_column.make_ascending().break_on_double_descending() - if not column.significant_arithmetic_progression() + if not column.close_to_arithmetic_progression() ] validated_sidebars = [sidebar_validator.reduce_until_valid(column) for column in filtered_columns] diff --git a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py index f4852f2..874c946 100644 --- a/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py +++ b/src/stratigraphy/sidebar/a_above_b_sidebar_validator.py @@ -59,6 +59,8 @@ def is_valid(self, sidebar: AAboveBSidebar, corr_coef_threshold: float = 0.99) - # Check if the entries are strictly increasing. if not sidebar.is_strictly_increasing(): return False + if sidebar.close_to_arithmetic_progression(): + return False corr_coef = sidebar.pearson_correlation_coef() diff --git a/tests/test_aabovebsidebar.py b/tests/test_aabovebsidebar.py index d19af40..af3123d 100644 --- a/tests/test_aabovebsidebar.py +++ b/tests/test_aabovebsidebar.py @@ -5,8 +5,8 @@ from stratigraphy.sidebar.sidebarentry import DepthColumnEntry -def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103 - """Test the is_arithmetic_progression method of the AAboveBSidebar class.""" +def test_aabovebsidebar_closetoarithmeticprogression(): # noqa: D103 + """Test the close_to_arithmetic_progression method of the AAboveBSidebar class.""" sidebar = AAboveBSidebar( [ DepthColumnEntry(fitz.Rect(), value=1), @@ -16,7 +16,16 @@ def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=5), ] ) - assert sidebar.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" + assert sidebar.close_to_arithmetic_progression(), "The sidebar should be recognized as arithmetic progression" + + sidebar = AAboveBSidebar( + [ + DepthColumnEntry(fitz.Rect(), value=0.2), + DepthColumnEntry(fitz.Rect(), value=0.3), + DepthColumnEntry(fitz.Rect(), value=0.4), + ] + ) + assert sidebar.close_to_arithmetic_progression(), "The sidebar should be recognized as arithmetic progression" sidebar = AAboveBSidebar( [ @@ -28,7 +37,9 @@ def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103 DepthColumnEntry(fitz.Rect(), value=20.5), ] ) - assert not sidebar.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" + assert ( + not sidebar.close_to_arithmetic_progression() + ), "The sidebar should not be recognized as arithmetic progression" def test_aabovebsidebar_makeascending(): # noqa: D103