Skip to content

Commit

Permalink
Merge pull request #109 from swisstopo/LGVISIUM-96/more-robust-depth-…
Browse files Browse the repository at this point in the history
…values

LGVISIUM-96: more robust depth values
  • Loading branch information
stijnvermeeren-swisstopo authored Jan 22, 2025
2 parents 2c431a7 + 8c56efd commit e7f76aa
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float:
all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None.
Returns:
float: The score of the match.
float: The score of the match. Better matches have a higher score value.
"""
rect = self.sidebar.rect()
top = rect.y0
Expand All @@ -37,4 +37,4 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float:
height = bottom - top

noise_count = self.sidebar.noise_count(all_words) if all_words else 0
return (height - distance) * math.pow(0.8, noise_count)
return (height - distance) * math.pow(0.8, 10 * noise_count / len(self.sidebar.entries))
38 changes: 18 additions & 20 deletions src/stratigraphy/sidebar/a_above_b_sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import statistics
from dataclasses import dataclass

import fitz
Expand Down Expand Up @@ -64,31 +65,28 @@ def depth_intervals(self) -> list[AAboveBInterval]:
# (and includes additional lines below the actual material descriptions).
return depth_intervals

def significant_arithmetic_progression(self) -> bool:
# to allow for OCR errors or gaps in the progression, we only require a segment of length 6 that is an
# arithmetic progression
segment_length = 6
if len(self.entries) < segment_length:
return self.is_arithmetic_progression()
else:
for i in range(len(self.entries) - segment_length + 1):
if AAboveBSidebar(self.entries[i : i + segment_length]).is_arithmetic_progression():
return True
def close_to_arithmetic_progression(self) -> bool:
"""Check if the depth values of the entries of this sidebar are very close to an arithmetic progressing."""
if len(self.entries) < 2:
return False

def is_arithmetic_progression(self) -> bool:
if len(self.entries) <= 2:
return True
values = [entry.value for entry in self.entries]

progression = np.array(range(len(self.entries)))
entries = np.array([entry.value for entry in self.entries])

# Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0.
if np.std(entries) == 0:
differences = [values[i + 1] - values[i] for i in range(len(values) - 1)]
step = round(statistics.median(differences), 2)
if step <= 0:
return False

scale_pearson_correlation_coef = np.corrcoef(entries, progression)[0, 1].item()
return abs(scale_pearson_correlation_coef) >= 0.9999
first = values[0]
last = values[-1]
arithmethic_progression = {
# ensure we have nicely rounded numbers, without inaccuracies from floating point arithmetic
round(value * step, 2)
for value in range(int(first / step), int(last / step) + 1)
}
score = [value in arithmethic_progression for value in values].count(True)
# 80% of the values must be contained in the closest arithmetic progression (allowing for 20% OCR errors)
return score > 0.8 * len(values)

def pearson_correlation_coef(self) -> float:
# We look at the lower y coordinate, because most often the baseline of the depth value text is aligned with
Expand Down
2 changes: 1 addition & 1 deletion src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def find_in_words(
column
for numeric_column in numeric_columns
for column in numeric_column.make_ascending().break_on_double_descending()
if not column.significant_arithmetic_progression()
if not column.close_to_arithmetic_progression()
]

validated_sidebars = [sidebar_validator.reduce_until_valid(column) for column in filtered_columns]
Expand Down
2 changes: 2 additions & 0 deletions src/stratigraphy/sidebar/a_above_b_sidebar_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def is_valid(self, sidebar: AAboveBSidebar, corr_coef_threshold: float = 0.99) -
# Check if the entries are strictly increasing.
if not sidebar.is_strictly_increasing():
return False
if sidebar.close_to_arithmetic_progression():
return False

corr_coef = sidebar.pearson_correlation_coef()

Expand Down
19 changes: 15 additions & 4 deletions tests/test_aabovebsidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from stratigraphy.sidebar.sidebarentry import DepthColumnEntry


def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103
"""Test the is_arithmetic_progression method of the AAboveBSidebar class."""
def test_aabovebsidebar_closetoarithmeticprogression(): # noqa: D103
"""Test the close_to_arithmetic_progression method of the AAboveBSidebar class."""
sidebar = AAboveBSidebar(
[
DepthColumnEntry(fitz.Rect(), value=1),
Expand All @@ -16,7 +16,16 @@ def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103
DepthColumnEntry(fitz.Rect(), value=5),
]
)
assert sidebar.is_arithmetic_progression(), "The column should be recognized as arithmetic progression"
assert sidebar.close_to_arithmetic_progression(), "The sidebar should be recognized as arithmetic progression"

sidebar = AAboveBSidebar(
[
DepthColumnEntry(fitz.Rect(), value=0.2),
DepthColumnEntry(fitz.Rect(), value=0.3),
DepthColumnEntry(fitz.Rect(), value=0.4),
]
)
assert sidebar.close_to_arithmetic_progression(), "The sidebar should be recognized as arithmetic progression"

sidebar = AAboveBSidebar(
[
Expand All @@ -28,7 +37,9 @@ def test_aabovebsidebar_isarithmeticprogression(): # noqa: D103
DepthColumnEntry(fitz.Rect(), value=20.5),
]
)
assert not sidebar.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression"
assert (
not sidebar.close_to_arithmetic_progression()
), "The sidebar should not be recognized as arithmetic progression"


def test_aabovebsidebar_makeascending(): # noqa: D103
Expand Down

1 comment on commit e7f76aa

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1671670%3–446
   get_files.py19190%3–47
   main.py1201200%3–326
src/stratigraphy/benchmark
   metrics.py594229%22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
   data_extractor.py76495%32, 45, 120, 164
   utility.py6350%28–36
src/stratigraphy/depth
   a_to_b_interval_extractor.py371559%41–60, 79, 92
   depthcolumnentry_extractor.py23291%45–46
   interval.py1015249%25–28, 33–36, 42, 48, 52, 91–137, 158, 164–180
src/stratigraphy/depths_materials_column_pairs
   bounding_boxes.py301067%23, 32, 50, 60, 72–78
   material_description_rect_with_sidebar.py18856%27–40
src/stratigraphy/evaluation
   evaluation_dataclasses.py491178%52, 71–74, 90, 104, 125–131, 147
   groundwater_evaluator.py48198%77
   layer_evaluator.py664630%29–30, 35–39, 47, 69–95, 105–113, 128–149
   metadata_evaluator.py371462%46–65, 86–93
   utility.py16756%43–52
src/stratigraphy/groundwater
   groundwater_extraction.py1469038%52, 94, 137–148, 180–184, 199–215, 226–314, 335–363
   utility.py423614%10–17, 30–50, 62–76, 91–105
src/stratigraphy/layer
   layer.py371365%26, 29, 37, 52–72
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py106496%29, 93–94, 106
   elevation_extraction.py906033%34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 234–238
   language_detection.py181328%17–23, 37–45
   metadata.py662464%27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
   a_above_b_sidebar.py963959%40, 43, 58–66, 71, 78, 99, 104–111, 141–142, 184–225
   a_above_b_sidebar_validator.py411954%48, 58, 61, 63, 83–86, 111–129, 141–145
   a_to_b_sidebar.py431467%36, 49–50, 67, 95–108
   layer_identifier_sidebar.py462937%46–65, 81–97, 109, 122
   layer_identifier_sidebar_extractor.py292031%31–41, 55–75
   sidebar.py38392%41, 46, 87
   sidebarentry.py20385%27, 31, 43
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py41880%26–34, 111–114
   textblock.py901188%22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
   dataclasses.py32391%37–39
   predictions.py723453%72, 95–115, 143–187
   util.py341265%69–76, 90–92, 116–117
TOTAL238096959% 

Tests Skipped Failures Errors Time
101 0 💤 0 ❌ 0 🔥 7.561s ⏱️

Please sign in to comment.