Skip to content

Commit

Permalink
Merge pull request #111 from swisstopo/LGD-564/optimize_a_above_b_sid…
Browse files Browse the repository at this point in the history
…ebar_extractor

Lgd 564/optimize a above b sidebar extractor
  • Loading branch information
lhaibach authored Jan 27, 2025
2 parents a31eafa + abc93fd commit 763cd05
Show file tree
Hide file tree
Showing 12 changed files with 230 additions and 92 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ dependencies = [
"PyMuPDF>=1.23.26",
"opencv-python-headless",
"quads>=1.1.0",
"numpy<2"
"numpy<2",
"rtree"
]

[project.optional-dependencies]
Expand Down
3 changes: 3 additions & 0 deletions src/stratigraphy/depth/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ class AToBInterval(Interval):
def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry):
super().__init__(start, end)

def __repr__(self):
return f"({self.start}, {self.end})"

@property
def rect(self) -> fitz.Rect:
"""Get the rectangle surrounding the interval."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from dataclasses import dataclass

import fitz
from stratigraphy.lines.line import TextWord
from stratigraphy.sidebar import Sidebar


Expand All @@ -14,13 +13,12 @@ class MaterialDescriptionRectWithSidebar:

sidebar: Sidebar | None
material_description_rect: fitz.Rect
noise_count: int = 0

def score_match(self, all_words: list[TextWord] | None = None) -> float:
@property
def score_match(self) -> float:
"""Scores the match between a sidebar and a material description.
Args:
all_words (list[TextWord] | None, optional): List of the available text words. Defaults to None.
Returns:
float: The score of the match. Better matches have a higher score value.
"""
Expand All @@ -36,5 +34,4 @@ def score_match(self, all_words: list[TextWord] | None = None) -> float:

height = bottom - top

noise_count = self.sidebar.noise_count(all_words) if all_words else 0
return (height - distance) * math.pow(0.8, 10 * noise_count / len(self.sidebar.entries))
return (height - distance) * math.pow(0.8, 10 * self.noise_count / len(self.sidebar.entries))
32 changes: 22 additions & 10 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataclasses import dataclass

import fitz
import rtree

from stratigraphy.data_extractor.data_extractor import FeatureOnPage
from stratigraphy.depth import AAboveBInterval, Interval
Expand All @@ -19,6 +20,7 @@
LayerIdentifierSidebarExtractor,
Sidebar,
)
from stratigraphy.sidebar.sidebar import SidebarNoise, noise_count
from stratigraphy.text.find_description import (
get_description_blocks,
get_description_lines,
Expand Down Expand Up @@ -75,37 +77,47 @@ def process_page(
)

if material_descriptions_sidebar_pairs:
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match())
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match)

# If there is a layer identifier sidebar, then we use this directly.
# Else, we search for sidebars with depths.
# We could also think of some scoring mechanism to decide which one to use.
if not material_descriptions_sidebar_pairs:
words = sorted([word for line in lines for word in line.words], key=lambda word: word.rect.y0)
a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words)
word_rtree = rtree.index.Index()
for word in words:
word_rtree.insert(id(word), (word.rect.x0, word.rect.y0, word.rect.x1, word.rect.y1), obj=word)

a_to_b_sidebars = AToBSidebarExtractor.find_in_words(words)
used_entry_rects = []
for column in a_to_b_sidebars:
for entry in column.entries:
used_entry_rects.extend([entry.start.rect, entry.end.rect])

sidebars: list[Sidebar] = a_to_b_sidebars
sidebars.extend(
# create sidebars with noise count
sidebars_noise: list[SidebarNoise] = [
SidebarNoise(sidebar=sidebar, noise_count=noise_count(sidebar, word_rtree)) for sidebar in a_to_b_sidebars
]
sidebars_noise.extend(
AAboveBSidebarExtractor.find_in_words(
words, used_entry_rects, sidebar_params=params["depth_column_params"]
words, word_rtree, used_entry_rects, sidebar_params=params["depth_column_params"]
)
)

for sidebar in sidebars:
for sidebar_noise in sidebars_noise:
material_description_rect = find_material_description_column(
lines, sidebar, language, **params["material_description"]
lines, sidebar_noise.sidebar, language, **params["material_description"]
)
if material_description_rect:
material_descriptions_sidebar_pairs.append(
MaterialDescriptionRectWithSidebar(sidebar, material_description_rect)
MaterialDescriptionRectWithSidebar(
sidebar=sidebar_noise.sidebar,
material_description_rect=material_description_rect,
noise_count=sidebar_noise.noise_count,
)
)
# lowest score first
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match(words))
material_descriptions_sidebar_pairs.sort(key=lambda pair: pair.score_match)

to_delete = []
for i, pair in enumerate(material_descriptions_sidebar_pairs):
Expand Down Expand Up @@ -440,7 +452,7 @@ def is_below(best_x0, best_y1, line):
if sidebar:
return max(
candidate_rects,
key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match(),
key=lambda rect: MaterialDescriptionRectWithSidebar(sidebar, rect).score_match,
)
else:
return candidate_rects[0]
2 changes: 1 addition & 1 deletion src/stratigraphy/lines/geometric_line_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def _are_parallel(line1: Line, line2: Line, angle_threshold: float) -> bool:
Args:
line1 (Line): The first line.
line2 (Line): The second line.
angle_threshold (float, optional): The acceptable difference between the slopes of the lines.
angle_threshold (float, optional): The acceptable difference between the angles of the lines in degrees.
Returns:
bool: True if the lines are parallel, False otherwise.
Expand Down
1 change: 1 addition & 0 deletions src/stratigraphy/sidebar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

__all__ = [
"Sidebar",
"SidebarNoise",
"AAboveBSidebar",
"AAboveBSidebarExtractor",
"AAboveBSidebarValidator",
Expand Down
12 changes: 8 additions & 4 deletions src/stratigraphy/sidebar/a_above_b_sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def strictly_contains(self, other: AAboveBSidebar) -> bool:
)

def is_strictly_increasing(self) -> bool:
return all(i.value < j.value for i, j in zip(self.entries, self.entries[1:], strict=False))
return all(self.entries[i].value < self.entries[i + 1].value for i in range(len(self.entries) - 1))

def depth_intervals(self) -> list[AAboveBInterval]:
"""Creates a list of depth intervals from the depth column entries.
Expand Down Expand Up @@ -94,11 +94,15 @@ def pearson_correlation_coef(self) -> float:
positions = np.array([entry.rect.y1 for entry in self.entries])
entries = np.array([entry.value for entry in self.entries])

# Avoid warnings in the np.corrcoef call, as the correlation coef is undefined if the standard deviation is 0.
if np.std(entries) == 0 or np.std(positions) == 0:
std_positions = np.std(positions)
std_entries = np.std(entries)
if std_positions == 0 or std_entries == 0:
return 0

return np.corrcoef(positions, entries)[0, 1].item()
# We calculate the Pearson correlation coefficient manually
# to avoid redundant standard deviation calculations that would occur with np.corrcoef.
covariance = np.mean((positions - np.mean(positions)) * (entries - np.mean(entries)))
return covariance / (std_positions * std_entries)

def remove_entry_by_correlation_gradient(self) -> AAboveBSidebar | None:
if len(self.entries) < 3:
Expand Down
37 changes: 26 additions & 11 deletions src/stratigraphy/sidebar/a_above_b_sidebar_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Module for finding AAboveBSidebar instances in a borehole profile."""

import fitz
import rtree

from stratigraphy.depth import DepthColumnEntryExtractor
from stratigraphy.lines.line import TextWord

from .a_above_b_sidebar import AAboveBSidebar
from .a_above_b_sidebar_validator import AAboveBSidebarValidator
from .cluster import Cluster
from .sidebar import SidebarNoise, noise_count
from .sidebarentry import DepthColumnEntry


Expand All @@ -16,17 +18,21 @@ class AAboveBSidebarExtractor:

@staticmethod
def find_in_words(
all_words: list[TextWord], used_entry_rects: list[fitz.Rect], sidebar_params: dict
) -> list[AAboveBSidebar]:
all_words: list[TextWord],
word_rtree: rtree.index.Index,
used_entry_rects: list[fitz.Rect],
sidebar_params: dict,
) -> list[SidebarNoise]:
"""Construct all possible AAboveBSidebar objects from the given words.
Args:
all_words (list[TextWord]): All words in the page.
word_rtree (rtree.index.Index): Pre-built R-tree for spatial queries.
used_entry_rects (list[fitz.Rect]): Part of the document to ignore.
sidebar_params (dict): Parameters for the AAboveBSidebar objects.
Returns:
list[AAboveBSidebar]: Found AAboveBSidebar objects.
list[SidebarNoise]: Validated AAboveBSidebar objects wrapped with noise count.
"""
entries = [
entry
Expand All @@ -36,7 +42,7 @@ def find_in_words(
clusters = Cluster[DepthColumnEntry].create_clusters(entries)

numeric_columns = [AAboveBSidebar(cluster.entries) for cluster in clusters if len(cluster.entries) >= 3]
sidebar_validator = AAboveBSidebarValidator(all_words, **sidebar_params)
sidebar_validator = AAboveBSidebarValidator(**sidebar_params)

filtered_columns = [
column
Expand All @@ -45,18 +51,27 @@ def find_in_words(
if not column.close_to_arithmetic_progression()
]

validated_sidebars = [sidebar_validator.reduce_until_valid(column) for column in filtered_columns]
sidebar_validator = AAboveBSidebarValidator(**sidebar_params)

def process_column(column):
noise = noise_count(column, word_rtree)
sidebar_noise = SidebarNoise(sidebar=column, noise_count=noise)
return sidebar_validator.reduce_until_valid(sidebar_noise, word_rtree)

validated_sidebars = list(filter(None, map(process_column, filtered_columns)))

sidebars_by_length = sorted(
[sidebar for sidebar in validated_sidebars if sidebar],
key=lambda sidebar: len(sidebar.entries),
[sidebar_noise for sidebar_noise in validated_sidebars if sidebar_noise.sidebar],
key=lambda sidebar_noise: len(sidebar_noise.sidebar.entries),
reverse=True,
)

result = []
# Remove columns that are fully contained in a longer column
for sidebar in sidebars_by_length:
if not any(result_sidebar.rect().contains(sidebar.rect()) for result_sidebar in result):
result.append(sidebar)
# Remove sidebar_noise that are fully contained in a longer sidebar
for sidebar_noise in sidebars_by_length:
if not any(
result_sidebar.sidebar.rect().contains(sidebar_noise.sidebar.rect()) for result_sidebar in result
):
result.append(sidebar_noise)

return result
Loading

1 comment on commit 763cd05

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1721720%3–458
   get_files.py19190%3–47
   main.py1261260%3–333
src/stratigraphy/benchmark
   metrics.py594229%22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
   data_extractor.py76495%32, 45, 120, 164
   utility.py6350%28–36
src/stratigraphy/depth
   a_to_b_interval_extractor.py371559%41–60, 79, 92
   depthcolumnentry_extractor.py23291%45–46
   interval.py1035349%25–28, 33–36, 42, 48, 52, 91–137, 147, 161, 167–183
src/stratigraphy/depths_materials_column_pairs
   bounding_boxes.py301067%23, 32, 50, 60, 72–78
   material_description_rect_with_sidebar.py18761%25–37
src/stratigraphy/evaluation
   evaluation_dataclasses.py491178%52, 71–74, 90, 104, 125–131, 147
   groundwater_evaluator.py48198%77
   layer_evaluator.py664630%29–30, 35–39, 47, 69–95, 105–113, 128–149
   metadata_evaluator.py371462%46–65, 86–93
   utility.py16756%43–52
src/stratigraphy/groundwater
   groundwater_extraction.py1469038%52, 94, 137–148, 180–184, 199–215, 226–314, 335–363
   utility.py423614%10–17, 30–50, 62–76, 91–105
src/stratigraphy/layer
   layer.py371365%26, 29, 37, 52–72
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py106496%29, 93–94, 106
   elevation_extraction.py906033%34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 234–238
   language_detection.py181328%17–23, 37–45
   metadata.py662464%27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
   a_above_b_sidebar.py993961%40, 43, 58–66, 71, 78, 100, 108–115, 145–146, 188–229
   a_above_b_sidebar_validator.py512747%51, 54, 57, 59, 85–96, 122–145, 157–161
   a_to_b_sidebar.py431467%36, 49–50, 67, 95–108
   layer_identifier_sidebar.py462937%46–65, 81–97, 109, 122
   layer_identifier_sidebar_extractor.py292031%31–41, 55–75
   sidebar.py53591%42, 47, 68, 80, 83
   sidebarentry.py20385%27, 31, 43
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py41880%26–34, 111–114
   textblock.py901188%22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
   dataclasses.py32391%37–39
   predictions.py723453%72, 95–115, 143–187
   util.py341265%69–76, 90–92, 116–117
TOTAL242899059% 

Tests Skipped Failures Errors Time
102 0 💤 0 ❌ 0 🔥 7.858s ⏱️

Please sign in to comment.