diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py index e549a2585..3e1536882 100644 --- a/annif/analyzer/simplemma.py +++ b/annif/analyzer/simplemma.py @@ -2,7 +2,7 @@ from __future__ import annotations -import simplemma +import annif.simplemma_util from . import analyzer @@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None: super().__init__(**kwargs) def _normalize_word(self, word: str) -> str: - return simplemma.lemmatize(word, lang=self.lang) + return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang) diff --git a/annif/simplemma_util.py b/annif/simplemma_util.py new file mode 100644 index 000000000..4a8b8a1e6 --- /dev/null +++ b/annif/simplemma_util.py @@ -0,0 +1,17 @@ +"""Wrapper code for using Simplemma functionality in Annif""" + +from typing import Tuple, Union + +from simplemma import LanguageDetector, Lemmatizer +from simplemma.strategies import DefaultStrategy +from simplemma.strategies.dictionaries import DefaultDictionaryFactory + +LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max) + +_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE) +_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory) +lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy) + + +def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector: + return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy) diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index f7c985485..e5cf8fdfe 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING -from simplemma.langdetect import in_target_language - import annif +import annif.simplemma_util from . import transform @@ -31,6 +30,9 @@ def __init__( self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) self.min_ratio = float(min_ratio) + self.language_detector = annif.simplemma_util.get_language_detector( + self.project.language + ) def transform_fn(self, text: str) -> str: if len(text) < self.text_min_length: @@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str: if len(sent) < self.sentence_min_length: retained_sentences.append(sent) continue - proportion = in_target_language(sent, lang=(self.project.language,)) + proportion = self.language_detector.proportion_in_target_languages(sent) if proportion >= self.min_ratio: retained_sentences.append(sent) return " ".join(retained_sentences) diff --git a/pyproject.toml b/pyproject.toml index 970fd2503..487bb8649 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ numpy = "1.26.*" optuna = "3.6.*" python-dateutil = "2.9.*" tomli = { version = "2.0.*", python = "<3.11" } -simplemma = "0.9.*" +simplemma = "~1.1.1" jsonschema = "4.21.*" huggingface-hub = "0.22.*" diff --git a/tests/test_simplemma_util.py b/tests/test_simplemma_util.py new file mode 100644 index 000000000..57ea8b83e --- /dev/null +++ b/tests/test_simplemma_util.py @@ -0,0 +1,19 @@ +"""Unit tests for Simplemma utility functions""" + +import pytest + +from annif.simplemma_util import get_language_detector + + +def test_get_language_detector(): + detector = get_language_detector("en") + text = "She said 'au revoir' and left" + proportion = detector.proportion_in_target_languages(text) + assert proportion == pytest.approx(0.75) + + +def test_get_language_detector_many(): + detector = get_language_detector(("en", "fr")) + text = "She said 'au revoir' and left" + proportion = detector.proportion_in_target_languages(text) + assert proportion == pytest.approx(1.0)