Skip to content

Commit

Permalink
Merge pull request #724 from NatLibFi/upgrade-simplemma
Browse files Browse the repository at this point in the history
Upgrade Simplemma & limit its memory usage
  • Loading branch information
juhoinkinen authored Sep 17, 2024
2 parents 337ee70 + 61c9409 commit e1edc53
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 6 deletions.
4 changes: 2 additions & 2 deletions annif/analyzer/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import simplemma
import annif.simplemma_util

from . import analyzer

Expand All @@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None:
super().__init__(**kwargs)

def _normalize_word(self, word: str) -> str:
return simplemma.lemmatize(word, lang=self.lang)
return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
17 changes: 17 additions & 0 deletions annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from typing import Tuple, Union

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory

LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max)

_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)


def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
8 changes: 5 additions & 3 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

from typing import TYPE_CHECKING

from simplemma.langdetect import in_target_language

import annif
import annif.simplemma_util

from . import transform

Expand All @@ -31,6 +30,9 @@ def __init__(
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.min_ratio = float(min_ratio)
self.language_detector = annif.simplemma_util.get_language_detector(
self.project.language
)

def transform_fn(self, text: str) -> str:
if len(text) < self.text_min_length:
Expand All @@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str:
if len(sent) < self.sentence_min_length:
retained_sentences.append(sent)
continue
proportion = in_target_language(sent, lang=(self.project.language,))
proportion = self.language_detector.proportion_in_target_languages(sent)
if proportion >= self.min_ratio:
retained_sentences.append(sent)
return " ".join(retained_sentences)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ numpy = "1.26.*"
optuna = "3.6.*"
python-dateutil = "2.9.*"
tomli = { version = "2.0.*", python = "<3.11" }
simplemma = "0.9.*"
simplemma = "~1.1.1"
jsonschema = "4.21.*"
huggingface-hub = "0.22.*"

Expand Down
19 changes: 19 additions & 0 deletions tests/test_simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Unit tests for Simplemma utility functions"""

import pytest

from annif.simplemma_util import get_language_detector


def test_get_language_detector():
detector = get_language_detector("en")
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(0.75)


def test_get_language_detector_many():
detector = get_language_detector(("en", "fr"))
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(1.0)

0 comments on commit e1edc53

Please sign in to comment.