From 30b2aeca7cf3c428a973317332073465d8e00a04 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Tue, 21 Jan 2025 14:28:17 +0100 Subject: [PATCH 1/8] chore(Documentation): fix typo in usage sample --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f276ef..5c8122f 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ pip install dwdsmor The library can be used for lemmatisation: ``` python-console ->>> import dwsdmor +>>> import dwdsmor >>> lemmatizer = dwdsmor.lemmatizer() >>> assert lemmatizer("getestet", pos={"+V"}) == "testen" >>> assert lemmatizer("getestet", pos={"+ADJ"}) == "getestet" From d549219dc8b929b503cc79e8be2a0a2a12cc7625 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 16:28:54 +0100 Subject: [PATCH 2/8] feat: Add spaCy integration as single-threaded pipeline component --- dwdsmor/automaton.py | 2 +- dwdsmor/spacy.py | 76 ++ dwdsmor/{tag.py => tag/__init__.py} | 0 dwdsmor/tag/hdt.py | 105 +++ pyproject.toml | 4 +- test/__snapshots__/test_spacy.ambr | 1067 +++++++++++++++++++++++++++ test/test_spacy.py | 36 + 7 files changed, 1287 insertions(+), 3 deletions(-) create mode 100644 dwdsmor/spacy.py rename dwdsmor/{tag.py => tag/__init__.py} (100%) create mode 100644 dwdsmor/tag/hdt.py create mode 100644 test/__snapshots__/test_spacy.ambr create mode 100644 test/test_spacy.py diff --git a/dwdsmor/automaton.py b/dwdsmor/automaton.py index 8c89a02..5720b8e 100644 --- a/dwdsmor/automaton.py +++ b/dwdsmor/automaton.py @@ -209,7 +209,7 @@ def __init__(self, automata, automaton_type="lemma"): def __call__(self, word, **criteria): traversals = tuple(self.analyzer.analyze(word)) - criteria_stack = list(criteria.items()) + criteria_stack = list((k, v) for k, v in criteria.items() if v) criteria_stack.reverse() while criteria_stack: if len(traversals) == 1: diff --git a/dwdsmor/spacy.py b/dwdsmor/spacy.py new file mode 100644 index 0000000..08d18cf --- /dev/null +++ b/dwdsmor/spacy.py @@ -0,0 +1,76 @@ +from collections import OrderedDict +from functools import cache +from typing import Iterable + +from spacy.language import Language +from spacy.tokens.token import Token + +import dwdsmor.tag.hdt as hdt + +from . import lemmatizer +from .automaton import Lemmatizer + +Token.set_extension("dwdsmor_lemma", default=None) + + +def criterion(k, v, mapping): + return (k, mapping.get(v, {v}) if v else None) + + +@cache +def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite): + return OrderedDict( + ( + criterion("pos", pos, hdt.pos_map), + criterion("number", number, hdt.number_map), + criterion("gender", gender, hdt.gender_map), + criterion("case", case, hdt.case_map), + criterion("person", person, hdt.person_map), + criterion("tense", tense, hdt.tense_map), + criterion("degree", degree, hdt.degree_map), + criterion("mood", mood, hdt.mood_map), + criterion("nonfinite", nonfinite, hdt.nonfinite_map), + ) + ) + + +def morph(token_morph, k): + v = ",".join(token_morph.get(k)) + return v if v else None + + +def lemmatize_token(lemmatizer: Lemmatizer, token: Token): + token_morph = token.morph + token_criteria = criteria( + token.tag_, + morph(token_morph, "Number"), + morph(token_morph, "Gender"), + morph(token_morph, "Case"), + morph(token_morph, "Person"), + morph(token_morph, "Tense"), + morph(token_morph, "Degree"), + morph(token_morph, "Mood"), + morph(token_morph, "VerbForm"), + ) + token._.dwdsmor_lemma = lemmatizer(token.text, **token_criteria) + return token + + +def lemmatize(lemmatizer: Lemmatizer, tokens: Iterable[Token]): + for token in tokens: + lemmatize_token(lemmatizer, token) + return tokens + + +class Component: + def __init__(self, automata_location=None): + self.lemmatizer = lemmatizer(automata_location) + + def __call__(self, doc): + lemmatize(self.lemmatizer, doc) + return doc + + +@Language.factory("dwdsmor", default_config={"automata_location": None}) +def create_component(nlp: Language, name: str, automata_location: str | None): + return Component(automata_location) diff --git a/dwdsmor/tag.py b/dwdsmor/tag/__init__.py similarity index 100% rename from dwdsmor/tag.py rename to dwdsmor/tag/__init__.py diff --git a/dwdsmor/tag/hdt.py b/dwdsmor/tag/hdt.py new file mode 100644 index 0000000..6103223 --- /dev/null +++ b/dwdsmor/tag/hdt.py @@ -0,0 +1,105 @@ +pos_map = { + "$(": {"+PUNCT"}, + "$,": {"+PUNCT"}, + "$.": {"+PUNCT"}, + "ADJA": {"+ADJ", "+CARD", "+INDEF", "+ORD"}, + "ADJD": {"+ADJ"}, + "ADV": {"+ADV"}, + "APPO": {"+POSTP"}, + "APPR": {"+PREP"}, + "APPR_ART": {"+PREPART"}, + "APZR": {"+POSTP", "+PREP"}, + "ART": {"+ART"}, + "CARD": {"+CARD"}, + "FM": {"+FM"}, # ? + "ITJ": {"+INTJ"}, + "KOKOM": {"+CONJ"}, + "KON": {"+CONJ"}, + "KOUI": {"+CONJ"}, + "KOUS": {"+CONJ"}, + "NE": {"+NN", "+NPROP"}, + "NN": {"+NN", "+NPROP"}, + "PDAT": {"+DEM"}, + "PDS": {"+DEM"}, + "PIAT": {"+INDEF"}, + "PIDAT": {"+INDEF"}, + "PIS": {"+INDEF"}, + "PPER": {"+PPRO"}, + "PPOSAT": {"+POSS"}, + "PPOSS": {"+POSS"}, + "PRELAT": {"+REL"}, + "PRELS": {"+REL"}, + "PRF": {"+PPRO"}, + "PROAV": {"+ADV", "+PROADV"}, + "PTKA": {"+PTCL"}, + "PTKANT": {"+INTJ", "+PTCL"}, + "PTKNEG": {"+PTCL"}, + "PTKVZ": {"+ADV", "+PREP", "+VPART"}, + "PTKZU": {"+PTCL"}, + "PWAT": {"+WPRO"}, + "PWAV": {"+ADV"}, + "PWS": {"+WPRO"}, + "TRUNC": {"+TRUNC"}, # ? + "VAFIN": {"+V"}, + "VAIMP": {"+V"}, + "VAINF": {"+V"}, + "VAPP": {"+V"}, + "VMFIN": {"+V"}, + "VMINF": {"+V"}, + "VMPP": {"+V"}, + "VVFIN": {"+V"}, + "VVIMP": {"+V"}, + "VVINF": {"+V"}, + "VVIZU": {"+V"}, + "VVPP": {"+V"}, + "XY": {"+XY"}, # ? +} + +number_map = { + "Sing": {"Sg"}, + "Plur": {"Pl"}, +} + + +gender_map = { + "Masc,Neut": {"Masc", "Neut"}, + "Neut": {"Neut"}, + "Fem": {"Fem"}, + "Masc": {"Masc"}, +} + +case_map = { + "Nom": {"Nom"}, + "Gen": {"Gen"}, + "Dat": {"Dat"}, + "Acc": {"Acc"}, +} + +person_map = { + "1": {"1"}, + "2": {"2"}, + "3": {"3"}, +} + +tense_map = { + "Past": {"Past"}, + "Pres": {"Pres"}, +} + + +degree_map = { + "Cmp": {"Comp"}, + "Sup": {"Sup"}, + "Pos": {"Pos"}, +} + +mood_map = { + "Ind": {"Ind"}, + "Imp": {"Imp"}, +} + +# VerbForm +nonfinite_map = { + "Part": {"Part"}, + "Inf": {"Inf"}, +} diff --git a/pyproject.toml b/pyproject.toml index 1789476..ac7335f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,8 +39,8 @@ dev = [ "pytest", "syrupy", "tqdm", - "Jinja2" -] + "Jinja2", + "de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"] [project.scripts] dwdsmor = "dwdsmor.cli:main" diff --git a/test/__snapshots__/test_spacy.ambr b/test/__snapshots__/test_spacy.ambr new file mode 100644 index 0000000..5b39374 --- /dev/null +++ b/test/__snapshots__/test_spacy.ambr @@ -0,0 +1,1067 @@ +# serializer version: 1 +# name: test_lemmatisation + tuple( + tuple( + 'Sehr', + 'ADV', + 'Sehr', + 'sehr', + ), + tuple( + 'gute', + 'ADJA', + 'gut', + 'gut', + ), + tuple( + 'Beratung', + 'NN', + 'Beratung', + 'Beratung', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'schnelle', + 'ADJA', + 'schnell', + 'schnell', + ), + tuple( + 'Behebung', + 'NN', + 'Behebung', + 'Behebung', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Probleme', + 'NN', + 'Problem', + 'Problem', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'so', + 'ADV', + 'so', + 'so', + ), + tuple( + 'stelle', + 'VVFIN', + 'stellen', + 'stellen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'mir', + 'PRF', + 'ich', + 'ich', + ), + tuple( + 'Kundenservice', + 'NN', + 'Service', + 'Kundenservice', + ), + tuple( + 'vor', + 'PTKVZ', + 'vor', + 'vor', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Die', + 'ART', + 'der', + 'die', + ), + tuple( + 'Kosten', + 'NN', + 'Kosten', + 'Kosten', + ), + tuple( + 'sind', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'definitiv', + 'ADJD', + 'definitiv', + 'definitiv', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'im', + 'APPR_ART', + 'in der', + 'in', + ), + tuple( + 'Rahmen', + 'NN', + 'Rahmen', + 'Rahmen', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Nette', + 'ADJA', + 'Nette', + 'nett', + ), + tuple( + 'Gespräche', + 'NN', + 'Gespräch', + 'Gespräch', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'klasse', + 'ADJA', + 'klasse', + 'klasse', + ), + tuple( + 'Ergebnis', + 'NN', + 'Ergebnis', + 'Ergebnis', + ), + tuple( + 'Ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'bin', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'seit', + 'APPR', + 'seit', + 'seit', + ), + tuple( + 'längerer', + 'ADJA', + 'lang', + 'lang', + ), + tuple( + 'Zeit', + 'NN', + 'Zeit', + 'Zeit', + ), + tuple( + 'zur', + 'APPR_ART', + 'zu der', + 'zu', + ), + tuple( + 'Behandlung', + 'NN', + 'Behandlung', + 'Behandlung', + ), + tuple( + 'verschiedenster', + 'ADJA', + 'verschieden', + 'verschieden', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'Leiden', + 'NN', + 'Leiden', + 'Leiden', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'in', + 'APPR', + 'in', + 'in', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Physiotherapieraxis', + 'NN', + 'Physiotherapieraxis', + None, + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'Gaby', + 'NE', + 'Gaby', + None, + ), + tuple( + 'Montag', + 'NE', + 'Montag', + 'Montag', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'im', + 'APPR_ART', + 'in der', + 'in', + ), + tuple( + 'Vital', + 'NN', + 'Vital', + 'vital', + ), + tuple( + 'Center', + 'NN', + 'Center', + 'Center', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'kann', + 'VMFIN', + 'können', + 'können', + ), + tuple( + 'ausschließlich', + 'ADV', + 'ausschließlich', + 'ausschließlich', + ), + tuple( + 'Positives', + 'NN', + 'Positiv', + 'positiv', + ), + tuple( + 'berichten', + 'VVINF', + 'berichten', + 'berichten', + ), + tuple( + '!', + '$.', + '!', + '!', + ), + tuple( + 'Ob', + 'KOUS', + 'Ob', + 'ob', + ), + tuple( + 'bei', + 'APPR', + 'bei', + 'bei', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Terminvergabe', + 'NN', + 'Terminvergabe', + 'Terminvergabe', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'den', + 'ART', + 'der', + 'die', + ), + tuple( + 'Behandlungsräumen', + 'NN', + 'Behandlungsräumen', + 'Behandlungsraum', + ), + tuple( + 'oder', + 'KON', + 'oder', + 'oder', + ), + tuple( + 'den', + 'ART', + 'der', + 'die', + ), + tuple( + 'individuell', + 'ADJD', + 'individuell', + 'individuell', + ), + tuple( + 'zugeschnittenen', + 'ADJA', + 'zuschneiden', + 'zugeschnitten', + ), + tuple( + 'Trainingsplänen', + 'NN', + 'Trainingsplan', + 'Trainingsplan', + ), + tuple( + 'sind', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'alle', + 'PIDAT', + 'alle', + 'alle', + ), + tuple( + 'Mitarbeiter', + 'NN', + 'Mitarbeiter', + 'Mitarbeiter', + ), + tuple( + 'äußerst', + 'ADV', + 'äußerst', + 'äußern', + ), + tuple( + 'kompetent', + 'ADJD', + 'kompetent', + 'kompetent', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'flexibel', + 'ADJD', + 'flexibel', + 'flexibel', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Sauberkeit', + 'NN', + 'Sauberkeit', + 'Sauberkeit', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'Ordnung', + 'NN', + 'Ordnung', + 'Ordnung', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'Freundlichkeit', + 'NN', + 'Freundlichkeit', + 'Freundlichkeit', + ), + tuple( + 'brauche', + 'VVFIN', + 'brauchen', + 'brauchen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'hier', + 'ADV', + 'hier', + 'hier', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'zu', + 'PTKZU', + 'zu', + 'zu', + ), + tuple( + 'erwähnen', + 'VVINF', + 'erwähnen', + 'erwähnen', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'denn', + 'KON', + 'denn', + 'denn', + ), + tuple( + 'das', + 'PDS', + 'der', + 'die', + ), + tuple( + 'gehört', + 'VVFIN', + 'gehören', + 'gehören', + ), + tuple( + 'für', + 'APPR', + 'für', + 'für', + ), + tuple( + 'mich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'zum', + 'APPR_ART', + 'zu der', + 'zu', + ), + tuple( + 'Standard', + 'NN', + 'Standard', + 'Standard', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'der', + 'PRELS', + 'der', + 'die', + ), + tuple( + 'aber', + 'ADV', + 'aber', + 'aber', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'noch', + 'ADV', + 'noch', + 'noch', + ), + tuple( + 'übertroffen', + 'VVPP', + 'übertreffen', + 'übertreffen', + ), + tuple( + 'wird', + 'VAFIN', + 'werden', + 'werden', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Physiotherapie', + 'NE', + 'Physiotherapie', + 'Physiotherapie', + ), + tuple( + 'ist', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'zwar', + 'ADV', + 'zwar', + 'zwar', + ), + tuple( + 'oftmals', + 'ADV', + 'oftmals', + 'oftmals', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'anstrengend', + 'ADJD', + 'anstrengen', + 'anstrengend', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'aber', + 'KON', + 'aber', + 'aber', + ), + tuple( + 'in', + 'APPR', + 'in', + 'in', + ), + tuple( + 'dieser', + 'PDAT', + 'dieser', + 'diese', + ), + tuple( + 'Umgebeung', + 'NN', + 'Umgebeung', + None, + ), + tuple( + 'freut', + 'VVFIN', + 'freuen', + 'freuen', + ), + tuple( + 'man', + 'PIS', + 'man', + 'man', + ), + tuple( + 'sich', + 'PRF', + 'sich', + 'sich', + ), + tuple( + 'auf', + 'APPR', + 'auf', + 'auf', + ), + tuple( + 'jede', + 'PIDAT', + 'jeder', + 'jede', + ), + tuple( + 'Minute', + 'NN', + 'Minute', + 'Minute', + ), + tuple( + 'Behandlung', + 'NN', + 'Behandlung', + 'Behandlung', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Das', + 'ART', + 'der', + 'die', + ), + tuple( + 'nächste', + 'ADJA', + 'NULL', + 'nahe', + ), + tuple( + 'mal', + 'ADV', + 'mal', + 'mal', + ), + tuple( + 'rief', + 'VVFIN', + 'rufen', + 'rufen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'extra', + 'ADV', + 'extra', + 'extra', + ), + tuple( + 'vorher', + 'ADV', + 'vorher', + 'vorher', + ), + tuple( + 'an', + 'PTKVZ', + 'an', + 'an', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'um', + 'KOUI', + 'um', + 'um', + ), + tuple( + 'einen', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Termin', + 'NN', + 'Termin', + 'Termin', + ), + tuple( + 'zu', + 'PTKZU', + 'zu', + 'zu', + ), + tuple( + 'vereinbaren', + 'VVINF', + 'vereinbaren', + 'vereinbaren', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'damit', + 'KOUS', + 'damit', + 'damit', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Konditor', + 'NN', + 'Konditor', + 'Konditor', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'Zeit', + 'NN', + 'Zeit', + 'Zeit', + ), + tuple( + 'für', + 'APPR', + 'für', + 'für', + ), + tuple( + 'uns', + 'PPER', + 'wir', + 'uns', + ), + tuple( + 'hätte', + 'VAFIN', + 'haben', + 'haben', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Eine', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Stunde', + 'NN', + 'Stunde', + 'Stunde', + ), + tuple( + 'später', + 'ADJD', + 'spät', + 'spät', + ), + tuple( + 'gab', + 'VVFIN', + 'geben', + 'geben', + ), + tuple( + 'man', + 'PIS', + 'man', + 'man', + ), + tuple( + 'uns', + 'PPER', + 'wir', + 'uns', + ), + tuple( + 'dann', + 'ADV', + 'dann', + 'dann', + ), + tuple( + 'endlich', + 'ADJD', + 'endlich', + 'endlich', + ), + tuple( + 'einen', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Tisch', + 'NN', + 'Tisch', + 'Tisch', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'der', + 'PRELS', + 'der', + 'die', + ), + tuple( + 'allerdings', + 'ADV', + 'allerdings', + 'allerdings', + ), + tuple( + 'noch', + 'ADV', + 'noch', + 'noch', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'einmal', + 'ADV', + 'einmal', + 'einmal', + ), + tuple( + 'abgeräumt', + 'VVPP', + 'abräumen', + 'abräumen', + ), + tuple( + 'war', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Die', + 'ART', + 'der', + 'die', + ), + tuple( + 'Bedienung', + 'NN', + 'Bedienung', + 'Bedienung', + ), + tuple( + 'verschwand', + 'VVFIN', + 'verschwinden', + 'verschwinden', + ), + tuple( + 'sofort', + 'ADV', + 'sofort', + 'sofort', + ), + tuple( + 'wieder', + 'ADV', + 'wieder', + 'wieder', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'kam', + 'VVFIN', + 'kommen', + 'kommen', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'erstmal', + 'ADV', + 'erstmal', + 'erstmal', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'mehr', + 'ADV', + 'mehr', + 'sehr', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + ) +# --- diff --git a/test/test_spacy.py b/test/test_spacy.py new file mode 100644 index 0000000..19c7b47 --- /dev/null +++ b/test/test_spacy.py @@ -0,0 +1,36 @@ +import spacy +from datasets import load_dataset +from pytest import fixture + +import dwdsmor +import dwdsmor.spacy + + +@fixture(scope="module") +def nlp(): + nlp = spacy.load("de_hdt_lg") + nlp.add_pipe("dwdsmor") + return nlp + + +@fixture(scope="module") +def lemmatizer(): + return dwdsmor.lemmatizer() + + +@fixture(scope="module") +def sentences(): + ds = load_dataset( + "universal_dependencies", + "de_gsd", + split="train", + trust_remote_code=True, + ) + return tuple(s["text"] for s in ds.select(range(100))) + + +def test_lemmatisation(nlp, lemmatizer, sentences, snapshot): + sentences = sentences[:10] + docs = nlp.pipe(sentences) + tokens = ((t.text, t.tag_, t.lemma_, t._.dwdsmor_lemma) for d in docs for t in d) + assert tuple(tokens) == snapshot From 4189d0e7092b3e072755914ffce6a79b35efde83 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 16:43:40 +0100 Subject: [PATCH 3/8] chore(Project): Add Python version classifiers --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index ac7335f..5af86ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,12 @@ classifiers = [ "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Education", "Topic :: Scientific/Engineering", "Topic :: Text Processing :: Linguistic" From e679773bfb6f89983206c82c0bba30085b7f2f18 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 16:49:19 +0100 Subject: [PATCH 4/8] chore: Snapshot spaCy test with Open Edition --- test/__snapshots__/test_spacy.ambr | 42 +++++++++++++++--------------- test/test_spacy.py | 10 ++----- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/test/__snapshots__/test_spacy.ambr b/test/__snapshots__/test_spacy.ambr index 5b39374..6d1b9c4 100644 --- a/test/__snapshots__/test_spacy.ambr +++ b/test/__snapshots__/test_spacy.ambr @@ -35,7 +35,7 @@ 'Behebung', 'NN', 'Behebung', - 'Behebung', + None, ), tuple( 'der', @@ -83,7 +83,7 @@ 'Kundenservice', 'NN', 'Service', - 'Kundenservice', + None, ), tuple( 'vor', @@ -155,7 +155,7 @@ 'Gespräche', 'NN', 'Gespräch', - 'Gespräch', + None, ), tuple( ',', @@ -221,7 +221,7 @@ 'verschiedenster', 'ADJA', 'verschieden', - 'verschieden', + None, ), tuple( '"', @@ -293,13 +293,13 @@ 'Vital', 'NN', 'Vital', - 'vital', + None, ), tuple( 'Center', 'NN', 'Center', - 'Center', + None, ), tuple( 'und', @@ -359,7 +359,7 @@ 'Terminvergabe', 'NN', 'Terminvergabe', - 'Terminvergabe', + None, ), tuple( ',', @@ -377,7 +377,7 @@ 'Behandlungsräumen', 'NN', 'Behandlungsräumen', - 'Behandlungsraum', + None, ), tuple( 'oder', @@ -407,7 +407,7 @@ 'Trainingsplänen', 'NN', 'Trainingsplan', - 'Trainingsplan', + None, ), tuple( 'sind', @@ -461,7 +461,7 @@ 'Sauberkeit', 'NN', 'Sauberkeit', - 'Sauberkeit', + None, ), tuple( ',', @@ -485,7 +485,7 @@ 'Freundlichkeit', 'NN', 'Freundlichkeit', - 'Freundlichkeit', + None, ), tuple( 'brauche', @@ -515,13 +515,13 @@ 'zu', 'PTKZU', 'zu', - 'zu', + None, ), tuple( 'erwähnen', 'VVINF', 'erwähnen', - 'erwähnen', + None, ), tuple( ',', @@ -569,7 +569,7 @@ 'Standard', 'NN', 'Standard', - 'Standard', + None, ), tuple( ',', @@ -605,7 +605,7 @@ 'übertroffen', 'VVPP', 'übertreffen', - 'übertreffen', + None, ), tuple( 'wird', @@ -623,7 +623,7 @@ 'Physiotherapie', 'NE', 'Physiotherapie', - 'Physiotherapie', + None, ), tuple( 'ist', @@ -779,7 +779,7 @@ 'an', 'PTKVZ', 'an', - 'an', + None, ), tuple( ',', @@ -809,13 +809,13 @@ 'zu', 'PTKZU', 'zu', - 'zu', + None, ), tuple( 'vereinbaren', 'VVINF', 'vereinbaren', - 'vereinbaren', + None, ), tuple( ',', @@ -839,7 +839,7 @@ 'Konditor', 'NN', 'Konditor', - 'Konditor', + None, ), tuple( 'auch', @@ -1001,7 +1001,7 @@ 'Bedienung', 'NN', 'Bedienung', - 'Bedienung', + None, ), tuple( 'verschwand', diff --git a/test/test_spacy.py b/test/test_spacy.py index 19c7b47..96597fa 100644 --- a/test/test_spacy.py +++ b/test/test_spacy.py @@ -2,8 +2,7 @@ from datasets import load_dataset from pytest import fixture -import dwdsmor -import dwdsmor.spacy +import dwdsmor.spacy # noqa @fixture(scope="module") @@ -13,11 +12,6 @@ def nlp(): return nlp -@fixture(scope="module") -def lemmatizer(): - return dwdsmor.lemmatizer() - - @fixture(scope="module") def sentences(): ds = load_dataset( @@ -29,7 +23,7 @@ def sentences(): return tuple(s["text"] for s in ds.select(range(100))) -def test_lemmatisation(nlp, lemmatizer, sentences, snapshot): +def test_lemmatisation(nlp, sentences, snapshot): sentences = sentences[:10] docs = nlp.pipe(sentences) tokens = ((t.text, t.tag_, t.lemma_, t._.dwdsmor_lemma) for d in docs for t in d) From aad736fe7f137bfd55e12c5aeb863abea67350ef Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 16:57:16 +0100 Subject: [PATCH 5/8] chore(Documentation): show sample usage of spaCy integration --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 5c8122f..4a95194 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,18 @@ The library can be used for lemmatisation: >>> assert lemmatizer("getestet", pos={"+ADJ"}) == "getestet" ``` +There is also integration with spacy: + +``` python-console +>>> import spacy +>>> import dwdsmor.spacy +>>> nlp = spacy.load("de_hdt_lg") +>>> nlp.add_pipe("dwdsmor") + +>>> tuple((t.lemma_, t._.dwdsmor_lemma) for t in nlp("Das ist ein Test.")) +(('der', 'die'), ('sein', 'sein'), ('ein', 'eine'), ('Test', 'Test'), ('.', '.')) +``` + Next to the Python API, the package provides a simple command line interface named `dwdsmor`. To analyze a word form, pass it as an argument: From dec02176706eb2f0c46c07024b3dd2b79a909628 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 17:02:54 +0100 Subject: [PATCH 6/8] chore(Test): Only warn on unused regression snapshots --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0b0ff6e..966c3f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,4 +27,4 @@ jobs: - name: Build run: ./build-dwdsmor - name: Run tests - run: pytest + run: pytest --snapshot-warn-unused From 19d41b3be846e2ff7b4b2a116f10617599e38655 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Mon, 27 Jan 2025 08:35:45 +0100 Subject: [PATCH 7/8] chore(API): Decouple DWDSmor/HDT tag translation from spaCy integration --- README.md | 6 +- dwdsmor/automaton.py | 6 +- dwdsmor/spacy.py | 29 +- dwdsmor/tag/hdt.py | 24 + test/__snapshots__/test_spacy.ambr | 858 +---------------------------- test/test_lemmatizer.py | 4 +- test/test_spacy.py | 10 +- 7 files changed, 47 insertions(+), 890 deletions(-) diff --git a/README.md b/README.md index 4a95194..dbbee12 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,8 @@ The library can be used for lemmatisation: ``` python-console >>> import dwdsmor >>> lemmatizer = dwdsmor.lemmatizer() ->>> assert lemmatizer("getestet", pos={"+V"}) == "testen" ->>> assert lemmatizer("getestet", pos={"+ADJ"}) == "getestet" +>>> assert lemmatizer("getestet", pos={"+V"}).analysis == "testen" +>>> assert lemmatizer("getestet", pos={"+ADJ"}).analysis == "getestet" ``` There is also integration with spacy: @@ -73,7 +73,7 @@ There is also integration with spacy: >>> nlp = spacy.load("de_hdt_lg") >>> nlp.add_pipe("dwdsmor") ->>> tuple((t.lemma_, t._.dwdsmor_lemma) for t in nlp("Das ist ein Test.")) +>>> tuple((t.lemma_, t._.dwdsmor.analysis) for t in nlp("Das ist ein Test.")) (('der', 'die'), ('sein', 'sein'), ('ein', 'eine'), ('Test', 'Test'), ('.', '.')) ``` diff --git a/dwdsmor/automaton.py b/dwdsmor/automaton.py index 5720b8e..74662fd 100644 --- a/dwdsmor/automaton.py +++ b/dwdsmor/automaton.py @@ -213,13 +213,13 @@ def __call__(self, word, **criteria): criteria_stack.reverse() while criteria_stack: if len(traversals) == 1: - (traversal,) = traversals - return traversal.analysis + break attr, attr_vals = criteria_stack.pop() filtered = tuple((t for t in traversals if getattr(t, attr) in attr_vals)) traversals = filtered or traversals + traversals = sorted(traversals, key=lambda t: len(t.spec)) for traversal in traversals: - return traversal.analysis + return traversal def lemmatizer(*args, automaton_type="lemma", **kwargs): diff --git a/dwdsmor/spacy.py b/dwdsmor/spacy.py index 08d18cf..f25eb12 100644 --- a/dwdsmor/spacy.py +++ b/dwdsmor/spacy.py @@ -1,5 +1,3 @@ -from collections import OrderedDict -from functools import cache from typing import Iterable from spacy.language import Language @@ -10,28 +8,7 @@ from . import lemmatizer from .automaton import Lemmatizer -Token.set_extension("dwdsmor_lemma", default=None) - - -def criterion(k, v, mapping): - return (k, mapping.get(v, {v}) if v else None) - - -@cache -def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite): - return OrderedDict( - ( - criterion("pos", pos, hdt.pos_map), - criterion("number", number, hdt.number_map), - criterion("gender", gender, hdt.gender_map), - criterion("case", case, hdt.case_map), - criterion("person", person, hdt.person_map), - criterion("tense", tense, hdt.tense_map), - criterion("degree", degree, hdt.degree_map), - criterion("mood", mood, hdt.mood_map), - criterion("nonfinite", nonfinite, hdt.nonfinite_map), - ) - ) +Token.set_extension("dwdsmor", default=None) def morph(token_morph, k): @@ -41,7 +18,7 @@ def morph(token_morph, k): def lemmatize_token(lemmatizer: Lemmatizer, token: Token): token_morph = token.morph - token_criteria = criteria( + token_criteria = hdt.criteria( token.tag_, morph(token_morph, "Number"), morph(token_morph, "Gender"), @@ -52,7 +29,7 @@ def lemmatize_token(lemmatizer: Lemmatizer, token: Token): morph(token_morph, "Mood"), morph(token_morph, "VerbForm"), ) - token._.dwdsmor_lemma = lemmatizer(token.text, **token_criteria) + token._.dwdsmor = lemmatizer(token.text, **token_criteria) return token diff --git a/dwdsmor/tag/hdt.py b/dwdsmor/tag/hdt.py index 6103223..b121d91 100644 --- a/dwdsmor/tag/hdt.py +++ b/dwdsmor/tag/hdt.py @@ -1,3 +1,6 @@ +from collections import OrderedDict +from functools import cache + pos_map = { "$(": {"+PUNCT"}, "$,": {"+PUNCT"}, @@ -103,3 +106,24 @@ "Part": {"Part"}, "Inf": {"Inf"}, } + + +def criterion(k, v, mapping): + return (k, mapping.get(v, {v}) if v else None) + + +@cache +def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite): + return OrderedDict( + ( + criterion("pos", pos, pos_map), + criterion("number", number, number_map), + criterion("gender", gender, gender_map), + criterion("case", case, case_map), + criterion("person", person, person_map), + criterion("tense", tense, tense_map), + criterion("degree", degree, degree_map), + criterion("mood", mood, mood_map), + criterion("nonfinite", nonfinite, nonfinite_map), + ) + ) diff --git a/test/__snapshots__/test_spacy.ambr b/test/__snapshots__/test_spacy.ambr index 6d1b9c4..5d8204f 100644 --- a/test/__snapshots__/test_spacy.ambr +++ b/test/__snapshots__/test_spacy.ambr @@ -7,95 +7,17 @@ 'Sehr', 'sehr', ), - tuple( - 'gute', - 'ADJA', - 'gut', - 'gut', - ), - tuple( - 'Beratung', - 'NN', - 'Beratung', - 'Beratung', - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'schnelle', - 'ADJA', - 'schnell', - 'schnell', - ), - tuple( - 'Behebung', - 'NN', - 'Behebung', - None, - ), tuple( 'der', 'ART', 'der', 'die', ), - tuple( - 'Probleme', - 'NN', - 'Problem', - 'Problem', - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'so', - 'ADV', - 'so', - 'so', - ), - tuple( - 'stelle', - 'VVFIN', - 'stellen', - 'stellen', - ), - tuple( - 'ich', - 'PPER', - 'ich', - 'ich', - ), - tuple( - 'mir', - 'PRF', - 'ich', - 'ich', - ), tuple( 'Kundenservice', 'NN', 'Service', - None, - ), - tuple( - 'vor', - 'PTKVZ', - 'vor', - 'vor', - ), - tuple( - '.', - '$.', - '.', - '.', + 'Kundenservice', ), tuple( 'Die', @@ -103,186 +25,30 @@ 'der', 'die', ), - tuple( - 'Kosten', - 'NN', - 'Kosten', - 'Kosten', - ), - tuple( - 'sind', - 'VAFIN', - 'sein', - 'sein', - ), - tuple( - 'definitiv', - 'ADJD', - 'definitiv', - 'definitiv', - ), - tuple( - 'auch', - 'ADV', - 'auch', - 'auch', - ), tuple( 'im', 'APPR_ART', 'in der', 'in', ), - tuple( - 'Rahmen', - 'NN', - 'Rahmen', - 'Rahmen', - ), - tuple( - '.', - '$.', - '.', - '.', - ), tuple( 'Nette', 'ADJA', 'Nette', 'nett', ), - tuple( - 'Gespräche', - 'NN', - 'Gespräch', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'klasse', - 'ADJA', - 'klasse', - 'klasse', - ), - tuple( - 'Ergebnis', - 'NN', - 'Ergebnis', - 'Ergebnis', - ), - tuple( - 'Ich', - 'PPER', - 'ich', - 'ich', - ), - tuple( - 'bin', - 'VAFIN', - 'sein', - 'sein', - ), - tuple( - 'seit', - 'APPR', - 'seit', - 'seit', - ), - tuple( - 'längerer', - 'ADJA', - 'lang', - 'lang', - ), - tuple( - 'Zeit', - 'NN', - 'Zeit', - 'Zeit', - ), tuple( 'zur', 'APPR_ART', 'zu der', 'zu', ), - tuple( - 'Behandlung', - 'NN', - 'Behandlung', - 'Behandlung', - ), - tuple( - 'verschiedenster', - 'ADJA', - 'verschieden', - None, - ), - tuple( - '"', - '$(', - '"', - '"', - ), - tuple( - 'Leiden', - 'NN', - 'Leiden', - 'Leiden', - ), - tuple( - '"', - '$(', - '"', - '"', - ), - tuple( - 'in', - 'APPR', - 'in', - 'in', - ), tuple( 'der', 'ART', 'der', 'die', ), - tuple( - 'Physiotherapieraxis', - 'NN', - 'Physiotherapieraxis', - None, - ), - tuple( - '"', - '$(', - '"', - '"', - ), - tuple( - 'Gaby', - 'NE', - 'Gaby', - None, - ), - tuple( - 'Montag', - 'NE', - 'Montag', - 'Montag', - ), - tuple( - '"', - '$(', - '"', - '"', - ), tuple( 'im', 'APPR_ART', @@ -293,31 +59,7 @@ 'Vital', 'NN', 'Vital', - None, - ), - tuple( - 'Center', - 'NN', - 'Center', - None, - ), - tuple( - 'und', - 'KON', - 'und', - 'und', - ), - tuple( - 'kann', - 'VMFIN', - 'können', - 'können', - ), - tuple( - 'ausschließlich', - 'ADV', - 'ausschließlich', - 'ausschließlich', + 'vital', ), tuple( 'Positives', @@ -325,48 +67,18 @@ 'Positiv', 'positiv', ), - tuple( - 'berichten', - 'VVINF', - 'berichten', - 'berichten', - ), - tuple( - '!', - '$.', - '!', - '!', - ), tuple( 'Ob', 'KOUS', 'Ob', 'ob', ), - tuple( - 'bei', - 'APPR', - 'bei', - 'bei', - ), tuple( 'der', 'ART', 'der', 'die', ), - tuple( - 'Terminvergabe', - 'NN', - 'Terminvergabe', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), tuple( 'den', 'ART', @@ -377,13 +89,7 @@ 'Behandlungsräumen', 'NN', 'Behandlungsräumen', - None, - ), - tuple( - 'oder', - 'KON', - 'oder', - 'oder', + 'Behandlungsraum', ), tuple( 'den', @@ -391,348 +97,54 @@ 'der', 'die', ), - tuple( - 'individuell', - 'ADJD', - 'individuell', - 'individuell', - ), tuple( 'zugeschnittenen', 'ADJA', 'zuschneiden', 'zugeschnitten', ), - tuple( - 'Trainingsplänen', - 'NN', - 'Trainingsplan', - None, - ), - tuple( - 'sind', - 'VAFIN', - 'sein', - 'sein', - ), - tuple( - 'alle', - 'PIDAT', - 'alle', - 'alle', - ), - tuple( - 'Mitarbeiter', - 'NN', - 'Mitarbeiter', - 'Mitarbeiter', - ), tuple( 'äußerst', 'ADV', 'äußerst', 'äußern', ), - tuple( - 'kompetent', - 'ADJD', - 'kompetent', - 'kompetent', - ), - tuple( - 'und', - 'KON', - 'und', - 'und', - ), - tuple( - 'flexibel', - 'ADJD', - 'flexibel', - 'flexibel', - ), - tuple( - '.', - '$.', - '.', - '.', - ), - tuple( - 'Sauberkeit', - 'NN', - 'Sauberkeit', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'Ordnung', - 'NN', - 'Ordnung', - 'Ordnung', - ), - tuple( - 'und', - 'KON', - 'und', - 'und', - ), - tuple( - 'Freundlichkeit', - 'NN', - 'Freundlichkeit', - None, - ), - tuple( - 'brauche', - 'VVFIN', - 'brauchen', - 'brauchen', - ), - tuple( - 'ich', - 'PPER', - 'ich', - 'ich', - ), - tuple( - 'hier', - 'ADV', - 'hier', - 'hier', - ), - tuple( - 'nicht', - 'PTKNEG', - 'nicht', - 'nicht', - ), - tuple( - 'zu', - 'PTKZU', - 'zu', - None, - ), - tuple( - 'erwähnen', - 'VVINF', - 'erwähnen', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'denn', - 'KON', - 'denn', - 'denn', - ), tuple( 'das', 'PDS', 'der', 'die', ), - tuple( - 'gehört', - 'VVFIN', - 'gehören', - 'gehören', - ), - tuple( - 'für', - 'APPR', - 'für', - 'für', - ), - tuple( - 'mich', - 'PPER', - 'ich', - 'ich', - ), tuple( 'zum', 'APPR_ART', 'zu der', 'zu', ), - tuple( - 'Standard', - 'NN', - 'Standard', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), tuple( 'der', 'PRELS', 'der', 'die', ), - tuple( - 'aber', - 'ADV', - 'aber', - 'aber', - ), - tuple( - 'auch', - 'ADV', - 'auch', - 'auch', - ), - tuple( - 'noch', - 'ADV', - 'noch', - 'noch', - ), - tuple( - 'übertroffen', - 'VVPP', - 'übertreffen', - None, - ), - tuple( - 'wird', - 'VAFIN', - 'werden', - 'werden', - ), - tuple( - '.', - '$.', - '.', - '.', - ), - tuple( - 'Physiotherapie', - 'NE', - 'Physiotherapie', - None, - ), - tuple( - 'ist', - 'VAFIN', - 'sein', - 'sein', - ), - tuple( - 'zwar', - 'ADV', - 'zwar', - 'zwar', - ), - tuple( - 'oftmals', - 'ADV', - 'oftmals', - 'oftmals', - ), - tuple( - 'auch', - 'ADV', - 'auch', - 'auch', - ), tuple( 'anstrengend', 'ADJD', 'anstrengen', 'anstrengend', ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'aber', - 'KON', - 'aber', - 'aber', - ), - tuple( - 'in', - 'APPR', - 'in', - 'in', - ), tuple( 'dieser', 'PDAT', 'dieser', 'diese', ), - tuple( - 'Umgebeung', - 'NN', - 'Umgebeung', - None, - ), - tuple( - 'freut', - 'VVFIN', - 'freuen', - 'freuen', - ), - tuple( - 'man', - 'PIS', - 'man', - 'man', - ), - tuple( - 'sich', - 'PRF', - 'sich', - 'sich', - ), - tuple( - 'auf', - 'APPR', - 'auf', - 'auf', - ), tuple( 'jede', 'PIDAT', 'jeder', 'jede', ), - tuple( - 'Minute', - 'NN', - 'Minute', - 'Minute', - ), - tuple( - 'Behandlung', - 'NN', - 'Behandlung', - 'Behandlung', - ), - tuple( - '.', - '$.', - '.', - '.', - ), tuple( 'Das', 'ART', @@ -745,323 +157,59 @@ 'NULL', 'nahe', ), - tuple( - 'mal', - 'ADV', - 'mal', - 'mal', - ), - tuple( - 'rief', - 'VVFIN', - 'rufen', - 'rufen', - ), - tuple( - 'ich', - 'PPER', - 'ich', - 'ich', - ), - tuple( - 'extra', - 'ADV', - 'extra', - 'extra', - ), - tuple( - 'vorher', - 'ADV', - 'vorher', - 'vorher', - ), - tuple( - 'an', - 'PTKVZ', - 'an', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'um', - 'KOUI', - 'um', - 'um', - ), tuple( 'einen', 'ART', 'ein', 'eine', ), - tuple( - 'Termin', - 'NN', - 'Termin', - 'Termin', - ), - tuple( - 'zu', - 'PTKZU', - 'zu', - None, - ), - tuple( - 'vereinbaren', - 'VVINF', - 'vereinbaren', - None, - ), - tuple( - ',', - '$,', - ',', - ',', - ), - tuple( - 'damit', - 'KOUS', - 'damit', - 'damit', - ), tuple( 'der', 'ART', 'der', 'die', ), - tuple( - 'Konditor', - 'NN', - 'Konditor', - None, - ), - tuple( - 'auch', - 'ADV', - 'auch', - 'auch', - ), - tuple( - 'Zeit', - 'NN', - 'Zeit', - 'Zeit', - ), - tuple( - 'für', - 'APPR', - 'für', - 'für', - ), tuple( 'uns', 'PPER', 'wir', 'uns', ), - tuple( - 'hätte', - 'VAFIN', - 'haben', - 'haben', - ), - tuple( - '.', - '$.', - '.', - '.', - ), tuple( 'Eine', 'ART', 'ein', 'eine', ), - tuple( - 'Stunde', - 'NN', - 'Stunde', - 'Stunde', - ), - tuple( - 'später', - 'ADJD', - 'spät', - 'spät', - ), - tuple( - 'gab', - 'VVFIN', - 'geben', - 'geben', - ), - tuple( - 'man', - 'PIS', - 'man', - 'man', - ), tuple( 'uns', 'PPER', 'wir', 'uns', ), - tuple( - 'dann', - 'ADV', - 'dann', - 'dann', - ), - tuple( - 'endlich', - 'ADJD', - 'endlich', - 'endlich', - ), tuple( 'einen', 'ART', 'ein', 'eine', ), - tuple( - 'Tisch', - 'NN', - 'Tisch', - 'Tisch', - ), - tuple( - ',', - '$,', - ',', - ',', - ), tuple( 'der', 'PRELS', 'der', 'die', ), - tuple( - 'allerdings', - 'ADV', - 'allerdings', - 'allerdings', - ), - tuple( - 'noch', - 'ADV', - 'noch', - 'noch', - ), - tuple( - 'nicht', - 'PTKNEG', - 'nicht', - 'nicht', - ), - tuple( - 'einmal', - 'ADV', - 'einmal', - 'einmal', - ), - tuple( - 'abgeräumt', - 'VVPP', - 'abräumen', - 'abräumen', - ), - tuple( - 'war', - 'VAFIN', - 'sein', - 'sein', - ), - tuple( - '.', - '$.', - '.', - '.', - ), tuple( 'Die', 'ART', 'der', 'die', ), - tuple( - 'Bedienung', - 'NN', - 'Bedienung', - None, - ), - tuple( - 'verschwand', - 'VVFIN', - 'verschwinden', - 'verschwinden', - ), - tuple( - 'sofort', - 'ADV', - 'sofort', - 'sofort', - ), - tuple( - 'wieder', - 'ADV', - 'wieder', - 'wieder', - ), - tuple( - 'und', - 'KON', - 'und', - 'und', - ), - tuple( - 'kam', - 'VVFIN', - 'kommen', - 'kommen', - ), - tuple( - 'auch', - 'ADV', - 'auch', - 'auch', - ), - tuple( - 'erstmal', - 'ADV', - 'erstmal', - 'erstmal', - ), - tuple( - 'nicht', - 'PTKNEG', - 'nicht', - 'nicht', - ), tuple( 'mehr', 'ADV', 'mehr', 'sehr', ), - tuple( - '.', - '$.', - '.', - '.', - ), ) # --- diff --git a/test/test_lemmatizer.py b/test/test_lemmatizer.py index 2ba7db6..c039f84 100644 --- a/test/test_lemmatizer.py +++ b/test/test_lemmatizer.py @@ -3,5 +3,5 @@ def test_lemmatizer(): lemmatizer = dwdsmor.lemmatizer() - assert lemmatizer("getestet", pos={"+V"}) == "testen" - assert lemmatizer("getestet", pos={"+ADJ"}) == "getestet" + assert lemmatizer("getestet", pos={"+V"}).analysis == "testen" + assert lemmatizer("getestet", pos={"+ADJ"}).analysis == "getestet" diff --git a/test/test_spacy.py b/test/test_spacy.py index 96597fa..07a4815 100644 --- a/test/test_spacy.py +++ b/test/test_spacy.py @@ -4,6 +4,8 @@ import dwdsmor.spacy # noqa +from .conftest import if_dwds_available + @fixture(scope="module") def nlp(): @@ -23,8 +25,14 @@ def sentences(): return tuple(s["text"] for s in ds.select(range(100))) +@if_dwds_available def test_lemmatisation(nlp, sentences, snapshot): sentences = sentences[:10] docs = nlp.pipe(sentences) - tokens = ((t.text, t.tag_, t.lemma_, t._.dwdsmor_lemma) for d in docs for t in d) + tokens = ( + (t.text, t.tag_, t.lemma_, t._.dwdsmor.analysis) + for d in docs + for t in d + if t._.dwdsmor and t.lemma_ != t._.dwdsmor.analysis + ) assert tuple(tokens) == snapshot From 2e59838dd9d9fa8eabf1228915b7088e9d3d122b Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Mon, 27 Jan 2025 08:36:14 +0100 Subject: [PATCH 8/8] chore(Test): Update regression test --- test/__snapshots__/test_regression.ambr | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/__snapshots__/test_regression.ambr b/test/__snapshots__/test_regression.ambr index 5f58dea..3724d54 100644 --- a/test/__snapshots__/test_regression.ambr +++ b/test/__snapshots__/test_regression.ambr @@ -6431,9 +6431,13 @@ 'Gravi:<>s:<><+NN>:<>:<><>:e<>:s:<>:<>', 'Gravi:<>s:<><+NN>:<>:<><>:e<>:s:<>:<>', 'Gravi:<>s:<><+NN>:<>:<><>:e<>:s:<>:<>', + 'Gravis<+NN>:<>:<>:<>:<>', 'Gravis<+NN>:<>:<>:<>:<>', + 'Gravis<+NN>:<>:<>:<>:<>', 'Gravis<+NN>:<>:<>:<>:<>', + 'Gravis<+NN>:<>:<>:<>:<>', 'Gravis<+NN>:<>:<>:<>:<>', + 'Gravis<+NN>:<>:<>:<>:<>', 'Gravis<+NN>:<>:<>:<>:<>', ), tuple(