Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add spaCy integration and DWDSmor/HDT tag translation #9

Merged
merged 8 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ jobs:
- name: Build
run: ./build-dwdsmor
- name: Run tests
run: pytest
run: pytest --snapshot-warn-unused
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,22 @@ pip install dwdsmor
The library can be used for lemmatisation:

``` python-console
>>> import dwsdmor
>>> import dwdsmor
>>> lemmatizer = dwdsmor.lemmatizer()
>>> assert lemmatizer("getestet", pos={"+V"}) == "testen"
>>> assert lemmatizer("getestet", pos={"+ADJ"}) == "getestet"
>>> assert lemmatizer("getestet", pos={"+V"}).analysis == "testen"
>>> assert lemmatizer("getestet", pos={"+ADJ"}).analysis == "getestet"
```

There is also integration with spacy:

``` python-console
>>> import spacy
>>> import dwdsmor.spacy
>>> nlp = spacy.load("de_hdt_lg")
>>> nlp.add_pipe("dwdsmor")
<dwdsmor.spacy.Component object at 0x7f99e634f220>
>>> tuple((t.lemma_, t._.dwdsmor.analysis) for t in nlp("Das ist ein Test."))
(('der', 'die'), ('sein', 'sein'), ('ein', 'eine'), ('Test', 'Test'), ('.', '.'))
```

Next to the Python API, the package provides a simple command line
Expand Down
8 changes: 4 additions & 4 deletions dwdsmor/automaton.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,17 +209,17 @@ def __init__(self, automata, automaton_type="lemma"):

def __call__(self, word, **criteria):
traversals = tuple(self.analyzer.analyze(word))
criteria_stack = list(criteria.items())
criteria_stack = list((k, v) for k, v in criteria.items() if v)
criteria_stack.reverse()
while criteria_stack:
if len(traversals) == 1:
(traversal,) = traversals
return traversal.analysis
break
attr, attr_vals = criteria_stack.pop()
filtered = tuple((t for t in traversals if getattr(t, attr) in attr_vals))
traversals = filtered or traversals
traversals = sorted(traversals, key=lambda t: len(t.spec))
for traversal in traversals:
return traversal.analysis
return traversal


def lemmatizer(*args, automaton_type="lemma", **kwargs):
Expand Down
53 changes: 53 additions & 0 deletions dwdsmor/spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Iterable

from spacy.language import Language
from spacy.tokens.token import Token

import dwdsmor.tag.hdt as hdt

from . import lemmatizer
from .automaton import Lemmatizer

Token.set_extension("dwdsmor", default=None)


def morph(token_morph, k):
v = ",".join(token_morph.get(k))
return v if v else None


def lemmatize_token(lemmatizer: Lemmatizer, token: Token):
token_morph = token.morph
token_criteria = hdt.criteria(
token.tag_,
morph(token_morph, "Number"),
morph(token_morph, "Gender"),
morph(token_morph, "Case"),
morph(token_morph, "Person"),
morph(token_morph, "Tense"),
morph(token_morph, "Degree"),
morph(token_morph, "Mood"),
morph(token_morph, "VerbForm"),
)
token._.dwdsmor = lemmatizer(token.text, **token_criteria)
return token


def lemmatize(lemmatizer: Lemmatizer, tokens: Iterable[Token]):
for token in tokens:
lemmatize_token(lemmatizer, token)
return tokens


class Component:
def __init__(self, automata_location=None):
self.lemmatizer = lemmatizer(automata_location)

def __call__(self, doc):
lemmatize(self.lemmatizer, doc)
return doc


@Language.factory("dwdsmor", default_config={"automata_location": None})
def create_component(nlp: Language, name: str, automata_location: str | None):
return Component(automata_location)
File renamed without changes.
129 changes: 129 additions & 0 deletions dwdsmor/tag/hdt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from collections import OrderedDict
from functools import cache

pos_map = {
"$(": {"+PUNCT"},
"$,": {"+PUNCT"},
"$.": {"+PUNCT"},
"ADJA": {"+ADJ", "+CARD", "+INDEF", "+ORD"},
"ADJD": {"+ADJ"},
"ADV": {"+ADV"},
"APPO": {"+POSTP"},
"APPR": {"+PREP"},
"APPR_ART": {"+PREPART"},
"APZR": {"+POSTP", "+PREP"},
"ART": {"+ART"},
"CARD": {"+CARD"},
"FM": {"+FM"}, # ?
"ITJ": {"+INTJ"},
"KOKOM": {"+CONJ"},
"KON": {"+CONJ"},
"KOUI": {"+CONJ"},
"KOUS": {"+CONJ"},
"NE": {"+NN", "+NPROP"},
"NN": {"+NN", "+NPROP"},
"PDAT": {"+DEM"},
"PDS": {"+DEM"},
"PIAT": {"+INDEF"},
"PIDAT": {"+INDEF"},
"PIS": {"+INDEF"},
"PPER": {"+PPRO"},
"PPOSAT": {"+POSS"},
"PPOSS": {"+POSS"},
"PRELAT": {"+REL"},
"PRELS": {"+REL"},
"PRF": {"+PPRO"},
"PROAV": {"+ADV", "+PROADV"},
"PTKA": {"+PTCL"},
"PTKANT": {"+INTJ", "+PTCL"},
"PTKNEG": {"+PTCL"},
"PTKVZ": {"+ADV", "+PREP", "+VPART"},
"PTKZU": {"+PTCL"},
"PWAT": {"+WPRO"},
"PWAV": {"+ADV"},
"PWS": {"+WPRO"},
"TRUNC": {"+TRUNC"}, # ?
"VAFIN": {"+V"},
"VAIMP": {"+V"},
"VAINF": {"+V"},
"VAPP": {"+V"},
"VMFIN": {"+V"},
"VMINF": {"+V"},
"VMPP": {"+V"},
"VVFIN": {"+V"},
"VVIMP": {"+V"},
"VVINF": {"+V"},
"VVIZU": {"+V"},
"VVPP": {"+V"},
"XY": {"+XY"}, # ?
}

number_map = {
"Sing": {"Sg"},
"Plur": {"Pl"},
}


gender_map = {
"Masc,Neut": {"Masc", "Neut"},
"Neut": {"Neut"},
"Fem": {"Fem"},
"Masc": {"Masc"},
}

case_map = {
"Nom": {"Nom"},
"Gen": {"Gen"},
"Dat": {"Dat"},
"Acc": {"Acc"},
}

person_map = {
"1": {"1"},
"2": {"2"},
"3": {"3"},
}

tense_map = {
"Past": {"Past"},
"Pres": {"Pres"},
}


degree_map = {
"Cmp": {"Comp"},
"Sup": {"Sup"},
"Pos": {"Pos"},
}

mood_map = {
"Ind": {"Ind"},
"Imp": {"Imp"},
}

# VerbForm
nonfinite_map = {
"Part": {"Part"},
"Inf": {"Inf"},
}


def criterion(k, v, mapping):
return (k, mapping.get(v, {v}) if v else None)


@cache
def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite):
return OrderedDict(
(
criterion("pos", pos, pos_map),
criterion("number", number, number_map),
criterion("gender", gender, gender_map),
criterion("case", case, case_map),
criterion("person", person, person_map),
criterion("tense", tense, tense_map),
criterion("degree", degree, degree_map),
criterion("mood", mood, mood_map),
criterion("nonfinite", nonfinite, nonfinite_map),
)
)
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ classifiers = [
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Topic :: Education",
"Topic :: Scientific/Engineering",
"Topic :: Text Processing :: Linguistic"
Expand All @@ -39,8 +45,8 @@ dev = [
"pytest",
"syrupy",
"tqdm",
"Jinja2"
]
"Jinja2",
"de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"]

[project.scripts]
dwdsmor = "dwdsmor.cli:main"
Expand Down
4 changes: 4 additions & 0 deletions test/__snapshots__/test_regression.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -6431,9 +6431,13 @@
'Gravi:<>s:<><+NN>:<><Masc>:<><>:e<>:s<Dat>:<><Pl>:<>',
'Gravi:<>s:<><+NN>:<><Masc>:<><>:e<>:s<Gen>:<><Pl>:<>',
'Gravi:<>s:<><+NN>:<><Masc>:<><>:e<>:s<Nom>:<><Pl>:<>',
'Gravis<+NN>:<><Masc>:<><Acc>:<><Pl>:<>',
'Gravis<+NN>:<><Masc>:<><Acc>:<><Sg>:<>',
'Gravis<+NN>:<><Masc>:<><Dat>:<><Pl>:<>',
'Gravis<+NN>:<><Masc>:<><Dat>:<><Sg>:<>',
'Gravis<+NN>:<><Masc>:<><Gen>:<><Pl>:<>',
'Gravis<+NN>:<><Masc>:<><Gen>:<><Sg>:<>',
'Gravis<+NN>:<><Masc>:<><Nom>:<><Pl>:<>',
'Gravis<+NN>:<><Masc>:<><Nom>:<><Sg>:<>',
),
tuple(
Expand Down
Loading
Loading