Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend exception list for German nouns with umlaut in plural #21796

Merged
merged 8 commits into from
Jan 17, 2025
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,19 @@ const morphologyDataDE = getMorphologyData( "de" ).de;
const wordsToStem = [
// Default stemmer
[ "studenten", "student" ],
// Nouns: exceptionStems
// Nouns that gets umlaut in plural
[ "vögel", "vogel" ],
// Nouns: exceptionStems compound
[ "läden", "laden" ],
// A noun that gets umlaut and -e in plural
[ "häuse", "haus" ],
[ "ängste", "angst" ],
// A noun that gets umlaut and -er in plural
[ "männer", "mann" ],
[ "wörter", "wort" ],
// compound noun that gets umlaut in plural
[ "raubvögel", "raubvogel" ],
// compound noun that gets umlaut and -e in plural
[ "landflüchte", "landflucht" ],
// Nouns: exceptionStems with one plural matching multiple singulars
[ "stadium", "stadi" ],
[ "stadion", "stadi" ],
Expand Down Expand Up @@ -55,8 +64,40 @@ const wordsToStem = [
[ "Kraftwerke", "Kraftwerk" ],
];

describe( "Test for determining stems for German words", () => {
it( "creates stems for German words", () => {
wordsToStem.forEach( wordToStem => expect( determineStem( wordToStem[ 0 ], morphologyDataDE ) ).toBe( wordToStem[ 1 ] ) );
describe.each( wordsToStem )( "Test for determining stems for German words", ( word, stem ) => {
it( "stems for German word " + word + " to " + stem, () => {
expect( determineStem( word, morphologyDataDE ) ).toBe( stem );
} );
} );

const umlautExceptions = [
[ "geschwülst", "geschwulst" ],
[ "schwäger", "schwager" ],
[ "schäden", "schaden" ],
[ "töchter", "tochter" ],
[ "brünst", "brunst" ],
[ "brüder", "bruder" ],
[ "gärten", "garten" ],
[ "gräben", "graben" ],
[ "kästen", "kasten" ],
[ "mütter", "mutter" ],
[ "läden", "laden" ],
[ "väter", "vater" ],
[ "füchs", "fuchs" ],
[ "ärzte", "arzt" ],
[ "gäns", "gans" ],
[ "häls", "hal" ],
[ "äxte", "axt" ],
[ "äste", "ast" ],
[ "feuersbrünst", "feuersbrunst" ],
[ "hirschbrünft", "hirschbrunft" ],
[ "brünst", "brunst" ],
[ "lebensbrünst", "lebensbrunst" ],
[ "liebesbrünst", "liebesbrunst" ],
];

describe.each( umlautExceptions )( "Test for determining stems for German words with umlauts", ( word, stem ) => {
it( "stems for German word with umlaut " + word + " to " + stem, () => {
expect( determineStem( word, morphologyDataDE ) ).toBe( stem );
} );
} );
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
import { flatten } from "lodash";
import { languageProcessing } from "yoastseo";
const { flattenSortLength } = languageProcessing;

import { detectAndStemRegularParticiple } from "./detectAndStemRegularParticiple";

import stem from "./stem";

const { flattenSortLength } = languageProcessing;

/**
* Returns a stem for a word that appears on the noun exception lists.
*
* @param {Object} morphologyDataNouns The German morphology data for nouns.
* @param {string} stemmedWord The stem to check.
* @param {array[]} exceptionList The exception list to check.
* @param {string} stemmedWord The stem to check.
*
* @returns {string|null} The stemmed word or null if none was found.
*/
const findStemOnNounExceptionList = function( morphologyDataNouns, stemmedWord ) {
const exceptionStems = morphologyDataNouns.exceptionStems;

for ( const exceptionStemSet of exceptionStems ) {
const findStemOnNounExceptionList = function( exceptionList, stemmedWord ) {
for ( const exceptionStemSet of exceptionList ) {
const matchedStem = exceptionStemSet.find( exceptionStem => stemmedWord.endsWith( exceptionStem ) );

if ( matchedStem ) {
Expand Down Expand Up @@ -108,14 +106,21 @@ const findStemOnVerbExceptionList = function( morphologyDataVerbs, stemmedWord )
* @returns {string} Stemmed form of the word.
*/
export default function determineStem( word, morphologyDataGerman ) {
// Already return the stem here if the word contains umlaut and ends with an ending that looks like a valid suffix, e.g. "läden" stemmed to "laden".
const umlautException = morphologyDataGerman.nouns.umlautException;
hannaw93 marked this conversation as resolved.
Show resolved Hide resolved
const findUmlautException = findStemOnNounExceptionList( umlautException, word );
if ( findUmlautException ) {
return findUmlautException;
}

const verbData = morphologyDataGerman.verbs;
const stemmedWord = stem( verbData, word );

/*
* Goes through the stem exception functions from left to right, returns the first stem it finds.
* If no stem has been found, return the original, programmatically created, stem.
*/
return findStemOnNounExceptionList( morphologyDataGerman.nouns, stemmedWord ) ||
return findStemOnNounExceptionList( morphologyDataGerman.nouns.exceptionStems, stemmedWord ) ||
findStemOnAdjectiveExceptionList( morphologyDataGerman.adjectives, stemmedWord ) ||
findStemOnVerbExceptionList( verbData, stemmedWord ) ||
detectAndStemRegularParticiple( verbData, word ) ||
Expand Down
Loading