Skip to content

Commit

Permalink
Merge pull request #21796 from Yoast/390-fix-plurals-with-umlaut
Browse files Browse the repository at this point in the history
Extend exception list for German nouns with umlaut in plural
  • Loading branch information
FAMarfuaty authored Jan 17, 2025
2 parents 4864684 + e181aa3 commit f7f95dc
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ const morphologyDataDE = getMorphologyData( "de" ).de;
const wordsToStem = [
// Default stemmer
[ "studenten", "student" ],
// Nouns: exceptionStems
[ "vögel", "vogel" ],
// Nouns: exceptionStems compound
[ "raubvögel", "raubvogel" ],
// Nouns: exceptionStems with one plural matching multiple singulars
[ "stadium", "stadi" ],
[ "stadion", "stadi" ],
Expand Down Expand Up @@ -55,8 +51,61 @@ const wordsToStem = [
[ "Kraftwerke", "Kraftwerk" ],
];

describe( "Test for determining stems for German words", () => {
it( "creates stems for German words", () => {
wordsToStem.forEach( wordToStem => expect( determineStem( wordToStem[ 0 ], morphologyDataDE ) ).toBe( wordToStem[ 1 ] ) );
describe.each( wordsToStem )( "Test for determining stems for German words", ( word, stem ) => {
it( "stems for German word " + word + " to " + stem, () => {
expect( determineStem( word, morphologyDataDE ) ).toBe( stem );
} );
} );

const umlautExceptions = [
// A noun that gets umlaut in plural
[ "vögel", "vogel" ],
[ "läden", "laden" ],
// A noun that gets umlaut and an irregular plural dative suffix
[ "müttern", "mutter" ],
[ "schwägern", "schwager" ],
// A noun that gets umlaut and a regular case suffix
[ "bädern", "bad" ],
[ "ängsten", "angst" ],
[ "hände", "hand" ],
// A noun that gets umlaut and -e in plural
[ "häuse", "haus" ],
[ "ängste", "angst" ],
// A noun that gets umlaut and -er in plural
[ "männer", "mann" ],
[ "wörter", "wort" ],
// compound noun that gets umlaut in plural
[ "raubvögel", "raubvogel" ],
// compound noun that gets umlaut and -e in plural
[ "landflüchte", "landflucht" ],
[ "geschwülst", "geschwulst" ],
[ "feuersbrünst", "feuersbrunst" ],
[ "hirschbrünft", "hirschbrunft" ],
[ "brünst", "brunst" ],
[ "lebensbrünst", "lebensbrunst" ],
[ "liebesbrünst", "liebesbrunst" ],
// More umlaut nouns from all groups
[ "schwäger", "schwager" ],
[ "schäden", "schaden" ],
[ "töchter", "tochter" ],
[ "brünst", "brunst" ],
[ "brüder", "bruder" ],
[ "gärten", "garten" ],
[ "gräben", "graben" ],
[ "kästen", "kasten" ],
[ "mütter", "mutter" ],
[ "läden", "laden" ],
[ "väter", "vater" ],
[ "füchs", "fuchs" ],
[ "ärzte", "arzt" ],
[ "gäns", "gans" ],
[ "häls", "hal" ],
[ "äxte", "axt" ],
[ "äste", "ast" ],
];

describe.each( umlautExceptions )( "Test for determining stems for German words with umlauts", ( word, stem ) => {
it( "stems for German word with umlaut " + word + " to " + stem, () => {
expect( determineStem( word, morphologyDataDE ) ).toBe( stem );
} );
} );
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
import { flatten } from "lodash";
import { languageProcessing } from "yoastseo";
const { flattenSortLength } = languageProcessing;

import { detectAndStemRegularParticiple } from "./detectAndStemRegularParticiple";

import stem from "./stem";

const { flattenSortLength } = languageProcessing;

/**
* Returns a stem for a word that appears on the noun exception lists.
*
* @param {Object} morphologyDataNouns The German morphology data for nouns.
* @param {string} stemmedWord The stem to check.
* @param {array[]} exceptionList The exception list to check.
* @param {string} stemmedWord The stem to check.
*
* @returns {string|null} The stemmed word or null if none was found.
*/
const findStemOnNounExceptionList = function( morphologyDataNouns, stemmedWord ) {
const exceptionStems = morphologyDataNouns.exceptionStems;

for ( const exceptionStemSet of exceptionStems ) {
const findStemOnNounExceptionList = function( exceptionList, stemmedWord ) {
for ( const exceptionStemSet of exceptionList ) {
const matchedStem = exceptionStemSet.find( exceptionStem => stemmedWord.endsWith( exceptionStem ) );

if ( matchedStem ) {
Expand Down Expand Up @@ -108,14 +106,21 @@ const findStemOnVerbExceptionList = function( morphologyDataVerbs, stemmedWord )
* @returns {string} Stemmed form of the word.
*/
export default function determineStem( word, morphologyDataGerman ) {
// Already return the stem here if the word contains umlaut and ends with an ending that looks like a valid suffix, e.g. "läden" stemmed to "laden".
const umlautException = morphologyDataGerman.nouns.umlautException || [];
const findUmlautException = findStemOnNounExceptionList( umlautException, word );
if ( findUmlautException ) {
return findUmlautException;
}

const verbData = morphologyDataGerman.verbs;
const stemmedWord = stem( verbData, word );

/*
* Goes through the stem exception functions from left to right, returns the first stem it finds.
* If no stem has been found, return the original, programmatically created, stem.
*/
return findStemOnNounExceptionList( morphologyDataGerman.nouns, stemmedWord ) ||
return findStemOnNounExceptionList( morphologyDataGerman.nouns.exceptionStems, stemmedWord ) ||
findStemOnAdjectiveExceptionList( morphologyDataGerman.adjectives, stemmedWord ) ||
findStemOnVerbExceptionList( verbData, stemmedWord ) ||
detectAndStemRegularParticiple( verbData, word ) ||
Expand Down

0 comments on commit f7f95dc

Please sign in to comment.