Skip to content

Commit

Permalink
[MergeDups] With similar-vernacular-form words, compare glosses & def…
Browse files Browse the repository at this point in the history
…initions
  • Loading branch information
imnasnainaec committed Jan 28, 2025
1 parent 976c1f8 commit c2eb225
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 124 deletions.
95 changes: 27 additions & 68 deletions Backend.Tests/Helper/DuplicateFinderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,90 +99,49 @@ public void GetSimilarWordsBlacklistOrGraylistTest()
}

[Test]
public void HaveIdenticalDefinitionTest()
public void HaveSameDefinitionOrGloss()
{
const string text = "YesDef";
const string lang = "YesLang";
// strings that match with .Trim().ToLowerInvariant()
const string defiText = "YesPlease ";
const string glossDef = " yesPLEASE";

var defYY = new Definition { Text = text, Language = lang };
var defYN = new Definition { Text = text, Language = "NoLang" };
var defNY = new Definition { Text = "NoDef", Language = lang };
var senseEmpty = new Sense { Definitions = [new()], Glosses = [new(), new()] };
var senseDY = new Sense { Definitions = [new(), new() { Text = "other" }, new() { Text = defiText }] };
var senseGY = new Sense { Glosses = [new(), new() { Def = glossDef }] };

var senseEmpty = new Sense { Definitions = new List<Definition> { new() } };
var senseEmptyDYY = new Sense { Definitions = new List<Definition> { new(), defYY } };
var senseEmptyDNYDYY = new Sense { Definitions = new List<Definition> { new(), defNY, defYY } };
var senseDYNDNY = new Sense { Definitions = new List<Definition> { defYN, defNY } };

var wordWithOnlyDYY = new Word
{
Senses = new List<Sense> { new(), senseEmpty, senseEmptyDYY }
};
var wordAlsoWithDYY = new Word
{
Senses = new List<Sense> { senseDYNDNY, new(), senseEmptyDNYDYY, senseEmpty }
};
var wordWithoutDYY = new Word
{
Senses = new List<Sense> { senseEmpty, senseDYNDNY, new() }
};

Assert.That(DuplicateFinder.HaveIdenticalDefinition(new Word(), new Word()), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalDefinition(new Word(), wordWithOnlyDYY), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalDefinition(wordWithoutDYY, new Word()), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalDefinition(wordWithOnlyDYY, wordWithoutDYY), Is.False);

Assert.That(DuplicateFinder.HaveIdenticalDefinition(wordWithOnlyDYY, wordAlsoWithDYY), Is.True);
Assert.That(DuplicateFinder.HaveIdenticalDefinition(wordAlsoWithDYY, wordWithOnlyDYY), Is.True);
}

[Test]
public void HaveIdenticalGlossTest()
{
const string def = "YesGloss";
const string lang = "YesLang";

var glossYY = new Gloss { Def = def, Language = lang };
var glossYN = new Gloss { Def = def, Language = "NoLang" };
var glossNY = new Gloss { Def = "NoGloss", Language = lang };

var senseEmpty = new Sense { Glosses = new List<Gloss> { new() } };
var senseEmptyGYY = new Sense { Glosses = new List<Gloss> { new(), glossYY } };
var senseEmptyGNYGYY = new Sense { Glosses = new List<Gloss> { new(), glossNY, glossYY } };
var senseGYNGNY = new Sense { Glosses = new List<Gloss> { glossYN, glossNY } };

var wordWithOnlyGYY = new Word
var wordNo = new Word
{
Senses = new List<Sense> { new(), senseEmpty, senseEmptyGYY }
Senses = [new(), new() { Definitions = [new() { Text = "different" }, new()] }, senseEmpty]
};
var wordAlsoWithGYY = new Word
var wordDYes = new Word
{
Senses = new List<Sense> { senseGYNGNY, new(), senseEmptyGNYGYY, senseEmpty }
Senses = [senseEmpty, new(), senseDY]
};
var wordWithoutGYY = new Word
var wordGYes = new Word
{
Senses = new List<Sense> { senseEmpty, senseGYNGNY, new() }
Senses = [new(), senseGY]
};

Assert.That(DuplicateFinder.HaveIdenticalGloss(new Word(), new Word()), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalGloss(new Word(), wordWithOnlyGYY), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalGloss(wordWithoutGYY, new Word()), Is.False);
Assert.That(DuplicateFinder.HaveIdenticalGloss(wordWithOnlyGYY, wordWithoutGYY), Is.False);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(new Word(), new Word()), Is.False);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(new Word(), wordNo), Is.False);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(wordNo, wordDYes), Is.False);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(wordGYes, new Word()), Is.False);

Assert.That(DuplicateFinder.HaveIdenticalGloss(wordWithOnlyGYY, wordAlsoWithGYY), Is.True);
Assert.That(DuplicateFinder.HaveIdenticalGloss(wordAlsoWithGYY, wordWithOnlyGYY), Is.True);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(wordDYes, wordDYes), Is.True);
Assert.That(DuplicateFinder.HaveSameDefinitionOrGloss(wordDYes, wordGYes), Is.True);
}

[Test]
public void MightShareGramCatGroupsTest()
{
var nounSense = new Sense { GrammaticalInfo = new GrammaticalInfo { CatGroup = GramCatGroup.Noun } };
var unspecifiedSense = new Sense { GrammaticalInfo = new GrammaticalInfo { CatGroup = GramCatGroup.Unspecified } };
var verbSense = new Sense { GrammaticalInfo = new GrammaticalInfo { CatGroup = GramCatGroup.Verb } };

var nnWord = new Word { Senses = new List<Sense> { nounSense.Clone(), nounSense.Clone() } };
var uuWord = new Word { Senses = new List<Sense> { unspecifiedSense.Clone(), unspecifiedSense.Clone() } };
var vnWord = new Word { Senses = new List<Sense> { verbSense.Clone(), nounSense.Clone() } };
var vuWord = new Word { Senses = new List<Sense> { verbSense.Clone(), unspecifiedSense.Clone() } };
var nounSense = new Sense { GrammaticalInfo = new() { CatGroup = GramCatGroup.Noun } };
var unspecifiedSense = new Sense { GrammaticalInfo = new() { CatGroup = GramCatGroup.Unspecified } };
var verbSense = new Sense { GrammaticalInfo = new() { CatGroup = GramCatGroup.Verb } };

var nnWord = new Word { Senses = [nounSense.Clone(), nounSense.Clone()] };
var uuWord = new Word { Senses = [unspecifiedSense.Clone(), unspecifiedSense.Clone()] };
var vnWord = new Word { Senses = [verbSense.Clone(), nounSense.Clone()] };
var vuWord = new Word { Senses = [verbSense.Clone(), unspecifiedSense.Clone()] };

Assert.That(DuplicateFinder.HaveCommonGramCatGroup(nnWord, vnWord), Is.True);
Assert.That(DuplicateFinder.HaveCommonGramCatGroup(nnWord, vuWord), Is.False);
Expand Down
70 changes: 14 additions & 56 deletions Backend/Helper/DuplicateFinder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ public double GetWordScore(
return vernScore;
}

if (HaveIdenticalDefinition(wordA, wordB) || HaveIdenticalGloss(wordA, wordB))
if (HaveSameDefinitionOrGloss(wordA, wordB))
{
return (double)checkGlossThreshold;
}
Expand All @@ -225,69 +225,27 @@ public double GetWordScore(
}

/// <summary>
/// Check if two <see cref="Word"/>s have <see cref="Definition"/>s with identical Language and nonempty Text.
/// Check if two <see cref="Word"/>s have
/// <see cref="Definition"/>s and/or <see cref="Gloss"/>es with the same nonempty Text/Def.
/// </summary>
public static bool HaveIdenticalDefinition(Word wordA, Word wordB)
public static bool HaveSameDefinitionOrGloss(Word wordA, Word wordB)
{
var definitionsA = wordA.Senses.SelectMany(s => s.Definitions).ToList();
if (definitionsA.Count == 0)
var textsA = GetAllDefinitionAndGlossText(wordA);
if (textsA.Count == 0)
{
return false;
}
var definitionsB = wordB.Senses.SelectMany(s => s.Definitions).ToList();
if (definitionsB.Count == 0)
{
return false;
}

foreach (var a in definitionsA)
{
if (a.Text.Length == 0)
{
continue;
}
foreach (var b in definitionsB)
{
if (a.Language == b.Language && a.Text == b.Text)
{
return true;
}
}
}
return false;
var textsB = GetAllDefinitionAndGlossText(wordB);
return textsB.Any(tB => textsA.Any(tA => tA.Equals(tB, StringComparison.Ordinal)));
}

/// <summary>
/// Check if two <see cref="Word"/>s have <see cref="Gloss"/>es with identical Language and nonempty Def.
/// </summary>
public static bool HaveIdenticalGloss(Word wordA, Word wordB)
/// <summary> Get a List of all nonempty Definition Texts and Gloss Defs. </summary>
private static List<string> GetAllDefinitionAndGlossText(Word wordA)
{
var glossesA = wordA.Senses.SelectMany(s => s.Glosses).ToList();
if (glossesA.Count == 0)
{
return false;
}
var glossesB = wordB.Senses.SelectMany(s => s.Glosses).ToList();
if (glossesB.Count == 0)
{
return false;
}

foreach (var gA in glossesA)
{
if (gA.Def.Length == 0)
{
continue;
}
foreach (var gB in glossesB)
{
if (gA.Language == gB.Language && gA.Def == gB.Def)
{
return true;
}
}
}
return false;
var texts = wordA.Senses.SelectMany(s => s.Definitions.Select(d => d.Text.Trim().ToLowerInvariant()))
.ToList();
texts.AddRange(wordA.Senses.SelectMany(s => s.Glosses.Select(g => g.Def.Trim().ToLowerInvariant())));
return texts.Where(t => !string.IsNullOrEmpty(t)).ToList();
}

/// <summary>
Expand Down

0 comments on commit c2eb225

Please sign in to comment.