Skip to content

Commit

Permalink
Add more error pattern corrections
Browse files Browse the repository at this point in the history
  • Loading branch information
kavyamanohar committed Aug 22, 2024
1 parent 13a5e30 commit 940e46e
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
31 changes: 27 additions & 4 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ normalize_vowelsigns:
"ഇൗ": ""
"ഉൗ": ""
"ഒൗ": ""
"ഒാ": ""

normalize_alternateforms:
"ൻ്റ": "ന്റ"
Expand All @@ -27,16 +28,38 @@ normalize_alternateforms:
"": ""
"ു്": ""

common_mistakes: # For common mistakes
'([^\s]+)([{PUNCTUATION}\s])': '\1\2' # Remove ZWNJ at the end of words followed by ASCII punctuation or space.
common_mistakes: # Regex patterns for common mistakes
'([^\s]+)\u200c([{PUNCTUATION}\s])': '\1\2' # Remove ZWNJ at the end of words followed by ASCII punctuation or space.
# The PUNCTUATIONS are defined in core.py where regex is compiled.
'([^\s]+)$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)': '\1' # Remove ZWNJ after any of the chillu characters
'([^\s]+)\u200c$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)\u200c': '\1' # Remove ZWNJ after any of the chillu characters
'\u200D': '' # Remove all ZWJ characters
'\u200B': '' # Remove all Zero Width space characters
'\u00AD': '' # Remove all soft hyphen characters
'ര്(?![\s{PUNCTUATION}]|യ|$)': '' # Replace ര് with ർ when not at word end, string end and not followed by യ
'റ്(?![\s{PUNCTUATION}\u200c]|യ|വ|ല|ര|റ|$)': '' # Replace റ് with ർ when not at word end, string end and not followed by റ, ര, വ, ല, യ
'ള്(?![\s{PUNCTUATION}]|ള|$)': '' # Replace ള് with ൾ when not at word end, string end and not followed by ള
'ദു:ഖ': 'ദുഃഖ' # Common Mistake
'നമ:': 'നമഃ' # Remove all ZWJ characters
'ററ': 'റ്റ' # To correct പൂമ്പാററ. Fails for കണ്ടംപററി
'(^|\s)ാ': '\1ആ' # Map vowel sign "ാ" to "ആ" at the beginning of a word
'(^|\s)ി': '\1ഇ' # Map vowel sign "ി" to "ഇ" at the beginning of a word
'(^|\s)ീ': '\1ഈ' # Map vowel sign "ീ" to "ഈ" at the beginning of a word
'(^|\s)ു': '\1ഉ' # Map vowel sign "ു" to "ഉ" at the beginning of a word
'(^|\s)ൂ': '\1ഊ' # Map vowel sign "ൂ" to "ഊ" at the beginning of a word
'(^|\s)ൃ': '\1ഋ' # Map vowel sign "ൃ" to "ഋ" at the beginning of a word
'(^|\s)െ': '\1എ' # Map vowel sign "െ" to "എ" at the beginning of a word
'(^|\s)േ': '\1ഏ' # Map vowel sign "േ" to "ഏ" at the beginning of a word
'(^|\s)ൈ': '\1ഐ' # Map vowel sign "ൈ" to "ഐ" at the beginning of a word
'(^|\s)ൊ': '\1ഒ' # Map vowel sign "ൊ" to "ഒ" at the beginning of a word
'(^|\s)ോ': '\1ഓ' # Map vowel sign "ോ" to "ഓ" at the beginning of a word
'(^|\s)ൗ': '\1ഔ' # Map vowel sign "ൌ" to "ഔ" at the beginning of a word
# Common Error patterns described in https://github.com/smc/corpus/blob/master/tools/corpora-cleanup.sed
'ൻറെ' : 'ന്റെ'
'പക്ഷെ': 'പക്ഷേ'
'ൻറും' : 'ന്റും'
'ൻറ്': 'ന്റ്'
'ൻറിൽ' : 'ന്റിൽ'
'ുൻപോൾ' : 'ുമ്പോൾ'


7 changes: 6 additions & 1 deletion libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_normalize(self):
self.assertEqual(normalize('അവൻ‌'), 'അവൻ')
self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി')
self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ')
self.assertEqual(normalize('സോഫ്റ്റ്‍വെയർ'), 'സോഫ്റ്റ്വെയർ') #soft_ware written with an zwj, before ware gets removed.
self.assertEqual(normalize('സോഫ്റ്റ്‍വെയർ'), 'സോഫ്റ്റ്വെയർ') #soft_ware written with an zwj, before ‌_ware gets removed.
self.assertEqual(normalize('ആറ്റ്‌ലി'), 'ആറ്റ്‌ലി')
self.assertEqual(normalize('ഇൻസ്റ്റിറ്റ്യൂട്ട്'), 'ഇൻസ്റ്റിറ്റ്യൂട്ട്')
self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം')
Expand All @@ -63,6 +63,11 @@ def test_normalize(self):
self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True),
'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize(' ൊന്നിലോ'), ' ഒന്നിലോ')
self.assertEqual(normalize('ൌന്നത്യം'), 'ഔന്നത്യം')
self.assertEqual(normalize('പാൻറ്'), 'പാന്റ്')





Expand Down

0 comments on commit 940e46e

Please sign in to comment.