Skip to content

Commit

Permalink
Move language attribute to PhonologicalSymbol.Inventory and update th…
Browse files Browse the repository at this point in the history
…e language of Graphemes from their inventory.

PiperOrigin-RevId: 730459942
  • Loading branch information
isingoo authored and copybara-github committed Feb 24, 2025
1 parent 0b6d007 commit 46ed0e6
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 43 deletions.
5 changes: 4 additions & 1 deletion nisaba/scripts/natural_translit/language_params/en.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
from nisaba.scripts.natural_translit.script.inventories import latn as l


LANGUAGE = g.Grapheme.LANGUAGE.en


def _latn_inventory() -> g.Grapheme.Inventory:
"""Builds a grapheme inventory for English."""
latn = l.LATN
ph = x_uni.PHONEMES
gr = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn, 'en')
gr = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn, LANGUAGE)
lowercase = [
latn.a,
latn.b,
Expand Down
11 changes: 8 additions & 3 deletions nisaba/scripts/natural_translit/phonology/phonological_symbol.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,14 @@ def has_feature(self, value: ft.Feature.Aspect.VALUES) -> bool:
class Inventory(sym.Symbol.Inventory):
"""Phonological symbol inventory."""

def __init__(self, alias: str, typed: ty.TypeOrNothing = ty.UNSPECIFIED):
def __init__(
self,
alias,
language: lang.Language.OR_NOTHING = ty.UNSPECIFIED,
typed: ty.TypeOrNothing = ty.UNSPECIFIED,
):
super().__init__(alias, typed=ty.type_check(typed, PhonologicalSymbol))
self.language = ty.type_check(language, Phon.LANGUAGE.x_uni)
self.atomics = i.Inventory()

def _add_symbol_and_atomic(self, symbol: PhonologicalSymbol) -> bool:
Expand Down Expand Up @@ -166,8 +172,7 @@ class Inventory(PhonologicalSymbol.Inventory):

def __init__(self, language: lang.Language.OR_NOTHING = ty.UNSPECIFIED):
language = ty.type_check(language, Phon.LANGUAGE.x_uni)
super().__init__(language.alias, typed=Phon)
self.language = language
super().__init__(language.alias, language, Phon)

def _add_phoneme(self, phoneme: Phon) -> bool:
"""Adds a phoneme to the inventory."""
Expand Down
2 changes: 1 addition & 1 deletion nisaba/scripts/natural_translit/script/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ py_library(
srcs = ["grapheme.py"],
deps = [
"//nisaba/scripts/natural_translit/phonology:phonological_symbol",
"//nisaba/scripts/natural_translit/phonology/features:language",
"//nisaba/scripts/natural_translit/utils:feature",
"//nisaba/scripts/natural_translit/utils:inventory",
"//nisaba/scripts/natural_translit/utils:type_op",
requirement("pycountry"),
],
Expand Down
15 changes: 9 additions & 6 deletions nisaba/scripts/natural_translit/script/grapheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import pycountry

from nisaba.scripts.natural_translit.phonology import phonological_symbol as ps
from nisaba.scripts.natural_translit.phonology.features import language as lang
from nisaba.scripts.natural_translit.utils import feature as ft
from nisaba.scripts.natural_translit.utils import inventory as i
from nisaba.scripts.natural_translit.utils import type_op as ty


Expand Down Expand Up @@ -168,13 +168,15 @@ def copy(
class Inventory(ps.PhonologicalSymbol.Inventory):
"""Grapheme inventory."""

def __init__(self, script: Script, language: str = ''):
if language:
language += '_'
super().__init__(alias=language + script.alias, typed=Grapheme)
def __init__(
self,
script: Script,
language: lang.Language.OR_NOTHING = ty.UNSPECIFIED,
):
alias_prefix = language.alias + '_' if language else ''
super().__init__(alias_prefix + script.alias, language, Grapheme)
self.script = script
self.prefix = self._prefix()
self.atomics = i.Inventory()

def _prefix(self) -> int:
return (
Expand All @@ -188,6 +190,7 @@ def add_graphemes(
grs = []
for gr in graphemes:
if self._add_symbol_and_atomic(gr):
gr.language = self.language
grs.append(gr)
gr.inventory = self
if list_alias:
Expand Down
72 changes: 40 additions & 32 deletions nisaba/scripts/natural_translit/script/grapheme_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,21 @@
from nisaba.scripts.natural_translit.script import grapheme as g
from nisaba.scripts.natural_translit.utils import test_op

_UND_GRAPHEMES = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.und)
_G = g.Grapheme
_UND_GRAPHEMES = _G.Inventory(_G.GR_FEATURES.script.und)


def _test_inventory() -> g.Grapheme.Inventory:
def _test_inventory() -> _G.Inventory:
"""Test grapheme inventory."""
gr_inv = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn)
gr_inv = _G.Inventory(_G.GR_FEATURES.script.latn)
gr_inv.add_graphemes(
# Raw
g.Grapheme.from_char('a', 'a'),
_G.from_char('a', 'a'),
# Abstract with custom text
g.Grapheme('nasal', '~'),
_G('nasal', '~'),
# Abstract with no text
# Eg. Graphemes dynamically generated by aligner for partial match.
g.Grapheme('ch_1'),
_G('ch_1'),
)
return gr_inv

Expand All @@ -40,102 +41,103 @@ class GraphemeTest(test_op.TestCase):

def test_script_iso(self):
self.AssertStrEqual(
g.Grapheme.GR_FEATURES.script.latn,
_G.GR_FEATURES.script.latn,
'alias: latn text: Latin numeric: 215',
)

def test_script_custom(self):
self.AssertStrEqual(
g.Grapheme.GR_FEATURES.script.br,
_G.GR_FEATURES.script.br,
'alias: br text: Brahmic Parent numeric: 801',
)

def test_from_char_index(self):
self.assertEqual(
g.Grapheme.from_char('a').index,
g.Grapheme.ReservedIndex.GRAPHEME_PREFIX + 97,
_G.from_char('a').index,
_G.ReservedIndex.GRAPHEME_PREFIX + 97,
)

def test_from_char_name(self):
self.assertEqual(g.Grapheme.from_char('🐱').name, 'CAT FACE U+1F431')
self.assertEqual(_G.from_char('🐱').name, 'CAT FACE U+1F431')

def test_from_char_name_error(self):
self.assertEqual(
g.Grapheme.from_char(chr(0xE027)).name, # Unassignable PUA character
_G.from_char(chr(0xE027)).name, # Unassignable PUA character
'GRAPHEME U+E027'
)

def test_from_char_alias_explicit(self):
self.assertEqual(g.Grapheme.from_char('ç', 'c_ced').alias, 'c_ced')
self.assertEqual(_G.from_char('ç', 'c_ced').alias, 'c_ced')

def test_from_char_alias_default(self):
self.assertEqual(g.Grapheme.from_char('ç').alias, 'u_0xe7')
self.assertEqual(_G.from_char('ç').alias, 'u_0xe7')

def test_from_char_has_feature(self):
self.AssertHasFeature(
g.Grapheme.from_char('a'), g.Grapheme.SYM_FEATURES.type.raw
_G.from_char('a'), _G.SYM_FEATURES.type.raw
)

def test_from_char_add_feature(self):
self.AssertHasFeature(
g.Grapheme.from_char(
'n', features=g.Grapheme.DESCRIPTIVE_FEATURES.ph_class.consonant
_G.from_char(
'n', features=_G.DESCRIPTIVE_FEATURES.ph_class.consonant
),
g.Grapheme.DESCRIPTIVE_FEATURES.ph_class.consonant,
_G.DESCRIPTIVE_FEATURES.ph_class.consonant,
)

def test_control_index(self):
self.assertEqual(
_UND_GRAPHEMES.CTRL.eps.index, g.Grapheme.ReservedIndex.CONTROL_PREFIX
_UND_GRAPHEMES.CTRL.eps.index, _G.ReservedIndex.CONTROL_PREFIX
)

def test_control_in_text_dict(self):
self.assertIn(g.Grapheme.CTRL.unk.text, _UND_GRAPHEMES.text_dict)
self.assertIn(_G.CTRL.unk.text, _UND_GRAPHEMES.text_dict)

def test_control_not_in_raw_dict(self):
self.assertNotIn(g.Grapheme.CTRL.unk.text, _UND_GRAPHEMES.raw_dict)
self.assertNotIn(_G.CTRL.unk.text, _UND_GRAPHEMES.raw_dict)

def test_control_in_index_dict(self):
self.assertIn(g.Grapheme.CTRL.oos.index, _UND_GRAPHEMES.index_dict)
self.assertIn(_G.CTRL.oos.index, _UND_GRAPHEMES.index_dict)

def test_inventory(self):
self.assertEqual(_UND_GRAPHEMES.alias, 'und')
self.assertEqual(_UND_GRAPHEMES.prefix, 2_800_000)
self.assertEqual(_UND_GRAPHEMES.language, _G.LANGUAGE.x_uni)

def test_add_grapheme_in_dicts(self):
char = 'ß'
_UND_GRAPHEMES.add_graphemes(g.Grapheme.from_char(char, alias='ss'))
_UND_GRAPHEMES.add_graphemes(_G.from_char(char, alias='ss'))
self.assertEqual(_UND_GRAPHEMES.raw_lookup(char), _UND_GRAPHEMES.ss)
self.assertEqual(_UND_GRAPHEMES.text_lookup(char), _UND_GRAPHEMES.ss)
self.assertEqual(
_UND_GRAPHEMES.index_lookup(
g.Grapheme.ReservedIndex.GRAPHEME_PREFIX + ord(char)
_G.ReservedIndex.GRAPHEME_PREFIX + ord(char)
),
_UND_GRAPHEMES.ss,
)
self.assertNotEqual(_UND_GRAPHEMES.atomics.ss, _UND_GRAPHEMES.ss)
self.AssertEquivalent(_UND_GRAPHEMES.atomics.ss, _UND_GRAPHEMES.ss)

def test_add_grapheme_recurring_alias(self):
_UND_GRAPHEMES.add_graphemes(g.Grapheme.from_char('œ', alias='oe'))
_UND_GRAPHEMES.add_graphemes(_G.from_char('œ', alias='oe'))
self.assertFalse(
_UND_GRAPHEMES.add_graphemes(g.Grapheme.from_char('Œ', alias='oe'))
_UND_GRAPHEMES.add_graphemes(_G.from_char('Œ', alias='oe'))
)

def test_add_grapheme_wrong_type(self):
self.assertFalse(_UND_GRAPHEMES._add_symbol(g.Grapheme.CTRL.eps))
self.assertFalse(_UND_GRAPHEMES._add_symbol(_G.CTRL.eps))

def test_add_graphemes(self):
_UND_GRAPHEMES.add_graphemes(
g.Grapheme.from_char('(', 'prl'),
g.Grapheme.from_char(')', 'prr'),
_G.from_char('(', 'prl'),
_G.from_char(')', 'prr'),
list_alias='parentheses',
)
self.assertIn(_UND_GRAPHEMES.prl, _UND_GRAPHEMES)
self.assertIn(_UND_GRAPHEMES.prl, _UND_GRAPHEMES.parentheses)

def test_get_grapheme_in_inventory(self):
_UND_GRAPHEMES.add_graphemes(g.Grapheme.from_char('æ', alias='ae'))
_UND_GRAPHEMES.add_graphemes(_G.from_char('æ', alias='ae'))
self.assertEqual(_UND_GRAPHEMES.text_lookup('æ'), _UND_GRAPHEMES.ae)

def test_get_grapheme_out_of_inventory_char(self):
Expand All @@ -149,7 +151,7 @@ def test_parse(self):
_UND_GRAPHEMES.parse('ş'), [_UND_GRAPHEMES.get('u_' + hex(ord('ş')))]
)
self.assertIn(
g.Grapheme.ReservedIndex.GRAPHEME_PREFIX + ord('ş'),
_G.ReservedIndex.GRAPHEME_PREFIX + ord('ş'),
_UND_GRAPHEMES.index_dict,
)

Expand All @@ -168,7 +170,9 @@ def test_grapheme_description(self):
)

def test_import_graphemes(self):
new_inv = g.Grapheme.Inventory(g.Grapheme.GR_FEATURES.script.latn)
new_inv = _G.Inventory(
_G.GR_FEATURES.script.latn, _G.LANGUAGE.en
)
new_inv.import_graphemes(
_TEST_INVENTORY.a, _TEST_INVENTORY.nasal, list_alias='from_test'
)
Expand All @@ -177,6 +181,10 @@ def test_import_graphemes(self):
new_inv.a.description(show_features=True),
_TEST_INVENTORY.a.description(show_features=True),
)
self.assertEqual(new_inv.language, _G.LANGUAGE.en)
self.assertEqual(new_inv.a.language, _G.LANGUAGE.en)
self.AssertHasFeature(new_inv.a, _G.LANGUAGE.en)
self.AssertHasFeature(new_inv.a, _G.LANGUAGE.indo_european)


if __name__ == '__main__':
Expand Down

0 comments on commit 46ed0e6

Please sign in to comment.