diff --git a/nisaba/scripts/natural_translit/utils/expression.py b/nisaba/scripts/natural_translit/utils/expression.py index 8c07cfac..9c1a1b65 100644 --- a/nisaba/scripts/natural_translit/utils/expression.py +++ b/nisaba/scripts/natural_translit/utils/expression.py @@ -112,6 +112,19 @@ def _symbols_of( else: return [[other]] + def is_any(self) -> bool: + if isinstance(self, sym.Symbol): + return False + if len(self) == 1: + return self.item(0).is_any() + return self is Expression.ANY + + def is_eps(self) -> bool: + return isinstance(self, sym.Symbol) and self.symbol.is_eps() + + def is_nor(self) -> bool: + return isinstance(self, sym.Symbol) and self.symbol.is_nor() + def accepts( self, other: 'Expression.OR_SYMBOL', equivalent: bool = False ) -> bool: @@ -124,6 +137,10 @@ def accepts( Returns: bool """ + if self.is_any() or other.is_any(): + if equivalent: + return self.is_any() and other.is_any() + return True self_symbols, other_symbols = self.symbols(), self._symbols_of(other) self_len, other_len = len(self_symbols), len(other_symbols) if ( @@ -198,6 +215,8 @@ def contains_symbol_list( return True if search_for == [sym.Symbol.CTRL.nor]: return False + if self.is_any(): + return True # Loop over symbol lists, eg: [[a, b, c, d], [e, f, g]] for symbol_list in self.symbols(): while symbol_list: @@ -251,15 +270,19 @@ def contains( a.contains(b, head=True): [a, b, c, d] starts with [a, b] a.contains(b, tail=True): False """ + if self.is_any() or other.is_any(): + return not self.is_nor() and not other.is_nor() for sym_list in self._symbols_of(other): if self.contains_symbol_list(sym_list, head, tail): return True return False def _symbol_contains(self, other: sym.Symbol) -> bool: + if other.is_any(): + return True self_symbols = self.symbols() return [sym.Symbol.CTRL.eps] in self_symbols or ( - other != sym.Symbol.CTRL.nor and [other] in self_symbols + not other.is_nor() and [other] in self_symbols ) def is_contained( @@ -278,13 +301,13 @@ def matches(self, other: 'Expression.OR_SYMBOL') -> bool: return self.contains(other, head=True, tail=True) # head_matches and tail_matches require at least one symbol match unless - # both expressions are empty Cats. For example, if a rule requires a vowel as - # following context but there is no following context, the rule shouldn't - # apply. + # both expressions are empty Cats or one of the expressions is Expression.ANY + # For example, if a rule requires a vowel as following context but there is no + # following context, the rule shouldn't apply. def head_matches(self, other: 'Expression.OR_SYMBOL') -> bool: if self and not other: - return False + return other.is_any() return self.contains(other, head=True) def is_prefix(self, other: 'Expression.OR_SYMBOL') -> bool: @@ -294,7 +317,7 @@ def is_prefix(self, other: 'Expression.OR_SYMBOL') -> bool: def tail_matches(self, other: 'Expression.OR_SYMBOL') -> bool: if self and not other: - return False + return other.is_any() return self.contains(other, tail=True) def is_suffix(self, other: 'Expression.OR_SYMBOL') -> bool: @@ -316,6 +339,9 @@ def repeat(self, n: int = 2) -> 'Cat': return Cat(*([self] * n)) +Expression.ANY = Expression('any_expression') + + class Atomic(Expression, sym.Symbol): """An instance of a single symbol.""" @@ -392,7 +418,10 @@ def __str__(self): def add(self, *items: Expression) -> 'Cat': for item in items: - self._add_item(item) + if item.is_any(): + self._items.append(item) + else: + self._add_item(item) return self def symbols(self) -> list[list[sym.Symbol]]: @@ -462,6 +491,13 @@ def add(self, *items: Expression) -> 'Or': self """ for item in items: + # If the Expression.ANY is in Or, don't add any items. + if Expression.ANY in self: + break + # If the item is any, the other items are irrelevant. + if item.is_any(): + self._items = [item] + break if self.accepts(item): if item.accepts(self) and item.state_count() < self.state_count(): self._update(item) diff --git a/nisaba/scripts/natural_translit/utils/expression_test.py b/nisaba/scripts/natural_translit/utils/expression_test.py index 0c9dd15d..28331e1e 100644 --- a/nisaba/scripts/natural_translit/utils/expression_test.py +++ b/nisaba/scripts/natural_translit/utils/expression_test.py @@ -51,6 +51,8 @@ def test_atomic_read(self): def test_control(self): self.assertTrue(exp.Atomic.CTRL.unk.is_control()) + self.assertTrue(exp.Atomic.CTRL.eps.is_eps()) + self.assertTrue(exp.Atomic.CTRL.nor.is_nor()) def test_symbol_inventory_lookup(self): self.assertEqual(_ATM.lookup(_ATM.a, 'atm_sym'), _SYM.a) @@ -98,6 +100,7 @@ def test_cat_items(self): self.AssertStrEqual(cat, '(a b a)') self.assertIsNot(cat.item(0), cat.item(2)) self.AssertEquivalent(cat.item(0), (cat.item(2))) + self.assertTrue(exp.Cat(exp.Expression.ANY).is_any()) def test_cat_nested(self): cat1 = exp.Cat(_ATM.a, _ATM.b) @@ -132,10 +135,13 @@ def test_or_items(self): or2 = exp.Or(_ATM.b, _ATM.c) or3 = exp.Or(or1, or2) or4 = exp.Or(_ATM.a, _ATM.b, _ATM.a) + or5 = or1.copy().add(exp.Expression.ANY) self.AssertStrEqual(or2, '(b | c)') self.AssertStrEqual(or3, '(a | b | c)') self.AssertStrEqual(or4, '(a | b)') self.assertLen(or4, 2) + self.assertNotIn(_ATM.a, or5) + self.AssertAccepts(or5, _ATM.a) def test_or_nested(self): cat1 = exp.Cat(_ATM.a) @@ -214,6 +220,8 @@ def test_state_count(self): def test_equivalent(self): or0 = exp.Or() self.AssertEquivalent(exp.Atomic.CTRL.eps, sym.Symbol.CTRL.eps) + self.AssertEquivalent(exp.Expression.ANY, exp.Expression.ANY) + self.AssertNotEquivalent(exp.Expression.ANY, exp.Atomic.CTRL.eps) self.AssertEquivalent(exp.Cat(), exp.Atomic.CTRL.eps) self.AssertEquivalent(exp.Cat(), exp.Cat()) self.AssertNotEquivalent(or0, exp.Atomic.CTRL.nor) @@ -229,13 +237,21 @@ def test_equivalent(self): def test_contains_controls(self): eps = exp.Atomic.CTRL.eps nor = exp.Atomic.CTRL.nor + any_exp = exp.Expression.ANY self.AssertContains(eps, eps) self.AssertContains(nor, eps) + self.AssertNotContains(nor, any_exp) self.AssertNotContains(eps, nor) self.AssertNotContains(nor, nor) + self.AssertNotContains(any_exp, nor) def test_contains_expressions(self): cat_abc = _ATM.a + _ATM.b + _ATM.c + any_exp = exp.Expression.ANY + self.AssertContains(any_exp, _ATM.a) + self.AssertContains(_ATM.a, any_exp) + self.AssertContains(any_exp, cat_abc) + self.AssertContains(cat_abc, any_exp) self.AssertContains(cat_abc, exp.Cat()) self.AssertNotContains(cat_abc, exp.Or()) self.AssertContains(cat_abc, exp.Or(exp.Cat())) @@ -250,8 +266,15 @@ def test_contains_expressions(self): def test_matches(self): abc_or_cd = (_ATM.a + _ATM.b + _ATM.c) | (_ATM.c + _ATM.d) a_or_c_b_or_d = (_ATM.a | _ATM.c) + (_ATM.b | _ATM.d) + any_exp = exp.Expression.ANY + self.AssertMatches(any_exp, a_or_c_b_or_d) + self.AssertMatches(a_or_c_b_or_d, any_exp) self.AssertMatches(abc_or_cd, a_or_c_b_or_d) self.AssertNotMatches(abc_or_cd, _ATM.a + _ATM.b + _ATM.d) + self.assertTrue(any_exp.is_prefix(abc_or_cd)) + self.assertTrue(any_exp.is_suffix(abc_or_cd)) + self.assertTrue(abc_or_cd.is_prefix(any_exp)) + self.assertTrue(abc_or_cd.is_suffix(any_exp)) self.assertTrue(exp.Cat().is_prefix(exp.Cat())) self.assertTrue(exp.Cat().is_suffix(exp.Cat())) self.assertFalse(exp.Cat().is_prefix(abc_or_cd)) diff --git a/nisaba/scripts/natural_translit/utils/symbol.py b/nisaba/scripts/natural_translit/utils/symbol.py index 38d972c9..f53137ac 100644 --- a/nisaba/scripts/natural_translit/utils/symbol.py +++ b/nisaba/scripts/natural_translit/utils/symbol.py @@ -106,6 +106,15 @@ def __str__(self) -> str: def is_control(self) -> bool: return self in Symbol.CTRL + def is_any(self) -> bool: + return False + + def is_eps(self) -> bool: + return self is Symbol.CTRL.eps + + def is_nor(self) -> bool: + return self is Symbol.CTRL.nor + def description(self, show_features: bool = False) -> str: """A string that describes the symbol.""" text = 'alias: %s index: %s' % (self.alias, self.index)