aider: Fix the argument type in the is_within_loan_span function by…

… updating the type hint for the `loan_spans` parameter. 1. 優化性能 1. 增加「定係」「一係」作為判別標準 1. 改正 README 命令
CanCLID · Jun 14, 2024 · f3b415e · f3b415e
1 parent efc099a
commit f3b415e
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ cantofilter --input input.txt --mode cantonese > output.txt
 你亦都可以剩係輸出啲句子嘅分類結果，用 `--mode label` 就得：
 
 ```bash
-cantofilter --input input.txt --model label > output.txt
+cantofilter --input input.txt --mode label > output.txt
 ```
 
 噉樣嘅 `output.txt` 剩得一列，全部都係分類標籤。

diff --git a/cantofilter/judge.py b/cantofilter/judge.py
@@ -9,15 +9,16 @@
 """
 from enum import StrEnum, auto
 import re
-from typing import List, Tuple
+from typing import Set, Tuple
 
 CANTO_UNIQUE = re.compile(
     r'[嘅嗰啲咗佢喺咁噉冇啩哋畀嚟諗惗乜嘢閪撚𨳍𨳊瞓睇㗎餸𨋢摷喎嚿噃嚡嘥嗮啱揾搵喐逳噏𢳂岋糴揈捹撳㩒𥄫攰癐冚孻冧𡃁嚫跣𨃩瀡氹嬲掟孭黐唞㪗埞忟𢛴]|' +
-    r'唔[係得會好識使洗駛通知到去走掂該錯差]|點[樣會做得解]|[琴尋噚聽第]日|[而依]家|家[下陣]|[真就實梗又話都但淨剩只]係|邊[度個位科]|' +
+    r'唔[係得會好識使洗駛通知到去走掂該錯差]|點[樣會做得解]|[琴尋噚聽第]日|[而依]家|家[下陣]|[真就實梗又話都但淨剩只定一]係|邊[度個位科]|' +
     r'[嚇凍攝整揩逢淥浸激][親嚫]|[橫搞傾諗得唔]掂|仲[有係話要得好衰唔]|返[學工去歸]|執[好生實返輸]|' +
     r'屋企|收皮|慳錢|傾[偈計]|幫襯|求其|是[但旦]|[濕溼]碎|零舍|肉[赤緊酸]|核突|同埋|勁[秋抽]')
 MANDO_UNIQUE = re.compile(r'[這哪您們唄咱啥甭她]|還[是好有]')
-# “在不” 因為太多融入粵語所以唔喺判別標準內
+# “在不把” 因為太多融入粵語所以唔喺判別標準內
+# Too many Cantonese loan words have 在不把, so not included in the judgment criteria
 MANDO_FEATURE = re.compile(r'[那是的他它看吧沒麼么些了卻説說吃弄也]|而已')
 MANDO_LOAN = re.compile(r'亞利桑那|剎那|巴塞羅那|薩那|沙那|哈瓦那|印第安那|那不勒斯|支那|' +
                         r'是[否日次非但旦]|[利於]是|唯命是從|頭頭是道|似是而非|自以為是|俯拾皆是|撩是鬥非|莫衷一是|唯才是用|' +
@@ -28,29 +29,33 @@
                         r'他[信人國日殺鄉]|[其利無排維結]他|馬耳他|他加祿|他山之石|' +
                         r'其[它]|' +
                         r'[收查窺觀]看|看[守住好護]|刮目相看|' +
-                        r'[酒網水貼]吧|吧[台臺枱檯]|' +
+                        r'[酒網水貼]吧|吧[務台臺枱檯]|' +
                         r'[退忘阻]卻|卻步|' +
                         r'[遊游小傳解學假淺眾衆訴論][説說]|[說説][話服明]|自圓其[説說]|長話短[說説]|不由分[說説]|' +
                         r'吃[虧苦力]|' +
                         r'弄[堂]|[賣擺嘲]弄|' +
-                        r'可怒也|可惱也|可惱也|如也|也門|之乎者也|天助我也')
+                        r'可[怒惱]也|如也|也門|之乎者也|天助我也')
 
 
 class LanguageType(StrEnum):
+    '''
+    總共有四個分類：粵語、官話、官話溝粵語、中性
+    There are four categories: Cantonese, Mandarin, mixed-Mandarin-Cantonese, and neutral
+    '''
     CANTONESE = auto()
     MANDARIN = auto()
     MIXED = auto()
     NEUTRAL = auto()
 
 
-def is_within_loan_span(feature_span: Tuple[int, int], loan_spans: List[Tuple[int, int]]) -> bool:
+def is_within_loan_span(feature_span: Tuple[int, int], loan_spans: Set[Tuple[int, int]]) -> bool:
     '''
     判斷一個官話特徵係唔係借詞。如果佢嘅位置喺某個借詞區間，就係借詞
     Judge whether a Mandarin feature is a loan word. If its position is within a loan span, it is a loan.
 
     Args:
         feature_span (Tuple[int, int]): 官話特徵嘅位置  Mandarin feature position
-        loan_spans (List[Tuple[int, int]]): 借詞嘅位置  Loan word positions
+        loan_spans (Set[Tuple[int, int]]): 借詞嘅位置  Loan word positions
     Returns:
         bool: 係唔係官話借詞 Whether the input feature is a Mandarin loan word
     '''
@@ -69,7 +74,7 @@ def is_all_loan(s: str) -> bool:
     mando_features = MANDO_FEATURE.finditer(s)
     mando_loans = MANDO_LOAN.finditer(s)
     feature_spans = [m.span() for m in mando_features]
-    loan_spans = [m.span() for m in mando_loans]
+    loan_spans = set(m.span() for m in mando_loans)
 
     # 如果所有官話特徵都喺借詞區間，噉就全部都係借詞
     # If all Mandarin features are within loan word spans, then all are loan words.
@@ -89,9 +94,9 @@ def judge(s: str) -> LanguageType:
     Returns:
         LanguageType: 粵語、官話、官話溝粵語定係中性 LanguageType.CANTONESE, LanguageType.MANDARIN, LanguageType.MIXED, or LanguageType.NEUTRAL.
     '''
-    has_canto_unique = bool(re.search(CANTO_UNIQUE, s))
-    has_mando_unique = bool(re.search(MANDO_UNIQUE, s))
-    has_mando_feature = bool(re.search(MANDO_FEATURE, s))
+    has_canto_unique = bool(CANTO_UNIQUE.search(s))
+    has_mando_unique = bool(MANDO_UNIQUE.search(s))
+    has_mando_feature = bool(MANDO_FEATURE.search(s))
 
     if has_canto_unique:
         # 含有粵語成分
@@ -105,32 +110,17 @@ def judge(s: str) -> LanguageType:
             # Contain Mandarin features, has Mandarin unique words, so it is Mandarin-Cantonese mixed
             return LanguageType.MIXED
         else:
-            # 含有官話成分，冇官話專屬詞，有可能官話借詞，亦都算粵語
-            # Contain Mandarin features, no Mandarin unique words,
-            # which may be Mandarin loan words that also count as Cantonese
-            if is_all_loan(s):
-                # 所有官話特色都係借詞，所以仲係算粵語
-                # All Mandarin features are loan words, so still count as Cantonese
-                return LanguageType.CANTONESE
-            else:
-                # 有官話特色字唔係借詞，所以係官話溝粵語
-                # Some Mandarin features are not loan words, so it is Mandarin-Cantonese mixed
-                return LanguageType.MIXED
+            # 既有粵語特徵亦有官話特徵。如果全部官話特徵都係借詞，則算粵語。如果有官話特徵唔係借詞，則為官話溝粵語。
+            # Contain both Cantonese and Mandarin features. If all Mandarin features are loan words, then it is Cantonese. Otherwise, it is Mandarin-Cantonese mixed.
+            return LanguageType.CANTONESE if is_all_loan(s) else LanguageType.MIXED
     elif has_mando_unique:
-        # 冇粵語成分
-        # No Cantonese features
+        # 冇粵語成分，且包含官話獨有詞，則判斷為官話
+        # No Cantonese features and contains Mandarin unique words, it is Mandarin
         return LanguageType.MANDARIN
     elif has_mando_feature:
-        # 有官話特徵但係要判斷係唔係全部都係借詞
-        # Has Mandarin features but need to judge whether all are loan words
-        if is_all_loan(s):
-            # 全部都係借詞，唔算官話
-            # All are loan words, not count as Mandarin
-            return LanguageType.NEUTRAL
-        else:
-            # 有特徵唔係借詞，所以算官話
-            # Some features are not Mandarin loan words, so count as Mandarin
-            return LanguageType.MANDARIN
+        # 冇粵語特徵且有官話特徵，但如果全部都係借詞就唔算官話，否則係官話
+        # No Cantonese features while has Mandarin features. But if all these Mandarin features are loan words then not count as Mandarin. Otherwise, it is Mandarin.
+        return LanguageType.NEUTRAL if is_all_loan(s) else LanguageType.MANDARIN
     else:
         # 冇任何特徵，既可以當粵語亦可以當官話
         # No features, can be either Cantonese or Mandarin

diff --git a/tests/test_judge.py b/tests/test_judge.py
@@ -1,24 +1,28 @@
-import unittest
 from cantofilter.judge import LanguageType, judge
+import unittest
+
 
 def load_test_sentences(file_path):
     with open(file_path, 'r', encoding='utf-8') as file:
-        lines = [line.strip() for line in file if line.strip() and not line.startswith('#')]
+        lines = [line.strip() for line in file if line.strip()
+                 and not line.startswith('#')]
     test_cases = []
     for line in lines:
         if '|' in line:
             sentence, expected = line.split('|')
             test_cases.append((sentence, LanguageType[expected.upper()]))
     return test_cases
 
+
 test_cases = load_test_sentences('tests/test_sentences.txt')
 
 
 class TestJudgeFunction(unittest.TestCase):
     def test_judge(self):
         for sentence, expected in test_cases:
             result = judge(sentence)
-            self.assertEqual(result, expected, f"Failed for input: {sentence}. Expected: {expected.name}, but got: {result.name}")
+            self.assertEqual(
+                result, expected, f"Failed for input: {sentence}. Expected: {expected.name}, but got: {result.name}")
 
 
 if __name__ == "__main__":