From cfc2777729d180293b2e55295f87cdd09fb1c509 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Thu, 16 Jan 2025 00:51:01 +0600
Subject: [PATCH] fix total

---
 src/scribe_data/cli/total.py               |  7 +-
 src/scribe_data/wikidata/wikidata_utils.py |  8 +-
 src/scribe_data/wiktionary/parse_dump.py   | 27 +++++++
 tests/cli/test_total.py                    | 90 ++++++++++++++++++++++
 4 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 89396f72..b867a48f 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -367,11 +367,15 @@ def total_wrapper(
     """
     # Handle --all flag
     if all_bool and wikidata_dump:
-        language = "all"
+        if data_type is None:
+            data_type = "all"
+        if language is None:
+            language = "all"
 
     if wikidata_dump is True:  # flag without a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
+            data_types=[data_type],
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
@@ -380,6 +384,7 @@ def total_wrapper(
     if isinstance(wikidata_dump, str):  # if user provided a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
+            data_types=[data_type],
             wikidata_dump_type=["total"],
             wikidata_dump_path=wikidata_dump,
         )
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 18d01895..cf6fb872 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -93,9 +93,6 @@ def parse_wd_lexeme_dump(
     if isinstance(language, str) and language.lower() == "all":
         language = list(language_metadata.keys())
 
-    # For printing: include all data types including translations
-    display_data_types = list(data_type_metadata.keys())
-
     # For processing: exclude translations and emoji-keywords
     if isinstance(data_types, str) and data_types.lower() == "all":
         data_types = [
@@ -103,12 +100,9 @@ def parse_wd_lexeme_dump(
             for dt in data_type_metadata.keys()
             if dt != "translations" and dt != "emoji-keywords"
         ]
-        display_data_types += ["translations"]
-    else:
-        display_data_types = data_types
 
     print(f"Languages to process: {language}")
-    print(f"Data types to process: {display_data_types}")
+    print(f"Data types to process: {data_types}")
 
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index f9fcb158..fa6bd0f6 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -249,6 +249,32 @@ def _get_form_name(self, features):
 
         return "".join(form_parts)
 
+    def _process_totals(self, lexeme, lang_code, category_name):
+        """
+        Process totals for statistical counting.
+        """
+        # Skip if we have specific data types and this category isn't in them
+        if self.data_types and category_name.lower() not in [
+            dt.lower() for dt in self.data_types
+        ]:
+            return
+
+        # Increment lexeme count for this language and category
+        self.lexical_category_counts[lang_code][category_name] += 1
+
+        # Count translations if they exist
+        if lexeme.get("senses"):
+            translation_count = sum(
+                1
+                for sense in lexeme["senses"]
+                if sense.get("glosses")
+                and any(
+                    lang in self.valid_iso_codes for lang in sense["glosses"].keys()
+                )
+            )
+            if translation_count > 0:
+                self.translation_counts[lang_code][category_name] += translation_count
+
     # MARK: process file
     def process_file(self, file_path: str, batch_size: int = 50000):
         """
@@ -611,3 +637,4 @@ def parse_dump(
     #                 print(f"    {i}. {readable_features}")
 
     # print_unique_forms(processor.unique_forms)
+    # print(processor.unique_forms)
diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py
index 7ede34b4..a9640142 100644
--- a/tests/cli/test_total.py
+++ b/tests/cli/test_total.py
@@ -274,3 +274,93 @@ def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes):
     def test_total_wrapper_invalid_input(self):
         with self.assertRaises(ValueError):
             total_wrapper()
+
+    # MARK: Using wikidata_dump
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump):
+        """Test when wikidata_dump is True (flag without path)"""
+        total_wrapper(wikidata_dump=True)
+        mock_parse_dump.assert_called_once_with(
+            language=None,
+            data_types=[None],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_path(self, mock_parse_dump):
+        """Test when wikidata_dump is a file path"""
+        dump_path = "/path/to/dump.json"
+        total_wrapper(wikidata_dump=dump_path)
+        mock_parse_dump.assert_called_once_with(
+            language=None,
+            data_types=[None],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=dump_path,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump):
+        """Test when both wikidata_dump and all_bool are True"""
+        total_wrapper(wikidata_dump=True, all_bool=True)
+        mock_parse_dump.assert_called_once_with(
+            language="all",
+            data_types=["all"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dump):
+        """Test wikidata_dump with specific language and data type"""
+        total_wrapper(
+            language="English", data_type="nouns", wikidata_dump="/path/to/dump.json"
+        )
+        mock_parse_dump.assert_called_once_with(
+            language="English",
+            data_types=["nouns"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path="/path/to/dump.json",
+        )
+
+    # MARK: Using QID
+    @patch("scribe_data.cli.total.check_qid_is_language")
+    @patch("scribe_data.cli.total.print_total_lexemes")
+    def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid):
+        """
+        Test when language is provided as a QID
+        """
+        mock_check_qid.return_value = "Thai"
+        total_wrapper(language="Q9217")
+        mock_print_total.assert_called_once_with(language="Q9217")
+
+    @patch("scribe_data.cli.total.check_qid_is_language")
+    @patch("scribe_data.cli.total.get_total_lexemes")
+    def test_total_wrapper_with_qid_and_datatype(self, mock_get_total, mock_check_qid):
+        """
+        Test when language QID and data type are provided
+        """
+        mock_check_qid.return_value = "Thai"
+        total_wrapper(language="Q9217", data_type="nouns")
+        mock_get_total.assert_called_once_with(language="Q9217", data_type="nouns")
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump):
+        """
+        Test QID with wikidata dump
+        """
+        total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True)
+        mock_parse_dump.assert_called_once_with(
+            language="Q9217",
+            data_types=["all"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.get_total_lexemes")
+    def test_get_total_lexemes_with_qid(self, mock_get_total):
+        """
+        Test get_total_lexemes with QID input
+        """
+        total_wrapper(language="Q9217", data_type="Q1084")  # Q1084 is noun QID
+        mock_get_total.assert_called_once_with(language="Q9217", data_type="Q1084")