From cfc2777729d180293b2e55295f87cdd09fb1c509 Mon Sep 17 00:00:00 2001 From: axif Date: Thu, 16 Jan 2025 00:51:01 +0600 Subject: [PATCH] fix total --- src/scribe_data/cli/total.py | 7 +- src/scribe_data/wikidata/wikidata_utils.py | 8 +- src/scribe_data/wiktionary/parse_dump.py | 27 +++++++ tests/cli/test_total.py | 90 ++++++++++++++++++++++ 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 89396f72..b867a48f 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -367,11 +367,15 @@ def total_wrapper( """ # Handle --all flag if all_bool and wikidata_dump: - language = "all" + if data_type is None: + data_type = "all" + if language is None: + language = "all" if wikidata_dump is True: # flag without a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, + data_types=[data_type], wikidata_dump_type=["total"], wikidata_dump_path=None, ) @@ -380,6 +384,7 @@ def total_wrapper( if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, + data_types=[data_type], wikidata_dump_type=["total"], wikidata_dump_path=wikidata_dump, ) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 18d01895..cf6fb872 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -93,9 +93,6 @@ def parse_wd_lexeme_dump( if isinstance(language, str) and language.lower() == "all": language = list(language_metadata.keys()) - # For printing: include all data types including translations - display_data_types = list(data_type_metadata.keys()) - # For processing: exclude translations and emoji-keywords if isinstance(data_types, str) and data_types.lower() == "all": data_types = [ @@ -103,12 +100,9 @@ def parse_wd_lexeme_dump( for dt in data_type_metadata.keys() if dt != "translations" and dt != "emoji-keywords" ] - display_data_types += ["translations"] - else: - display_data_types = data_types print(f"Languages to process: {language}") - print(f"Data types to process: {display_data_types}") + print(f"Data types to process: {data_types}") file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index f9fcb158..fa6bd0f6 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -249,6 +249,32 @@ def _get_form_name(self, features): return "".join(form_parts) + def _process_totals(self, lexeme, lang_code, category_name): + """ + Process totals for statistical counting. + """ + # Skip if we have specific data types and this category isn't in them + if self.data_types and category_name.lower() not in [ + dt.lower() for dt in self.data_types + ]: + return + + # Increment lexeme count for this language and category + self.lexical_category_counts[lang_code][category_name] += 1 + + # Count translations if they exist + if lexeme.get("senses"): + translation_count = sum( + 1 + for sense in lexeme["senses"] + if sense.get("glosses") + and any( + lang in self.valid_iso_codes for lang in sense["glosses"].keys() + ) + ) + if translation_count > 0: + self.translation_counts[lang_code][category_name] += translation_count + # MARK: process file def process_file(self, file_path: str, batch_size: int = 50000): """ @@ -611,3 +637,4 @@ def parse_dump( # print(f" {i}. {readable_features}") # print_unique_forms(processor.unique_forms) + # print(processor.unique_forms) diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index 7ede34b4..a9640142 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -274,3 +274,93 @@ def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes): def test_total_wrapper_invalid_input(self): with self.assertRaises(ValueError): total_wrapper() + + # MARK: Using wikidata_dump + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump): + """Test when wikidata_dump is True (flag without path)""" + total_wrapper(wikidata_dump=True) + mock_parse_dump.assert_called_once_with( + language=None, + data_types=[None], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_path(self, mock_parse_dump): + """Test when wikidata_dump is a file path""" + dump_path = "/path/to/dump.json" + total_wrapper(wikidata_dump=dump_path) + mock_parse_dump.assert_called_once_with( + language=None, + data_types=[None], + wikidata_dump_type=["total"], + wikidata_dump_path=dump_path, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump): + """Test when both wikidata_dump and all_bool are True""" + total_wrapper(wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + language="all", + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dump): + """Test wikidata_dump with specific language and data type""" + total_wrapper( + language="English", data_type="nouns", wikidata_dump="/path/to/dump.json" + ) + mock_parse_dump.assert_called_once_with( + language="English", + data_types=["nouns"], + wikidata_dump_type=["total"], + wikidata_dump_path="/path/to/dump.json", + ) + + # MARK: Using QID + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.print_total_lexemes") + def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid): + """ + Test when language is provided as a QID + """ + mock_check_qid.return_value = "Thai" + total_wrapper(language="Q9217") + mock_print_total.assert_called_once_with(language="Q9217") + + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.get_total_lexemes") + def test_total_wrapper_with_qid_and_datatype(self, mock_get_total, mock_check_qid): + """ + Test when language QID and data type are provided + """ + mock_check_qid.return_value = "Thai" + total_wrapper(language="Q9217", data_type="nouns") + mock_get_total.assert_called_once_with(language="Q9217", data_type="nouns") + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump): + """ + Test QID with wikidata dump + """ + total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + language="Q9217", + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.get_total_lexemes") + def test_get_total_lexemes_with_qid(self, mock_get_total): + """ + Test get_total_lexemes with QID input + """ + total_wrapper(language="Q9217", data_type="Q1084") # Q1084 is noun QID + mock_get_total.assert_called_once_with(language="Q9217", data_type="Q1084")