Skip to content

Commit

Permalink
fix total
Browse files Browse the repository at this point in the history
  • Loading branch information
axif0 committed Jan 15, 2025
1 parent 192b09c commit cfc2777
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 8 deletions.
7 changes: 6 additions & 1 deletion src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,11 +367,15 @@ def total_wrapper(
"""
# Handle --all flag
if all_bool and wikidata_dump:
language = "all"
if data_type is None:
data_type = "all"
if language is None:
language = "all"

if wikidata_dump is True: # flag without a wikidata lexeme dump path
parse_wd_lexeme_dump(
language=language,
data_types=[data_type],
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)
Expand All @@ -380,6 +384,7 @@ def total_wrapper(
if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path
parse_wd_lexeme_dump(
language=language,
data_types=[data_type],
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump,
)
Expand Down
8 changes: 1 addition & 7 deletions src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,16 @@ def parse_wd_lexeme_dump(
if isinstance(language, str) and language.lower() == "all":
language = list(language_metadata.keys())

# For printing: include all data types including translations
display_data_types = list(data_type_metadata.keys())

# For processing: exclude translations and emoji-keywords
if isinstance(data_types, str) and data_types.lower() == "all":
data_types = [
dt
for dt in data_type_metadata.keys()
if dt != "translations" and dt != "emoji-keywords"
]
display_data_types += ["translations"]
else:
display_data_types = data_types

print(f"Languages to process: {language}")
print(f"Data types to process: {display_data_types}")
print(f"Data types to process: {data_types}")

file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)

Expand Down
27 changes: 27 additions & 0 deletions src/scribe_data/wiktionary/parse_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,32 @@ def _get_form_name(self, features):

return "".join(form_parts)

def _process_totals(self, lexeme, lang_code, category_name):
"""
Process totals for statistical counting.
"""
# Skip if we have specific data types and this category isn't in them
if self.data_types and category_name.lower() not in [
dt.lower() for dt in self.data_types
]:
return

# Increment lexeme count for this language and category
self.lexical_category_counts[lang_code][category_name] += 1

# Count translations if they exist
if lexeme.get("senses"):
translation_count = sum(
1
for sense in lexeme["senses"]
if sense.get("glosses")
and any(
lang in self.valid_iso_codes for lang in sense["glosses"].keys()
)
)
if translation_count > 0:
self.translation_counts[lang_code][category_name] += translation_count

# MARK: process file
def process_file(self, file_path: str, batch_size: int = 50000):
"""
Expand Down Expand Up @@ -611,3 +637,4 @@ def parse_dump(
# print(f" {i}. {readable_features}")

# print_unique_forms(processor.unique_forms)
# print(processor.unique_forms)
90 changes: 90 additions & 0 deletions tests/cli/test_total.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,3 +274,93 @@ def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes):
def test_total_wrapper_invalid_input(self):
with self.assertRaises(ValueError):
total_wrapper()

# MARK: Using wikidata_dump
@patch("scribe_data.cli.total.parse_wd_lexeme_dump")
def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump):
"""Test when wikidata_dump is True (flag without path)"""
total_wrapper(wikidata_dump=True)
mock_parse_dump.assert_called_once_with(
language=None,
data_types=[None],
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)

@patch("scribe_data.cli.total.parse_wd_lexeme_dump")
def test_total_wrapper_wikidata_dump_path(self, mock_parse_dump):
"""Test when wikidata_dump is a file path"""
dump_path = "/path/to/dump.json"
total_wrapper(wikidata_dump=dump_path)
mock_parse_dump.assert_called_once_with(
language=None,
data_types=[None],
wikidata_dump_type=["total"],
wikidata_dump_path=dump_path,
)

@patch("scribe_data.cli.total.parse_wd_lexeme_dump")
def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump):
"""Test when both wikidata_dump and all_bool are True"""
total_wrapper(wikidata_dump=True, all_bool=True)
mock_parse_dump.assert_called_once_with(
language="all",
data_types=["all"],
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)

@patch("scribe_data.cli.total.parse_wd_lexeme_dump")
def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dump):
"""Test wikidata_dump with specific language and data type"""
total_wrapper(
language="English", data_type="nouns", wikidata_dump="/path/to/dump.json"
)
mock_parse_dump.assert_called_once_with(
language="English",
data_types=["nouns"],
wikidata_dump_type=["total"],
wikidata_dump_path="/path/to/dump.json",
)

# MARK: Using QID
@patch("scribe_data.cli.total.check_qid_is_language")
@patch("scribe_data.cli.total.print_total_lexemes")
def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid):
"""
Test when language is provided as a QID
"""
mock_check_qid.return_value = "Thai"
total_wrapper(language="Q9217")
mock_print_total.assert_called_once_with(language="Q9217")

@patch("scribe_data.cli.total.check_qid_is_language")
@patch("scribe_data.cli.total.get_total_lexemes")
def test_total_wrapper_with_qid_and_datatype(self, mock_get_total, mock_check_qid):
"""
Test when language QID and data type are provided
"""
mock_check_qid.return_value = "Thai"
total_wrapper(language="Q9217", data_type="nouns")
mock_get_total.assert_called_once_with(language="Q9217", data_type="nouns")

@patch("scribe_data.cli.total.parse_wd_lexeme_dump")
def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump):
"""
Test QID with wikidata dump
"""
total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True)
mock_parse_dump.assert_called_once_with(
language="Q9217",
data_types=["all"],
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)

@patch("scribe_data.cli.total.get_total_lexemes")
def test_get_total_lexemes_with_qid(self, mock_get_total):
"""
Test get_total_lexemes with QID input
"""
total_wrapper(language="Q9217", data_type="Q1084") # Q1084 is noun QID
mock_get_total.assert_called_once_with(language="Q9217", data_type="Q1084")

0 comments on commit cfc2777

Please sign in to comment.