diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f9a9063..a8f11b6e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -154,6 +154,13 @@ pre-commit install # install pre-commit hooks # pre-commit run --all-files # lint and fix common problems in the codebase ``` +> [!NOTE] +> If you are having issues with pre-commit and want to send along your changes regardless, you can ignore the pre-commit hooks via the following: +> +> ```bash +> git commit --no-verify -m "COMMIT_MESSAGE" +> ``` + If you face any issues, consider reinstalling Scribe-data by running the following: ```bash diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index cbf94f0c..c3e98e6d 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -120,7 +120,7 @@ def prompt_user_download_all(): parse_wd_lexeme_dump( language=language, wikidata_dump_type=["form"], - data_types=data_types, + data_types="all", type_output_dir=output_dir, ) else: diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 1359c2e6..89396f72 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -24,7 +24,6 @@ from typing import List, Union from urllib.error import HTTPError -import requests from SPARQLWrapper import JSON from scribe_data.utils import ( @@ -34,6 +33,7 @@ language_metadata, language_to_qid, list_all_languages, + check_qid_is_language, ) from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql @@ -124,39 +124,6 @@ def get_datatype_list(language): return data_type_metadata -def check_qid_is_language(qid: str): - """ - Parameters - ---------- - qid : str - The QID to check Wikidata to see if it's a language and return its English label. - - Outputs - ------- - str - The English label of the Wikidata language entity. - - Raises - ------ - ValueError - An invalid QID that's not a language has been passed. - """ - api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0" - request_string = f"{api_endpoint}/entities/items/{qid}" - - request = requests.get(request_string, timeout=5) - request_result = request.json() - - if request_result["statements"]["P31"]: - instance_of_values = request_result["statements"]["P31"] - for val in instance_of_values: - if val["value"]["content"] == "Q34770": - print(f"{request_result['labels']['en']} ({qid}) is a language.\n") - return request_result["labels"]["en"] - - raise ValueError("The passed Wikidata QID is not a language.") - - # MARK: Print diff --git a/src/scribe_data/resources/wikidata_qids_pids.json b/src/scribe_data/resources/wikidata_qids_pids.json new file mode 100644 index 00000000..3201e75e --- /dev/null +++ b/src/scribe_data/resources/wikidata_qids_pids.json @@ -0,0 +1,4 @@ +{ + "instance_of": "P31", + "ietf_language_tag": "P305" +} diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index b1934a6d..153fc293 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -32,6 +32,7 @@ from typing import Any, Optional import questionary +import requests from rich import print as rprint # MARK: Utils Variables @@ -54,6 +55,9 @@ LEXEME_FORM_METADATA_FILE = ( Path(__file__).parent / "resources" / "lexeme_form_metadata.json" ) +WIKIDATA_QIDS_PIDS_FILE = ( + Path(__file__).parent / "resources" / "wikidata_qids_pids.json" +) DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR) try: @@ -78,6 +82,13 @@ except (IOError, json.JSONDecodeError) as e: print(f"Error reading lexeme form metadata: {e}") +try: + with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: + wikidata_qids_pids = json.load(file) + +except (IOError, json.JSONDecodeError) as e: + print(f"Error reading language metadata: {e}") + language_map = {} language_to_qid = {} @@ -736,3 +747,72 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool: return choice == "Skip process" return False + + +def check_qid_is_language(qid: str): + """ + Parameters + ---------- + qid : str + The QID to check Wikidata to see if it's a language and return its English label. + + Outputs + ------- + str + The English label of the Wikidata language entity. + + Raises + ------ + ValueError + An invalid QID that's not a language has been passed. + """ + api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0" + request_string = f"{api_endpoint}/entities/items/{qid}" + + request = requests.get(request_string, timeout=5) + request_result = request.json() + + if request_result["statements"][wikidata_qids_pids["instance_of"]]: + instance_of_values = request_result["statements"][ + wikidata_qids_pids["instance_of"] + ] + for val in instance_of_values: + if val["value"]["content"] == "Q34770": + print(f"{request_result['labels']['en']} ({qid}) is a language.\n") + return request_result["labels"]["en"] + + raise ValueError("The passed Wikidata QID is not a language.") + + +def get_language_iso_code(qid: str): + """ + Parameters + ---------- + qid : str + Get the ISO code of a language given its Wikidata QID. + + Outputs + ------- + str + The ISO code of the language. + + Raises + ------ + ValueError + An invalid QID that's not a language has been passed. + KeyError + The ISO code for the language is not available. + """ + + api_endpoint = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&props=claims&format=json" + response = requests.get(api_endpoint) + data = response.json() + try: + return data["entities"][qid]["claims"][wikidata_qids_pids["ietf_language_tag"]][ + 0 + ]["mainsnak"]["datavalue"]["value"] + + except ValueError: + raise ValueError("The passed Wikidata QID is not a language.") + except KeyError: + return KeyError("The ISO code for the language is not available.") diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 45f00d19..cea8de12 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -33,6 +33,8 @@ check_index_exists, data_type_metadata, language_metadata, + get_language_iso_code, + check_qid_is_language, ) from tqdm import tqdm @@ -81,7 +83,6 @@ def __init__( # Build map from ISO to full language name. self.iso_to_name = self._build_iso_mapping() - # For "total" usage. self.lexical_category_counts = defaultdict(Counter) self.translation_counts = defaultdict(Counter) @@ -101,120 +102,18 @@ def _build_iso_mapping(self) -> dict: if iso_code := data.get("iso"): iso_mapping[iso_code] = lang_name - return iso_mapping - - # MARK: process total - def _process_lexeme_total(self, lexeme: dict) -> None: - """ - Gather stats if 'total' is in parse_type: how many entries per language & category, - how many translations, etc. - """ - lexicalCategory = lexeme.get("lexicalCategory") - if not lexicalCategory or lexicalCategory not in data_type_metadata.values(): - return - - category_name = self._category_lookup.get(lexicalCategory) - if not category_name: - return - - # Update counters. - lemmas = lexeme.get("lemmas", {}) - for lemma in lemmas.values(): - lang = lemma.get("language") - - if lang in self.iso_to_name: - self.lexical_category_counts[lang][category_name] += 1 - translation_count = sum( - len(sense.get("glosses", {})) for sense in lexeme.get("senses", []) - ) - self.translation_counts[lang][category_name] += translation_count - - break - - # MARK: process translations - def _process_lexeme_translations(self, lexeme: dict) -> None: - """ - Process gloss-based translations if 'translations' is in parse_type. - Store them in self.translations_index. - """ - lemmas = lexeme.get("lemmas", {}) - qid = lexeme.get("lexicalCategory") - - if not (lemmas and qid): - return - - category_name = self._category_lookup.get(qid) - if not category_name: - return - - # Only store first valid lemma for translations. - for lang_code, lemma_data in lemmas.items(): - if lang_code not in self.iso_to_name: - continue - - word = lemma_data.get("value", "").lower() - if not word: - continue - - # Build translations from sense glosses. - translations = {} - for sense in lexeme.get("senses", []): - for sense_lang_code, gloss in sense.get("glosses", {}).items(): - if sense_lang_code in self.iso_to_name: - translations[sense_lang_code] = gloss["value"] + for language in self.target_iso: + if ( + language.lower().startswith("q") + and language[1:].isdigit() + ): + qid_to_lang = check_qid_is_language(language) + if qid_to_lang: + iso_code = get_language_iso_code(language.upper()) + iso_mapping[iso_code] = qid_to_lang + print(f"ISO code for {language} is {iso_code}") - if translations: - self.translations_index[word][lang_code][category_name] = translations - - break # only handle the first lemma - - # MARK: process forms - def _process_lexeme_forms(self, lexeme: dict) -> None: - """ - Process forms for categories in self.data_types if 'form' is in parse_type. - Store them in self.forms_index. - """ - lemmas = lexeme.get("lemmas", {}) - lexical_category = lexeme.get("lexicalCategory") - - # Skip if category missing or not recognized. - if not lexical_category or lexical_category not in data_type_metadata.values(): - return - - # Convert Q1084 -> "nouns", etc. - category_name = self._category_lookup.get(lexical_category) - if not category_name: - return - - # If the category_name is NOT in our data_types list, skip - # e.g., category_name = "nouns", but user didn't request "nouns" in data_types. - if category_name not in self.data_types: - return - - # Process forms. - for lang_code, lemma_data in lemmas.items(): - if lang_code not in self.iso_to_name: - continue - - word = lemma_data.get("value", "").lower() - if not word: - continue - - forms_data = defaultdict(list) - for form in lexeme.get("forms", []): - representations = form.get("representations", {}) - grammatical_features = form.get("grammaticalFeatures", []) - - for rep_lang, rep_data in representations.items(): - if rep_lang == lang_code: - if form_value := rep_data.get("value"): - forms_data[form_value].extend(grammatical_features) - - if forms_data: - self.forms_index[word][lang_code][category_name] = dict(forms_data) - self.forms_counts[lang_code][category_name] += len(forms_data) - - break # only first valid lemma + return iso_mapping # MARK: process lines def process_lines(self, line: str) -> None: @@ -385,6 +284,12 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N for word, lang_data in self.translations_index.items() if language_iso in lang_data } + + # Check if filtered data is empty before saving. + if not filtered: + print(f"No translations found for {language_iso}, skipping export...") + return + self._save_by_language(filtered, filepath, language_iso, "translations") # MARK: export forms @@ -418,6 +323,11 @@ def export_forms_json( else: filtered[word] = {language_iso: lang_data[language_iso]} + # Check if filtered data is empty before saving. + if not filtered: + print(f"No forms found for {language_iso}, skipping export...") + return + self._save_by_language( filtered, filepath, language_iso, data_type or "forms" ) diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 9e1fdbfb..54cf389d 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -81,7 +81,7 @@ def test_get_all_data_types_for_language_user_says_yes( mock_parse.assert_called_once_with( language="English", wikidata_dump_type=["form"], - data_types=None, # because data_types = [data_type] if provided else None + data_types="all", # because if only language given, data_types is None type_output_dir="scribe_data_json_export", # default for JSON ) mock_query_data.assert_not_called() diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index a8145f04..7ede34b4 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -20,16 +20,24 @@ --> """ +import json import unittest from unittest.mock import MagicMock, call, patch from scribe_data.cli.total import ( - check_qid_is_language, get_datatype_list, get_qid_by_input, get_total_lexemes, total_wrapper, ) +from scribe_data.utils import WIKIDATA_QIDS_PIDS_FILE, check_qid_is_language + +try: + with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file: + wikidata_qids_pids = json.load(file) + +except (IOError, json.JSONDecodeError) as e: + print(f"Error reading language metadata: {e}") class TestTotalLexemes(unittest.TestCase): @@ -213,11 +221,13 @@ def test_get_datatype_list_no_data_types(self, mock_dir): class TestCheckQidIsLanguage(unittest.TestCase): - @patch("scribe_data.cli.total.requests.get") + @patch("scribe_data.utils.requests.get") def test_check_qid_is_language_valid(self, mock_get): mock_response = MagicMock() mock_response.json.return_value = { - "statements": {"P31": [{"value": {"content": "Q34770"}}]}, + "statements": { + wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q34770"}}] + }, "labels": {"en": "English"}, } mock_get.return_value = mock_response @@ -228,11 +238,13 @@ def test_check_qid_is_language_valid(self, mock_get): self.assertEqual(result, "English") mock_print.assert_called_once_with("English (Q1860) is a language.\n") - @patch("scribe_data.cli.total.requests.get") + @patch("scribe_data.utils.requests.get") def test_check_qid_is_language_invalid(self, mock_get): mock_response = MagicMock() mock_response.json.return_value = { - "statements": {"P31": [{"value": {"content": "Q5"}}]}, + "statements": { + wikidata_qids_pids["instance_of"]: [{"value": {"content": "Q5"}}] + }, "labels": {"en": "Human"}, } mock_get.return_value = mock_response