-
Notifications
You must be signed in to change notification settings - Fork 75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Check language metadata #385
Changes from 9 commits
dd179c7
eb79e73
056d796
7149172
562f96a
885dc94
399dd37
e47bf55
e3f8d5f
6808796
208fba7
3ec3e74
3969f30
392977e
d238018
122969d
0c87b59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
import difflib | ||
import json | ||
from pathlib import Path | ||
import sys | ||
|
||
LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" | ||
|
||
LANGUAGE_METADATA_FILE = ( | ||
Path(__file__).parent.parent / "resources" / "language_metadata.json" | ||
) | ||
|
||
DATA_TYPE_METADATA_FILE = ( | ||
Path(__file__).parent.parent / "resources" / "data_type_metadata.json" | ||
) | ||
|
||
try: | ||
with LANGUAGE_METADATA_FILE.open("r", encoding="utf-8") as file: | ||
language_metadata = json.load(file) | ||
languages_in_metadata = { | ||
lang["language"]: {"iso": lang["iso"], "qid": lang["qid"]} | ||
for lang in language_metadata["languages"] | ||
} # current language metadata | ||
|
||
# languages_in_metadata = { # proposed language metadata | ||
# key.lower(): value for key, value in language_metadata.items() | ||
# } # Normalize keys to lowercase for case-insensitive comparison | ||
|
||
except (IOError, json.JSONDecodeError) as e: | ||
print(f"Error reading language metadata: {e}") | ||
|
||
try: | ||
with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: | ||
data_type_metadata = json.load(file) | ||
all_data_types = tuple(data_type_metadata.keys()) | ||
|
||
except (IOError, json.JSONDecodeError) as e: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it mean we can directly use it? how? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, just call it directly. You can experiment and see for yourself 😄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thankss.. I found it 😊 |
||
print(f"Error reading data type metadata: {e}") | ||
|
||
|
||
def get_available_languages() -> dict[str, list[str]]: | ||
""" | ||
Get available languages from the data extraction folder. | ||
Returns: | ||
dict[str, list[str]]: A dictionary with the language name as the key and a list of its sub-languages (if available) as the value. | ||
""" | ||
extraction_dir = LANGUAGE_DATA_EXTRACTION_DIR | ||
available_languages = {} | ||
|
||
for lang_folder in extraction_dir.iterdir(): | ||
if lang_folder.is_dir(): # Check if it's a directory | ||
lang_name = ( | ||
lang_folder.name.lower() | ||
) # Normalize keys to lowercase for case-insensitive comparison | ||
sub_languages = [] | ||
|
||
# Check if lang_folder contains subdirectories | ||
for sub_folder in lang_folder.iterdir(): | ||
if sub_folder.is_dir(): | ||
sub_lang_name = ( | ||
sub_folder.name.lower() | ||
) # Normalize to lowercase for case-insensitive comparison | ||
|
||
# Check for almost similar keys using difflib | ||
close_matches = difflib.get_close_matches( | ||
sub_lang_name, all_data_types, n=1, cutoff=0.8 | ||
) | ||
|
||
if not close_matches: | ||
sub_languages.append( | ||
sub_lang_name | ||
) # Append sub-language name if no close match found (not a data type) | ||
|
||
if ( | ||
sub_languages | ||
): # If we found sub-languages, add them to available_languages | ||
available_languages[lang_name] = {"sub_languages": sub_languages} | ||
else: # No sub-languages found, initialize entry without them | ||
available_languages[lang_name] = {} | ||
|
||
return available_languages | ||
|
||
|
||
def get_missing_languages( | ||
reference_languages: dict, target_languages: dict | ||
) -> list[str]: | ||
""" | ||
Compare two language dictionaries and return a list of languages and sub-languages | ||
that exist in target_languages but not in reference_languages. | ||
|
||
Parameters | ||
---------- | ||
reference_languages : dict | ||
A dictionary of languages from the reference source. | ||
target_languages : dict | ||
A dictionary of languages from the target source to check for missing entries. | ||
|
||
Returns | ||
------- | ||
list[str] | ||
A list of languages and sub-languages that are in target_languages but not in reference_languages. | ||
""" | ||
missing_languages = [] | ||
reference_keys = {lang for lang in reference_languages.keys()} | ||
|
||
for lang, details in target_languages.items(): | ||
# Check if the parent language exists | ||
if lang not in reference_keys: | ||
# If it's a parent language, check for sub-languages and append them | ||
if "sub_languages" in details: | ||
for sub_lang in details["sub_languages"]: | ||
missing_languages.append(f"{lang} - {sub_lang}") | ||
else: | ||
# Individual language, append directly | ||
missing_languages.append(lang) | ||
else: | ||
# If the parent exists, only check for missing sub-languages | ||
ref_sub_languages = reference_languages[lang].get("sub_languages", {}) | ||
ref_sub_languages_keys = {sub for sub in ref_sub_languages} | ||
|
||
if "sub_languages" in details: | ||
for sub_lang in details["sub_languages"]: | ||
if sub_lang not in ref_sub_languages_keys: | ||
missing_languages.append(f"{lang} - {sub_lang}") | ||
|
||
return missing_languages | ||
|
||
|
||
def validate_language_properties(languages_dict: dict) -> dict: | ||
""" | ||
Validates the presence of 'qid' and 'iso' properties for each language and its sub-languages. | ||
|
||
Args: | ||
languages_dict (dict): A dictionary where each key is a language, and the value is another | ||
dictionary containing details about the language. If the language has | ||
sub-languages, they are stored under the 'sub_languages' key. | ||
|
||
Returns: | ||
dict: A dictionary with two lists: | ||
- "missing_qids": Languages or sub-languages missing the 'qid' property. | ||
- "missing_isos": Languages or sub-languages missing the 'iso' property. | ||
|
||
Each entry in these lists is in the format "parent_language - sub_language" for sub-languages, | ||
or simply "parent_language" for the parent languages. | ||
""" | ||
missing_qids = [] | ||
missing_isos = [] | ||
|
||
for lang, details in languages_dict.items(): | ||
# Check if the language has sub-languages | ||
if "sub_languages" in details: | ||
sub_languages = details["sub_languages"] | ||
|
||
# Validate each sub-language | ||
for sub_lang, sub_details in sub_languages.items(): | ||
if "qid" not in sub_details: | ||
missing_qids.append(f"{lang} - {sub_lang}") | ||
if "iso" not in sub_details: | ||
missing_isos.append(f"{lang} - {sub_lang}") | ||
else: | ||
# Validate the parent language itself | ||
if "qid" not in details: | ||
missing_qids.append(lang) | ||
if "iso" not in details: | ||
missing_isos.append(lang) | ||
|
||
return {"missing_qids": missing_qids, "missing_isos": missing_isos} | ||
|
||
|
||
def check_language_metadata(): | ||
""" | ||
Validates language metadata by performing the following checks: | ||
|
||
1. Ensures that all languages listed in `language_data_extraction` are present in `language_metadata.json`, and vice versa. | ||
2. Checks if each language in `language_metadata.json` has the required properties: | ||
- 'qid' (a unique identifier) | ||
- 'iso' (ISO language code) | ||
|
||
This function helps identify missing languages or missing properties, ensuring data consistency across both sources. | ||
""" | ||
languages_in_directory = get_available_languages() | ||
missing_languages_metadata = get_missing_languages( | ||
languages_in_metadata, languages_in_directory | ||
) | ||
missing_languages_extraction = get_missing_languages( | ||
languages_in_directory, languages_in_metadata | ||
) | ||
languages_with_missing_properties = validate_language_properties( | ||
languages_in_metadata | ||
) | ||
|
||
if ( | ||
missing_languages_metadata | ||
or missing_languages_extraction | ||
or languages_with_missing_properties["missing_qids"] | ||
or languages_with_missing_properties["missing_isos"] | ||
): | ||
if missing_languages_metadata or missing_languages_extraction: | ||
print( | ||
"There are missing languages or inconsistencies between language_metadata.json and language_data_extraction.\n" | ||
) | ||
|
||
if missing_languages_metadata: | ||
print("Languages missing from language_metadata.json:") | ||
for lang in missing_languages_metadata: | ||
print(f" • {lang.title()}") | ||
|
||
if missing_languages_extraction: | ||
print("\nLanguages missing from language_data_extraction:") | ||
for lang in missing_languages_extraction: | ||
print(f" • {lang.title()}") | ||
|
||
if languages_with_missing_properties["missing_qids"]: | ||
print("\nLanguages missing the `qid` property:") | ||
for lang in languages_with_missing_properties["missing_qids"]: | ||
print(f" • {lang.title()}") | ||
|
||
if languages_with_missing_properties["missing_isos"]: | ||
print("\nLanguages missing the `iso` property:") | ||
for lang in languages_with_missing_properties["missing_isos"]: | ||
print(f" • {lang.title()}") | ||
|
||
# Exit with a non-zero status code to indicate failure | ||
sys.exit(1) # Indicate failure | ||
|
||
print( | ||
"All languages match between language_metadata.json and language_data_extraction; languages in language_metadata.json have the correct properties." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
check_language_metadata() | ||
Comment on lines
+212
to
+213
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is for a test, then it's fine. but we will be calling this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you! good to know 😊 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@catreedle you can get these from
src/scribe_data/cli/cli_utils.py
- they already exist there