Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check language metadata #385

Merged
merged 17 commits into from
Oct 24, 2024
Merged
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 231 additions & 0 deletions src/scribe_data/check/check_language_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import difflib
import json
from pathlib import Path
import sys

LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"

LANGUAGE_METADATA_FILE = (
Path(__file__).parent.parent / "resources" / "language_metadata.json"
)

DATA_TYPE_METADATA_FILE = (
Path(__file__).parent.parent / "resources" / "data_type_metadata.json"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@catreedle you can get these from src/scribe_data/cli/cli_utils.py - they already exist there


try:
with LANGUAGE_METADATA_FILE.open("r", encoding="utf-8") as file:
language_metadata = json.load(file)
languages_in_metadata = {
lang["language"]: {"iso": lang["iso"], "qid": lang["qid"]}
for lang in language_metadata["languages"]
} # current language metadata

# languages_in_metadata = { # proposed language metadata
# key.lower(): value for key, value in language_metadata.items()
# } # Normalize keys to lowercase for case-insensitive comparison

except (IOError, json.JSONDecodeError) as e:
print(f"Error reading language metadata: {e}")

try:
with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file:
data_type_metadata = json.load(file)
all_data_types = tuple(data_type_metadata.keys())

except (IOError, json.JSONDecodeError) as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, the cli_utils.py has loaded the language_metadata.json and data_type_metadata.json files for us. You could ignore loading it in here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2024-10-16 at 15 13 42

see the cli_utils.py contents

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it mean we can directly use it? how?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, just call it directly. You can experiment and see for yourself 😄

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thankss.. I found it 😊

print(f"Error reading data type metadata: {e}")


def get_available_languages() -> dict[str, list[str]]:
"""
Get available languages from the data extraction folder.
Returns:
dict[str, list[str]]: A dictionary with the language name as the key and a list of its sub-languages (if available) as the value.
"""
extraction_dir = LANGUAGE_DATA_EXTRACTION_DIR
available_languages = {}

for lang_folder in extraction_dir.iterdir():
if lang_folder.is_dir(): # Check if it's a directory
lang_name = (
lang_folder.name.lower()
) # Normalize keys to lowercase for case-insensitive comparison
sub_languages = []

# Check if lang_folder contains subdirectories
for sub_folder in lang_folder.iterdir():
if sub_folder.is_dir():
sub_lang_name = (
sub_folder.name.lower()
) # Normalize to lowercase for case-insensitive comparison

# Check for almost similar keys using difflib
close_matches = difflib.get_close_matches(
sub_lang_name, all_data_types, n=1, cutoff=0.8
)

if not close_matches:
sub_languages.append(
sub_lang_name
) # Append sub-language name if no close match found (not a data type)

if (
sub_languages
): # If we found sub-languages, add them to available_languages
available_languages[lang_name] = {"sub_languages": sub_languages}
else: # No sub-languages found, initialize entry without them
available_languages[lang_name] = {}

return available_languages


def get_missing_languages(
reference_languages: dict, target_languages: dict
) -> list[str]:
"""
Compare two language dictionaries and return a list of languages and sub-languages
that exist in target_languages but not in reference_languages.

Parameters
----------
reference_languages : dict
A dictionary of languages from the reference source.
target_languages : dict
A dictionary of languages from the target source to check for missing entries.

Returns
-------
list[str]
A list of languages and sub-languages that are in target_languages but not in reference_languages.
"""
missing_languages = []
reference_keys = {lang for lang in reference_languages.keys()}

for lang, details in target_languages.items():
# Check if the parent language exists
if lang not in reference_keys:
# If it's a parent language, check for sub-languages and append them
if "sub_languages" in details:
for sub_lang in details["sub_languages"]:
missing_languages.append(f"{lang} - {sub_lang}")
else:
# Individual language, append directly
missing_languages.append(lang)
else:
# If the parent exists, only check for missing sub-languages
ref_sub_languages = reference_languages[lang].get("sub_languages", {})
ref_sub_languages_keys = {sub for sub in ref_sub_languages}

if "sub_languages" in details:
for sub_lang in details["sub_languages"]:
if sub_lang not in ref_sub_languages_keys:
missing_languages.append(f"{lang} - {sub_lang}")

return missing_languages


def validate_language_properties(languages_dict: dict) -> dict:
"""
Validates the presence of 'qid' and 'iso' properties for each language and its sub-languages.

Args:
languages_dict (dict): A dictionary where each key is a language, and the value is another
dictionary containing details about the language. If the language has
sub-languages, they are stored under the 'sub_languages' key.

Returns:
dict: A dictionary with two lists:
- "missing_qids": Languages or sub-languages missing the 'qid' property.
- "missing_isos": Languages or sub-languages missing the 'iso' property.

Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
or simply "parent_language" for the parent languages.
"""
missing_qids = []
missing_isos = []

for lang, details in languages_dict.items():
# Check if the language has sub-languages
if "sub_languages" in details:
sub_languages = details["sub_languages"]

# Validate each sub-language
for sub_lang, sub_details in sub_languages.items():
if "qid" not in sub_details:
missing_qids.append(f"{lang} - {sub_lang}")
if "iso" not in sub_details:
missing_isos.append(f"{lang} - {sub_lang}")
else:
# Validate the parent language itself
if "qid" not in details:
missing_qids.append(lang)
if "iso" not in details:
missing_isos.append(lang)

return {"missing_qids": missing_qids, "missing_isos": missing_isos}


def check_language_metadata():
"""
Validates language metadata by performing the following checks:

1. Ensures that all languages listed in `language_data_extraction` are present in `language_metadata.json`, and vice versa.
2. Checks if each language in `language_metadata.json` has the required properties:
- 'qid' (a unique identifier)
- 'iso' (ISO language code)

This function helps identify missing languages or missing properties, ensuring data consistency across both sources.
"""
languages_in_directory = get_available_languages()
missing_languages_metadata = get_missing_languages(
languages_in_metadata, languages_in_directory
)
missing_languages_extraction = get_missing_languages(
languages_in_directory, languages_in_metadata
)
languages_with_missing_properties = validate_language_properties(
languages_in_metadata
)

if (
missing_languages_metadata
or missing_languages_extraction
or languages_with_missing_properties["missing_qids"]
or languages_with_missing_properties["missing_isos"]
):
if missing_languages_metadata or missing_languages_extraction:
print(
"There are missing languages or inconsistencies between language_metadata.json and language_data_extraction.\n"
)

if missing_languages_metadata:
print("Languages missing from language_metadata.json:")
for lang in missing_languages_metadata:
print(f" • {lang.title()}")

if missing_languages_extraction:
print("\nLanguages missing from language_data_extraction:")
for lang in missing_languages_extraction:
print(f" • {lang.title()}")

if languages_with_missing_properties["missing_qids"]:
print("\nLanguages missing the `qid` property:")
for lang in languages_with_missing_properties["missing_qids"]:
print(f" • {lang.title()}")

if languages_with_missing_properties["missing_isos"]:
print("\nLanguages missing the `iso` property:")
for lang in languages_with_missing_properties["missing_isos"]:
print(f" • {lang.title()}")

# Exit with a non-zero status code to indicate failure
sys.exit(1) # Indicate failure

print(
"All languages match between language_metadata.json and language_data_extraction; languages in language_metadata.json have the correct properties."
)


if __name__ == "__main__":
check_language_metadata()
Comment on lines +212 to +213
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is for a test, then it's fine. but we will be calling this in check_project_metadata.yaml file so no need for this :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you! good to know 😊

Loading