From b6af7e4f3daaa3e07d0b47c4bd882de8a2be191c Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:49:03 +0530 Subject: [PATCH 1/5] Create check_data_type_metadata.py --- .../check/check_data_type_metadata.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/scribe_data/check/check_data_type_metadata.py diff --git a/src/scribe_data/check/check_data_type_metadata.py b/src/scribe_data/check/check_data_type_metadata.py new file mode 100644 index 000000000..95bb28bab --- /dev/null +++ b/src/scribe_data/check/check_data_type_metadata.py @@ -0,0 +1,56 @@ +import os +import sys +from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, + data_type_metadata, +) + + +def check_data_type_metadata(output_file): + """ + Check that subdirectories named for data types in language directories + are also reflected in the data_type_metadata.json file, accounting for meta-languages. + """ + # Extract valid data types from data_type_metadata + valid_data_types = set(data_type_metadata.keys()) + + def check_language_subdirs(lang_dir, meta_lang=None): + discrepancies = [] + + for language in lang_dir.iterdir(): + if language.is_dir(): + meta_language = meta_lang or language.name.lower() + data_types_in_dir = [] + + for data_type in language.iterdir(): + if data_type.is_dir(): + data_types_in_dir.append(data_type.name.lower()) + + # Compare with valid data types + missing_data_types = set(data_types_in_dir) - valid_data_types + extra_data_types = valid_data_types - set(data_types_in_dir) + + if missing_data_types: + discrepancies.append(f"Missing in metadata for '{meta_language}': {missing_data_types}") + if extra_data_types: + discrepancies.append(f"Extra in directory for '{meta_language}': {extra_data_types}") + + # Recursively check sub-languages (if applicable) + sub_lang_dir = language / 'sub-languages' + if sub_lang_dir.exists(): + discrepancies.extend(check_language_subdirs(sub_lang_dir, meta_language)) + + return discrepancies + + # Start checking from the base language directory + discrepancies = check_language_subdirs(LANGUAGE_DATA_EXTRACTION_DIR) + + # Store discrepancies in the output file + with open(output_file, 'w', encoding='utf-8') as f: + if discrepancies: + for discrepancy in discrepancies: + f.write(discrepancy + '\n') + else: + f.write("All data type metadata is up-to-date!\n") + + print(f"Discrepancies stored in: {output_file}") From ee1afb67a9cdcd7727b456cee4fd22325b5d80f6 Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:57:24 +0530 Subject: [PATCH 2/5] Update check_data_type_metadata.py --- src/scribe_data/check/check_data_type_metadata.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scribe_data/check/check_data_type_metadata.py b/src/scribe_data/check/check_data_type_metadata.py index 95bb28bab..962268c4f 100644 --- a/src/scribe_data/check/check_data_type_metadata.py +++ b/src/scribe_data/check/check_data_type_metadata.py @@ -1,5 +1,3 @@ -import os -import sys from scribe_data.cli.cli_utils import ( LANGUAGE_DATA_EXTRACTION_DIR, data_type_metadata, From fd25c2acf0412ffa00a07982376f4d8a39906f22 Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Thu, 17 Oct 2024 20:16:31 +0530 Subject: [PATCH 3/5] Update src/scribe_data/check/check_data_type_metadata.py Co-authored-by: Akindele Michael <49593618+DeleMike@users.noreply.github.com> --- src/scribe_data/check/check_data_type_metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/check/check_data_type_metadata.py b/src/scribe_data/check/check_data_type_metadata.py index 962268c4f..8063562dc 100644 --- a/src/scribe_data/check/check_data_type_metadata.py +++ b/src/scribe_data/check/check_data_type_metadata.py @@ -19,7 +19,9 @@ def check_language_subdirs(lang_dir, meta_lang=None): if language.is_dir(): meta_language = meta_lang or language.name.lower() data_types_in_dir = [] - +# Skip sub-languages if they are not explicitly listed in the metadata +if is_sub_language and meta_language not in data_type_metadata: + continue for data_type in language.iterdir(): if data_type.is_dir(): data_types_in_dir.append(data_type.name.lower()) From ec399ae1ca18da3b2f04c69997f4f2d8aa82c150 Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:24:38 +0530 Subject: [PATCH 4/5] Update check_data_type_metadata.py Corrected indentation --- src/scribe_data/check/check_data_type_metadata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/check/check_data_type_metadata.py b/src/scribe_data/check/check_data_type_metadata.py index 8063562dc..261acfc5f 100644 --- a/src/scribe_data/check/check_data_type_metadata.py +++ b/src/scribe_data/check/check_data_type_metadata.py @@ -19,9 +19,9 @@ def check_language_subdirs(lang_dir, meta_lang=None): if language.is_dir(): meta_language = meta_lang or language.name.lower() data_types_in_dir = [] -# Skip sub-languages if they are not explicitly listed in the metadata -if is_sub_language and meta_language not in data_type_metadata: - continue + + if is_sub_language and meta_language not in data_type_metadata: + continue for data_type in language.iterdir(): if data_type.is_dir(): data_types_in_dir.append(data_type.name.lower()) @@ -42,6 +42,7 @@ def check_language_subdirs(lang_dir, meta_lang=None): return discrepancies + # Start checking from the base language directory discrepancies = check_language_subdirs(LANGUAGE_DATA_EXTRACTION_DIR) From d3a070875064af7774a8f419558d32ea37c8edbf Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:09:14 +0530 Subject: [PATCH 5/5] Update check_data_type_metadata.py --- src/scribe_data/check/check_data_type_metadata.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scribe_data/check/check_data_type_metadata.py b/src/scribe_data/check/check_data_type_metadata.py index 261acfc5f..cfca54e2e 100644 --- a/src/scribe_data/check/check_data_type_metadata.py +++ b/src/scribe_data/check/check_data_type_metadata.py @@ -19,9 +19,7 @@ def check_language_subdirs(lang_dir, meta_lang=None): if language.is_dir(): meta_language = meta_lang or language.name.lower() data_types_in_dir = [] - - if is_sub_language and meta_language not in data_type_metadata: - continue + for data_type in language.iterdir(): if data_type.is_dir(): data_types_in_dir.append(data_type.name.lower())