diff --git a/CHANGELOG.md b/CHANGELOG.md index 43da0cc3..65b39acd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,11 +17,19 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). ### ✨ Features - Queries for noun genders and other properties that require the Wikidata label service now return their English label rather than auto label that was returning just the Wikidata QID. +- SPARQL queries for English and Portuguese prepositions were added to allow the CLI to query these types of data. +- The convert functionality once again works for lists of languages all data types for them. ### 🐞 Bug Fixes - SQLite conversion was fixed for all queries ([#527](https://github.com/scribe-org/Scribe-Data/issues/527)). - The data conversion process outputs were improved including capitalizing language names and repeat notices to the user were removed. +- The CLI's `get` command now returns all data types if none is passed. +- The Portuguese verbs query was fixed as it wasn't formatted correctly. +- The emoji keyword functionality was fixed given the new lexeme ID based form of the data. + - Arguments were fixed that were breaking the functionality. + - Languages for the user were capitalized. +- `case` has been renamed `grammaticalCase` in preposition queries to assure that SQLite reserved keywords are not used. ## Scribe-Data 4.0.0 diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 89000214..a2e2f777 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -384,7 +384,7 @@ def convert_to_sqlite( input_file: str = None, output_dir: str = None, overwrite: bool = False, - identifier_case: str = "snake", + identifier_case: str = "camel", ) -> None: """ Converts a Scribe-Data output file to an SQLite file. @@ -416,16 +416,13 @@ def convert_to_sqlite( ------- A SQLite file saved in the given location. """ - if not language: - raise ValueError("Language must be specified for SQLite conversion.") - if input_file: input_file = Path(input_file) - if not input_file.exists(): + if input_file is not None and not input_file.exists(): raise ValueError(f"Input file does not exist: {input_file}") - languages = [language] + languages = [language] if data_type else None specific_tables = [data_type] if data_type else None if output_dir is None: @@ -450,21 +447,22 @@ def convert_to_sqlite( def convert_wrapper( - language: str, - data_type: Union[str, List[str]], + languages: Union[str, List[str]], + data_types: Union[str, List[str]], output_type: str, - input_file: str, + input_files: Union[str, List[str]], output_dir: str = None, overwrite: bool = False, - identifier_case: str = "snake", + identifier_case: str = "camel", + all: bool = False, ): """ Convert data to the specified output type: JSON, CSV/TSV, or SQLite. Parameters ---------- - language : str - The language of the data to convert. + language : Union[str, List[str]] + The language(s) of the data to convert. data_type : Union[str, List[str]] The data type(s) of the data to convert. @@ -472,8 +470,8 @@ def convert_wrapper( output_type : str The desired output format. It can be 'json', 'csv', 'tsv', or 'sqlite'. - input_file : str - The path to the input file. + input_file : Union[str, List[str]] + The path(s) to the input file(s). output_dir : str, optional The output directory where converted files will be stored. Defaults to None. @@ -484,22 +482,27 @@ def convert_wrapper( identifier_case : str The case format for identifiers. Default is "camel". + all : bool + Convert all languages and data types. + Returns ------- - None + None """ output_type = output_type.lower() - print( - f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..." - ) + + if languages is not None and data_types is not None: + print( + f"Converting data for {languages.capitalize()} {data_types} to {output_type}..." + ) # Route the function call to the correct conversion function. if output_type == "json": convert_to_json( - language=language, - data_type=data_type, + language=languages, + data_type=data_types, output_type=output_type, - input_file=input_file, + input_file=input_files, output_dir=output_dir, overwrite=overwrite, identifier_case=identifier_case, @@ -507,10 +510,10 @@ def convert_wrapper( elif output_type in {"csv", "tsv"}: convert_to_csv_or_tsv( - language=language, - data_type=data_type, + language=languages, + data_type=data_types, output_type=output_type, - input_file=input_file, + input_file=input_files, output_dir=output_dir, overwrite=overwrite, identifier_case=identifier_case, @@ -518,10 +521,10 @@ def convert_wrapper( elif output_type == "sqlite": convert_to_sqlite( - language=language, - data_type=data_type, + language=languages, + data_type=data_types, output_type=output_type, - input_file=input_file, + input_file=input_files, output_dir=output_dir, overwrite=overwrite, identifier_case=identifier_case, diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 38615200..9cc718b5 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -100,7 +100,7 @@ def main() -> None: help="List all languages and data types.", ) - # MARK: GET + # MARK: Get get_parser = subparsers.add_parser( "get", @@ -206,21 +206,21 @@ def main() -> None: "-lang", "--language", type=str, - required=True, + required=False, help="The language of the file to convert.", ) convert_parser.add_argument( "-dt", "--data-type", type=str, - required=True, + required=False, help="The data type(s) of the file to convert (e.g., nouns, verbs).", ) convert_parser.add_argument( "-if", "--input-file", type=Path, - required=True, + required=False, help="The path to the input file to convert.", ) convert_parser.add_argument( @@ -258,6 +258,12 @@ def main() -> None: default="camel", help="The case format for identifiers in the output data (default: camel).", ) + convert_parser.add_argument( + "-a", + "--all", + action=argparse.BooleanOptionalAction, + help="Convert all languages and data types.", + ) # MARK: Setup CLI @@ -296,8 +302,12 @@ def main() -> None: else: get_data( - language=args.language.lower(), - data_type=args.data_type.lower(), + language=args.language.lower() + if args.language is not None + else None, + data_type=args.data_type.lower() + if args.data_type is not None + else None, output_type=args.output_type, output_dir=args.output_dir, outputs_per_entry=args.outputs_per_entry, @@ -323,13 +333,16 @@ def main() -> None: elif args.command in ["convert", "c"]: convert_wrapper( - language=args.language.lower(), - data_type=args.data_type, + languages=args.language.lower() if args.language is not None else None, + data_types=args.data_type.lower() + if args.data_type is not None + else None, output_type=args.output_type, - input_file=args.input_file, + input_files=args.input_file, output_dir=args.output_dir, overwrite=args.overwrite, identifier_case=args.identifier_case, + all=args.all, ) else: diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 1d551f6a..359cef32 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -123,7 +123,7 @@ def create_table(data_type, cols): ] cursor.execute( - f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))" + f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, unique({cols[0]}))" ) def table_insert(data_type, keys): @@ -238,7 +238,13 @@ def table_insert(data_type, keys): if dt in ["nouns", "verbs", "prepositions"]: cols = ["wdLexemeId"] - cols += json_data[list(json_data.keys())[0]].keys() + + all_elem_keys = [ + json_data[k].keys() for k in list(json_data.keys()) + ] + all_keys_flat = list({k for ks in all_elem_keys for k in ks}) + + cols += all_keys_flat create_table(data_type=dt, cols=cols) cursor.execute(f"DELETE FROM {dt}") # clear existing data diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index a2f17d18..2661f48d 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -81,7 +81,7 @@ def generate_emoji(language, output_dir: str = None): emojis_per_keyword=EMOJI_KEYWORDS_DICT, ): export_formatted_data( - file_path=output_dir, + dir_path=output_dir, formatted_data=emoji_keywords_dict, query_data_in_use=True, language=language, diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index a3f39625..abdf2363 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -117,7 +117,7 @@ def gen_emoji_lexicon( for cldr_char in tqdm( iterable=cldr_dict, - desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language}", + desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language.capitalize()}", unit="cldr characters", ): # Filter CLDR data for emoji characters while not including certain emojis. @@ -187,11 +187,24 @@ def gen_emoji_lexicon( ) as f: noun_data = json.load(f) - plurals_to_singulars_dict = { - noun_data[row]["plural"].lower(): row.lower() - for row in noun_data - if noun_data[row]["plural"] != "isPlural" - } + if language not in ["german", "russian"]: + plurals_to_singulars_dict = { + noun_data[row]["plural"].lower(): row.lower() + for row in noun_data + if "singular" in noun_data[row] + and "plural" in noun_data[row] + and noun_data[row]["singular"] != noun_data[row]["plural"] + } + + else: + plurals_to_singulars_dict = { + noun_data[row]["nominativePlural"].lower(): row.lower() + for row in noun_data + if "nominativeSingular" in noun_data[row] + and "nominativePlural" in noun_data[row] + and noun_data[row]["nominativeSingular"] + != noun_data[row]["nominativePlural"] + } for plural, singular in plurals_to_singulars_dict.items(): if plural not in keyword_dict and singular in keyword_dict: diff --git a/src/scribe_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql b/src/scribe_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql index 4a7c5d59..12e79b1c 100644 --- a/src/scribe_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql @@ -6,7 +6,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q9610 ; @@ -21,6 +21,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql index 117202fc..8c9664a6 100644 --- a/src/scribe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q9610 ; @@ -20,6 +20,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql index 5df5f199..8440e78f 100644 --- a/src/scribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q9056 ; @@ -21,6 +21,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . ?lemma rdfs:label ?preposition . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql index 0b06b531..37636c9f 100644 --- a/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q9035 ; diff --git a/src/scribe_data/wikidata/language_data_extraction/english/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/english/prepositions/query_prepositions.sparql new file mode 100644 index 00000000..c7db57df --- /dev/null +++ b/src/scribe_data/wikidata/language_data_extraction/english/prepositions/query_prepositions.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All English (Q1860) prepositions (Q4833830) and the given forms. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?preposition + +WHERE { + ?lexeme dct:language wd:Q1860 ; + wikibase:lexicalCategory wd:Q4833830 ; + wikibase:lemma ?preposition . +} diff --git a/src/scribe_data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql b/src/scribe_data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql index dc9b16a7..a4d9ae01 100644 --- a/src/scribe_data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q9072 ; @@ -20,6 +20,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/german/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/german/prepositions/query_prepositions.sparql index be71eedf..7562b702 100644 --- a/src/scribe_data/wikidata/language_data_extraction/german/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/german/prepositions/query_prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q188 ; @@ -21,6 +21,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . ?lemma rdfs:label ?preposition . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/portuguese/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/portuguese/prepositions/query_prepositions.sparql new file mode 100644 index 00000000..0f146aa2 --- /dev/null +++ b/src/scribe_data/wikidata/language_data_extraction/portuguese/prepositions/query_prepositions.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Portuguese (Q5146) prepositions (Q4833830) and the given forms. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?preposition + +WHERE { + ?lexeme dct:language wd:Q5146 ; + wikibase:lexicalCategory wd:Q4833830 ; + wikibase:lemma ?preposition . +} diff --git a/src/scribe_data/wikidata/language_data_extraction/portuguese/verbs/query_verbs.sparql b/src/scribe_data/wikidata/language_data_extraction/portuguese/verbs/query_verbs.sparql index 0aa7a00f..4cda267b 100644 --- a/src/scribe_data/wikidata/language_data_extraction/portuguese/verbs/query_verbs.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/portuguese/verbs/query_verbs.sparql @@ -199,3 +199,4 @@ WHERE { ?indicativePluperfectThirdPersonPluralForm ontolex:representation ?indicativePluperfectThirdPersonPlural ; wikibase:grammaticalFeature wd:Q682111, wd:Q623742, wd:Q51929074, wd:Q146786 . } +} diff --git a/src/scribe_data/wikidata/language_data_extraction/russian/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/russian/prepositions/query_prepositions.sparql index 617e00a6..7bba9804 100644 --- a/src/scribe_data/wikidata/language_data_extraction/russian/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/russian/prepositions/query_prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q7737 ; @@ -21,6 +21,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . ?lemma rdfs:label ?preposition . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/src/scribe_data/wikidata/language_data_extraction/ukrainian/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/ukrainian/prepositions/query_prepositions.sparql index 9264bea7..c6ba7d70 100644 --- a/src/scribe_data/wikidata/language_data_extraction/ukrainian/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/ukrainian/prepositions/query_prepositions.sparql @@ -5,7 +5,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition - ?case + ?grammaticalCase WHERE { ?lexeme dct:language wd:Q8798 ; @@ -21,6 +21,6 @@ WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . ?lemma rdfs:label ?preposition . - ?caseForm rdfs:label ?case . + ?caseForm rdfs:label ?grammaticalCase . } } diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index bf38c235..92eb540c 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -648,7 +648,7 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat overwrite=True, ) - mock_data_to_sqlite.assert_called_with(["english"], ["nouns"], "snake") + mock_data_to_sqlite.assert_called_with(["english"], ["nouns"], "camel") @patch("scribe_data.cli.convert.Path", autospec=True) @patch("scribe_data.cli.convert.data_to_sqlite", autospec=True) @@ -671,25 +671,15 @@ def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path): overwrite=True, ) - mock_data_to_sqlite.assert_called_with(["english"], ["nouns"], "snake") - - def test_convert_to_sqlite_no_language(self): - with self.assertRaises(ValueError): - convert_to_sqlite( - language=None, - data_type="data_type", - output_type="sqlite", - output_dir="/output", - overwrite=True, - ) + mock_data_to_sqlite.assert_called_with(["english"], ["nouns"], "camel") def test_convert(self): with self.assertRaises(ValueError) as context: convert_wrapper( - language="English", - data_type="nouns", + languages="English", + data_types="nouns", output_type="parquet", - input_file="Data/ecode.csv", + input_files="Data/ecode.csv", output_dir="/output_dir", overwrite=True, ) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 4f6f0058..4b0fa44d 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -98,6 +98,7 @@ def test_list_data_types_specific_language(self, mock_print): call("adverbs"), call("emoji-keywords"), call("nouns"), + call("prepositions"), call("proper-nouns"), call("verbs"), call(),