Add preposition queries and fix sqlite conversion for all data

scribe-org · Dec 9, 2024 · 6355649 · 6355649
1 parent 1f9f418
commit 6355649
Show file tree

Hide file tree

Showing 19 changed files with 136 additions and 75 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,11 +17,19 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
 ### ✨ Features
 
 - Queries for noun genders and other properties that require the Wikidata label service now return their English label rather than auto label that was returning just the Wikidata QID.
+- SPARQL queries for English and Portuguese prepositions were added to allow the CLI to query these types of data.
+- The convert functionality once again works for lists of languages all data types for them.
 
 ### 🐞 Bug Fixes
 
 - SQLite conversion was fixed for all queries ([#527](https://github.com/scribe-org/Scribe-Data/issues/527)).
 - The data conversion process outputs were improved including capitalizing language names and repeat notices to the user were removed.
+- The CLI's `get` command now returns all data types if none is passed.
+- The Portuguese verbs query was fixed as it wasn't formatted correctly.
+- The emoji keyword functionality was fixed given the new lexeme ID based form of the data.
+  - Arguments were fixed that were breaking the functionality.
+  - Languages for the user were capitalized.
+- `case` has been renamed `grammaticalCase` in preposition queries to assure that SQLite reserved keywords are not used.
 
 ## Scribe-Data 4.0.0
 

diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py
@@ -384,7 +384,7 @@ def convert_to_sqlite(
     input_file: str = None,
     output_dir: str = None,
     overwrite: bool = False,
-    identifier_case: str = "snake",
+    identifier_case: str = "camel",
 ) -> None:
     """
     Converts a Scribe-Data output file to an SQLite file.
@@ -416,16 +416,13 @@ def convert_to_sqlite(
     -------
         A SQLite file saved in the given location.
     """
-    if not language:
-        raise ValueError("Language must be specified for SQLite conversion.")
-
     if input_file:
         input_file = Path(input_file)
 
-    if not input_file.exists():
+    if input_file is not None and not input_file.exists():
         raise ValueError(f"Input file does not exist: {input_file}")
 
-    languages = [language]
+    languages = [language] if data_type else None
     specific_tables = [data_type] if data_type else None
 
     if output_dir is None:
@@ -450,30 +447,31 @@ def convert_to_sqlite(
 
 
 def convert_wrapper(
-    language: str,
-    data_type: Union[str, List[str]],
+    languages: Union[str, List[str]],
+    data_types: Union[str, List[str]],
     output_type: str,
-    input_file: str,
+    input_files: Union[str, List[str]],
     output_dir: str = None,
     overwrite: bool = False,
-    identifier_case: str = "snake",
+    identifier_case: str = "camel",
+    all: bool = False,
 ):
     """
     Convert data to the specified output type: JSON, CSV/TSV, or SQLite.
 
     Parameters
     ----------
-    language : str
-        The language of the data to convert.
+    language : Union[str, List[str]]
+        The language(s) of the data to convert.
 
     data_type : Union[str, List[str]]
         The data type(s) of the data to convert.
 
     output_type : str
         The desired output format. It can be 'json', 'csv', 'tsv', or 'sqlite'.
 
-    input_file : str
-        The path to the input file.
+    input_file : Union[str, List[str]]
+        The path(s) to the input file(s).
 
     output_dir : str, optional
         The output directory where converted files will be stored. Defaults to None.
@@ -484,44 +482,49 @@ def convert_wrapper(
     identifier_case : str
         The case format for identifiers. Default is "camel".
 
+    all : bool
+        Convert all languages and data types.
+
     Returns
     -------
-    None
+        None
     """
     output_type = output_type.lower()
-    print(
-        f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..."
-    )
+
+    if languages is not None and data_types is not None:
+        print(
+            f"Converting data for {languages.capitalize()} {data_types} to {output_type}..."
+        )
 
     # Route the function call to the correct conversion function.
     if output_type == "json":
         convert_to_json(
-            language=language,
-            data_type=data_type,
+            language=languages,
+            data_type=data_types,
             output_type=output_type,
-            input_file=input_file,
+            input_file=input_files,
             output_dir=output_dir,
             overwrite=overwrite,
             identifier_case=identifier_case,
         )
 
     elif output_type in {"csv", "tsv"}:
         convert_to_csv_or_tsv(
-            language=language,
-            data_type=data_type,
+            language=languages,
+            data_type=data_types,
             output_type=output_type,
-            input_file=input_file,
+            input_file=input_files,
             output_dir=output_dir,
             overwrite=overwrite,
             identifier_case=identifier_case,
         )
 
     elif output_type == "sqlite":
         convert_to_sqlite(
-            language=language,
-            data_type=data_type,
+            language=languages,
+            data_type=data_types,
             output_type=output_type,
-            input_file=input_file,
+            input_file=input_files,
             output_dir=output_dir,
             overwrite=overwrite,
             identifier_case=identifier_case,

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -100,7 +100,7 @@ def main() -> None:
         help="List all languages and data types.",
     )
 
-    # MARK: GET
+    # MARK: Get
 
     get_parser = subparsers.add_parser(
         "get",
@@ -206,21 +206,21 @@ def main() -> None:
         "-lang",
         "--language",
         type=str,
-        required=True,
+        required=False,
         help="The language of the file to convert.",
     )
     convert_parser.add_argument(
         "-dt",
         "--data-type",
         type=str,
-        required=True,
+        required=False,
         help="The data type(s) of the file to convert (e.g., nouns, verbs).",
     )
     convert_parser.add_argument(
         "-if",
         "--input-file",
         type=Path,
-        required=True,
+        required=False,
         help="The path to the input file to convert.",
     )
     convert_parser.add_argument(
@@ -258,6 +258,12 @@ def main() -> None:
         default="camel",
         help="The case format for identifiers in the output data (default: camel).",
     )
+    convert_parser.add_argument(
+        "-a",
+        "--all",
+        action=argparse.BooleanOptionalAction,
+        help="Convert all languages and data types.",
+    )
 
     # MARK: Setup CLI
 
@@ -296,8 +302,12 @@ def main() -> None:
 
             else:
                 get_data(
-                    language=args.language.lower(),
-                    data_type=args.data_type.lower(),
+                    language=args.language.lower()
+                    if args.language is not None
+                    else None,
+                    data_type=args.data_type.lower()
+                    if args.data_type is not None
+                    else None,
                     output_type=args.output_type,
                     output_dir=args.output_dir,
                     outputs_per_entry=args.outputs_per_entry,
@@ -323,13 +333,16 @@ def main() -> None:
 
         elif args.command in ["convert", "c"]:
             convert_wrapper(
-                language=args.language.lower(),
-                data_type=args.data_type,
+                languages=args.language.lower() if args.language is not None else None,
+                data_types=args.data_type.lower()
+                if args.data_type is not None
+                else None,
                 output_type=args.output_type,
-                input_file=args.input_file,
+                input_files=args.input_file,
                 output_dir=args.output_dir,
                 overwrite=args.overwrite,
                 identifier_case=args.identifier_case,
+                all=args.all,
             )
 
         else:

diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py
@@ -123,7 +123,7 @@ def create_table(data_type, cols):
         ]
 
         cursor.execute(
-            f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))"
+            f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, unique({cols[0]}))"
         )
 
     def table_insert(data_type, keys):
@@ -238,7 +238,13 @@ def table_insert(data_type, keys):
 
                 if dt in ["nouns", "verbs", "prepositions"]:
                     cols = ["wdLexemeId"]
-                    cols += json_data[list(json_data.keys())[0]].keys()
+
+                    all_elem_keys = [
+                        json_data[k].keys() for k in list(json_data.keys())
+                    ]
+                    all_keys_flat = list({k for ks in all_elem_keys for k in ks})
+
+                    cols += all_keys_flat
                     create_table(data_type=dt, cols=cols)
                     cursor.execute(f"DELETE FROM {dt}")  # clear existing data
 

diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py
@@ -81,7 +81,7 @@ def generate_emoji(language, output_dir: str = None):
             emojis_per_keyword=EMOJI_KEYWORDS_DICT,
         ):
             export_formatted_data(
-                file_path=output_dir,
+                dir_path=output_dir,
                 formatted_data=emoji_keywords_dict,
                 query_data_in_use=True,
                 language=language,

diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
@@ -117,7 +117,7 @@ def gen_emoji_lexicon(
 
         for cldr_char in tqdm(
             iterable=cldr_dict,
-            desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language}",
+            desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language.capitalize()}",
             unit="cldr characters",
         ):
             # Filter CLDR data for emoji characters while not including certain emojis.
@@ -187,11 +187,24 @@ def gen_emoji_lexicon(
         ) as f:
             noun_data = json.load(f)
 
-        plurals_to_singulars_dict = {
-            noun_data[row]["plural"].lower(): row.lower()
-            for row in noun_data
-            if noun_data[row]["plural"] != "isPlural"
-        }
+        if language not in ["german", "russian"]:
+            plurals_to_singulars_dict = {
+                noun_data[row]["plural"].lower(): row.lower()
+                for row in noun_data
+                if "singular" in noun_data[row]
+                and "plural" in noun_data[row]
+                and noun_data[row]["singular"] != noun_data[row]["plural"]
+            }
+
+        else:
+            plurals_to_singulars_dict = {
+                noun_data[row]["nominativePlural"].lower(): row.lower()
+                for row in noun_data
+                if "nominativeSingular" in noun_data[row]
+                and "nominativePlural" in noun_data[row]
+                and noun_data[row]["nominativeSingular"]
+                != noun_data[row]["nominativePlural"]
+            }
 
         for plural, singular in plurals_to_singulars_dict.items():
             if plural not in keyword_dict and singular in keyword_dict:

diff --git a/...e_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql b/...e_data/wikidata/language_data_extraction/bengali/postpositions/query_postpositions.sparql
@@ -6,7 +6,7 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?preposition
-  ?case
+  ?grammaticalCase
 
 WHERE {
   ?lexeme dct:language wd:Q9610 ;
@@ -21,6 +21,6 @@ WHERE {
 
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
-    ?caseForm rdfs:label ?case .
+    ?caseForm rdfs:label ?grammaticalCase .
   }
 }
diff --git a/...ibe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql b/...ibe_data/wikidata/language_data_extraction/bengali/prepositions/query_prepositions.sparql
@@ -5,7 +5,7 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?preposition
-  ?case
+  ?grammaticalCase
 
 WHERE {
   ?lexeme dct:language wd:Q9610 ;
@@ -20,6 +20,6 @@ WHERE {
 
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
-    ?caseForm rdfs:label ?case .
+    ?caseForm rdfs:label ?grammaticalCase .
   }
 }
diff --git a/...cribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql b/...cribe_data/wikidata/language_data_extraction/czech/prepositions/query_prepositions.sparql
@@ -5,7 +5,7 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?preposition
-  ?case
+  ?grammaticalCase
 
 WHERE {
   ?lexeme dct:language wd:Q9056 ;
@@ -21,6 +21,6 @@ WHERE {
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
     ?lemma rdfs:label ?preposition .
-    ?caseForm rdfs:label ?case .
+    ?caseForm rdfs:label ?grammaticalCase .
   }
 }
diff --git a/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/danish/prepositions/prepositions.sparql
@@ -5,7 +5,7 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?preposition
-  ?case
+  ?grammaticalCase
 
 WHERE {
   ?lexeme dct:language wd:Q9035 ;

diff --git a/...ibe_data/wikidata/language_data_extraction/english/prepositions/query_prepositions.sparql b/...ibe_data/wikidata/language_data_extraction/english/prepositions/query_prepositions.sparql
@@ -0,0 +1,13 @@
+# tool: scribe-data
+# All English (Q1860) prepositions (Q4833830) and the given forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT
+  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+  ?preposition
+
+WHERE {
+  ?lexeme dct:language wd:Q1860 ;
+    wikibase:lexicalCategory wd:Q4833830 ;
+    wikibase:lemma ?preposition .
+}
diff --git a/..._data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql b/..._data/wikidata/language_data_extraction/estonian/postpositions/query_postpositions.sparql
@@ -5,7 +5,7 @@
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?preposition
-  ?case
+  ?grammaticalCase
 
 WHERE {
   ?lexeme dct:language wd:Q9072 ;
@@ -20,6 +20,6 @@ WHERE {
 
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
-    ?caseForm rdfs:label ?case .
+    ?caseForm rdfs:label ?grammaticalCase .
   }
 }