Add orjson dependency and add forms & boost interactive mood

scribe-org · Dec 29, 2024 · 612ebe5 · 612ebe5
1 parent 69f4bc7
commit 612ebe5
Show file tree

Hide file tree

Showing 9 changed files with 611 additions and 295 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ ruff>=0.3.3
 SPARQLWrapper>=2.0.0
 sphinx-rtd-theme>=3.0.0
 tqdm==4.66.4
+orjson>=3.10.12
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -117,8 +117,12 @@ def prompt_user_download_all():
     if all_bool:
         if language:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump()
-
+                parse_wd_lexeme_dump(
+                    language=language,
+                    wikidata_dump_type=["form"],
+                    data_types=data_types,
+                    type_output_dir=output_dir,
+                )
             else:
                 language_or_sub_language = language.split(" ")[0]
                 print(f"Updating all data types for language: {language.title()}")
@@ -134,8 +138,12 @@ def prompt_user_download_all():
 
         elif data_type:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump()
-
+                parse_wd_lexeme_dump(
+                    language=None,
+                    wikidata_dump_type=["form"],
+                    data_types=[data_type],
+                    type_output_dir=output_dir,
+                )
             else:
                 print(f"Updating all languages for data type: {data_type.capitalize()}")
                 query_data(
@@ -153,7 +161,13 @@ def prompt_user_download_all():
             rprint(
                 "[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
             )
-            parse_wd_lexeme_dump()
+            parse_wd_lexeme_dump(
+                language="all",
+                wikidata_dump_type=["form", "translations"],
+                data_types="all",
+                type_output_dir=output_dir,
+                wikidata_dump_path=wikidata_dump,
+            )
 
     # MARK: Emojis
 
@@ -165,7 +179,19 @@ def prompt_user_download_all():
     elif data_type == "translations":
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="translations",
+            wikidata_dump_type=["translations"],
+            type_output_dir=output_dir,
+            wikidata_dump_path=wikidata_dump,
+        )
+        return
+
+    # MARK: Query Data using Wikidata Dump
+
+    elif wikidata_dump:
+        parse_wd_lexeme_dump(
+            language=language,
+            wikidata_dump_type=["form"],
+            data_types=data_types,
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
         )

diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
@@ -38,8 +38,10 @@
 from scribe_data.cli.get import get_data
 from scribe_data.cli.total import total_wrapper
 from scribe_data.cli.version import get_version_message
+from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
 from scribe_data.utils import (
     DEFAULT_JSON_EXPORT_DIR,
+    DEFAULT_DUMP_EXPORT_DIR,
     data_type_metadata,
     language_metadata,
     list_all_languages,
@@ -262,6 +264,7 @@ def request_total_lexeme_loop():
             choices=[
                 Choice("Configure total lexemes request", "total"),
                 Choice("Run total lexemes request", "run"),
+                Choice("Run total lexemes request with lexeme dumps", "run_all"),
                 Choice("Exit", "exit"),
             ],
         ).ask()
@@ -275,6 +278,18 @@ def request_total_lexeme_loop():
             config.selected_languages, config.selected_data_types = [], []
             rprint(THANK_YOU_MESSAGE)
             break
+        elif choice == "run_all":
+            if wikidata_dump_path := prompt(
+                f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
+            ):
+                wikidata_dump_path = Path(wikidata_dump_path)
+
+            parse_wd_lexeme_dump(
+                language=config.selected_languages,
+                wikidata_dump_type=["total"],
+                wikidata_dump_path=wikidata_dump_path,
+            )
+            break
         elif choice == "exit":
             return
         else:
@@ -335,6 +350,12 @@ def start_interactive_mode(operation: str = None):
                     # Choice("See list of languages", "languages"),
                     Choice("Exit", "exit"),
                 ]
+            elif operation == "translations":
+                choices = [
+                    Choice("Configure translations request", "translations"),
+                    # Choice("See list of languages", "languages"),
+                    Choice("Exit", "exit"),
+                ]
 
         else:
             choices = [
@@ -358,6 +379,29 @@ def start_interactive_mode(operation: str = None):
             request_total_lexeme_loop()
             break
 
+        elif choice == "translations":
+            prompt_for_languages()
+
+            if wikidata_dump_path := prompt(
+                f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
+            ):
+                wikidata_dump_path = Path(wikidata_dump_path)
+
+            if output_dir := prompt(
+                f"Enter output directory (default: {config.output_dir}): "
+            ):
+                config.output_dir = Path(output_dir)
+
+            parse_wd_lexeme_dump(
+                language=config.selected_languages,
+                wikidata_dump_type=["translations"],
+                data_types=None,
+                type_output_dir=config.output_dir,
+                wikidata_dump_path=wikidata_dump_path,
+            )
+
+            break
+
         # elif choice == "languages":
         #     see_list_languages()
         #     break

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -436,7 +436,7 @@ def main() -> None:
             elif action == "Get data":
                 start_interactive_mode(operation="get")
             elif action == "Get translations":
-                print("Coming soon!")
+                start_interactive_mode(operation="translations")
             else:
                 print("Skipping action")
         else:

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -392,25 +392,22 @@ def total_wrapper(
             The local Wikidata dump path that can be used to process data.
             If True, indicates the flag was used without a path.
     """
+    # Handle --all flag
+    if all_bool and wikidata_dump:
+        language = "all"
 
     if wikidata_dump is True:  # flag without a wikidata dump path
-        if all_bool:
-            language = "all"
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="total",
-            type_output_dir=None,
+            wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
         return
 
     if isinstance(wikidata_dump, str):  # if user provided a wikidata dump path
-        if all_bool:
-            language = "all"
         parse_wd_lexeme_dump(
             language=language,
-            wikidata_dump_type="total",
-            type_output_dir=None,
+            wikidata_dump_type=["total"],
             wikidata_dump_path=wikidata_dump,
         )
         return

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -706,16 +706,27 @@ def check_lexeme_dump_prompt_download(output_dir: str):
             return True
 
 
-def check_index_exists(index_path: Path) -> bool:
+def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
     """
     Check if JSON wiktionary dump file exists and prompt user for action if it does.
+    Returns True if user chooses to skip (i.e., we do NOT proceed).
+    Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed).
+
+    Parameters:
+        index_path: Path to check
+        overwrite_all: If True, automatically overwrite without prompting
     """
     if index_path.exists():
+        if overwrite_all:
+            return False
+
         print(f"\nIndex file already exists at: {index_path}")
         choice = questionary.select(
             "Choose an action:",
             choices=["Overwrite existing data", "Skip process"],
             default="Skip process",
         ).ask()
+
+        # If user selects "Skip process", return True meaning "don't proceed"
         return choice == "Skip process"
     return False
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
@@ -23,18 +23,21 @@
 from pathlib import Path
 from rich import print as rprint
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
+from typing import List, Union
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
 from scribe_data.wiktionary.parse_dump import parse_dump
+from scribe_data.utils import language_metadata, data_type_metadata
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
 sparql.setMethod(POST)
 
 
 def parse_wd_lexeme_dump(
-    language: str = None,
-    wikidata_dump_type: str = None,
+    language: Union[str, List[str]] = None,
+    wikidata_dump_type: List[str] = None,
+    data_types: List[str] = None,
     type_output_dir: str = None,
     wikidata_dump_path: str = None,
 ):
@@ -43,18 +46,28 @@ def parse_wd_lexeme_dump(
 
     Parameters
     ----------
-    language : str
-        The language to parse the data for.
-    wikidata_dump_type : str
-        The type of Wikidata dump to parse (e.g. "total", "translations").
-    type_output_dir : str
-        The directory to save the parsed JSON data.
-    wikidata_dump_path : str
+    language : Union[str, List[str]]
+        The language(s) to parse the data for. Use "all" for all languages.
+    wikidata_dump_type : List[str]
+        The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]).
+    data_types : List[str]
+        The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]).
+    type_output_dir : str, optional
+        The directory to save the parsed JSON data. If None, uses default directory.
+    wikidata_dump_path : str, optional
         The local Wikidata dump directory that should be used to get data.
-    Returns
-    -------
-        The requested data saved locally given file type and location arguments.
     """
+    # Convert "all" to list of all languages
+    if isinstance(language, str) and language.lower() == "all":
+        language = list(language_metadata.keys())
+    if isinstance(data_types, str) and data_types.lower() == "all":
+        # Exclude translations as it's a separate section
+        data_types = [
+            dt
+            for dt in data_type_metadata.keys()
+            if dt != "translations" and dt != "emoji-keywords"
+        ]
+
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
     if isinstance(file_path, (str, Path)):
@@ -67,10 +80,10 @@ def parse_wd_lexeme_dump(
             parse_dump(
                 language=language,
                 parse_type=wikidata_dump_type,
-                type_output_dir=type_output_dir,
+                data_types=data_types,
                 file_path=file_path,
+                output_dir=type_output_dir,
             )
-
             return
 
     rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")