Skip to content

Commit

Permalink
Add orjson dependency and add forms & boost interactive mood
Browse files Browse the repository at this point in the history
  • Loading branch information
axif0 committed Dec 29, 2024
1 parent 69f4bc7 commit 612ebe5
Show file tree
Hide file tree
Showing 9 changed files with 611 additions and 295 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ ruff>=0.3.3
SPARQLWrapper>=2.0.0
sphinx-rtd-theme>=3.0.0
tqdm==4.66.4
orjson>=3.10.12
38 changes: 32 additions & 6 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,12 @@ def prompt_user_download_all():
if all_bool:
if language:
if prompt_user_download_all():
parse_wd_lexeme_dump()

parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
type_output_dir=output_dir,
)
else:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language: {language.title()}")
Expand All @@ -134,8 +138,12 @@ def prompt_user_download_all():

elif data_type:
if prompt_user_download_all():
parse_wd_lexeme_dump()

parse_wd_lexeme_dump(
language=None,
wikidata_dump_type=["form"],
data_types=[data_type],
type_output_dir=output_dir,
)
else:
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
Expand All @@ -153,7 +161,13 @@ def prompt_user_download_all():
rprint(
"[bold red]Note that the download all functionality must use Wikidata dumps to observe responsible Wikidata Query Service usage practices.[/bold red]"
)
parse_wd_lexeme_dump()
parse_wd_lexeme_dump(
language="all",
wikidata_dump_type=["form", "translations"],
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)

# MARK: Emojis

Expand All @@ -165,7 +179,19 @@ def prompt_user_download_all():
elif data_type == "translations":
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type="translations",
wikidata_dump_type=["translations"],
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)
return

# MARK: Query Data using Wikidata Dump

elif wikidata_dump:
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
)
Expand Down
44 changes: 44 additions & 0 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@
from scribe_data.cli.get import get_data
from scribe_data.cli.total import total_wrapper
from scribe_data.cli.version import get_version_message
from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
from scribe_data.utils import (
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_DUMP_EXPORT_DIR,
data_type_metadata,
language_metadata,
list_all_languages,
Expand Down Expand Up @@ -262,6 +264,7 @@ def request_total_lexeme_loop():
choices=[
Choice("Configure total lexemes request", "total"),
Choice("Run total lexemes request", "run"),
Choice("Run total lexemes request with lexeme dumps", "run_all"),
Choice("Exit", "exit"),
],
).ask()
Expand All @@ -275,6 +278,18 @@ def request_total_lexeme_loop():
config.selected_languages, config.selected_data_types = [], []
rprint(THANK_YOU_MESSAGE)
break
elif choice == "run_all":
if wikidata_dump_path := prompt(
f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
):
wikidata_dump_path = Path(wikidata_dump_path)

parse_wd_lexeme_dump(
language=config.selected_languages,
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump_path,
)
break
elif choice == "exit":
return
else:
Expand Down Expand Up @@ -335,6 +350,12 @@ def start_interactive_mode(operation: str = None):
# Choice("See list of languages", "languages"),
Choice("Exit", "exit"),
]
elif operation == "translations":
choices = [
Choice("Configure translations request", "translations"),
# Choice("See list of languages", "languages"),
Choice("Exit", "exit"),
]

else:
choices = [
Expand All @@ -358,6 +379,29 @@ def start_interactive_mode(operation: str = None):
request_total_lexeme_loop()
break

elif choice == "translations":
prompt_for_languages()

if wikidata_dump_path := prompt(
f"Enter Wikidata lexeme dump path (default: {DEFAULT_DUMP_EXPORT_DIR}): "
):
wikidata_dump_path = Path(wikidata_dump_path)

if output_dir := prompt(
f"Enter output directory (default: {config.output_dir}): "
):
config.output_dir = Path(output_dir)

parse_wd_lexeme_dump(
language=config.selected_languages,
wikidata_dump_type=["translations"],
data_types=None,
type_output_dir=config.output_dir,
wikidata_dump_path=wikidata_dump_path,
)

break

# elif choice == "languages":
# see_list_languages()
# break
Expand Down
2 changes: 1 addition & 1 deletion src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ def main() -> None:
elif action == "Get data":
start_interactive_mode(operation="get")
elif action == "Get translations":
print("Coming soon!")
start_interactive_mode(operation="translations")
else:
print("Skipping action")
else:
Expand Down
13 changes: 5 additions & 8 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,25 +392,22 @@ def total_wrapper(
The local Wikidata dump path that can be used to process data.
If True, indicates the flag was used without a path.
"""
# Handle --all flag
if all_bool and wikidata_dump:
language = "all"

if wikidata_dump is True: # flag without a wikidata dump path
if all_bool:
language = "all"
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type="total",
type_output_dir=None,
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)
return

if isinstance(wikidata_dump, str): # if user provided a wikidata dump path
if all_bool:
language = "all"
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type="total",
type_output_dir=None,
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump,
)
return
Expand Down
13 changes: 12 additions & 1 deletion src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,16 +706,27 @@ def check_lexeme_dump_prompt_download(output_dir: str):
return True


def check_index_exists(index_path: Path) -> bool:
def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
"""
Check if JSON wiktionary dump file exists and prompt user for action if it does.
Returns True if user chooses to skip (i.e., we do NOT proceed).
Returns False if the file doesn't exist or user chooses to overwrite (i.e., we DO proceed).
Parameters:
index_path: Path to check
overwrite_all: If True, automatically overwrite without prompting
"""
if index_path.exists():
if overwrite_all:
return False

print(f"\nIndex file already exists at: {index_path}")
choice = questionary.select(
"Choose an action:",
choices=["Overwrite existing data", "Skip process"],
default="Skip process",
).ask()

# If user selects "Skip process", return True meaning "don't proceed"
return choice == "Skip process"
return False
41 changes: 27 additions & 14 deletions src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,21 @@
from pathlib import Path
from rich import print as rprint
from SPARQLWrapper import JSON, POST, SPARQLWrapper
from typing import List, Union

from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
from scribe_data.wiktionary.parse_dump import parse_dump
from scribe_data.utils import language_metadata, data_type_metadata

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setMethod(POST)


def parse_wd_lexeme_dump(
language: str = None,
wikidata_dump_type: str = None,
language: Union[str, List[str]] = None,
wikidata_dump_type: List[str] = None,
data_types: List[str] = None,
type_output_dir: str = None,
wikidata_dump_path: str = None,
):
Expand All @@ -43,18 +46,28 @@ def parse_wd_lexeme_dump(
Parameters
----------
language : str
The language to parse the data for.
wikidata_dump_type : str
The type of Wikidata dump to parse (e.g. "total", "translations").
type_output_dir : str
The directory to save the parsed JSON data.
wikidata_dump_path : str
language : Union[str, List[str]]
The language(s) to parse the data for. Use "all" for all languages.
wikidata_dump_type : List[str]
The type(s) of Wikidata dump to parse (e.g. ["total", "translations", "form"]).
data_types : List[str]
The categories to parse when using "form" type (e.g. ["nouns", "adverbs"]).
type_output_dir : str, optional
The directory to save the parsed JSON data. If None, uses default directory.
wikidata_dump_path : str, optional
The local Wikidata dump directory that should be used to get data.
Returns
-------
The requested data saved locally given file type and location arguments.
"""
# Convert "all" to list of all languages
if isinstance(language, str) and language.lower() == "all":
language = list(language_metadata.keys())
if isinstance(data_types, str) and data_types.lower() == "all":
# Exclude translations as it's a separate section
data_types = [
dt
for dt in data_type_metadata.keys()
if dt != "translations" and dt != "emoji-keywords"
]

file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)

if isinstance(file_path, (str, Path)):
Expand All @@ -67,10 +80,10 @@ def parse_wd_lexeme_dump(
parse_dump(
language=language,
parse_type=wikidata_dump_type,
type_output_dir=type_output_dir,
data_types=data_types,
file_path=file_path,
output_dir=type_output_dir,
)

return

rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")
Loading

0 comments on commit 612ebe5

Please sign in to comment.