From 27378b5e34a0e842709b1c4b0d6b1ad190087fb1 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Mon, 16 Dec 2024 00:43:36 +0100
Subject: [PATCH] Move files from Wiktionary utils to WD utils - delete
 Wiktionary dir

---
 docs/source/scribe_data/index.rst             |   1 -
 docs/source/scribe_data/wiktionary/index.rst  |   6 -
 src/scribe_data/cli/download.py               |  26 +--
 src/scribe_data/cli/get.py                    |   6 +-
 src/scribe_data/utils.py                      |  39 +++-
 src/scribe_data/wikidata/wikidata_utils.py    | 155 +++++++++++++++
 src/scribe_data/wiktionary/__init__.py        |   0
 .../wiktionary/synonum_antonym/__init__.py    |   0
 .../wiktionary/translation/__init__.py        |   0
 .../wiktionary/wikitionary_utils.py           | 178 ------------------
 10 files changed, 201 insertions(+), 210 deletions(-)
 delete mode 100644 docs/source/scribe_data/wiktionary/index.rst
 delete mode 100644 src/scribe_data/wiktionary/__init__.py
 delete mode 100644 src/scribe_data/wiktionary/synonum_antonym/__init__.py
 delete mode 100644 src/scribe_data/wiktionary/translation/__init__.py
 delete mode 100644 src/scribe_data/wiktionary/wikitionary_utils.py

diff --git a/docs/source/scribe_data/index.rst b/docs/source/scribe_data/index.rst
index f870bcb8f..40d1f4094 100644
--- a/docs/source/scribe_data/index.rst
+++ b/docs/source/scribe_data/index.rst
@@ -10,7 +10,6 @@ Scribe-Data
     unicode/index
     wikidata/index
     wikipedia/index
-    wiktionary/index
 
 .. toctree::
     :maxdepth: 1
diff --git a/docs/source/scribe_data/wiktionary/index.rst b/docs/source/scribe_data/wiktionary/index.rst
deleted file mode 100644
index 9d6248867..000000000
--- a/docs/source/scribe_data/wiktionary/index.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-wiktionary/
-===========
-
-`View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/wiktionary>`_
-
-Scribe-Data will eventually be using data from Wiktionaries for features such as deriving translations of words as well as synonyms and antonyms.
diff --git a/src/scribe_data/cli/download.py b/src/scribe_data/cli/download.py
index a48633268..ecb82deab 100644
--- a/src/scribe_data/cli/download.py
+++ b/src/scribe_data/cli/download.py
@@ -1,5 +1,5 @@
 """
-Functions for downloading Wikidata dumps.
+Functions for downloading Wikidata lexeme dumps.
 
 .. raw:: html
     <!--
@@ -29,7 +29,7 @@
 from tqdm import tqdm
 
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR, check_lexeme_dump_prompt_download
-from scribe_data.wiktionary.wikitionary_utils import download_wiki_lexeme_dump
+from scribe_data.wikidata.wikidata_utils import download_wiki_lexeme_dump
 
 
 def download_wrapper(
@@ -41,25 +41,20 @@ def download_wrapper(
         wikidata_dump: Optional date string in YYYYMMDD format for specific dumps
         output_dir: Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidumps' directory
     """
-    dump_url = download_wiki_lexeme_dump(
-        "latest-lexemes" if not wikidata_dump else wikidata_dump
-    )
+    dump_url = download_wiki_lexeme_dump(wikidata_dump or "latest-lexemes")
 
     if not dump_url:
         rprint("[bold red]No dump URL found.[/bold red]")
         return False
 
     try:
-        output_dir = output_dir if output_dir else DEFAULT_DUMP_EXPORT_DIR
+        output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
 
         os.makedirs(output_dir, exist_ok=True)
 
-        # Don't check for lexeme if date given
+        # Don't check for lexeme if date given.
         if not wikidata_dump:
-            useable_file_dir = check_lexeme_dump_prompt_download(output_dir)
-
-            # Check for existing .json.bz2 files
-            if useable_file_dir:
+            if useable_file_dir := check_lexeme_dump_prompt_download(output_dir):
                 return useable_file_dir
 
         filename = dump_url.split("/")[-1]
@@ -68,13 +63,13 @@ def download_wrapper(
         user_response = (
             input(
                 "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities."
-                "Do you want to proceed? (Yes/Cancel): "
+                "Do you want to proceed? (y/n): "
             )
             .strip()
             .lower()
         )
 
-        if user_response == "yes" or user_response == "":
+        if user_response == "y":
             rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]")
 
             response = requests.get(dump_url, stream=True)
@@ -90,9 +85,14 @@ def download_wrapper(
                             pbar.update(len(chunk))
 
             rprint("[bold green]Download completed successfully![/bold green]")
+
             return output_path
 
+        else:
+            return
+
     except requests.exceptions.RequestException as e:
         rprint(f"[bold red]Error downloading dump: {e}[/bold red]")
+
     except Exception as e:
         rprint(f"[bold red]An error occurred: {e}[/bold red]")
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 593129ca8..550cc97fe 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -106,9 +106,9 @@ def get_data(
     subprocess_result = False
 
     # MARK: Get All
-    if all:
-        # Using wikimedia lexeme based dump
 
+    if all:
+        # Using Wikidata lexeme based dumps.
         if wikidata_dump:
             print("wikidata_dump", wikidata_dump)
             download_wrapper(None, wikidata_dump)
@@ -125,7 +125,7 @@ def get_data(
                     "[bold red]Parsing lexeme dump feature will be available soon...[/bold red]"
                 )
 
-        # Using sparql based data extract
+        # Using Wikidata Query Service based data extraction.
 
         # if language:
         #     language_or_sub_language = language.split(" ")[0]
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index a5b6f6acf..af5ad0430 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -25,11 +25,12 @@
 import json
 import os
 import re
+from datetime import datetime
 from importlib import resources
 from pathlib import Path
 from typing import Any, Optional
+
 from rich import print as rprint
-from datetime import datetime
 
 # MARK: Utils Variables
 
@@ -620,14 +621,28 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages):
 
 
 def camel_to_snake(name: str) -> str:
-    """Convert camelCase to snake_case."""
+    """
+    Convert camelCase to snake_case.
+    """
     return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
 
 
-# MARK : Check Dump
+# MARK: Check Dump
 
 
-def check_lexeme_dump_prompt_download(output_dir):
+def check_lexeme_dump_prompt_download(output_dir: str):
+    """
+    Checks to see if a Wikidata lexeme dump exists and prompts the user to download one if not.
+
+    Parameters
+    ----------
+        output_dir : str
+            The directory to check for the existence of a Wikidata lexeme dump.
+
+    Returns
+    -------
+        None : The user is prompted to download a new Wikidata dump after the existence of one is checked.
+    """
     existing_dumps = list(Path(output_dir).glob("*.json.bz2"))
     if existing_dumps:
         rprint("[bold yellow]Existing dump files found:[/bold yellow]")
@@ -635,22 +650,25 @@ def check_lexeme_dump_prompt_download(output_dir):
             rprint(f"  - {Path(output_dir)}/{dump.name}")
 
         user_input = input(
-            "\nDo you want to\n - Delete existing dumps,\n - Skip download,\n - Use existing latest dump\n -Download (n)ew version?\n [d/s/u/n]: "
+            "\nDo you want to:\n - Delete existing dumps (d)?\n - Skip download (s)?\n - Use existing latest dump (u)?\n -Download new version(n)?\n[d/s/u/n]: "
         ).lower()
+
         if user_input == "d":
             for dump in existing_dumps:
                 dump.unlink()
+
             rprint("[bold green]Existing dumps deleted.[/bold green]")
             user_input = input("Do you want to download latest lexeme dump? (y/N): ")
             return user_input != "y"
 
         elif user_input == "u":
-            # Check for the latest dump file
+            # Check for the latest dump file.
             latest_dump = None
             if any(dump.name == "latest-lexemes.json.bz2" for dump in existing_dumps):
                 latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
+
             else:
-                # Extract dates from filenames using datetime validation
+                # Extract dates from filenames using datetime validation.
                 dated_dumps = []
                 for dump in existing_dumps:
                     parts = dump.stem.split("-")
@@ -658,19 +676,22 @@ def check_lexeme_dump_prompt_download(output_dir):
                         try:
                             date = datetime.strptime(parts[1], "%Y%m%d")
                             dated_dumps.append((dump, date))
+
                         except ValueError:
-                            continue  # Skip files without a valid date
+                            continue  # skip files without a valid date
 
                 if dated_dumps:
-                    # Find the dump with the most recent date
+                    # Find the dump with the most recent date.
                     latest_dump = max(dated_dumps, key=lambda x: x[1])[0]
 
             if latest_dump:
                 rprint(f"[bold green]Using latest dump:[/bold green] {latest_dump}")
                 return latest_dump
+
             else:
                 rprint("[bold red]No valid dumps found.[/bold red]")
                 return None
+
         else:
             rprint("[bold blue]Skipping download.[/bold blue]")
             return True
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 950dc0755..0fbd0d8a9 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -20,8 +20,163 @@
     -->
 """
 
+import re
+from datetime import datetime
+
+import requests
 from SPARQLWrapper import JSON, POST, SPARQLWrapper
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
 sparql.setMethod(POST)
+
+
+def parse_date(date_string):
+    """
+    Parses a date string into a `datetime.date` object.
+
+    Supported formats:
+        - YYYYMMDD
+        - YYYY/MM/DD
+        - YYYY-MM-DD
+
+    Args:
+        date_string (str): The date string to be parsed.
+
+    Returns:
+        datetime.date: Parsed date object if the format is valid.
+        None: If the date format is invalid.
+    """
+    formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_string, fmt).date()
+        except ValueError:
+            continue
+    print(
+        f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
+    )
+    return None
+
+
+def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump):
+    """
+    Finds the closest available dump file based on the target date.
+
+    Args:
+        target_entity (str): The target date for which the dump is requested (format: YYYY/MM/DD or similar).
+        other_old_dumps (list): List of available dump folders as strings.
+        try_old_dump (function): A function to validate if the dump file exists.
+
+    Returns:
+        str: The closest available dump file date (as a string).
+        None: If no suitable dump is found.
+    """
+    available_dates = []
+    target_date = parse_date(target_entity)
+    closest_date = None
+    closest_diff = None
+
+    if target_date:
+        for i in other_old_dumps:
+            if i == "..":
+                continue
+            try:
+                if try_old_dump(i):
+                    available_dates.append(i)
+                    current_date = parse_date(i)
+                    diff = abs((current_date - target_date).days)
+
+                    if closest_diff is None or diff < closest_diff:
+                        closest_date = i
+                        closest_diff = diff
+
+                    if current_date >= target_date:
+                        break
+            except requests.exceptions.HTTPError:
+                pass
+        return closest_date
+
+
+def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
+    """
+    Downloads a Wikimedia lexeme dump based on the specified target entity or date.
+
+    Args:
+        target_entity (str, optional): The target dump to download. Defaults to "latest-lexemes".
+            - If "latest-lexemes", downloads the latest dump.
+            - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
+
+    Returns:
+        str: The URL of the requested or closest available dump.
+        None: If no suitable dump is found or the request fails.
+    """
+    base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"
+
+    def try_old_dump(target_entity):
+        """
+        Checks if the specified dump file exists for a target entity.
+
+        Args:
+            target_entity (str): The target entity or date folder to check.
+
+        Returns:
+            str: The URL of the dump file if it exists.
+            None: If the dump file does not exist.
+        """
+        entity_url = f"{base_url}/{target_entity}/"
+        entity_response = requests.get(entity_url)
+        entity_response.raise_for_status()
+        dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)
+
+        fileurl = f"wikidata-{target_entity}-lexemes.json.bz2"
+        if fileurl in dump_filenames:
+            return f"{base_url}/{target_entity}/{fileurl}"
+
+    if target_entity != "latest-lexemes":
+        try:
+            if parse_date(target_entity):
+                target_entity = target_entity.replace("/", "").replace("-", "")
+                return try_old_dump(target_entity)
+
+        except requests.exceptions.HTTPError as http_err:
+            print(
+                f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
+            )
+            print("We could not find your requested Wikidata lexeme dump.")
+
+            response = requests.get(base_url)
+            other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)
+
+            user_input = input(
+                "Do you want to see the closest available older dumps? [Y/n]"
+            ).lower()
+
+            if user_input == "n":
+                return
+
+            if user_input == "y" or user_input == "":
+                if other_old_dumps:
+                    closest_date = available_closest_lexeme_dumpfile(
+                        target_entity, other_old_dumps, try_old_dump
+                    )
+                    print(
+                        f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
+                    )
+                    fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"
+                    if closest_date:
+                        return f"{base_url}/{fileurl}"
+                    else:
+                        return
+            return other_old_dumps
+
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()
+        latest_dump = re.findall(r'href="([^"]+)"', response.text)
+        if "latest-all.json.bz2" in latest_dump:
+            latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
+            return latest_dump_link
+
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
diff --git a/src/scribe_data/wiktionary/__init__.py b/src/scribe_data/wiktionary/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/scribe_data/wiktionary/synonum_antonym/__init__.py b/src/scribe_data/wiktionary/synonum_antonym/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/scribe_data/wiktionary/translation/__init__.py b/src/scribe_data/wiktionary/translation/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/scribe_data/wiktionary/wikitionary_utils.py b/src/scribe_data/wiktionary/wikitionary_utils.py
deleted file mode 100644
index 1e5c7ae27..000000000
--- a/src/scribe_data/wiktionary/wikitionary_utils.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""
-Module for downloading Wikipedia based lexeme JSON dump.
-
-.. raw:: html
-
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
-"""
-
-import re
-from datetime import datetime
-
-import requests
-
-
-def parse_date(date_string):
-    """
-    Parses a date string into a `datetime.date` object.
-
-    Supported formats:
-        - YYYYMMDD
-        - YYYY/MM/DD
-        - YYYY-MM-DD
-
-    Args:
-        date_string (str): The date string to be parsed.
-
-    Returns:
-        datetime.date: Parsed date object if the format is valid.
-        None: If the date format is invalid.
-    """
-    formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"]
-    for fmt in formats:
-        try:
-            return datetime.strptime(date_string, fmt).date()
-        except ValueError:
-            continue
-    print(
-        f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD."
-    )
-    return None
-
-
-def available_closest_lexeme_dumpfile(target_entity, other_old_dumps, try_old_dump):
-    """
-    Finds the closest available dump file based on the target date.
-
-    Args:
-        target_entity (str): The target date for which the dump is requested (format: YYYY/MM/DD or similar).
-        other_old_dumps (list): List of available dump folders as strings.
-        try_old_dump (function): A function to validate if the dump file exists.
-
-    Returns:
-        str: The closest available dump file date (as a string).
-        None: If no suitable dump is found.
-    """
-    available_dates = []
-    target_date = parse_date(target_entity)
-    closest_date = None
-    closest_diff = None
-
-    if target_date:
-        for i in other_old_dumps:
-            if i == "..":
-                continue
-            try:
-                if try_old_dump(i):
-                    available_dates.append(i)
-                    current_date = parse_date(i)
-                    diff = abs((current_date - target_date).days)
-
-                    if closest_diff is None or diff < closest_diff:
-                        closest_date = i
-                        closest_diff = diff
-
-                    if current_date >= target_date:
-                        break
-            except requests.exceptions.HTTPError:
-                pass
-        return closest_date
-
-
-def download_wiki_lexeme_dump(target_entity="latest-lexemes"):
-    """
-    Downloads a Wikimedia lexeme dump based on the specified target entity or date.
-
-    Args:
-        target_entity (str, optional): The target dump to download. Defaults to "latest-lexemes".
-            - If "latest-lexemes", downloads the latest dump.
-            - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date.
-
-    Returns:
-        str: The URL of the requested or closest available dump.
-        None: If no suitable dump is found or the request fails.
-    """
-    base_url = "https://dumps.wikimedia.org/wikidatawiki/entities"
-
-    def try_old_dump(target_entity):
-        """
-        Checks if the specified dump file exists for a target entity.
-
-        Args:
-            target_entity (str): The target entity or date folder to check.
-
-        Returns:
-            str: The URL of the dump file if it exists.
-            None: If the dump file does not exist.
-        """
-        entity_url = f"{base_url}/{target_entity}/"
-        entity_response = requests.get(entity_url)
-        entity_response.raise_for_status()
-        dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text)
-
-        fileurl = f"wikidata-{target_entity}-lexemes.json.bz2"
-        if fileurl in dump_filenames:
-            return f"{base_url}/{target_entity}/{fileurl}"
-
-    if target_entity != "latest-lexemes":
-        try:
-            if parse_date(target_entity):
-                target_entity = target_entity.replace("/", "").replace("-", "")
-                return try_old_dump(target_entity)
-
-        except requests.exceptions.HTTPError as http_err:
-            print(
-                f"HTTP error occurred: {http_err} Status code: {http_err.response.status_code}"
-            )
-            print("We could not find your requested Wikidata lexeme dump.")
-
-            response = requests.get(base_url)
-            other_old_dumps = re.findall(r'href="([^"]+)/"', response.text)
-
-            user_input = input(
-                "Do you want to see the closest available older dumps? [Y/n]"
-            ).lower()
-
-            if user_input == "n":
-                return
-
-            if user_input == "y" or user_input == "":
-                if other_old_dumps:
-                    closest_date = available_closest_lexeme_dumpfile(
-                        target_entity, other_old_dumps, try_old_dump
-                    )
-                    print(
-                        f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}"
-                    )
-                    fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2"
-                    if closest_date:
-                        return f"{base_url}/{fileurl}"
-                    else:
-                        return
-            return other_old_dumps
-
-    try:
-        response = requests.get(base_url)
-        response.raise_for_status()
-        latest_dump = re.findall(r'href="([^"]+)"', response.text)
-        if "latest-all.json.bz2" in latest_dump:
-            latest_dump_link = f"{base_url}/latest-lexemes.json.bz2"
-            return latest_dump_link
-
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")