Merge pull request #35 from Living-with-machines/fix-dataprovider

Add abstractions to to ease exporting `DataProvider`
Living-with-machines · Jul 31, 2023 · 8055041 · 8055041
2 parents 343a3ce + c4de0a6
commit 8055041
Show file tree

Hide file tree

Showing 16 changed files with 716 additions and 467 deletions.
diff --git a/alto2txt2fixture/__main__.py b/alto2txt2fixture/__main__.py
@@ -23,7 +23,7 @@
 __main__), the ``run`` function is executed.
 """
 
-from argparse import ArgumentParser
+from argparse import ArgumentParser, BooleanOptionalAction
 
 from alto2txt2fixture.parser import parse
 from alto2txt2fixture.router import route
@@ -54,10 +54,17 @@ def parse_args(argv=None):
         help="<Optional> Set an output directory",
         required=False,
     )
+    parser.add_argument(
+        "-t",
+        "--test-config",
+        default=False,
+        help="Only print the configuration",
+        action=BooleanOptionalAction,
+    )
     return parser.parse_args(argv)
 
 
-def run():
+def run(test_config: bool = False) -> None:
     """
     The run function is the main function that starts the alto2txt2fixture
     process.
@@ -77,8 +84,6 @@ def run():
 
     Finally, the ``clear_cache`` function is called to clear the cache
     (pending the user's confirmation).
-
-    :return: None
     """
 
     args = parse_args()
@@ -107,25 +112,25 @@ def run():
         REPORT_DIR=settings.REPORT_DIR,
         MAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,
     )
-
-    # Routing alto2txt into subdirectories with structured files
-    route(
-        COLLECTIONS,
-        settings.CACHE_HOME,
-        MOUNTPOINT,
-        settings.JISC_PAPERS_CSV,
-        settings.REPORT_DIR,
-    )
-
-    # Parsing the resulting JSON files
-    parse(
-        COLLECTIONS,
-        settings.CACHE_HOME,
-        OUTPUT,
-        settings.MAX_ELEMENTS_PER_FILE,
-    )
-
-    clear_cache(settings.CACHE_HOME)
+    if not args.test_config and not test_config:
+        # Routing alto2txt into subdirectories with structured files
+        route(
+            COLLECTIONS,
+            settings.CACHE_HOME,
+            MOUNTPOINT,
+            settings.JISC_PAPERS_CSV,
+            settings.REPORT_DIR,
+        )
+
+        # Parsing the resulting JSON files
+        parse(
+            COLLECTIONS,
+            settings.CACHE_HOME,
+            OUTPUT,
+            settings.MAX_ELEMENTS_PER_FILE,
+        )
+
+        clear_cache(settings.CACHE_HOME)
 
 
 if __name__ == "__main__":

diff --git a/alto2txt2fixture/create_adjacent_tables.py b/alto2txt2fixture/create_adjacent_tables.py
@@ -61,11 +61,15 @@ def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConf
 
     The `csv` and `json` paths
 
-    :param names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names
-    :param module_name: name of module each name is part of, that is added as a prefix
+    Args:
+        names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names
+        module_name: name of module each name is part of, that is added as a prefix
 
-    :Examples:
+    Returns:
+        A ``TableOutputConfigType``: a `dict` of table ``names`` and output
+            `csv` and `json` filenames.
 
+    Examples:
         >>> from pprint import pprint
         >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
         {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
@@ -146,7 +150,7 @@ def csv2json_list(
     saved: list[Path] | None = None,
     indent: int = 2,
 ) -> list:
-    """Save `csv_path` as a `json` file and return as a `dict`."""
+    """Save `csv_path` as a `json` file and return as a `list`."""
     json_data = []
     # See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486
     df = (
@@ -220,15 +224,14 @@ def download_data(
     overwrite: bool = OVERWRITE,
     exclude: list[str] = [],
 ) -> None:
-    """Download files in `files_dict`, overwrite if specified.
+    """Download files in ``files_dict``, overwrite if specified.
 
-    :param files_dict: dict of related files to download
-    :param overwrite: bool on whether to overwrite `LOCAL_CACHE` files
-    :param exclude: list[str] of files to exclude from `files_dict`
+    Args:
+        files_dict: `dict` of related files to download
+        overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not
+        exclude: `list` of files to exclude from ``files_dict``
 
-    :Examples:
-
-        >>> from pathlib import Path
+    Examples:
         >>> tmp: Path = getfixture('tmpdir')
         >>> set_path: Path = tmp.chdir()
         >>> download_data(exclude=[
@@ -293,7 +296,7 @@ def run(
     time_stamp: str = "",
     output_path: Path = OUTPUT,
 ) -> None:
-    """Download, process and link `files_dict` to `json` and `csv`."""
+    """Download, process and link ``files_dict`` to `json` and `csv`."""
 
     # Ensure time_stamp from the point of calling `run`
     if not time_stamp:

diff --git a/alto2txt2fixture/jisc.py b/alto2txt2fixture/jisc.py
@@ -9,9 +9,10 @@
 
 def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:
     """
-    Creates a DataFrame with correct informations based on the JISC_PAPERS_CSV from the settings.
+    Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.
 
-    Returns: DataFrame with all JISC titles.
+    Returns:
+        `DataFrame` with all JISC titles.
     """
 
     if not Path(path).exists():
@@ -92,14 +93,30 @@ def get_jisc_title(
     jisc_papers: pd.DataFrame,
     input_sub_path: str,
     publication_code: str,
-    abbr: str = None,
+    abbr: str | None = None,
 ) -> str:
     """
-    Takes an input_sub_path, a publication_code, and an (optional) abbreviation for any newspaper, and tries to
-    locate the title in the jisc_papers DataFrame provided (usually loaded with the setup_jisc_papers function
-    above).
+    Match a newspaper ``title`` with ``jisc_papers`` records.
 
-    Returns a string (or crashes).
+    Takes an ``input_sub_path``, a ``publication_code``, and an (optional)
+    abbreviation for any newspaper to locate the ``title`` in the
+    ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the
+    ``setup_jisc_papers`` function.
+
+    Args:
+        title: target newspaper title
+        issue_date: target newspaper issue_date
+        jisc_papers: `DataFrame` of `jisc_papers` to match
+        input_sub_path: path of files to narrow down query input_sub_path
+        publication_code: unique codes to match newspaper records
+        abbr: an optional abbreviation of the newspaper title
+
+    Returns:
+        Matched ``title`` `str` or ``abbr``.
+
+
+    Returns:
+        A string estimating the JISC equivalent newspaper title
     """
 
     # First option, search the input_sub_path for a valid-looking publication_code

diff --git a/alto2txt2fixture/log.py b/alto2txt2fixture/log.py
@@ -2,21 +2,27 @@
 
 
 def success(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.GREEN` colour."""
     print(f"{Fore.GREEN}{msg}{Style.RESET_ALL}")
     return
 
 
 def info(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.CYAN` colour."""
     print(f"{Fore.CYAN}{msg}{Style.RESET_ALL}")
     return
 
 
 def warning(msg: str) -> None:
+    """Print ``msg`` in `colorama` `Force.YELLOW` colour."""
     print(f"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}")
     return
 
 
 def error(msg: str, crash: bool = True, silent: bool = True) -> None:
+    """Print ``msg`` in `colorama` `Force.RED` and `exit()`
+
+    If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``."""
     if crash and silent:
         print(f"{Fore.RED}{msg}{Style.RESET_ALL}")
         exit()