Skip to content

Commit

Permalink
Merge pull request #35 from Living-with-machines/fix-dataprovider
Browse files Browse the repository at this point in the history
Add abstractions to to ease exporting `DataProvider`
  • Loading branch information
griff-rees authored Jul 31, 2023
2 parents 343a3ce + c4de0a6 commit 8055041
Show file tree
Hide file tree
Showing 16 changed files with 716 additions and 467 deletions.
51 changes: 28 additions & 23 deletions alto2txt2fixture/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
__main__), the ``run`` function is executed.
"""

from argparse import ArgumentParser
from argparse import ArgumentParser, BooleanOptionalAction

from alto2txt2fixture.parser import parse
from alto2txt2fixture.router import route
Expand Down Expand Up @@ -54,10 +54,17 @@ def parse_args(argv=None):
help="<Optional> Set an output directory",
required=False,
)
parser.add_argument(
"-t",
"--test-config",
default=False,
help="Only print the configuration",
action=BooleanOptionalAction,
)
return parser.parse_args(argv)


def run():
def run(test_config: bool = False) -> None:
"""
The run function is the main function that starts the alto2txt2fixture
process.
Expand All @@ -77,8 +84,6 @@ def run():
Finally, the ``clear_cache`` function is called to clear the cache
(pending the user's confirmation).
:return: None
"""

args = parse_args()
Expand Down Expand Up @@ -107,25 +112,25 @@ def run():
REPORT_DIR=settings.REPORT_DIR,
MAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,
)

# Routing alto2txt into subdirectories with structured files
route(
COLLECTIONS,
settings.CACHE_HOME,
MOUNTPOINT,
settings.JISC_PAPERS_CSV,
settings.REPORT_DIR,
)

# Parsing the resulting JSON files
parse(
COLLECTIONS,
settings.CACHE_HOME,
OUTPUT,
settings.MAX_ELEMENTS_PER_FILE,
)

clear_cache(settings.CACHE_HOME)
if not args.test_config and not test_config:
# Routing alto2txt into subdirectories with structured files
route(
COLLECTIONS,
settings.CACHE_HOME,
MOUNTPOINT,
settings.JISC_PAPERS_CSV,
settings.REPORT_DIR,
)

# Parsing the resulting JSON files
parse(
COLLECTIONS,
settings.CACHE_HOME,
OUTPUT,
settings.MAX_ELEMENTS_PER_FILE,
)

clear_cache(settings.CACHE_HOME)


if __name__ == "__main__":
Expand Down
27 changes: 15 additions & 12 deletions alto2txt2fixture/create_adjacent_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,15 @@ def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConf
The `csv` and `json` paths
:param names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names
:param module_name: name of module each name is part of, that is added as a prefix
Args:
names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names
module_name: name of module each name is part of, that is added as a prefix
:Examples:
Returns:
A ``TableOutputConfigType``: a `dict` of table ``names`` and output
`csv` and `json` filenames.
Examples:
>>> from pprint import pprint
>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
Expand Down Expand Up @@ -146,7 +150,7 @@ def csv2json_list(
saved: list[Path] | None = None,
indent: int = 2,
) -> list:
"""Save `csv_path` as a `json` file and return as a `dict`."""
"""Save `csv_path` as a `json` file and return as a `list`."""
json_data = []
# See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486
df = (
Expand Down Expand Up @@ -220,15 +224,14 @@ def download_data(
overwrite: bool = OVERWRITE,
exclude: list[str] = [],
) -> None:
"""Download files in `files_dict`, overwrite if specified.
"""Download files in ``files_dict``, overwrite if specified.
:param files_dict: dict of related files to download
:param overwrite: bool on whether to overwrite `LOCAL_CACHE` files
:param exclude: list[str] of files to exclude from `files_dict`
Args:
files_dict: `dict` of related files to download
overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not
exclude: `list` of files to exclude from ``files_dict``
:Examples:
>>> from pathlib import Path
Examples:
>>> tmp: Path = getfixture('tmpdir')
>>> set_path: Path = tmp.chdir()
>>> download_data(exclude=[
Expand Down Expand Up @@ -293,7 +296,7 @@ def run(
time_stamp: str = "",
output_path: Path = OUTPUT,
) -> None:
"""Download, process and link `files_dict` to `json` and `csv`."""
"""Download, process and link ``files_dict`` to `json` and `csv`."""

# Ensure time_stamp from the point of calling `run`
if not time_stamp:
Expand Down
31 changes: 24 additions & 7 deletions alto2txt2fixture/jisc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@

def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:
"""
Creates a DataFrame with correct informations based on the JISC_PAPERS_CSV from the settings.
Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.
Returns: DataFrame with all JISC titles.
Returns:
`DataFrame` with all JISC titles.
"""

if not Path(path).exists():
Expand Down Expand Up @@ -92,14 +93,30 @@ def get_jisc_title(
jisc_papers: pd.DataFrame,
input_sub_path: str,
publication_code: str,
abbr: str = None,
abbr: str | None = None,
) -> str:
"""
Takes an input_sub_path, a publication_code, and an (optional) abbreviation for any newspaper, and tries to
locate the title in the jisc_papers DataFrame provided (usually loaded with the setup_jisc_papers function
above).
Match a newspaper ``title`` with ``jisc_papers`` records.
Returns a string (or crashes).
Takes an ``input_sub_path``, a ``publication_code``, and an (optional)
abbreviation for any newspaper to locate the ``title`` in the
``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the
``setup_jisc_papers`` function.
Args:
title: target newspaper title
issue_date: target newspaper issue_date
jisc_papers: `DataFrame` of `jisc_papers` to match
input_sub_path: path of files to narrow down query input_sub_path
publication_code: unique codes to match newspaper records
abbr: an optional abbreviation of the newspaper title
Returns:
Matched ``title`` `str` or ``abbr``.
Returns:
A string estimating the JISC equivalent newspaper title
"""

# First option, search the input_sub_path for a valid-looking publication_code
Expand Down
6 changes: 6 additions & 0 deletions alto2txt2fixture/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,27 @@


def success(msg: str) -> None:
"""Print ``msg`` in `colorama` `Force.GREEN` colour."""
print(f"{Fore.GREEN}{msg}{Style.RESET_ALL}")
return


def info(msg: str) -> None:
"""Print ``msg`` in `colorama` `Force.CYAN` colour."""
print(f"{Fore.CYAN}{msg}{Style.RESET_ALL}")
return


def warning(msg: str) -> None:
"""Print ``msg`` in `colorama` `Force.YELLOW` colour."""
print(f"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}")
return


def error(msg: str, crash: bool = True, silent: bool = True) -> None:
"""Print ``msg`` in `colorama` `Force.RED` and `exit()`
If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``."""
if crash and silent:
print(f"{Fore.RED}{msg}{Style.RESET_ALL}")
exit()
Expand Down
Loading

0 comments on commit 8055041

Please sign in to comment.