Skip to content

Commit

Permalink
Merge pull request #51 from Living-with-machines/plaintext
Browse files Browse the repository at this point in the history
Add `Plaintext` fixture creation
  • Loading branch information
griff-rees authored Sep 5, 2023
2 parents 8684610 + 5d9396a commit 97cd1c4
Show file tree
Hide file tree
Showing 24 changed files with 3,560 additions and 125 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
os: [ubuntu-latest, windows-latest, macos-latest]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
Expand All @@ -45,7 +45,7 @@ jobs:
runs-on: ubuntu-latest
needs: build
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: 3.x
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ debugger.py
.vscode
.DS_Store
tmp*
data
6 changes: 4 additions & 2 deletions alto2txt2fixture/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ def run(local_args: list[str] | None = None) -> None:
(pending the user's confirmation).
Arguments:
local_args:
Options passed to `parse_args()`
local_args: Options passed to `parse_args()`
Returns:
None
"""
args: Namespace = parse_args(argv=local_args)

Expand Down
34 changes: 29 additions & 5 deletions alto2txt2fixture/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,38 @@
import os
from pathlib import Path

from rich.console import Console
import typer
from rich.table import Table
from typing_extensions import Annotated

from .plaintext import (
DEFAULT_EXTRACTED_SUBDIR,
DEFAULT_PLAINTEXT_FIXTURE_OUTPUT,
PlainTextFixture,
)
from .settings import DATA_PROVIDER_INDEX, SETUP_TITLE, settings
from .types import dotdict
from .utils import check_newspaper_collection_configuration, gen_fixture_tables
from .utils import check_newspaper_collection_configuration, console, gen_fixture_tables

console = Console()
cli = typer.Typer(pretty_exceptions_show_locals=False)


@cli.command()
def plaintext(
path: Annotated[Path, typer.Argument()],
save_path: Annotated[Path, typer.Option()] = Path(DEFAULT_PLAINTEXT_FIXTURE_OUTPUT),
data_provider_code: Annotated[str, typer.Option()] = "",
extract_path: Annotated[Path, typer.Argument()] = Path(DEFAULT_EXTRACTED_SUBDIR),
) -> None:
"""Create a PlainTextFixture and save to `save_path`."""
plaintext_fixture = PlainTextFixture(
path=path,
data_provider_code=data_provider_code,
extract_subdir=extract_path,
export_directory=save_path,
)
plaintext_fixture.extract_compressed()
plaintext_fixture.export_to_json_fixtures()


def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:
Expand Down Expand Up @@ -56,9 +81,8 @@ def show_fixture_tables(
>>> [column.header for column in fixture_tables[0].columns]
['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
>>> fixture_tables = show_fixture_tables(settings)
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
<BLANKLINE>
...dataprovider...Heritage...│ bl-hmd...│ hmd...
...dataprovider...Heritage...│ bl_hmd...│ hmd...
```
Expand Down
17 changes: 9 additions & 8 deletions alto2txt2fixture/create_adjacent_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConf
Example:
```pycon
>>> from pprint import pprint
>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},
Expand Down Expand Up @@ -237,11 +236,10 @@ def download_data(
Example:
```pycon
>>> tmp: Path = getfixture('tmpdir')
>>> set_path: Path = tmp.chdir()
>>> download_data(exclude=[
... "mitchells", "Newspaper-1", "linking"
... ]) # doctest: +ELLIPSIS
>>> from os import chdir
>>> tmp_path: Path = getfixture('tmp_path')
>>> set_path: Path = chdir(tmp_path)
>>> download_data(exclude=["mitchells", "Newspaper-1", "linking"])
Excluding mitchells...
Excluding Newspaper-1...
Excluding linking...
Expand Down Expand Up @@ -302,7 +300,7 @@ def run(
saved: list[PathLike] = SAVED,
time_stamp: str = "",
output_path: Path = OUTPUT,
) -> None:
) -> list[PathLike]:
"""Download, process and link ``files_dict`` to `json` and `csv`.
Note:
Expand All @@ -324,6 +322,7 @@ def run(
output_path.mkdir(exist_ok=True, parents=True)

# Read all the Wikidata Q values from Mitchells
assert "local" in files_dict["mitchells"]
mitchells_df = pd.read_csv(files_dict["mitchells"]["local"], index_col=0)
mitchell_wikidata_mentions = sorted(
list(mitchells_df.PLACE_PUB_WIKI.unique()),
Expand All @@ -332,6 +331,7 @@ def run(

# Set up wikidata_gazetteer
gaz_cols = ["wikidata_id", "english_label", "latitude", "longitude", "geonamesIDs"]
assert "local" in files_dict["wikidata_gazetteer_selected_columns"]
wikidata_gazetteer = pd.read_csv(
files_dict["wikidata_gazetteer_selected_columns"]["local"], usecols=gaz_cols
)
Expand Down Expand Up @@ -760,10 +760,11 @@ def run(

# ###### NOW WE CAN EASILY CREATE JSON files_dict
for csv_file_path in output_path.glob("*.csv"):
csv2json_list(csv_file_path)
csv2json_list(csv_file_path, output_path=output_path)

print("Finished - saved files:")
print("- " + "\n- ".join([str(x) for x in saved]))
return saved


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 97cd1c4

Please sign in to comment.