Skip to content

Commit

Permalink
feat: add NEWSPAPER_COLLECTION_METADATA to settings.py for #12
Browse files Browse the repository at this point in the history
  • Loading branch information
spool committed Jul 31, 2023
1 parent cd9ecf9 commit 777bf3f
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 108 deletions.
4 changes: 1 addition & 3 deletions alto2txt2fixture/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def parse_args(argv=None):
return parser.parse_args(argv)


def run():
def run() -> None:
"""
The run function is the main function that starts the alto2txt2fixture
process.
Expand All @@ -77,8 +77,6 @@ def run():
Finally, the ``clear_cache`` function is called to clear the cache
(pending the user's confirmation).
:return: None
"""

args = parse_args()
Expand Down
56 changes: 12 additions & 44 deletions alto2txt2fixture/parser.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,14 @@
import gc
import json
from pathlib import Path
from typing import Any, Generator, NamedTuple, TypedDict, Union
from typing import Any, Generator, Union

from tqdm import tqdm

from .types import FixtureDict, TranslatorTuple
from .utils import NOW_str


class FixtureDict(TypedDict):
"""A `dict` structure to ease use as a `json` database fixture.
Attributes:
pk: an id to uniquely define and query each entry
model: what model a given record is for
fields: a `dict` of record information conforming to ``model`` table
"""

pk: int
model: str
fields: dict[str, Any]


def get_key_from(item: Path, x: str) -> str:
"""
Retrieves a specific key from a file and returns its value.
Expand Down Expand Up @@ -60,7 +47,7 @@ def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:
items.
Yields:
A unique item from the filelist as a FixtureDict`` based on the specified keys.
A unique item from `filelist`.
"""

seen = set()
Expand Down Expand Up @@ -212,29 +199,6 @@ def reset_fixture_dir(output: str | Path) -> None:
return


class TranslatorTuple(NamedTuple):
"""A named tuple of fields for translation.
Attributes:
start: A string representing the starting field name.
finish: A string or list specifying the field(s) to be translated.
If it is a string, the translated field
will be a direct mapping of the specified field in
each item of the input list.
If it is a list, the translated field will be a
hyphen-separated concatenation of the specified fields
in each item of the input list.
lst: A list of dictionaries representing the items to be
translated. Each dictionary should contain the necessary
fields for translation, with the field names specified in
the `start` parameter.
"""

start: str
finish: str | list
lst: list[dict]


def get_translator(
fields: list[TranslatorTuple] = [TranslatorTuple("", "", [])]
) -> dict:
Expand Down Expand Up @@ -569,7 +533,11 @@ def digitisation_in_x(x):

# Process issue
translate = get_translator(
[("publication__publication_code", "publication_code", newspaper_json)]
[
TranslatorTuple(
"publication__publication_code", "publication_code", newspaper_json
)
]
)
rename = {"publication": {"publication_code": "newspaper_id"}}

Expand All @@ -586,10 +554,10 @@ def digitisation_in_x(x):
# Create translator/clear up memory before processing items
translate = get_translator(
[
TranslatorTuple("issue__issue_identifier", "issue_code", issue_json),
TranslatorTuple("digitisation__software", "software", digitisation_json),
TranslatorTuple("data_provider__name", "name", data_provider_json),
TranslatorTuple(
("issue__issue_identifier", "issue_code", issue_json),
("digitisation__software", "software", digitisation_json),
("data_provider__name", "name", data_provider_json),
(
"ingest__lwm_tool_identifier",
["lwm_tool_name", "lwm_tool_version"],
ingest_json,
Expand Down
103 changes: 43 additions & 60 deletions alto2txt2fixture/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from .jisc import get_jisc_title, setup_jisc_papers
from .log import error, warning
from .patterns import PUBLICATION_CODE
from .types import dotdict
from .settings import NEWSPAPER_COLLECTION_METADATA
from .types import FixtureDict, dotdict
from .utils import get_now, get_size_from_path, write_json


Expand Down Expand Up @@ -79,21 +80,16 @@ def write_to_cache(self) -> Optional[bool]:


class Newspaper(Cache):
"""
The Newspaper class extends the Cache class and represents a newspaper.
"""The Newspaper class extends the Cache class and represents a newspaper.
The class has several properties and methods that allow the creation of a
newspaper object and the manipulation of its data.
Args:
root (xml.etree.ElementTree.Element):
An xml element that represents the root of the publication.
collection (str):
A string that represents the collection of the publication.
meta (dotdict):
A dotdict object that holds metadata about the publication.
jisc_papers (pandas.DataFrame, optional):
A pandas DataFrame object that holds information about the JISC
papers.
Attributes:
root: An xml element that represents the root of the publication.
collection: A string that represents the collection of the publication.
meta: A dotdict object that holds metadata about the publication.
jisc_papers: A pandas DataFrame object for JISC paper information.
"""

kind = "newspaper"
Expand Down Expand Up @@ -334,7 +330,7 @@ class Item(Cache):
item, i.e. an article. The class has several properties and methods that
allow the creation of an article object and the manipulation of its data.
Args:
Attributes:
root:
An xml element that represents the root of the publication
issue_code:
Expand Down Expand Up @@ -595,11 +591,9 @@ class Ingest(Cache):
The class has several properties and methods that allow the creation of an
ingest object and the manipulation of its data.
Args:
root (xml.etree.ElementTree.Element):
An xml element that represents the root of the publication
collection (str):
A string that represents the collection of the publication
Attributes:
root: An xml element that represents the root of the publication
collection: A string that represents the collection of the publication
"""

kind = "ingest"
Expand Down Expand Up @@ -639,11 +633,9 @@ class Digitisation(Cache):
digitisation. The class has several properties and methods that allow
creation of an digitisation object and the manipulation of its data.
Args:
root (xml.etree.ElementTree.Element):
An xml element that represents the root of the publication
collection (str):
A string that represents the collection of the publication
Attributes:
root: An xml element that represents the root of the publication
collection: A string that represents the collection of the publication
"""

kind = "digitisation"
Expand All @@ -656,8 +648,8 @@ def __init__(self, root: ET.Element, collection: str = ""):
if not isinstance(root, ET.Element):
raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")

self.root = root
self.collection = collection
self.root: ET.Element = root
self.collection: str = collection

def as_dict(self) -> dict:
"""
Expand Down Expand Up @@ -695,26 +687,24 @@ class DataProvider(Cache):
data provider. The class has several properties and methods that allow
creation of a data provider object and the manipulation of its data.
Args:
collection (str):
A string that represents the collection of the publication
Attributes:
collection: A string representing publication collection
kind: Indication of object type, defaults to `data-provider`
"""

kind = "data-provider"
"""A string that represents the type of the object, set to
"data-provider"."""
kind: str = "data-provider"
meta_data: list[FixtureDict] = NEWSPAPER_COLLECTION_METADATA

def __init__(self, collection: str = None):
def __init__(self, collection: str):
"""Constructor method."""
self.collection = collection
self.collection: str = collection

def as_dict(self) -> dict:
"""
A method that returns a dictionary representation of the data provider
object.
Return a `dict` of the data provider object.
:return: Dictionary representation of the DataProvider object
:rtype: dict
Returns:
Dictionary representation of the DataProvider object
"""
return {
"name": self.collection,
Expand Down Expand Up @@ -860,43 +850,35 @@ def newspaper(self) -> Newspaper:


class Archive:
"""
The Archive class represents a zip archive of XML files. The class is used
"""Manage extracting information from a ZIP archive.
The ``Archive`` class represents a zip archive of XML files. The class is used
to extract information from a ZIP archive, and it contains several methods
to process the data contained in the archive.
.. describe:: open(Archive) context manager
!!! info "`open(Archive)` context manager"
Archive can be opened with a context manager, which creates a meta
object, with timings for the object. When closed, it will save the
meta JSON to the correct paths.
.. describe:: len(Archive)
Getting the length of the Archive returns the number of files inside
the zip archive.
Args:
path (str):
The path to the zip archive.
collection (str, optional):
The collection of the XML files in the archive. Default is "".
report_id (str, optional):
The report ID for the archive. If not provided, a random UUID is
Attributes:
path: The path to the zip archive.
collection: The collection of the XML files in the archive. Default is "".
report_id: The report ID for the archive. If not provided, a random UUID is
generated.
jisc_papers (pandas.DataFrame): A DataFrame of JISC papers.
jisc_papers: A DataFrame of JISC papers.
Raises:
RuntimeError:
If the ``path`` does not exist.
RuntimeError: If the ``path`` does not exist.
"""

def __init__(
self,
path: str,
collection: str = "",
report_id: Optional[str] = None,
jisc_papers: Optional[pd.DataFrame] = None,
path: str | Path,
collection: str | None = "",
report_id: str | None = None,
jisc_papers: pd.DataFrame | None = None,
):
"""Constructor method."""

Expand Down Expand Up @@ -941,6 +923,7 @@ def __init__(
"""The file path of the report file for the archive."""

def __len__(self):
"""The number of files inside the zip archive."""
return len(self.filelist)

def __str__(self):
Expand Down
52 changes: 51 additions & 1 deletion alto2txt2fixture/settings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,61 @@
import os
from typing import Final

from rich.console import Console
from rich.table import Table

from .types import dotdict
from .types import FixtureDict, dotdict

# To understand the settings object, see documentation.


NEWSPAPER_COLLECTION_METADATA: Final[list[FixtureDict]] = [
FixtureDict(
pk=1,
model="newspapers.dataprovider",
fields={
"name": "FindMyPast",
"code": "fmp",
"legacy_code": "bna",
"collection": "newspapers",
"source_note": "FindMyPast-funded digitised newspapers provided by the British Newspaper Archive",
},
),
FixtureDict(
pk=2,
model="newspapers.dataprovider",
fields={
"name": "Heritage Made Digital",
"code": "bl-hmd",
"legacy_code": "hmd",
"collection": "newspapers",
"source_note": "British Library-funded digitised newspapers provided by the British Newspaper Archive",
},
),
FixtureDict(
pk=3,
model="newspapers.dataprovider",
fields={
"name": "Joint Information Systems Committee",
"code": "jisc",
"legacy_code": "jisc",
"collection": "newspapers",
"source_note": "JISC-funded digitised newspapers provided by the British Newspaper Archive",
},
),
FixtureDict(
pk=3,
model="newspapers.dataprovider",
fields={
"name": "Living with Machines",
"code": "bl_lwm",
"legacy_code": "lwm",
"collection": "newspapers",
"source_note": "Living with Machines-funded digitised newspapers provided by the British Newspaper Archive",
},
),
]

settings = dotdict(
**{
"MOUNTPOINT": "./input/alto2txt/",
Expand All @@ -20,6 +69,7 @@
"JISC_PAPERS_CSV": "./input/JISC papers.csv",
"MAX_ELEMENTS_PER_FILE": int(2e6),
"REPORT_DIR": "./output/reports/",
"NEWSPAPER_COLLECTION_METADATA": NEWSPAPER_COLLECTION_METADATA,
}
)

Expand Down
Loading

0 comments on commit 777bf3f

Please sign in to comment.