feat: add NEWSPAPER_COLLECTION_METADATA to settings.py for #12

Living-with-machines · Jul 31, 2023 · 777bf3f · 777bf3f
1 parent cd9ecf9
commit 777bf3f
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 108 deletions.
diff --git a/alto2txt2fixture/__main__.py b/alto2txt2fixture/__main__.py
@@ -57,7 +57,7 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
-def run():
+def run() -> None:
     """
     The run function is the main function that starts the alto2txt2fixture
     process.
@@ -77,8 +77,6 @@ def run():
 
     Finally, the ``clear_cache`` function is called to clear the cache
     (pending the user's confirmation).
-
-    :return: None
     """
 
     args = parse_args()

diff --git a/alto2txt2fixture/parser.py b/alto2txt2fixture/parser.py
@@ -1,27 +1,14 @@
 import gc
 import json
 from pathlib import Path
-from typing import Any, Generator, NamedTuple, TypedDict, Union
+from typing import Any, Generator, Union
 
 from tqdm import tqdm
 
+from .types import FixtureDict, TranslatorTuple
 from .utils import NOW_str
 
 
-class FixtureDict(TypedDict):
-    """A `dict` structure to ease use as a `json` database fixture.
-
-    Attributes:
-        pk: an id to uniquely define and query each entry
-        model: what model a given record is for
-        fields: a `dict` of record information conforming to ``model`` table
-    """
-
-    pk: int
-    model: str
-    fields: dict[str, Any]
-
-
 def get_key_from(item: Path, x: str) -> str:
     """
     Retrieves a specific key from a file and returns its value.
@@ -60,7 +47,7 @@ def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:
             items.
 
     Yields:
-        A unique item from the filelist as a FixtureDict`` based on the specified keys.
+        A unique item from `filelist`.
     """
 
     seen = set()
@@ -212,29 +199,6 @@ def reset_fixture_dir(output: str | Path) -> None:
     return
 
 
-class TranslatorTuple(NamedTuple):
-    """A named tuple of fields for translation.
-
-    Attributes:
-        start: A string representing the starting field name.
-        finish: A string or list specifying the field(s) to be translated.
-            If it is a string, the translated field
-            will be a direct mapping of the specified field in
-            each item of the input list.
-            If it is a list, the translated field will be a
-            hyphen-separated concatenation of the specified fields
-            in each item of the input list.
-        lst: A list of dictionaries representing the items to be
-            translated. Each dictionary should contain the necessary
-            fields for translation, with the field names specified in
-            the `start` parameter.
-    """
-
-    start: str
-    finish: str | list
-    lst: list[dict]
-
-
 def get_translator(
     fields: list[TranslatorTuple] = [TranslatorTuple("", "", [])]
 ) -> dict:
@@ -569,7 +533,11 @@ def digitisation_in_x(x):
 
     # Process issue
     translate = get_translator(
-        [("publication__publication_code", "publication_code", newspaper_json)]
+        [
+            TranslatorTuple(
+                "publication__publication_code", "publication_code", newspaper_json
+            )
+        ]
     )
     rename = {"publication": {"publication_code": "newspaper_id"}}
 
@@ -586,10 +554,10 @@ def digitisation_in_x(x):
     # Create translator/clear up memory before processing items
     translate = get_translator(
         [
-            TranslatorTuple("issue__issue_identifier", "issue_code", issue_json),
-            TranslatorTuple("digitisation__software", "software", digitisation_json),
-            TranslatorTuple("data_provider__name", "name", data_provider_json),
-            TranslatorTuple(
+            ("issue__issue_identifier", "issue_code", issue_json),
+            ("digitisation__software", "software", digitisation_json),
+            ("data_provider__name", "name", data_provider_json),
+            (
                 "ingest__lwm_tool_identifier",
                 ["lwm_tool_name", "lwm_tool_version"],
                 ingest_json,

diff --git a/alto2txt2fixture/router.py b/alto2txt2fixture/router.py
@@ -11,7 +11,8 @@
 from .jisc import get_jisc_title, setup_jisc_papers
 from .log import error, warning
 from .patterns import PUBLICATION_CODE
-from .types import dotdict
+from .settings import NEWSPAPER_COLLECTION_METADATA
+from .types import FixtureDict, dotdict
 from .utils import get_now, get_size_from_path, write_json
 
 
@@ -79,21 +80,16 @@ def write_to_cache(self) -> Optional[bool]:
 
 
 class Newspaper(Cache):
-    """
-    The Newspaper class extends the Cache class and represents a newspaper.
+    """The Newspaper class extends the Cache class and represents a newspaper.
+
     The class has several properties and methods that allow the creation of a
     newspaper object and the manipulation of its data.
 
-    Args:
-        root (xml.etree.ElementTree.Element):
-            An xml element that represents the root of the publication.
-        collection (str):
-            A string that represents the collection of the publication.
-        meta (dotdict):
-            A dotdict object that holds metadata about the publication.
-        jisc_papers (pandas.DataFrame, optional):
-            A pandas DataFrame object that holds information about the JISC
-            papers.
+    Attributes:
+        root: An xml element that represents the root of the publication.
+        collection: A string that represents the collection of the publication.
+        meta: A dotdict object that holds metadata about the publication.
+        jisc_papers: A pandas DataFrame object for JISC paper information.
     """
 
     kind = "newspaper"
@@ -334,7 +330,7 @@ class Item(Cache):
     item, i.e. an article. The class has several properties and methods that
     allow the creation of an article object and the manipulation of its data.
 
-    Args:
+    Attributes:
         root:
             An xml element that represents the root of the publication
         issue_code:
@@ -595,11 +591,9 @@ class Ingest(Cache):
     The class has several properties and methods that allow the creation of an
     ingest object and the manipulation of its data.
 
-    Args:
-        root (xml.etree.ElementTree.Element):
-            An xml element that represents the root of the publication
-        collection (str):
-            A string that represents the collection of the publication
+    Attributes:
+        root: An xml element that represents the root of the publication
+        collection: A string that represents the collection of the publication
     """
 
     kind = "ingest"
@@ -639,11 +633,9 @@ class Digitisation(Cache):
     digitisation. The class has several properties and methods that allow
     creation of an digitisation object and the manipulation of its data.
 
-    Args:
-        root (xml.etree.ElementTree.Element):
-            An xml element that represents the root of the publication
-        collection (str):
-            A string that represents the collection of the publication
+    Attributes:
+        root: An xml element that represents the root of the publication
+        collection: A string that represents the collection of the publication
     """
 
     kind = "digitisation"
@@ -656,8 +648,8 @@ def __init__(self, root: ET.Element, collection: str = ""):
         if not isinstance(root, ET.Element):
             raise RuntimeError(f"Expected root to be xml.etree.Element: {type(root)}")
 
-        self.root = root
-        self.collection = collection
+        self.root: ET.Element = root
+        self.collection: str = collection
 
     def as_dict(self) -> dict:
         """
@@ -695,26 +687,24 @@ class DataProvider(Cache):
     data provider. The class has several properties and methods that allow
     creation of a data provider object and the manipulation of its data.
 
-    Args:
-        collection (str):
-            A string that represents the collection of the publication
+    Attributes:
+        collection: A string representing publication collection
+        kind: Indication of object type, defaults to `data-provider`
     """
 
-    kind = "data-provider"
-    """A string that represents the type of the object, set to
-    "data-provider"."""
+    kind: str = "data-provider"
+    meta_data: list[FixtureDict] = NEWSPAPER_COLLECTION_METADATA
 
-    def __init__(self, collection: str = None):
+    def __init__(self, collection: str):
         """Constructor method."""
-        self.collection = collection
+        self.collection: str = collection
 
     def as_dict(self) -> dict:
         """
-        A method that returns a dictionary representation of the data provider
-        object.
+        Return a `dict` of the data provider object.
 
-        :return: Dictionary representation of the DataProvider object
-        :rtype: dict
+        Returns:
+            Dictionary representation of the DataProvider object
         """
         return {
             "name": self.collection,
@@ -860,43 +850,35 @@ def newspaper(self) -> Newspaper:
 
 
 class Archive:
-    """
-    The Archive class represents a zip archive of XML files. The class is used
+    """Manage extracting information from a ZIP archive.
+
+    The ``Archive`` class represents a zip archive of XML files. The class is used
     to extract information from a ZIP archive, and it contains several methods
     to process the data contained in the archive.
 
-    .. describe:: open(Archive) context manager
+    !!! info "`open(Archive)` context manager"
 
         Archive can be opened with a context manager, which creates a meta
         object, with timings for the object. When closed, it will save the
         meta JSON to the correct paths.
 
-    .. describe:: len(Archive)
-
-        Getting the length of the Archive returns the number of files inside
-        the zip archive.
-
-    Args:
-        path (str):
-            The path to the zip archive.
-        collection (str, optional):
-            The collection of the XML files in the archive. Default is "".
-        report_id (str, optional):
-            The report ID for the archive. If not provided, a random UUID is
+    Attributes:
+        path: The path to the zip archive.
+        collection: The collection of the XML files in the archive. Default is "".
+        report_id: The report ID for the archive. If not provided, a random UUID is
             generated.
-        jisc_papers (pandas.DataFrame): A DataFrame of JISC papers.
+        jisc_papers: A DataFrame of JISC papers.
 
     Raises:
-        RuntimeError:
-            If the ``path`` does not exist.
+        RuntimeError: If the ``path`` does not exist.
     """
 
     def __init__(
         self,
-        path: str,
-        collection: str = "",
-        report_id: Optional[str] = None,
-        jisc_papers: Optional[pd.DataFrame] = None,
+        path: str | Path,
+        collection: str | None = "",
+        report_id: str | None = None,
+        jisc_papers: pd.DataFrame | None = None,
     ):
         """Constructor method."""
 
@@ -941,6 +923,7 @@ def __init__(
         """The file path of the report file for the archive."""
 
     def __len__(self):
+        """The number of files inside the zip archive."""
         return len(self.filelist)
 
     def __str__(self):

diff --git a/alto2txt2fixture/settings.py b/alto2txt2fixture/settings.py
@@ -1,12 +1,61 @@
 import os
+from typing import Final
 
 from rich.console import Console
 from rich.table import Table
 
-from .types import dotdict
+from .types import FixtureDict, dotdict
 
 # To understand the settings object, see documentation.
 
+
+NEWSPAPER_COLLECTION_METADATA: Final[list[FixtureDict]] = [
+    FixtureDict(
+        pk=1,
+        model="newspapers.dataprovider",
+        fields={
+            "name": "FindMyPast",
+            "code": "fmp",
+            "legacy_code": "bna",
+            "collection": "newspapers",
+            "source_note": "FindMyPast-funded digitised newspapers provided by the British Newspaper Archive",
+        },
+    ),
+    FixtureDict(
+        pk=2,
+        model="newspapers.dataprovider",
+        fields={
+            "name": "Heritage Made Digital",
+            "code": "bl-hmd",
+            "legacy_code": "hmd",
+            "collection": "newspapers",
+            "source_note": "British Library-funded digitised newspapers provided by the British Newspaper Archive",
+        },
+    ),
+    FixtureDict(
+        pk=3,
+        model="newspapers.dataprovider",
+        fields={
+            "name": "Joint Information Systems Committee",
+            "code": "jisc",
+            "legacy_code": "jisc",
+            "collection": "newspapers",
+            "source_note": "JISC-funded digitised newspapers provided by the British Newspaper Archive",
+        },
+    ),
+    FixtureDict(
+        pk=3,
+        model="newspapers.dataprovider",
+        fields={
+            "name": "Living with Machines",
+            "code": "bl_lwm",
+            "legacy_code": "lwm",
+            "collection": "newspapers",
+            "source_note": "Living with Machines-funded digitised newspapers provided by the British Newspaper Archive",
+        },
+    ),
+]
+
 settings = dotdict(
     **{
         "MOUNTPOINT": "./input/alto2txt/",
@@ -20,6 +69,7 @@
         "JISC_PAPERS_CSV": "./input/JISC papers.csv",
         "MAX_ELEMENTS_PER_FILE": int(2e6),
         "REPORT_DIR": "./output/reports/",
+        "NEWSPAPER_COLLECTION_METADATA": NEWSPAPER_COLLECTION_METADATA,
     }
 )