diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..c3a02dd --- /dev/null +++ b/404.html @@ -0,0 +1,597 @@ + + + +
+ + + + + + + + + + + + + + + +alto2txt2fixture
is a standalone tool to convert alto2txt
XML
output and other related datasets into JSON
(and where feasible CSV
) data with corresponding relational IDs to ease general use and ingestion into a relational database.
We target the the JSON
produced for importing into lwmdb
: a database built using the Django
python
webframework database fixture
structure.
We provide a command line interface to process alto2txt
XML
files stored locally (or mounted via azure
blobfuse
), and for additional public data we automate a means of downloading those automatically.
We recommend downloading a copy of the reposity or using git clone
. From a local copy use poetry
to install dependencies:
If you would like to test, render documentation and/or contribute to the code included dev
dependencies in a local install:
To processing newspaper metadata with a local copy of alto2txt
XML
results, it's easiest to have that data in the same folder as your alto2txt2fixture
checkout and poetry
installed folder. One arranged, you should be able to begin the JSON
converstion with
To generate related data in JSON
and CSV
form, assuming you have an internet collection and access to a living-with-machines
azure
account, the following will download related data into JSON
and CSV
files. The JSON
results should be consistent with lwmdb
tables for ease of import.
Entry point for alto2txt2fixture.parse
to convert alto2txt
XML
-> JSON
.
This module defines the run function which is the main driver for the entire +process.
+It imports various functions from other modules and uses them to route and
+parse XML
data generated by alto2txt
.
The following steps are performed in the run function:
+alto2txt
data into subdirectories with
+ structured files.JSON
files.If the script is run as a main
program (i.e. if the name of the script is
+__main__
), the run()
function is executed.
Note: at present this does not include any functunality in create_adjacent_tables.py
Manage command line arguments for run()
This constructs an ArgumentParser
instance to manage
+configurating calls of run()
to manage newspaper
+XML
to JSON
converstion.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
argv |
+
+ list[str] | None
+ |
+
+
+
+ If |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Namespace
+ |
+
+
+
+ A |
+
alto2txt2fixture/__main__.py
44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 |
|
Manage running newspaper XML
to JSON
conversion.
First parse_args
is called for command line arguments including:
collections
output
mountpoint
If any of these arguments are specified, they will be used, otherwise they
+will default to the values in the settings
module.
The show_setup
function is then called to display the configurations
+being used.
The route
function is then called to route the alto2txt files into
+subdirectories with structured files.
The parse
function is then called to parse the resulting JSON files.
Finally, the clear_cache
function is called to clear the cache
+(pending the user's confirmation).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
local_args |
+
+ list[str] | None
+ |
+
+
+
+ Options passed to |
+
+ None
+ |
+
alto2txt2fixture/__main__.py
118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 |
|
show_fixture_tables(
+ run_settings: dotdict = settings,
+ print_in_call: bool = True,
+ data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> list[Table]
+
Print fixture tables specified in settings.fixture_tables
in rich.Table
format.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
run_settings |
+
+ dotdict
+ |
+
+
+
+
|
+
+ settings
+ |
+
print_in_call |
+
+ bool
+ |
+
+
+
+ whether to print to console (will use |
+
+ True
+ |
+
data_provider_index |
+
+ str
+ |
+
+
+
+ key to index |
+
+ DATA_PROVIDER_INDEX
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list[Table]
+ |
+
+
+
+ A |
+
>>> fixture_tables: list[Table] = show_fixture_tables(
+... settings,
+... print_in_call=False)
+>>> len(fixture_tables)
+1
+>>> fixture_tables[0].title
+'dataprovider'
+>>> [column.header for column in fixture_tables[0].columns]
+['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+>>> fixture_tables = show_fixture_tables(settings)
+... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+<BLANKLINE>
+...dataprovider...Heritage...│ bl-hmd...│ hmd...
+
It is possible for the example test to fail in different screen sizes. Try +increasing the window or screen width of terminal used to check before +raising an issue.
+alto2txt2fixture/cli.py
Generate a rich.table.Table
for printing configuration to console.
alto2txt2fixture/cli.py
Returns a list with corrected data from a provided dictionary.
+ +alto2txt2fixture/create_adjacent_tables.py
csv2json_list(
+ csv_path: PathLike,
+ output_path: Path = OUTPUT,
+ saved: list[Path] | None = None,
+ indent: int = JSON_INDENT,
+) -> list
+
Save csv_path
as a json
file and return as a list
.
alto2txt2fixture/create_adjacent_tables.py
download_data(
+ files_dict: RemoteDataFilesType = {},
+ overwrite: bool = OVERWRITE,
+ exclude: list[str] = [],
+) -> None
+
Download files in files_dict
, overwrite if specified.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
files_dict |
+
+ RemoteDataFilesType
+ |
+
+
+
+
|
+
+ {}
+ |
+
overwrite |
+
+ bool
+ |
+
+
+
+
|
+
+ OVERWRITE
+ |
+
exclude |
+
+ list[str]
+ |
+
+
+
+
|
+
+ []
+ |
+
>>> tmp: Path = getfixture('tmpdir')
+>>> set_path: Path = tmp.chdir()
+>>> download_data(exclude=[
+... "mitchells", "Newspaper-1", "linking"
+... ]) # doctest: +ELLIPSIS
+Excluding mitchells...
+Excluding Newspaper-1...
+Excluding linking...
+Downloading cache...dict_admin_counties.json
+100% ... 37/37 bytes
+Downloading cache...dict_countries.json
+100% ... 33.2/33.2 kB
+Downloading cache...dict_historic_counties.json
+100% ... 41.4/41.4 kB
+Downloading cache...nlp_loc_wikidata_concat.csv
+100% ... 59.8/59.8 kB
+Downloading cache...wikidata_gazetteer_selected_columns.csv
+100% ... 47.8/47.8 MB
+
alto2txt2fixture/create_adjacent_tables.py
226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 |
|
Get a list from a string, which contains
Return a dict
of csv
and json
paths for each module_name
table.
The csv
and json
paths
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
names |
+
+ Sequence[str]
+ |
+
+
+
+ iterable of names of each |
+ + required + | +
module_name |
+
+ str
+ |
+
+
+
+ name of module each name is part of, that is added as a prefix + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ TableOutputConfigType
+ |
+
+
+
+ A |
+
>>> from pprint import pprint
+>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, "mitchells"))
+{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},
+ 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},
+ 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',
+ 'json': 'mitchells.PoliticalLeaning.json'},
+ 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}
+
alto2txt2fixture/create_adjacent_tables.py
run(
+ files_dict: dict = {},
+ files_to_download_overwrite: bool = OVERWRITE,
+ saved: list[PathLike] = SAVED,
+ time_stamp: str = "",
+ output_path: Path = OUTPUT,
+) -> None
+
Download, process and link files_dict
to json
and csv
.
This will require access to https://zooniversedata.blob.core.windows.net/downloads/
.
alto2txt2fixture/create_adjacent_tables.py
299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 |
|
get_jisc_title(
+ title: str,
+ issue_date: str,
+ jisc_papers: pd.DataFrame,
+ input_sub_path: str,
+ publication_code: str,
+ abbr: str | None = None,
+) -> str
+
Match a newspaper title
with jisc_papers
records.
Takes an input_sub_path
, a publication_code
, and an (optional)
+abbreviation for any newspaper to locate the title
in the
+jisc_papers
DataFrame
. jisc_papers
is usually loaded via the
+setup_jisc_papers
function.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
title |
+
+ str
+ |
+
+
+
+ target newspaper title + |
+ + required + | +
issue_date |
+
+ str
+ |
+
+
+
+ target newspaper issue_date + |
+ + required + | +
jisc_papers |
+
+ pd.DataFrame
+ |
+
+
+
+
|
+ + required + | +
input_sub_path |
+
+ str
+ |
+
+
+
+ path of files to narrow down query input_sub_path + |
+ + required + | +
publication_code |
+
+ str
+ |
+
+
+
+ unique codes to match newspaper records + |
+ + required + | +
abbr |
+
+ str | None
+ |
+
+
+
+ an optional abbreviation of the newspaper title + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ Matched |
+
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ A string estimating the JISC equivalent newspaper title + |
+
alto2txt2fixture/jisc.py
90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 |
|
Create a DataFrame
with information in JISC_PAPERS_CSV
in settings.
Returns:
+Type | +Description | +
---|---|
+ pd.DataFrame
+ |
+
+
+
+
|
+
alto2txt2fixture/jisc.py
10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 |
|
Print msg
in colorama
Force.RED
and exit()
If silent
exit()
after call, else raise
RuntimeError
if crash=True
.
alto2txt2fixture/log.py
fixtures(
+ filelist: list = [],
+ model: str = "",
+ translate: dict = {},
+ rename: dict = {},
+ uniq_keys: list = [],
+) -> Generator[FixtureDict, None, None]
+
Generates fixtures for a specified model using a list of files.
+This function takes a list of files and generates fixtures for a specified +model. The fixtures can be used to populate a database or perform other +data-related operations.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
filelist |
+
+ list
+ |
+
+
+
+ A list of files to process and generate fixtures from. + |
+
+ []
+ |
+
model |
+
+ str
+ |
+
+
+
+ The name of the model for which fixtures are generated. +translate: A nested dictionary representing the translation mapping +for fields. The structure of the translator follows the format: + +The translated fields will be used as keys, and their +corresponding primary keys (obtained from the provided files) will +be used as values in the generated fixtures. + |
+
+ ''
+ |
+
rename |
+
+ dict
+ |
+ + + | +
+ {}
+ |
+
uniq_keys |
+
+ list
+ |
+
+
+
+ A list of fields that need to be considered for +uniqueness in the fixtures. If specified, the fixtures will yield +only unique items based on the combination of these fields. + |
+
+ []
+ |
+
Yields:
+Type | +Description | +
---|---|
+ FixtureDict
+ |
+
+
+
+
|
+
Returns:
+Type | +Description | +
---|---|
+ Generator[FixtureDict, None, None]
+ |
+
+
+
+ This function generates fixtures but does not return any value. + |
+
alto2txt2fixture/parser.py
65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 |
|
get_fields(
+ file: Union[Path, str, dict],
+ translate: dict = {},
+ rename: dict = {},
+ allow_null: bool = False,
+) -> dict
+
Retrieves fields from a file and performs modifications and checks.
+This function takes a file (in various formats: Path
, str
, or dict
)
+and processes its fields. It retrieves the fields from the file and
+performs modifications, translations, and checks on the fields.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file |
+
+ Union[Path, str, dict]
+ |
+
+
+
+ The file from which the fields are retrieved. + |
+ + required + | +
translate |
+
+ dict
+ |
+ + + | +
+ {}
+ |
+
rename |
+
+ dict
+ |
+ + + | +
+ {}
+ |
+
allow_null |
+
+ bool
+ |
+
+
+
+ Determines whether to allow |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ A dictionary representing the retrieved fields from the file, +with modifications and checks applied. + |
+
Raises:
+Type | +Description | +
---|---|
+ RuntimeError
+ |
+
+
+
+ If the file type is unsupported or if an error occurs +during field retrieval or processing. + |
+
alto2txt2fixture/parser.py
258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 |
|
Retrieves a specific key from a file and returns its value.
+This function reads a file and extracts the value of a specified +key. If the key is not found or an error occurs while processing +the file, a warning is printed, and an empty string is returned.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
item |
+
+ Path
+ |
+
+
+
+ The file from which the key is extracted. + |
+ + required + | +
x |
+
+ str
+ |
+
+
+
+ The key to be retrieved from the file. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ The value of the specified key from the file. + |
+
alto2txt2fixture/parser.py
Converts a list of fields into a nested dictionary representing a +translator.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fields |
+
+ list[TranslatorTuple]
+ |
+
+
+
+ A list of tuples representing fields to be translated. + |
+
+ [TranslatorTuple('', '', [])]
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+ + + | +
alto2txt2fixture/parser.py
Parses files from collections and generates fixtures for various models.
+This function processes files from the specified collections and generates
+fixtures for different models, such as newspapers.dataprovider
,
+newspapers.ingest
, newspapers.digitisation
, newspapers.newspaper
,
+newspapers.issue
, and newspapers.item
.
It performs various steps, such as file listing, fixture generation, +translation mapping, renaming fields, and saving fixtures to files.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
collections |
+
+ list
+ |
+
+
+
+ A list of collections from which files are +processed and fixtures are generated. + |
+ + required + | +
cache_home |
+
+ str
+ |
+
+
+
+ The directory path where the collections are located. + |
+ + required + | +
output |
+
+ str
+ |
+
+
+
+ The directory path where the fixtures will be saved. + |
+ + required + | +
max_elements_per_file |
+
+ int
+ |
+
+
+
+ The maximum number of elements per file +when saving fixtures. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ This function generates fixtures but does not return any value. + |
+
alto2txt2fixture/parser.py
382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 |
|
Resets the fixture directory by removing all JSON files inside it.
+This function takes a directory path (output
) as input and removes all
+JSON files within the directory.
Prior to removal, it prompts the user for confirmation to proceed. If the +user confirms, the function clears the fixture directory by deleting the +JSON files.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output |
+
+ str | Path
+ |
+
+
+
+ The directory path of the fixture directory to be reset. + |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ RuntimeError
+ |
+
+
+
+ If the |
+
alto2txt2fixture/parser.py
Generates unique items from a list of files based on specified keys.
+This function takes a list of files and yields unique items based on a
+combination of keys. The keys are extracted from each file using the
+get_key_from
function, and duplicate items are ignored.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
filelist |
+
+ list
+ |
+
+
+
+ A list of files from which unique items are +generated. + |
+ + required + | +
keys |
+
+ list
+ |
+
+
+
+ A list of keys used for uniqueness. Each key specifies +a field to be used for uniqueness checking in the generated +items. + |
+
+ []
+ |
+
Yields:
+Type | +Description | +
---|---|
+ Any
+ |
+
+
+
+ A unique item from |
+
alto2txt2fixture/parser.py
Archive(
+ path: str | Path,
+ collection: str = "",
+ report_id: str | None = None,
+ jisc_papers: pd.DataFrame | None = None,
+ json_indent: int = JSON_INDENT,
+)
+
Manage extracting information from a ZIP archive.
+The Archive
class represents a zip archive of XML files. The class is used
+to extract information from a ZIP archive, and it contains several methods
+to process the data contained in the archive.
open(Archive)
context manager
Archive can be opened with a context manager, which creates a meta +object, with timings for the object. When closed, it will save the +meta JSON to the correct paths.
+Attributes:
+Name | +Type | +Description | +
---|---|---|
path |
+
+ Path
+ |
+
+
+
+ The path to the zip archive. + |
+
collection |
+
+ str
+ |
+
+
+
+ The collection of the XML files in the archive. Default is "". + |
+
report |
+
+ Path
+ |
+
+
+
+ The file path of the report file for the archive. + |
+
report_id |
+
+ str
+ |
+
+
+
+ The report ID for the archive. If not provided, a random UUID is +generated. + |
+
report_parent |
+
+ Path
+ |
+
+
+
+ The parent directory of the report file for the archive. + |
+
jisc_papers |
+
+ pd.DataFrame
+ |
+
+
+
+ A DataFrame of JISC papers. + |
+
size |
+
+ str | float
+ |
+
+
+
+ The size of the archive, in human-readable format. + |
+
size_raw |
+
+ str | float
+ |
+
+
+
+ The raw size of the archive, in bytes. + |
+
roots |
+
+ Generator[ET.Element, None, None]
+ |
+
+
+
+ The root elements of the XML documents contained in the archive. + |
+
meta |
+
+ dotdict
+ |
+
+
+
+ Metadata about the archive, such as its path, size, and number of contents. + |
+
json_indent |
+
+ int
+ |
+
+
+
+ Indentation formatting of |
+
Raises:
+Type | +Description | +
---|---|
+ RuntimeError
+ |
+
+
+
+ If the |
+
Constructor method.
+ +alto2txt2fixture/router.py
property
+
+
+Property that calls the get_documents
method
property
+
+
+Returns the list of files in the zip file
+A generator that yields instances of the Document class for each XML +file in the ZIP archive.
+It uses the tqdm
library to display a progress bar in the terminal
+while it is running.
If the contents of the ZIP file are not empty, the method creates an
+instance of the Document
class by passing the root element of the XML
+file, the collection name, meta information about the archive, and the
+JISC papers data frame (if provided) to the constructor of the
+Document
class. The instance of the Document
class is then
+returned by the generator.
Yields:
+Type | +Description | +
---|---|
+ Document
+ |
+
+
+
+
|
+
alto2txt2fixture/router.py
Yields the root elements of the XML documents contained in the archive.
+ +alto2txt2fixture/router.py
The Cache class provides a blueprint for creating and managing cache data. +The class has several methods that help in getting the cache path, +converting the data to a dictionary, and writing the cache data to a file.
+It is inherited by many other classes in this document.
+ +Initializes the Cache class object.
+ +alto2txt2fixture/router.py
Returns the cache path, which is used to store the cache data. +The path is normally constructed using some of the object's +properties (collection, kind, and id) but can be changed when +inherited.
+ +alto2txt2fixture/router.py
Writes the cache data to a file at the specified cache path. The cache +data is first converted to a dictionary using the as_dict method. If +the cache path already exists, the function returns True.
+ +alto2txt2fixture/router.py
A Collection represents a group of newspaper archives from any passed +alto2txt metadata output.
+A Collection is initialised with a name and an optional pandas DataFrame
+of JISC papers. The archives
property returns an iterable of the
+Archive
objects within the collection.
Attributes:
+Name | +Type | +Description | +
---|---|---|
name |
+
+ str
+ |
+
+
+
+ Name of the collection (default "hmd") + |
+
jisc_papers |
+
+ pandas.DataFrame
+ |
+
+
+
+ DataFrame of JISC papers, optional + |
+
Constructor method.
+ +alto2txt2fixture/router.py
+ Bases: Cache
The DataProvider class extends the Cache class and represents a newspaper +data provider. The class has several properties and methods that allow +creation of a data provider object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
collection |
+
+ str
+ |
+
+
+
+ A string representing publication collection + |
+
kind |
+
+ str
+ |
+
+
+
+ Indication of object type, defaults to |
+
providers_meta_data |
+
+ list[FixtureDict]
+ |
+
+
+
+ structured dict of metadata for known collection sources + |
+
collection_type |
+
+ str
+ |
+
+
+
+ related data sources and potential linkage source + |
+
index_field |
+
+ str
+ |
+
+
+
+ field name for querying existing records + |
+
>>> from pprint import pprint
+>>> hmd = DataProvider("hmd")
+>>> hmd.pk
+2
+>>> pprint(hmd.as_dict())
+{'code': 'bl-hmd',
+ 'collection': 'newspapers',
+ 'legacy_code': 'hmd',
+ 'name': 'Heritage Made Digital',
+ 'source_note': 'British Library-funded digitised newspapers provided by the '
+ 'British Newspaper Archive'}
+
Constructor method.
+ +alto2txt2fixture/router.py
property
+
+
+Return self.providers_meta_data[self.collection]
or {}
.
property
+
+
+Return self.providers_meta_data[self.collection]
or {}
.
property
+
+
+Return pk
if provided via providers_meta_data
, else None
.
property
+
+
+Return all self.index_field
values from providers_meta_data
.
Return a dict
of the data provider object.
Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the DataProvider object + |
+
alto2txt2fixture/router.py
+ Bases: Cache
The Digitisation class extends the Cache class and represents a newspaper +digitisation. The class has several properties and methods that allow +creation of an digitisation object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
root |
+
+ ET.Element
+ |
+
+
+
+ An xml element that represents the root of the publication + |
+
collection |
+
+ str
+ |
+
+
+
+ A string that represents the collection of the publication + |
+
Constructor method.
+ +alto2txt2fixture/router.py
class-attribute
+ instance-attribute
+
+
+A string that represents the type of the object, set to +"digitisation".
+A method that returns a dictionary representation of the digitisation +object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the Digitising object + |
+
alto2txt2fixture/router.py
The Document class is a representation of a document that contains +information about a publication, newspaper, item, digitisation, and +ingest. This class holds all the relevant information about a document in +a structured manner and provides properties that can be used to access +different aspects of the document.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
collection |
+
+ str | None
+ |
+
+
+
+ A string that represents the collection of the publication + |
+
root |
+
+ ET.Element | None
+ |
+
+
+
+ An |
+
zip_file |
+
+ str | None
+ |
+
+
+
+ A path to a valid |
+
jisc_papers |
+
+ pd.DataFrame | None
+ |
+
+
+
+ A |
+
meta |
+
+ dotdict | None
+ |
+
+
+
+ TODO + |
+
Constructor method.
+ +alto2txt2fixture/router.py
+ Bases: Cache
The Ingest class extends the Cache class and represents a newspaper ingest. +The class has several properties and methods that allow the creation of an +ingest object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
root |
+
+ ET.Element
+ |
+
+
+
+ An xml element that represents the root of the publication + |
+
collection |
+
+ str
+ |
+
+
+
+ A string that represents the collection of the publication + |
+
Constructor method.
+ +alto2txt2fixture/router.py
class-attribute
+ instance-attribute
+
+
+A string that represents the type of the object, set to "ingest".
+A method that returns a dictionary representation of the ingest +object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the Ingest object + |
+
alto2txt2fixture/router.py
Issue(
+ publication: ET.Element,
+ newspaper: Optional[Newspaper] = None,
+ collection: str = "",
+ input_sub_path: str = "",
+ meta: dotdict = dotdict(),
+)
+
+ Bases: Cache
The Issue class extends the Cache class and represents a newspaper issue. +The class has several properties and methods that allow the creation of an +issue object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
root |
+ + | +
+
+
+ An xml element that represents the root of the publication + |
+
newspaper |
+
+ Newspaper | None
+ |
+
+
+
+ The parent newspaper + |
+
collection |
+
+ str
+ |
+
+
+
+ A string that represents the collection of the publication + |
+
input_sub_path |
+
+ str
+ |
+
+
+
+ TODO + |
+
meta |
+
+ dotdict
+ |
+
+
+
+ TODO + |
+
Constructor method.
+ +alto2txt2fixture/router.py
property
+
+
+Sets up and saves the issue code for easy access as property.
+property
+
+
+Sets up and saves the issue date for easy access as property.
+class-attribute
+ instance-attribute
+
+
+A string that represents the type of the object, set to "issue".
+A method that returns a dictionary representation of the issue +object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the Issue object + |
+
alto2txt2fixture/router.py
Returns the path to the cache file for the issue object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ Path to the cache file for the issue object + |
+
alto2txt2fixture/router.py
Item(
+ root: ET.Element,
+ issue_code: str = "",
+ digitisation: dict = {},
+ ingest: dict = {},
+ collection: str = "",
+ newspaper: Optional[Newspaper] = None,
+ meta: dotdict = dotdict(),
+)
+
+ Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper +item, i.e. an article. The class has several properties and methods that +allow the creation of an article object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
root |
+
+ ET.Element
+ |
+
+
+
+ An xml element that represents the root of the publication + |
+
issue_code |
+
+ str
+ |
+
+
+
+ A string that represents the issue code + |
+
digitisation |
+
+ dict
+ |
+
+
+
+ TODO + |
+
ingest |
+
+ dict
+ |
+
+
+
+ TODO + |
+
collection |
+
+ str
+ |
+
+
+
+ A string that represents the collection of the publication + |
+
newspaper |
+
+ Newspaper | None
+ |
+
+
+
+ The parent newspaper + |
+
meta |
+
+ dotdict
+ |
+
+
+
+ TODO + |
+
Constructor method.
+ +alto2txt2fixture/router.py
property
+
+
+Sets up and saves the item code for easy access as property.
+property
+
+
+Sets up and saves the issue XML item for easy access as a property.
+class-attribute
+ instance-attribute
+
+
+A string that represents the type of the object, set to "item".
+A method that returns a dictionary representation of the item object +(i.e. article).
+ + + +Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the Item object + |
+
alto2txt2fixture/router.py
Returns the path to the cache file for the item (article) object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ Path to the cache file for the article object + |
+
alto2txt2fixture/router.py
Special cache-write function that appends rather than writes at the +end of the process.
+ + + +Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ None. + |
+
alto2txt2fixture/router.py
Newspaper(
+ root: ET.Element,
+ collection: str = "",
+ meta: dotdict = dotdict(),
+ jisc_papers: Optional[pd.DataFrame] = None,
+)
+
+ Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper.
+The class has several properties and methods that allow the creation of a +newspaper object and the manipulation of its data.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
root |
+ + | +
+
+
+ An xml element that represents the root of the publication. + |
+
collection |
+ + | +
+
+
+ A string that represents the collection of the publication. + |
+
meta |
+ + | +
+
+
+ A dotdict object that holds metadata about the publication. + |
+
jisc_papers |
+ + | +
+
+
+ A pandas DataFrame object for JISC paper information. + |
+
Constructor method.
+ +alto2txt2fixture/router.py
class-attribute
+ instance-attribute
+
+
+A string that represents the type of the object, set to "newspaper".
+property
+
+
+Returns the nested directories in which we want to save the cache file.
+ + + +Returns:
+Type | +Description | +
---|---|
+ list
+ |
+
+
+
+ List of the desired directories in descending order + |
+
property
+
+
+A property that returns the code of the publication.
+ + + +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ The code of the publication + |
+
property
+
+
+A property that returns the title of the newspaper.
+ + + +Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ The title of the newspaper + |
+
A method that returns a dictionary representation of the newspaper +object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ Dictionary representation of the Newspaper object + |
+
alto2txt2fixture/router.py
Returns the path to the cache file for the newspaper object.
+ + + +Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ Path to the cache file for the newspaper object + |
+
alto2txt2fixture/router.py
A method that returns the publication code from the input sub-path of +the publication process.
+ + + +Returns:
+Type | +Description | +
---|---|
+ str | None
+ |
+
+
+
+ The code of the publication + |
+
alto2txt2fixture/router.py
route(
+ collections: list,
+ cache_home: str,
+ mountpoint: str,
+ jisc_papers_path: str,
+ report_dir: str,
+) -> None
+
This function is responsible for setting up the path for the alto2txt +mountpoint, setting up the JISC papers and routing the collections for +processing.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
collections |
+
+ list
+ |
+
+
+
+ List of collection names + |
+ + required + | +
cache_home |
+
+ str
+ |
+
+
+
+ Directory path for the cache + |
+ + required + | +
mountpoint |
+
+ str
+ |
+
+
+
+ Directory path for the alto2txt mountpoint + |
+ + required + | +
jisc_papers_path |
+
+ str
+ |
+
+
+
+ Path to the JISC papers + |
+ + required + | +
report_dir |
+
+ str
+ |
+
+
+
+ Path to the report directory + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ None + |
+
alto2txt2fixture/router.py
1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 |
|
The settings
module provides configuration for running alto2txt2fixture
.
Most of these are managed within the settings
variable within this module.
Note
+See the command line interface parameters documentation for means of modifying settings
when run.
Attributes:
+Name | +Type | +Description | +
---|---|---|
JSON_INDEX |
+ + | +
+
+
+ Amount of indentation to include in output |
+
DATA_PROVIDER_INDEX |
+
+ Final[str]
+ |
+
+
+
+ The |
+
NEWSPAPER_COLLECTION_METADATA |
+
+ Final[list[FixtureDict]]
+ |
+
+
+
+ A list of |
+
SETUP_TITLE |
+
+ str
+ |
+
+
+
+ the title printed at the commandline via |
+
settings |
+
+ dotdict
+ |
+
+
+
+ a |
+
+ Bases: TypedDict
A dict
structure to ease use as a json
database fixture.
Attributes:
+Name | +Type | +Description | +
---|---|---|
pk |
+
+ int
+ |
+
+
+
+ an id to uniquely define and query each entry + |
+
model |
+
+ str
+ |
+
+
+
+ what model a given record is for + |
+
fields |
+
+ dict[str, Any]
+ |
+
+
+
+ a |
+
+ Bases: NamedTuple
A named tuple of fields for translation.
+ + + +Attributes:
+Name | +Type | +Description | +
---|---|---|
start |
+
+ str
+ |
+
+
+
+ A string representing the starting field name. + |
+
finish |
+
+ str | list
+ |
+
+
+
+ A string or list specifying the field(s) to be translated. +If it is a string, the translated field +will be a direct mapping of the specified field in +each item of the input list. +If it is a list, the translated field will be a +hyphen-separated concatenation of the specified fields +in each item of the input list. + |
+
lst |
+
+ list[dict]
+ |
+
+
+
+ A list of dictionaries representing the items to be
+translated. Each dictionary should contain the necessary
+fields for translation, with the field names specified in
+the |
+
+ Bases: dict
dot.notation access to dictionary attributes
+ + + + +check_newspaper_collection_configuration(
+ collections: Iterable[str] = settings.COLLECTIONS,
+ newspaper_collections: Iterable[
+ FixtureDict
+ ] = NEWSPAPER_COLLECTION_METADATA,
+ data_provider_index: str = DATA_PROVIDER_INDEX,
+) -> set[str]
+
Check the names in collections
match the names in newspaper_collections
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
collections |
+
+ Iterable[str]
+ |
+
+
+
+ Names of newspaper collections, defaults to |
+
+ settings.COLLECTIONS
+ |
+
newspaper_collections |
+
+ Iterable[FixtureDict]
+ |
+
+
+
+ Newspaper collections in a list of |
+
+ NEWSPAPER_COLLECTION_METADATA
+ |
+
data_provider_index |
+
+ str
+ |
+
+
+
+
|
+
+ DATA_PROVIDER_INDEX
+ |
+
Returns:
+Type | +Description | +
---|---|
+ set[str]
+ |
+
+
+
+ A set of |
+
alto2txt2fixture/utils.py
Clears the cache directory by removing all .json
files in it.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dir |
+
+ str | Path
+ |
+
+
+
+ The path of the directory to be cleared. + |
+ + required + | +
alto2txt2fixture/utils.py
Create a lookup dictionary from a list of dictionaries.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
lst |
+
+ list
+ |
+
+
+
+ A list of dictionaries that should be used to generate the lookup. + |
+
+ []
+ |
+
on |
+
+ list
+ |
+
+
+
+ A list of keys from the dictionaries in the list that should be used as the keys in the lookup. + |
+
+ []
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict
+ |
+
+
+
+ The generated lookup dictionary. + |
+
alto2txt2fixture/utils.py
dict_from_list_fixture_fields(
+ fixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,
+ field_name: str = DATA_PROVIDER_INDEX,
+) -> dict[str, FixtureDict]
+
Create a dict
from fixture_list
with attr_name
as key
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fixture_list |
+
+ Iterable[FixtureDict]
+ |
+
+
+
+
|
+
+ NEWSPAPER_COLLECTION_METADATA
+ |
+
field_name |
+
+ str
+ |
+
+
+
+ key for values within |
+
+ DATA_PROVIDER_INDEX
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict[str, FixtureDict]
+ |
+
+
+
+ A |
+
alto2txt2fixture/utils.py
export_fixtures(
+ fixture_tables: dict[str, Sequence[FixtureDict]],
+ path: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,
+ prefix: str = "test-",
+ add_created: bool = True,
+ formats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,
+) -> None
+
Export fixture_tables
in formats
.
This is still in experimental phase of development and not recommended +for production.
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fixture_tables |
+
+ dict[str, Sequence[FixtureDict]]
+ |
+
+
+
+
|
+ + required + | +
path |
+
+ str | PathLike
+ |
+
+
+
+ Path to save exports in + |
+
+ settings.FIXTURE_TABLES_OUTPUT
+ |
+
prefix |
+
+ str
+ |
+
+
+
+
|
+
+ 'test-'
+ |
+
formats |
+
+ Sequence[EXPORT_FORMATS]
+ |
+
+
+
+ list of |
+
+ settings.FIXTURE_TABLES_FORMATS
+ |
+
>>> test_fixture_tables: dict[str, FixtureDict] = {
+... 'test0': NEWSPAPER_COLLECTION_METADATA,
+... 'test1': NEWSPAPER_COLLECTION_METADATA}
+>>> export_fixtures(test_fixture_tables, path='tests/')
+... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+<BLANKLINE>
+...Warning: Saving test0...
+...Warning: Saving test1...
+>>> from pandas import read_csv
+>>> fixture0_json = load_json('tests/test-test0-1.json')
+>>> fixture0_df = read_csv('tests/test-test0-1.csv')
+>>> fixture1_json = load_json('tests/test-test1-1.json')
+>>> fixture1_df = read_csv('tests/test-test1-1.csv')
+>>> fixture0_json == fixture1_json
+True
+>>> all(fixture0_df == fixture1_df)
+True
+>>> all(field in fixture0_json[0]['fields']
+... for field in ['created_at', 'updated_at'])
+True
+>>> fixture0_json[1]['pk']
+2
+>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()
+[2, 'hmd']
+
alto2txt2fixture/utils.py
846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 |
|
filter_json_fields(
+ json_results: list | dict | None = None,
+ file_path: PathLike | None = None,
+ fields: Sequence[str] = [],
+ value: Hashable = "",
+ **kwargs: Hashable
+) -> dict | list
+
Return keys
and values
from json_dict
where any fields
equal value
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
file_path |
+
+ PathLike | None
+ |
+
+
+
+ The file |
+
+ None
+ |
+
fields |
+
+ Sequence[str]
+ |
+
+
+
+ Which fields to check equal |
+
+ []
+ |
+
value |
+
+ Hashable
+ |
+
+
+
+ Value to filter by + |
+
+ ''
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict | list
+ |
+
+
+
+ A |
+
Raises:
+Type | +Description | +
---|---|
+ ValueError
+ |
+
+
+
+
|
+
>>> from pprint import pprint
+>>> entry_fixture: dict = [
+... {"pk": 4889, "model": "mitchells.entry",
+... "fields": {"title": "BIRMINGHAM POST .",
+... "price_raw": ['2d'],
+... "year": 1920,
+... "date_established_raw": "1857",
+... "persons": [], "newspaper": ""}},
+... {"pk": 9207, "model": "mitchells.entry",
+... "fields": {"title": "ULVERSTONE ADVERTISER .",
+... "price_raw": ['2 ½ d', '3 ½ d'],
+... "year": 1856,
+... "date_established_raw": "1848",
+... "persons": ['Stephen Soulby'],
+... "newspaper": "",}},
+... {"pk": 15, "model": "mitchells.entry",
+... "fields": {"title": "LLOYD'S WEEKLY LONDON NEWSPAPER .",
+... "price_raw": ['2d', '3d'],
+... "year": 1857,
+... "date_established_raw": "November , 1842",
+... "persons": ['Mr. Douglas Jerrold', 'Edward Lloyd'],
+... "newspaper": 1187}}
+... ]
+>>> pprint(filter_json_fields(entry_fixture,
+... fields=("newspaper", "persons"),
+... value=""))
+[{'fields': {'date_established_raw': '1857',
+ 'newspaper': '',
+ 'persons': [],
+ 'price_raw': ['2d'],
+ 'title': 'BIRMINGHAM POST .',
+ 'year': 1920},
+ 'model': 'mitchells.entry',
+ 'pk': 4889},
+ {'fields': {'date_established_raw': '1848',
+ 'newspaper': '',
+ 'persons': ['Stephen Soulby'],
+ 'price_raw': ['2 ½ d', '3 ½ d'],
+ 'title': 'ULVERSTONE ADVERTISER .',
+ 'year': 1856},
+ 'model': 'mitchells.entry',
+ 'pk': 9207}]
+
alto2txt2fixture/utils.py
428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 |
|
fixture_fields(
+ fixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False
+) -> tuple[str, ...] | dict[str, Any]
+
Generate a tuple of FixtureDict
field
names.
This is not in the utils
module to avoid a circular import.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fixture_dict |
+
+ FixtureDict
+ |
+
+
+
+ A |
+ + required + | +
include_pk |
+
+ bool
+ |
+
+
+
+ Whether to include the |
+
+ True
+ |
+
>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])
+('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')
+>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)
+('name', 'code', 'legacy_code', 'collection', 'source_note')
+>>> hmd_dict: dict[str, Any] = fixture_fields(
+... NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)
+>>> hmd_dict['code']
+'bl-hmd'
+>>> hmd_dict['pk']
+2
+>>> hmd_dict = fixture_fields(
+... NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)
+>>> 'pk' in hmd_dict
+False
+
alto2txt2fixture/utils.py
fixture_or_default_dict(
+ key: str,
+ fixture_dict: dict[str, FixtureDict],
+ default_dict: FixtureDict | dict = {},
+) -> FixtureDict | dict
+
Return a FixtureDict
from fixture_list
via key
index, else default_dict
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+
+ str
+ |
+
+
+
+ a |
+ + required + | +
fixture_dict |
+
+ dict[str, FixtureDict]
+ |
+
+
+
+ a |
+ + required + | +
default_dict |
+
+ FixtureDict | dict
+ |
+
+
+
+ a |
+
+ {}
+ |
+
>>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(
+... NEWSPAPER_COLLECTION_METADATA)
+>>> hmd_dict: FixtureDict = fixture_or_default_dict(
+... 'hmd', newspaper_dict
+... )
+>>> fixture_or_default_dict(
+... 'hmd', NEWSPAPER_COLLECTION_METADATA
+... )
+{}
+>>> fixture_or_default_dict(
+... 'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}
+... )
+{'a': 'default'}
+
alto2txt2fixture/utils.py
fixtures_dict2csv(
+ fixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],
+ prefix: str = "",
+ output_path: PathLike | str = settings.OUTPUT,
+ index: bool = False,
+ max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+) -> None
+
Saves fixtures generated by a generator to separate separate CSV
files.
This function takes an Iterable
or Generator
of fixtures and saves to
+separate CSV
files. The fixtures are saved in batches, where each batch
+is determined by the max_elements_per_file
parameter.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fixtures |
+
+ Iterable[FixtureDict] | Generator[FixtureDict, None, None]
+ |
+
+
+
+ An |
+ + required + | +
prefix |
+
+ str
+ |
+
+
+
+ A string prefix to be added to the file names of the +saved fixtures. + |
+
+ ''
+ |
+
output_path |
+
+ PathLike | str
+ |
+
+
+
+ Path to folder fixtures are saved to + |
+
+ settings.OUTPUT
+ |
+
max_elements_per_file |
+
+ int
+ |
+
+
+
+ Maximum |
+
+ settings.MAX_ELEMENTS_PER_FILE
+ |
+
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ This function saves fixtures to files and does not return a value. + |
+
alto2txt2fixture/utils.py
783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 |
|
gen_fixture_tables(
+ fixture_tables: dict[str, list[FixtureDict]] = {},
+ include_fixture_pk_column: bool = True,
+) -> Generator[Table, None, None]
+
Generator of rich.Table
instances from FixtureDict
configuration tables.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fixture_tables |
+
+ dict[str, list[FixtureDict]]
+ |
+
+
+
+
|
+
+ {}
+ |
+
include_fixture_pk_column |
+
+ bool
+ |
+
+
+
+ whether to include the |
+
+ True
+ |
+
>>> table_name: str = "data_provider"
+>>> tables = tuple(
+... gen_fixture_tables(
+... {table_name: NEWSPAPER_COLLECTION_METADATA}
+... ))
+>>> len(tables)
+1
+>>> assert tables[0].title == table_name
+>>> [column.header for column in tables[0].columns]
+['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']
+
alto2txt2fixture/utils.py
This function takes in a Path
object path
and returns a list of lists
+of zipfiles
sorted and chunked according to certain conditions defined
+in the settings
object (see settings.CHUNK_THRESHOLD
).
Note: the function will also skip zip files of a certain file size, which
+can be specified in the settings
object (see settings.SKIP_FILE_SIZE
).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
path |
+
+ Path
+ |
+
+
+
+ The input path where the zipfiles are located + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ list
+ |
+
+
+
+ A list of lists of |
+
alto2txt2fixture/utils.py
Get a string key from a dictionary using values from specified keys.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
x |
+
+ dict
+ |
+
+
+
+ A dictionary from which the key is generated. + |
+
+ dict()
+ |
+
on |
+
+ list
+ |
+
+
+
+ A list of keys from the dictionary that should be used to +generate the key. + |
+
+ []
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+
+
+
+ The generated string key. + |
+
alto2txt2fixture/utils.py
Provides the path to any given lockfile, which controls whether any +existing files should be overwritten or not.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
collection |
+
+ str
+ |
+
+
+
+ Collection folder name + |
+ + required + | +
kind |
+
+ NewspaperElements
+ |
+
+
+
+ Either |
+ + required + | +
dic |
+
+ dict
+ |
+
+
+
+ A dictionary with required information for either |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ Path to the resulting lockfile + |
+
alto2txt2fixture/utils.py
Return datetime.now()
as either a string or datetime
object.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
as_str |
+
+ bool
+ |
+
+
+
+ Whether to return |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ datetime.datetime | str
+ |
+
+
+
+
|
+
alto2txt2fixture/utils.py
Converts an input value into a Path object if it's not already one.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ The input value, which can be a string or a Path object. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Path
+ |
+
+
+
+ The input value as a Path object. + |
+
alto2txt2fixture/utils.py
Returns a nice string for any given file size.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ Path to read the size from + |
+ + required + | +
raw |
+
+ bool
+ |
+
+
+
+ Whether to return the file size as total number of bytes or +a human-readable MB/GB amount + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ str | float
+ |
+
+
+
+ Return |
+
alto2txt2fixture/utils.py
Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str
+ |
+
+
+
+ Path to a directory to filter + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ list
+ |
+
+
+
+ Sorted list of files contained in the provided path without the ones + |
+
+ list
+ |
+
+
+
+ whose names start with a |
+
alto2txt2fixture/utils.py
list_json_files(
+ p: str | Path,
+ drill: bool = False,
+ exclude_names: list = [],
+ include_names: list = [],
+) -> Generator[Path, None, None] | list[Path]
+
List json
files under the path specified in p
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ The path to search for |
+ + required + | +
drill |
+
+ bool
+ |
+
+
+
+ A flag indicating whether to drill down the subdirectories
+or not. Default is |
+
+ False
+ |
+
exclude_names |
+
+ list
+ |
+
+
+
+ A list of file names to exclude from the search +result. Default is an empty list + |
+
+ []
+ |
+
include_names |
+
+ list
+ |
+
+
+
+ A list of file names to include in search result.
+If provided, the |
+
+ []
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Generator[Path, None, None] | list[Path]
+ |
+
+
+
+ A list of |
+
alto2txt2fixture/utils.py
Easier access to reading json
files.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ Path to read |
+ + required + | +
crash |
+
+ bool
+ |
+
+
+
+ Whether the program should crash if there is a |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict | list
+ |
+
+
+
+ The decoded |
+
+ dict | list
+ |
+
+
+
+ if the file cannot be decoded and |
+
alto2txt2fixture/utils.py
load_multiple_json(
+ p: str | Path,
+ drill: bool = False,
+ filter_na: bool = True,
+ crash: bool = False,
+) -> list
+
Load multiple json
files and return a list of their content.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ The path to search for |
+ + required + | +
drill |
+
+ bool
+ |
+
+
+
+ A flag indicating whether to drill down the subdirectories
+or not. Default is |
+
+ False
+ |
+
filter_na |
+
+ bool
+ |
+
+
+
+ A flag indicating whether to filter out the content that
+is |
+
+ True
+ |
+
crash |
+
+ bool
+ |
+
+
+
+ A flag indicating whether to raise an exception when an
+error occurs while loading a |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list
+ |
+
+
+
+ A |
+
alto2txt2fixture/utils.py
Writes a '.' to a lockfile, after making sure the parent directory exists.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
lockfile |
+
+ Path
+ |
+
+
+
+ The path to the lock file to be created + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ None + |
+
alto2txt2fixture/utils.py
save_fixture(
+ generator: Sequence | Generator = [],
+ prefix: str = "",
+ output_path: PathLike | str = settings.OUTPUT,
+ max_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,
+ add_created: bool = True,
+ json_indent: int = JSON_INDENT,
+) -> None
+
Saves fixtures generated by a generator to separate JSON files.
+This function takes a generator and saves the generated fixtures to
+separate JSON files. The fixtures are saved in batches, where each batch
+is determined by the max_elements_per_file
parameter.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
generator |
+
+ Sequence | Generator
+ |
+
+
+
+ A generator that yields the fixtures to be saved. + |
+
+ []
+ |
+
prefix |
+
+ str
+ |
+
+
+
+ A string prefix to be added to the file names of the +saved fixtures. + |
+
+ ''
+ |
+
output_path |
+
+ PathLike | str
+ |
+
+
+
+ Path to folder fixtures are saved to + |
+
+ settings.OUTPUT
+ |
+
max_elements_per_file |
+
+ int
+ |
+
+
+
+ Maximum |
+
+ settings.MAX_ELEMENTS_PER_FILE
+ |
+
add_created |
+
+ bool
+ |
+
+
+
+ Whether to add |
+
+ True
+ |
+
json_indent |
+
+ int
+ |
+
+
+
+ Number of indent spaces per line in saved |
+
+ JSON_INDENT
+ |
+
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ This function saves the fixtures to files but does not return +any value. + |
+
>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,
+... prefix='test', output_path='tests/')
+>>> imported_fixture = load_json('tests/test-1.json')
+>>> imported_fixture[1]['pk']
+2
+>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]
+'hmd'
+>>> 'created_at' in imported_fixture[1]['fields']
+True
+
alto2txt2fixture/utils.py
706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 |
|
write_json(
+ p: str | Path,
+ o: dict,
+ add_created: bool = True,
+ json_indent: int = JSON_INDENT,
+) -> None
+
Easier access to writing json
files. Checks whether parent exists.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
p |
+
+ str | Path
+ |
+
+
+
+ Path to write |
+ + required + | +
o |
+
+ dict
+ |
+
+
+
+ Object to write to |
+ + required + | +
add_created |
+
+ bool
+ |
+
+
+
+ If set to True will add |
+
+ True
+ |
+
json_indent |
+
+ int
+ |
+
+
+
+ What indetation format to write out |
+
+ JSON_INDENT
+ |
+
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+
+
+
+ None + |
+
alto2txt2fixture/utils.py
271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 |
|
poetry
to runThe program should run automatically with the following command:
+ +Alternatively, if you want to add optional parameters and don’t want to use the standard poetry
script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py
and provide any optional parameters. You can see a list of all the “Optional parameters” below. For example, if you want to only include the hmd
collection:
If you find yourself in trouble with poetry
, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:
Note
+See the list under [tool.poetry.dependencies]
in pyproject.toml
for a list of dependencies that would need to be installed for alto2txt2fixture
to work outside a python poetry
environment.
The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.
+Flag | +Description | +Default value | +
---|---|---|
-c , --collections |
+Which collections to process in the mounted alto2txt directory | +hmd , lwm , jisc , bna |
+
-o , --output |
+Into which directory should the processed files be put? | +./output/fixtures/ |
+
-m , --mountpoint |
+Where is the alto2txt directories mounted? | +./input/alto2txt/ |
+
-t , --test-config |
+Print the config table but do not run | +False |
+
alto2txt2fixture
is a standalone tool to convert alto2txt
XML
output and other related datasets into JSON
(and where feasible CSV
) data with corresponding relational IDs to ease general use and ingestion into a relational database.
We target the the JSON
produced for importing into lwmdb
: a database built using the Django
python
webframework database fixture
structure.
We provide a command line interface to process alto2txt
XML
files stored locally (or mounted via azure
blobfuse
), and for additional public data we automate a means of downloading those automatically.
We recommend downloading a copy of the reposity or using git clone
. From a local copy use poetry
to install dependencies:
$ cd alto2txt2fixture\n$ poetry install\n
If you would like to test, render documentation and/or contribute to the code included dev
dependencies in a local install:
$ poetry install --with dev\n
"},{"location":"index.html#simple-use","title":"Simple use","text":"To processing newspaper metadata with a local copy of alto2txt
XML
results, it's easiest to have that data in the same folder as your alto2txt2fixture
checkout and poetry
installed folder. One arranged, you should be able to begin the JSON
converstion with
$ poetry run a2t2f-news\n
To generate related data in JSON
and CSV
form, assuming you have an internet collection and access to a living-with-machines
azure
account, the following will download related data into JSON
and CSV
files. The JSON
results should be consistent with lwmdb
tables for ease of import.
$ poetry run a2t2f-adj\n
"},{"location":"running.html","title":"Running the Program","text":""},{"location":"running.html#using-poetry-to-run","title":"Using poetry
to run","text":"The program should run automatically with the following command:
$ poetry run a2t2f-news\n
Alternatively, if you want to add optional parameters and don\u2019t want to use the standard poetry
script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py
and provide any optional parameters. You can see a list of all the \u201cOptional parameters\u201d below. For example, if you want to only include the hmd
collection:
$ poetry run alto2txt2fixture/run.py --collections hmd\n
"},{"location":"running.html#alternative-run-the-script-without-poetry","title":"Alternative: Run the script without poetry","text":"If you find yourself in trouble with poetry
, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:
$ python alto2txt2fixture/run.py --collections hmd\n
Note
See the list under [tool.poetry.dependencies]
in pyproject.toml
for a list of dependencies that would need to be installed for alto2txt2fixture
to work outside a python poetry
environment.
The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.
Flag Description Default value-c
, --collections
Which collections to process in the mounted alto2txt directory hmd
, lwm
, jisc
, bna
-o
, --output
Into which directory should the processed files be put? ./output/fixtures/
-m
, --mountpoint
Where is the alto2txt directories mounted? ./input/alto2txt/
-t
, --test-config
Print the config table but do not run False
"},{"location":"running.html#successfully-running-the-program-an-example","title":"Successfully running the program: An example","text":""},{"location":"understanding-results.html","title":"Understanding the Results","text":""},{"location":"understanding-results.html#the-resulting-file-structure","title":"The resulting file structure","text":"The examples below follow standard settings
If you choose other settings for when you run the program, your output directory may look different from the information on this page.
"},{"location":"understanding-results.html#reports","title":"Reports","text":"Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports
directory, you\u2019ll find a JSON file for each alto2txt
directory (organised by NLP identifier).
The report structure, thus, looks like this:
The JSON file has some good troubleshooting information. You\u2019ll find that the contents are structured as a Python dictionary
(or JavaScript Object
). Here is an example:
Here is an explanation of each of the keys in the dictionary:
Key Explanation Data typepath
The input path for the zip file that is being converted. string
bytes
The size of the input zip file represented in bytes. integer
size
The size of the input zip file represented in a human-readable string. string
contents
#TODO #3 integer
start
Date and time when processing started (see also end
below). datestring
newspaper_paths
#TODO #3 list
(string
) publication_codes
A list of the NLPs that are contained in the input zip file. list
(string
) issue_paths
A list of all the issue paths that are contained in the cache directory. list
(string
) item_paths
A list of all the item paths that are contained in the cache directory. list
(string
) end
Date and time when processing ended (see also start
above). datestring
seconds
Seconds that the script spent interpreting the zip file (should be added to the microseconds
below). integer
microseconds
Microseconds that the script spent interpreting the zip file (should be added to the seconds
above). integer
"},{"location":"understanding-results.html#fixtures","title":"Fixtures","text":"The most important output of the script is contained in the fixtures
directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider
, Digitisation
, Ingest
, Issue
, Newspaper
, and Item
). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6
elements*:
Each JSON file contains a Python-like list
(JavaScript Array
) of dictionaries
(JavaScript Objects
), which have a primary key (pk
), the related database model (in the example below the Django newspapers
app\u2019s newspaper
table), and a nested dictionary
/Object
which contains all the values for the database\u2019s table entry:
* The maximum elements per file can be adjusted in the settings.py
file\u2019s settings
object\u2019s MAX_ELEMENTS_PER_FILE
value.
Entry point for alto2txt2fixture.parse
to convert alto2txt
XML
-> JSON
.
This module defines the run function which is the main driver for the entire process.
It imports various functions from other modules and uses them to route and parse XML
data generated by alto2txt
.
The following steps are performed in the run function:
alto2txt
data into subdirectories with structured files.JSON
files.If the script is run as a main
program (i.e. if the name of the script is __main__
), the run()
function is executed.
Note: at present this does not include any functunality in create_adjacent_tables.py
parse_args(argv: list[str] | None = None) -> Namespace\n
Manage command line arguments for run()
This constructs an ArgumentParser
instance to manage configurating calls of run()
to manage newspaper
XML
to JSON
converstion.
Parameters:
Name Type Description Defaultargv
list[str] | None
If None
treat as equivalent of ['--help], if a
listof
strpass those options to
ArgumentParser`
None
Returns:
Type DescriptionNamespace
A Namespace
dict
-like configuration for run()
alto2txt2fixture/__main__.py
def parse_args(argv: list[str] | None = None) -> Namespace:\n\"\"\"Manage command line arguments for `run()`\n This constructs an `ArgumentParser` instance to manage\n configurating calls of `run()` to manage `newspaper`\n `XML` to `JSON` converstion.\n Arguments:\n argv:\n If `None` treat as equivalent of ['--help`],\n if a `list` of `str` pass those options to `ArgumentParser`\n Returns:\n A `Namespace` `dict`-like configuration for `run()`\n \"\"\"\nargv = None if not argv else argv\nparser = ArgumentParser(\nprog=\"a2t2f-news\",\ndescription=\"Process alto2txt XML into and Django JSON Fixture files\",\nepilog=(\n\"Note: this is still in beta mode and contributions welcome\\n\\n\" + __doc__\n),\nformatter_class=RawTextHelpFormatter,\n)\nparser.add_argument(\n\"-c\",\n\"--collections\",\nnargs=\"+\",\nhelp=\"<Optional> Set collections\",\nrequired=False,\n)\nparser.add_argument(\n\"-m\",\n\"--mountpoint\",\ntype=str,\nhelp=\"<Optional> Mountpoint\",\nrequired=False,\n)\nparser.add_argument(\n\"-o\",\n\"--output\",\ntype=str,\nhelp=\"<Optional> Set an output directory\",\nrequired=False,\n)\nparser.add_argument(\n\"-t\",\n\"--test-config\",\ndefault=False,\nhelp=\"Only print the configuration\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"-f\",\n\"--show-fixture-tables\",\ndefault=True,\nhelp=\"Print included fixture table configurations\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--export-fixture-tables\",\ndefault=True,\nhelp=\"Experimental: export fixture tables prior to data processing\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--data-provider-field\",\ntype=str,\ndefault=DATA_PROVIDER_INDEX,\nhelp=\"Key for indexing DataProvider records\",\n)\nreturn parser.parse_args(argv)\n
"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.run","title":"run","text":"run(local_args: list[str] | None = None) -> None\n
Manage running newspaper XML
to JSON
conversion.
First parse_args
is called for command line arguments including:
collections
output
mountpoint
If any of these arguments are specified, they will be used, otherwise they will default to the values in the settings
module.
The show_setup
function is then called to display the configurations being used.
The route
function is then called to route the alto2txt files into subdirectories with structured files.
The parse
function is then called to parse the resulting JSON files.
Finally, the clear_cache
function is called to clear the cache (pending the user's confirmation).
Parameters:
Name Type Description Defaultlocal_args
list[str] | None
Options passed to parse_args()
None
Source code in alto2txt2fixture/__main__.py
def run(local_args: list[str] | None = None) -> None:\n\"\"\"Manage running newspaper `XML` to `JSON` conversion.\n First `parse_args` is called for command line arguments including:\n - `collections`\n - `output`\n - `mountpoint`\n If any of these arguments are specified, they will be used, otherwise they\n will default to the values in the `settings` module.\n The `show_setup` function is then called to display the configurations\n being used.\n The `route` function is then called to route the alto2txt files into\n subdirectories with structured files.\n The `parse` function is then called to parse the resulting JSON files.\n Finally, the `clear_cache` function is called to clear the cache\n (pending the user's confirmation).\n Arguments:\n local_args:\n Options passed to `parse_args()`\n \"\"\"\nargs: Namespace = parse_args(argv=local_args)\nif args.collections:\nCOLLECTIONS = [x.lower() for x in args.collections]\nelse:\nCOLLECTIONS = settings.COLLECTIONS\nif args.output:\nOUTPUT = args.output.rstrip(\"/\")\nelse:\nOUTPUT = settings.OUTPUT\nif args.mountpoint:\nMOUNTPOINT = args.mountpoint.rstrip(\"/\")\nelse:\nMOUNTPOINT = settings.MOUNTPOINT\nshow_setup(\nCOLLECTIONS=COLLECTIONS,\nOUTPUT=OUTPUT,\nCACHE_HOME=settings.CACHE_HOME,\nMOUNTPOINT=MOUNTPOINT,\nJISC_PAPERS_CSV=settings.JISC_PAPERS_CSV,\nREPORT_DIR=settings.REPORT_DIR,\nMAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,\n)\nif args.show_fixture_tables:\n# Show a table of fixtures used, defaults to DataProvider Table\nshow_fixture_tables(settings, data_provider_index=args.data_provider_field)\nif args.export_fixture_tables:\nexport_fixtures(\nfixture_tables=settings.FIXTURE_TABLES,\npath=OUTPUT,\nformats=settings.FIXTURE_TABLES_FORMATS,\n)\nif not args.test_config:\n# Routing alto2txt into subdirectories with structured files\nroute(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nMOUNTPOINT,\nsettings.JISC_PAPERS_CSV,\nsettings.REPORT_DIR,\n)\n# Parsing the resulting JSON files\nparse(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nOUTPUT,\nsettings.MAX_ELEMENTS_PER_FILE,\n)\nclear_cache(settings.CACHE_HOME)\n
"},{"location":"reference/alto2txt2fixture/cli.html","title":"cli","text":""},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_fixture_tables","title":"show_fixture_tables","text":"show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]\n
Print fixture tables specified in settings.fixture_tables
in rich.Table
format.
Parameters:
Name Type Description Defaultrun_settings
dotdict
alto2txt2fixture
run configuration
settings
print_in_call
bool
whether to print to console (will use console
variable if so)
True
data_provider_index
str
key to index dataprovider
from NEWSPAPER_COLLECTION_METADATA
DATA_PROVIDER_INDEX
Returns:
Type Descriptionlist[Table]
A list
of rich.Table
renders from configurations in run_settings.FIXTURE_TABLES
>>> fixture_tables: list[Table] = show_fixture_tables(\n... settings,\n... print_in_call=False)\n>>> len(fixture_tables)\n1\n>>> fixture_tables[0].title\n'dataprovider'\n>>> [column.header for column in fixture_tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n>>> fixture_tables = show_fixture_tables(settings)\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n
Note It is possible for the example test to fail in different screen sizes. Try increasing the window or screen width of terminal used to check before raising an issue.
Source code inalto2txt2fixture/cli.py
def show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]:\n\"\"\"Print fixture tables specified in ``settings.fixture_tables`` in `rich.Table` format.\n Arguments:\n run_settings: `alto2txt2fixture` run configuration\n print_in_call: whether to print to console (will use ``console`` variable if so)\n data_provider_index: key to index `dataprovider` from ``NEWSPAPER_COLLECTION_METADATA``\n Returns:\n A `list` of `rich.Table` renders from configurations in ``run_settings.FIXTURE_TABLES``\n Example:\n ```pycon\n >>> fixture_tables: list[Table] = show_fixture_tables(\n ... settings,\n ... print_in_call=False)\n >>> len(fixture_tables)\n 1\n >>> fixture_tables[0].title\n 'dataprovider'\n >>> [column.header for column in fixture_tables[0].columns]\n ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n >>> fixture_tables = show_fixture_tables(settings)\n ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n <BLANKLINE>\n ...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n ```\n Note:\n It is possible for the example test to fail in different screen sizes. Try\n increasing the window or screen width of terminal used to check before\n raising an issue.\n \"\"\"\nif run_settings.FIXTURE_TABLES:\nif \"dataprovider\" in run_settings.FIXTURE_TABLES:\ncheck_newspaper_collection_configuration(\nrun_settings.COLLECTIONS,\nrun_settings.FIXTURE_TABLES[\"dataprovider\"],\ndata_provider_index=data_provider_index,\n)\nconsole_tables: list[Table] = list(\ngen_fixture_tables(run_settings.FIXTURE_TABLES)\n)\nif print_in_call:\nfor console_table in console_tables:\nconsole.print(console_table)\nreturn console_tables\nelse:\nreturn []\n
"},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_setup","title":"show_setup","text":"show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs: str) -> None\n
Generate a rich.table.Table
for printing configuration to console.
alto2txt2fixture/cli.py
def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:\n\"\"\"Generate a `rich.table.Table` for printing configuration to console.\"\"\"\nif clear and os.name == \"posix\":\nos.system(\"clear\")\nelif clear:\nos.system(\"cls\")\ntable = Table(title=title)\ntable.add_column(\"Setting\", justify=\"right\", style=\"cyan\", no_wrap=True)\ntable.add_column(\"Value\", style=\"magenta\")\nfor key, value in kwargs.items():\ntable.add_row(str(key), str(value))\nconsole.print(table)\nreturn\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html","title":"create_adjacent_tables","text":""},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.correct_dict","title":"correct_dict","text":"correct_dict(o: dict) -> list\n
Returns a list with corrected data from a provided dictionary.
Source code inalto2txt2fixture/create_adjacent_tables.py
def correct_dict(o: dict) -> list:\n\"\"\"Returns a list with corrected data from a provided dictionary.\"\"\"\nreturn [(k, v[0], v[1]) for k, v in o.items() if not v[0].startswith(\"Q\")] + [\n(k, v[1], v[0]) for k, v in o.items() if v[0].startswith(\"Q\")\n]\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.csv2json_list","title":"csv2json_list","text":"csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list\n
Save csv_path
as a json
file and return as a list
.
alto2txt2fixture/create_adjacent_tables.py
def csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list:\n\"\"\"Save `csv_path` as a `json` file and return as a `list`.\"\"\"\njson_data = []\n# See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486\ndf = (\npd.read_csv(csv_path, index_col=0).fillna(np.nan).replace([np.nan], [None])\n) # fillna(None)\nif \"political_leanings\" in df.columns:\ndf[\"political_leanings\"] = df[\"political_leanings\"].apply(json.loads)\nif \"prices\" in df.columns:\ndf[\"prices\"] = df[\"prices\"].apply(json.loads)\nmodel = Path(csv_path).stem.lower()\nfor pk, row in df.iterrows():\nfields = row.to_dict()\njson_data.append({\"pk\": pk, \"model\": model, \"fields\": fields})\n(Path(output_path) / csv_path).parent.mkdir(parents=True, exist_ok=True)\nPath(output_path / f\"{Path(csv_path).stem}.json\").write_text(\njson.dumps(json_data, indent=indent)\n)\nif not saved is None:\nsaved.append(output_path / f\"{Path(csv_path).stem}.json\")\nreturn json_data\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.download_data","title":"download_data","text":"download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None\n
Download files in files_dict
, overwrite if specified.
Parameters:
Name Type Description Defaultfiles_dict
RemoteDataFilesType
dict
of related files to download
{}
overwrite
bool
bool
to overwrite LOCAL_CACHE
files or not
OVERWRITE
exclude
list[str]
list
of files to exclude from files_dict
[]
Example >>> tmp: Path = getfixture('tmpdir')\n>>> set_path: Path = tmp.chdir()\n>>> download_data(exclude=[\n... \"mitchells\", \"Newspaper-1\", \"linking\"\n... ]) # doctest: +ELLIPSIS\nExcluding mitchells...\nExcluding Newspaper-1...\nExcluding linking...\nDownloading cache...dict_admin_counties.json\n100% ... 37/37 bytes\nDownloading cache...dict_countries.json\n100% ... 33.2/33.2 kB\nDownloading cache...dict_historic_counties.json\n100% ... 41.4/41.4 kB\nDownloading cache...nlp_loc_wikidata_concat.csv\n100% ... 59.8/59.8 kB\nDownloading cache...wikidata_gazetteer_selected_columns.csv\n100% ... 47.8/47.8 MB\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None:\n\"\"\"Download files in ``files_dict``, overwrite if specified.\n Args:\n files_dict: `dict` of related files to download\n overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not\n exclude: `list` of files to exclude from ``files_dict``\n Example:\n ```pycon\n >>> tmp: Path = getfixture('tmpdir')\n >>> set_path: Path = tmp.chdir()\n >>> download_data(exclude=[\n ... \"mitchells\", \"Newspaper-1\", \"linking\"\n ... ]) # doctest: +ELLIPSIS\n Excluding mitchells...\n Excluding Newspaper-1...\n Excluding linking...\n Downloading cache...dict_admin_counties.json\n 100% ... 37/37 bytes\n Downloading cache...dict_countries.json\n 100% ... 33.2/33.2 kB\n Downloading cache...dict_historic_counties.json\n 100% ... 41.4/41.4 kB\n Downloading cache...nlp_loc_wikidata_concat.csv\n 100% ... 59.8/59.8 kB\n Downloading cache...wikidata_gazetteer_selected_columns.csv\n 100% ... 47.8/47.8 MB\n ```\n \"\"\"\nif not files_dict:\nfiles_dict = deepcopy(FILES)\nfor data_source in exclude:\nif data_source in files_dict:\nprint(f\"Excluding {data_source}...\")\nfiles_dict.pop(data_source, 0)\nelse:\nlogger.warning(\nf'\"{data_source}\" not an option to exclude from {files_dict}'\n)\n# Describe whether local file exists\nfor k in files_dict.keys():\nfiles_dict[k][\"exists\"] = files_dict[k][\"local\"].exists()\nfiles_to_download = [\n(v[\"remote\"], v[\"local\"], v[\"exists\"])\nfor v in files_dict.values()\nif \"exists\" in v and not v[\"exists\"] or overwrite\n]\nfor url, out, exists in files_to_download:\nrmtree(Path(out), ignore_errors=True) if exists else None\nprint(f\"Downloading {out}\")\nPath(out).parent.mkdir(parents=True, exist_ok=True)\nassert isinstance(url, str)\nwith urlopen(url) as response, open(out, \"wb\") as out_file:\ntotal: int = int(response.info()[\"Content-length\"])\nwith Progress(\n\"[progress.percentage]{task.percentage:>3.0f}%\",\nBarColumn(), # removed bar_width=None to avoid too long when resized\nDownloadColumn(),\n) as progress:\ndownload_task = progress.add_task(\"Download\", total=total)\nfor chunk in response:\nout_file.write(chunk)\nprogress.update(download_task, advance=len(chunk))\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_list","title":"get_list","text":"get_list(x)\n
Get a list from a string, which contains as separator. If no string is encountered, the function returns an empty list. Source code in alto2txt2fixture/create_adjacent_tables.py
def get_list(x):\n\"\"\"Get a list from a string, which contains <SEP> as separator. If no\n string is encountered, the function returns an empty list.\"\"\"\nreturn x.split(\"<SEP>\") if isinstance(x, str) else []\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_outpaths_dict","title":"get_outpaths_dict","text":"get_outpaths_dict(\nnames: Sequence[str], module_name: str\n) -> TableOutputConfigType\n
Return a dict
of csv
and json
paths for each module_name
table.
The csv
and json
paths
Parameters:
Name Type Description Defaultnames
Sequence[str]
iterable of names of each module_name
's component. Main target is csv
and json
table names
module_name
str
name of module each name is part of, that is added as a prefix
requiredReturns:
Type DescriptionTableOutputConfigType
A TableOutputConfigType
: a dict
of table names
and output csv
and json
filenames.
>>> from pprint import pprint\n>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n 'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConfigType:\n\"\"\"Return a `dict` of `csv` and `json` paths for each `module_name` table.\n The `csv` and `json` paths\n Args:\n names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names\n module_name: name of module each name is part of, that is added as a prefix\n Returns:\n A ``TableOutputConfigType``: a `dict` of table ``names`` and output\n `csv` and `json` filenames.\n Example:\n ```pycon\n >>> from pprint import pprint\n >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n 'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n ```\n \"\"\"\nreturn {\nname: OutputPathDict(\ncsv=f\"{module_name}.{name}.csv\",\njson=f\"{module_name}.{name}.json\",\n)\nfor name in names\n}\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.run","title":"run","text":"run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None\n
Download, process and link files_dict
to json
and csv
.
This will require access to https://zooniversedata.blob.core.windows.net/downloads/
.
alto2txt2fixture/create_adjacent_tables.py
def run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None:\n\"\"\"Download, process and link ``files_dict`` to `json` and `csv`.\n Note:\n This will require access to `https://zooniversedata.blob.core.windows.net/downloads/`.\n \"\"\"\n# Ensure time_stamp from the point of calling `run`\nif not time_stamp:\ntime_stamp = get_now(as_str=False).strftime(TIME_FORMAT)\n# Ensure an independent deepcopy of FILES to avoid modifying subsequent runs\nif not files_dict:\nfiles_dict = deepcopy(FILES)\n# Download non-existing files\ndownload_data(files_dict=files_dict, overwrite=files_to_download_overwrite)\n# Create the output directory (defined in output_path)\noutput_path.mkdir(exist_ok=True, parents=True)\n# Read all the Wikidata Q values from Mitchells\nmitchells_df = pd.read_csv(files_dict[\"mitchells\"][\"local\"], index_col=0)\nmitchell_wikidata_mentions = sorted(\nlist(mitchells_df.PLACE_PUB_WIKI.unique()),\nkey=lambda x: int(x.replace(\"Q\", \"\")),\n)\n# Set up wikidata_gazetteer\ngaz_cols = [\"wikidata_id\", \"english_label\", \"latitude\", \"longitude\", \"geonamesIDs\"]\nwikidata_gazetteer = pd.read_csv(\nfiles_dict[\"wikidata_gazetteer_selected_columns\"][\"local\"], usecols=gaz_cols\n)\nwikidata_gazetteer.rename(\n{\n\"wikidata_id\": \"place_wikidata_id\",\n\"english_label\": \"place_label\",\n\"geonamesIDs\": \"geonames_ids\",\n},\naxis=1,\ninplace=True,\n)\n# Read in + fix all dictionaries\ndict_historic_counties = json.loads(\nPath(files_dict[\"dict_historic_counties\"][\"local\"]).read_text()\n)\ndict_admin_counties = json.loads(\nPath(files_dict[\"dict_admin_counties\"][\"local\"]).read_text()\n)\ndict_countries = json.loads(Path(files_dict[\"dict_countries\"][\"local\"]).read_text())\ndict_historic_counties = correct_dict(dict_historic_counties)\ndict_admin_counties = correct_dict(dict_admin_counties)\ndict_countries = correct_dict(dict_countries)\n# Create assisting frames\nhistorical_counties_df = pd.DataFrame(\ndict_historic_counties,\ncolumns=[\"place_wikidata_id\", \"hcounty_label\", \"hcounty_wikidata_id\"],\n)\nadmin_county_df = pd.DataFrame(\ndict_admin_counties,\ncolumns=[\n\"place_wikidata_id\",\n\"admin_county_label\",\n\"admin_county_wikidata_id\",\n],\n)\ncountries_df = pd.DataFrame(\ndict_countries,\ncolumns=[\"place_wikidata_id\", \"country_label\", \"country_wikidata_id\"],\n)\nwikidata_gazetteer = wikidata_gazetteer[\nwikidata_gazetteer.place_wikidata_id.isin(mitchell_wikidata_mentions)\n].sort_values(\"place_wikidata_id\")\nwikidata_gazetteer[\"place_pk\"] = np.arange(1, len(wikidata_gazetteer) + 1)\nwikidata_gazetteer = wikidata_gazetteer[\n[\"place_pk\"] + [x for x in wikidata_gazetteer.columns if not x == \"place_pk\"]\n]\n# Merge wikidata_gazetteer with all the assisting frames (and rename the\n# resulting columns)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, historical_counties_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, admin_county_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, countries_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer.rename(\n{\n\"admin_county_label\": \"admin_county__label\",\n\"admin_county_wikidata_id\": \"admin_county__wikidata_id\",\n\"hcounty_label\": \"historic_county__label\",\n\"hcounty_wikidata_id\": \"historic_county__wikidata_id\",\n\"country_label\": \"country__label\",\n\"country_wikidata_id\": \"country__wikidata_id\",\n},\naxis=1,\ninplace=True,\n)\n# Split back up into dataframes specific for the tables\nhistoric_county_table = (\nwikidata_gazetteer[[\"historic_county__label\", \"historic_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nhistoric_county_table = historic_county_table.replace({\"\": np.nan}).dropna()\nhistoric_county_table[\"historic_county__pk\"] = np.arange(\n1, len(historic_county_table) + 1\n)\nadmin_county_table = (\nwikidata_gazetteer[[\"admin_county__label\", \"admin_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nadmin_county_table = admin_county_table.replace({\"\": np.nan}).dropna()\nadmin_county_table[\"admin_county__pk\"] = np.arange(1, len(admin_county_table) + 1)\ncountry_table = (\nwikidata_gazetteer[[\"country__label\", \"country__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\ncountry_table = country_table.replace({\"\": np.nan}).dropna()\ncountry_table[\"country__pk\"] = np.arange(1, len(country_table) + 1)\n# Set up place_table from wikidata_gazetteer\nplace_table = wikidata_gazetteer.copy()\nplace_table = (\npd.merge(\nplace_table,\nhistoric_county_table,\non=[\"historic_county__label\", \"historic_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"historic_county__label\", \"historic_county__wikidata_id\"], axis=1)\n.rename({\"historic_county__pk\": \"historic_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\nadmin_county_table,\non=[\"admin_county__label\", \"admin_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"admin_county__label\", \"admin_county__wikidata_id\"], axis=1)\n.rename({\"admin_county__pk\": \"admin_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\ncountry_table,\non=[\"country__label\", \"country__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"country__label\", \"country__wikidata_id\"], axis=1)\n.rename({\"country__pk\": \"country_id\"}, axis=1)\n)\nplace_table.fillna(\"\", inplace=True)\nplace_table.set_index(\"place_pk\", inplace=True)\nplace_table.rename(\n{\"place_label\": \"label\", \"place_wikidata_id\": \"wikidata_id\"},\naxis=1,\ninplace=True,\n)\nplace_table[\"historic_county_id\"] = (\nplace_table[\"historic_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"admin_county_id\"] = (\nplace_table[\"admin_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"country_id\"] = (\nplace_table[\"country_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table.index.rename(\"pk\", inplace=True)\nplace_table.rename(\n{\n\"historic_county_id\": \"historic_county\",\n\"admin_county_id\": \"admin_county\",\n\"country_id\": \"country\",\n},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.set_index(\"historic_county__pk\", inplace=True)\nhistoric_county_table.rename(\n{x: x.split(\"__\")[1] for x in historic_county_table.columns},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.index.rename(\"pk\", inplace=True)\nadmin_county_table.set_index(\"admin_county__pk\", inplace=True)\nadmin_county_table.rename(\n{x: x.split(\"__\")[1] for x in admin_county_table.columns}, axis=1, inplace=True\n)\nadmin_county_table.index.rename(\"pk\", inplace=True)\ncountry_table.set_index(\"country__pk\", inplace=True)\ncountry_table.rename(\n{x: x.split(\"__\")[1] for x in country_table.columns}, axis=1, inplace=True\n)\ncountry_table.index.rename(\"pk\", inplace=True)\n# Adding created_at, updated_at to all the gazetteer tables\nplace_table[\"created_at\"] = time_stamp\nplace_table[\"updated_at\"] = time_stamp\nadmin_county_table[\"created_at\"] = time_stamp\nadmin_county_table[\"updated_at\"] = time_stamp\nhistoric_county_table[\"created_at\"] = time_stamp\nhistoric_county_table[\"updated_at\"] = time_stamp\ncountry_table[\"created_at\"] = time_stamp\ncountry_table[\"updated_at\"] = time_stamp\n# Save CSV files for gazetteer tables\nplace_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"])\nadmin_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"]\n)\nhistoric_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"]\n)\ncountry_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"])\nsaved.extend(\n[\noutput_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"],\n]\n)\n# Fix up Mitchells (already loaded)\nmitchells_df[\"politics\"] = mitchells_df.POLITICS.apply(get_list)\nmitchells_df[\"persons\"] = mitchells_df.PERSONS.apply(get_list)\nmitchells_df[\"organisations\"] = mitchells_df.ORGANIZATIONS.apply(get_list)\nmitchells_df[\"price\"] = mitchells_df.PRICE.apply(get_list)\nmitchells_df.rename(\n{\n\"ID\": \"mpd_id\",\n\"TITLE\": \"title\",\n\"politics\": \"political_leaning_raw\",\n\"price\": \"price_raw\",\n\"YEAR\": \"year\",\n\"PLACE_PUB_WIKI\": \"place_of_publication_id\",\n\"ESTABLISHED_DATE\": \"date_established_raw\",\n\"PUBLISED_DATE\": \"day_of_publication_raw\",\n},\naxis=1,\ninplace=True,\n)\ndrop_cols = [\n\"CHAIN_ID\",\n\"POLITICS\",\n\"PERSONS\",\n\"ORGANIZATIONS\",\n\"PRICE\",\n\"PLACE_PUB\",\n\"PLACE_PUB_COORD\",\n\"PLACES\",\n\"PLACES_TRES\",\n\"TEXT\",\n]\nmitchells_df.drop(columns=drop_cols, inplace=True)\n# Create derivative tables (from Mitchells) = political_leanings, prices,\n# issues\npolitical_leanings = sorted(\nlist(set([y.strip() for x in mitchells_df.political_leaning_raw for y in x]))\n)\npolitical_leanings_table = pd.DataFrame()\npolitical_leanings_table[\"political_leaning__pk\"] = np.arange(\n1, len(political_leanings) + 1\n)\npolitical_leanings_table[\"political_leaning__label\"] = political_leanings\nexport = political_leanings_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"political_leaning__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nprices = sorted(list(set([y.strip() for x in mitchells_df.price_raw for y in x])))\nprices_table = pd.DataFrame()\nprices_table[\"price__pk\"] = np.arange(1, len(prices) + 1)\nprices_table[\"price__label\"] = prices\nexport = prices_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"price__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nissues = sorted(list(mitchells_df.year.unique()))\nissues_table = pd.DataFrame()\nissues_table[\"issue__pk\"] = np.arange(1, len(issues) + 1)\nissues_table[\"issue__year\"] = issues\nexport = issues_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"issue__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\n# Set up linking on Mitchells dataframe\nlinking_df = pd.read_csv(\nfiles_dict[\"linking\"][\"local\"],\nindex_col=0,\ndtype={\"NLP\": str},\nusecols=[\n\"NLP\",\n\"Title\",\n\"AcquiredYears\",\n\"Editions\",\n\"EditionTitles\",\n\"City\",\n\"Publisher\",\n\"UnavailableYears\",\n\"Collection\",\n\"UK\",\n\"Complete\",\n\"Notes\",\n\"County\",\n\"HistoricCounty\",\n\"First date held\",\n\"Publication title\",\n\"link_to_mpd\",\n],\n)\nlinking_df[\"NLP\"] = linking_df.index\nlinking_df.rename(\n{\"link_to_mpd\": \"mpd_id\", \"NLP\": \"newspaper\"}, axis=1, inplace=True\n)\n# Link Mitchells with all the other data\nmitchells_df = pd.merge(mitchells_df, linking_df, on=\"mpd_id\", how=\"inner\")\n# Create entry_table\nentry_table = mitchells_df.copy()\nentry_table[\"place_of_circulation_raw\"] = \"\"\nentry_table[\"publication_district_raw\"] = \"\"\nentry_table[\"publication_county_raw\"] = \"\"\n# TODO: What happened to the three columns above? (Check w Kaspar?)\n# Only keep relevant columns\nentry_table = entry_table[\n[\n\"title\",\n\"political_leaning_raw\",\n\"price_raw\",\n\"year\",\n\"date_established_raw\",\n\"day_of_publication_raw\",\n\"place_of_circulation_raw\",\n\"publication_district_raw\",\n\"publication_county_raw\",\n\"organisations\",\n\"persons\",\n\"place_of_publication_id\",\n\"newspaper\",\n]\n]\n# Fix refs to political_leanings_table\nrev = political_leanings_table.set_index(\"political_leaning__label\")\nentry_table[\"political_leanings\"] = entry_table.political_leaning_raw.apply(\nlambda x: [rev.at[y, \"political_leaning__pk\"] for y in x]\n)\n# Fix refs to prices_table\nrev = prices_table.set_index(\"price__label\")\nentry_table[\"prices\"] = entry_table.price_raw.apply(\nlambda x: [rev.at[y.strip(), \"price__pk\"] for y in x]\n)\n# Fix refs to issues_table\nrev = issues_table.set_index(\"issue__year\")\nentry_table[\"issue\"] = entry_table.year.apply(lambda x: rev.at[x, \"issue__pk\"])\n# Fix refs to place_table\nrev = place_table.copy()\nrev[\"place__pk\"] = rev.index\nrev.set_index(\"wikidata_id\", inplace=True)\nentry_table[\"place_of_publication\"] = entry_table.place_of_publication_id.apply(\ntest_place, rev=rev\n)\nentry_table.drop(columns=[\"place_of_publication_id\"], inplace=True)\n# Set up ref to newspapers\nrev = json.loads(files_dict[\"Newspaper-1\"][\"local\"].read_text())\nrev = [dict(pk=v[\"pk\"], **v[\"fields\"]) for v in rev]\nrev = pd.DataFrame(rev)\nrev.set_index(\"publication_code\", inplace=True)\nentry_table[\"newspaper\"] = entry_table.newspaper.str.zfill(7)\nentry_table[\"newspaper\"] = entry_table.newspaper.apply(test_paper, rev=rev)\n# Create PK for entries\nentry_table[\"pk\"] = np.arange(1, len(entry_table) + 1)\n# Sort columns in entries file\nentry_table = entry_table[\n[\"pk\"] + [col for col in entry_table.columns if not col == \"pk\"]\n]\n# Add created_at, modified_at to entry_table\nentry_table[\"created_at\"] = time_stamp\nentry_table[\"updated_at\"] = time_stamp\n# Export entry_table\nentry_table.set_index(\"pk\").to_csv(\noutput_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"]\n)\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"])\n# ######\u00a0NOW WE CAN EASILY CREATE JSON files_dict\nfor csv_file_path in output_path.glob(\"*.csv\"):\ncsv2json_list(csv_file_path)\nprint(\"Finished - saved files:\")\nprint(\"- \" + \"\\n- \".join([str(x) for x in saved]))\n
"},{"location":"reference/alto2txt2fixture/jisc.html","title":"jisc","text":""},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.get_jisc_title","title":"get_jisc_title","text":"get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str\n
Match a newspaper title
with jisc_papers
records.
Takes an input_sub_path
, a publication_code
, and an (optional) abbreviation for any newspaper to locate the title
in the jisc_papers
DataFrame
. jisc_papers
is usually loaded via the setup_jisc_papers
function.
Parameters:
Name Type Description Defaulttitle
str
target newspaper title
requiredissue_date
str
target newspaper issue_date
requiredjisc_papers
pd.DataFrame
DataFrame
of jisc_papers
to match
input_sub_path
str
path of files to narrow down query input_sub_path
requiredpublication_code
str
unique codes to match newspaper records
requiredabbr
str | None
an optional abbreviation of the newspaper title
None
Returns:
Type Descriptionstr
Matched title
str
or abbr
.
Returns:
Type Descriptionstr
A string estimating the JISC equivalent newspaper title
Source code inalto2txt2fixture/jisc.py
def get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str:\n\"\"\"\n Match a newspaper ``title`` with ``jisc_papers`` records.\n Takes an ``input_sub_path``, a ``publication_code``, and an (optional)\n abbreviation for any newspaper to locate the ``title`` in the\n ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the\n ``setup_jisc_papers`` function.\n Args:\n title: target newspaper title\n issue_date: target newspaper issue_date\n jisc_papers: `DataFrame` of `jisc_papers` to match\n input_sub_path: path of files to narrow down query input_sub_path\n publication_code: unique codes to match newspaper records\n abbr: an optional abbreviation of the newspaper title\n Returns:\n Matched ``title`` `str` or ``abbr``.\n Returns:\n A string estimating the JISC equivalent newspaper title\n \"\"\"\n# First option, search the input_sub_path for a valid-looking publication_code\ng = PUBLICATION_CODE.findall(input_sub_path)\nif len(g) == 1:\npublication_code = g[0]\n# Let's see if we can find title:\ntitle = (\njisc_papers[\njisc_papers.publication_code == publication_code\n].title.to_list()[0]\nif jisc_papers[\njisc_papers.publication_code == publication_code\n].title.count()\n== 1\nelse title\n)\nreturn title\n# Second option, look through JISC papers for best match (on publication_code if we have it, but abbr more importantly if we have it)\nif abbr:\n_publication_code = publication_code\npublication_code = abbr\nif jisc_papers.abbr[jisc_papers.abbr == publication_code].count():\ndate = datetime.strptime(issue_date, \"%Y-%m-%d\")\nmask = (\n(jisc_papers.abbr == publication_code)\n& (date >= jisc_papers.start_date)\n& (date <= jisc_papers.end_date)\n)\nfiltered = jisc_papers.loc[mask]\nif filtered.publication_code.count() == 1:\npublication_code = filtered.publication_code.to_list()[0]\ntitle = filtered.title.to_list()[0]\nreturn title\n# Last option: let's find all the possible titles in the jisc_papers for the abbreviation, and if it's just one unique title, let's pick it!\nif abbr:\ntest = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test[0]\nelse:\nmask1 = (jisc_papers.abbr == publication_code) & (\njisc_papers.publication_code == _publication_code\n)\ntest1 = jisc_papers.loc[mask1]\ntest1 = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test1[0]\n# Fallback: if abbreviation is set, we'll return that:\nif abbr:\n# For these exceptions, see issue comment:\n# https://github.com/alan-turing-institute/Living-with-Machines/issues/2453#issuecomment-1050652587\nif abbr == \"IPJL\":\nreturn \"Ipswich Journal\"\nelif abbr == \"BHCH\":\nreturn \"Bath Chronicle\"\nelif abbr == \"LSIR\":\nreturn \"Leeds Intelligencer\"\nelif abbr == \"AGER\":\nreturn \"Lancaster Gazetter, And General Advertiser For Lancashire West\"\nreturn abbr\nraise RuntimeError(f\"Title {title} could not be found.\")\n
"},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.setup_jisc_papers","title":"setup_jisc_papers","text":"setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame\n
Create a DataFrame
with information in JISC_PAPERS_CSV
in settings.
Returns:
Type Descriptionpd.DataFrame
DataFrame
with all JISC titles.
alto2txt2fixture/jisc.py
def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:\n\"\"\"\n Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.\n Returns:\n `DataFrame` with all JISC titles.\n \"\"\"\nif not Path(path).exists():\nraise RuntimeError(\nf\"Could not find required JISC papers file. Put {Path(path).name} in {Path(path).parent} or correct the settings with a different path.\"\n)\nmonths = {\n\"Jan\": 1,\n\"Feb\": 2,\n\"Mar\": 3,\n\"Apr\": 4,\n\"May\": 5,\n\"Jun\": 6,\n\"June\": 6,\n\"Jul\": 7,\n\"July\": 7,\n\"Aug\": 8,\n\"Sep\": 9,\n\"Sept\": 9,\n\"Oct\": 10,\n\"Nov\": 11,\n\"Dec\": 12,\n\"Dec.\": 12,\n}\njisc_papers = pd.read_csv(\npath,\nusecols=[\n\"Newspaper Title\",\n\"NLP\",\n\"Abbr\",\n\"StartD\",\n\"StartM\",\n\"StartY\",\n\"EndD\",\n\"EndM\",\n\"EndY\",\n],\n)\njisc_papers[\"start_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.StartY),\nmonth=months[x.StartM.strip(\".\").strip()],\nday=int(x.StartD),\n),\naxis=1,\n)\njisc_papers[\"end_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.EndY), month=months[x.EndM.strip(\".\").strip()], day=int(x.EndD)\n),\naxis=1,\n)\njisc_papers.drop(\n[\"StartD\", \"StartM\", \"StartY\", \"EndD\", \"EndM\", \"EndY\"],\naxis=\"columns\",\ninplace=True,\n)\njisc_papers.rename(\n{\"Newspaper Title\": \"title\", \"NLP\": \"publication_code\", \"Abbr\": \"abbr\"},\naxis=1,\ninplace=True,\n)\njisc_papers[\"title\"] = jisc_papers[\"title\"].apply(\nlambda x: \"The \" + x[:-5] if x.strip()[-5:].lower() == \", the\" else x\n)\njisc_papers[\"publication_code\"] = jisc_papers[\"publication_code\"].apply(\nlambda x: str(x).zfill(7)\n)\nreturn jisc_papers\n
"},{"location":"reference/alto2txt2fixture/log.html","title":"log","text":""},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.error","title":"error","text":"error(msg: str, crash: bool = True, silent: bool = True) -> None\n
Print msg
in colorama
Force.RED
and exit()
If silent
exit()
after call, else raise
RuntimeError
if crash=True
.
alto2txt2fixture/log.py
def error(msg: str, crash: bool = True, silent: bool = True) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.RED` and `exit()`\n If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``.\"\"\"\nif crash and silent:\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nexit()\nelif crash:\nraise RuntimeError(msg) from None\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.info","title":"info","text":"info(msg: str) -> None\n
Print msg
in colorama
Force.CYAN
colour.
alto2txt2fixture/log.py
def info(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.CYAN` colour.\"\"\"\nprint(f\"{Fore.CYAN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.success","title":"success","text":"success(msg: str) -> None\n
Print msg
in colorama
Force.GREEN
colour.
alto2txt2fixture/log.py
def success(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.GREEN` colour.\"\"\"\nprint(f\"{Fore.GREEN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.warning","title":"warning","text":"warning(msg: str) -> None\n
Print msg
in colorama
Force.YELLOW
colour.
alto2txt2fixture/log.py
def warning(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.YELLOW` colour.\"\"\"\nprint(f\"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html","title":"parser","text":""},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.fixtures","title":"fixtures","text":"fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]\n
Generates fixtures for a specified model using a list of files.
This function takes a list of files and generates fixtures for a specified model. The fixtures can be used to populate a database or perform other data-related operations.
Parameters:
Name Type Description Defaultfilelist
list
A list of files to process and generate fixtures from.
[]
model
str
The name of the model for which fixtures are generated. translate: A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used as keys, and their corresponding primary keys (obtained from the provided files) will be used as values in the generated fixtures. ''
rename
dict
A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the generated fixtures. {}
uniq_keys
list
A list of fields that need to be considered for uniqueness in the fixtures. If specified, the fixtures will yield only unique items based on the combination of these fields.
[]
Yields:
Type DescriptionFixtureDict
FixtureDict
from model
, pk
and dict
of fields
.
Returns:
Type DescriptionGenerator[FixtureDict, None, None]
This function generates fixtures but does not return any value.
Source code inalto2txt2fixture/parser.py
def fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]:\n\"\"\"\n Generates fixtures for a specified model using a list of files.\n This function takes a list of files and generates fixtures for a specified\n model. The fixtures can be used to populate a database or perform other\n data-related operations.\n Args:\n filelist: A list of files to process and generate fixtures from.\n model: The name of the model for which fixtures are generated.\n translate: A nested dictionary representing the translation mapping\n for fields. The structure of the translator follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n The translated fields will be used as keys, and their\n corresponding primary keys (obtained from the provided files) will\n be used as values in the generated fixtures.\n rename: A nested dictionary representing the field renaming\n mapping. The structure of the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': 'new_field_name'\n }\n }\n ```\n The fields specified in the dictionary will be renamed to the\n provided new field names in the generated fixtures.\n uniq_keys: A list of fields that need to be considered for\n uniqueness in the fixtures. If specified, the fixtures will yield\n only unique items based on the combination of these fields.\n Yields:\n `FixtureDict` from ``model``, ``pk`` and `dict` of ``fields``.\n Returns:\n This function generates fixtures but does not return any value.\n \"\"\"\nfilelist = sorted(filelist, key=lambda x: str(x).split(\"/\")[:-1])\ncount = len(filelist)\n# Process JSONL\nif [x for x in filelist if \".jsonl\" in x.name]:\npk = 0\n# In the future, we might want to show progress here (tqdm or suchlike)\nfor file in filelist:\nfor line in file.read_text().splitlines():\npk += 1\nline = json.loads(line)\nyield FixtureDict(\npk=pk,\nmodel=model,\nfields=dict(**get_fields(line, translate=translate, rename=rename)),\n)\nreturn\nelse:\n# Process JSON\npks = [x for x in range(1, count + 1)]\nif len(uniq_keys):\nuniq_files = list(uniq(filelist, uniq_keys))\ncount = len(uniq_files)\nzipped = zip(uniq_files, pks)\nelse:\nzipped = zip(filelist, pks)\nfor x in tqdm(\nzipped, total=count, desc=f\"{model} ({count:,} objs)\", leave=False\n):\nyield FixtureDict(\npk=x[1],\nmodel=model,\nfields=dict(**get_fields(x[0], translate=translate, rename=rename)),\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_fields","title":"get_fields","text":"get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict\n
Retrieves fields from a file and performs modifications and checks.
This function takes a file (in various formats: Path
, str
, or dict
) and processes its fields. It retrieves the fields from the file and performs modifications, translations, and checks on the fields.
Parameters:
Name Type Description Defaultfile
Union[Path, str, dict]
The file from which the fields are retrieved.
requiredtranslate
dict
A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used to replace the original fields in the retrieved fields. {}
rename
dict
A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the retrieved fields. {}
allow_null
bool
Determines whether to allow None
values for relational fields. If set to True
, relational fields with missing values will be assigned None
. If set to False
, an error will be raised.
False
Returns:
Type Descriptiondict
A dictionary representing the retrieved fields from the file, with modifications and checks applied.
Raises:
Type DescriptionRuntimeError
If the file type is unsupported or if an error occurs during field retrieval or processing.
Source code inalto2txt2fixture/parser.py
def get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict:\n\"\"\"\n Retrieves fields from a file and performs modifications and checks.\n This function takes a file (in various formats: `Path`, `str`, or `dict`)\n and processes its fields. It retrieves the fields from the file and\n performs modifications, translations, and checks on the fields.\n Args:\n file: The file from which the fields are retrieved.\n translate: A nested dictionary representing the translation mapping\n for fields. The structure of the translator follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n The translated fields will be used to replace the original fields\n in the retrieved fields.\n rename: A nested dictionary representing the field renaming\n mapping. The structure of the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': 'new_field_name'\n }\n }\n ```\n The fields specified in the dictionary will be renamed to the\n provided new field names in the retrieved fields.\n allow_null: Determines whether to allow ``None`` values for\n relational fields. If set to ``True``, relational fields with\n missing values will be assigned ``None``. If set to ``False``, an\n error will be raised.\n Returns:\n A dictionary representing the retrieved fields from the file,\n with modifications and checks applied.\n Raises:\n RuntimeError: If the file type is unsupported or if an error occurs\n during field retrieval or processing.\n \"\"\"\nif isinstance(file, Path):\ntry:\nfields = json.loads(file.read_text())\nexcept Exception as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, str):\nif \"\\n\" in file:\nraise RuntimeError(\"File has multiple lines.\")\ntry:\nfields = json.loads(file)\nexcept json.decoder.JSONDecodeError as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, dict):\nfields = file\nelse:\nraise RuntimeError(f\"Cannot process type {type(file)}.\")\n# Fix relational fields for any file\nfor key in [key for key in fields.keys() if \"__\" in key]:\nparts = key.split(\"__\")\ntry:\nbefore = fields[key]\nif before:\nbefore = before.replace(\"---\", \"/\")\nloc = translate.get(parts[0], {}).get(parts[1], {})\nfields[key] = loc.get(before)\nif fields[key] is None:\nraise RuntimeError(\nf\"Cannot translate fields.{key} from {before}: {loc}\"\n)\nexcept AttributeError:\nif allow_null:\nfields[key] = None\nelse:\nprint(\n\"Content had relational fields, but something went wrong in parsing the data:\"\n)\nprint(\"file\", file)\nprint(\"fields\", fields)\nprint(\"KEY:\", key)\nraise RuntimeError()\nnew_name = rename.get(parts[0], {}).get(parts[1], None)\nif new_name:\nfields[new_name] = fields[key]\ndel fields[key]\nfields[\"created_at\"] = NOW_str\nfields[\"updated_at\"] = NOW_str\ntry:\nfields[\"item_type\"] = str(fields[\"item_type\"]).upper()\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_mean\"] == \"\":\nfields[\"ocr_quality_mean\"] = 0\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_sd\"] == \"\":\nfields[\"ocr_quality_sd\"] = 0\nexcept KeyError:\npass\nreturn fields\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_key_from","title":"get_key_from","text":"get_key_from(item: Path, x: str) -> str\n
Retrieves a specific key from a file and returns its value.
This function reads a file and extracts the value of a specified key. If the key is not found or an error occurs while processing the file, a warning is printed, and an empty string is returned.
Parameters:
Name Type Description Defaultitem
Path
The file from which the key is extracted.
requiredx
str
The key to be retrieved from the file.
requiredReturns:
Type Descriptionstr
The value of the specified key from the file.
Source code inalto2txt2fixture/parser.py
def get_key_from(item: Path, x: str) -> str:\n\"\"\"\n Retrieves a specific key from a file and returns its value.\n This function reads a file and extracts the value of a specified\n key. If the key is not found or an error occurs while processing\n the file, a warning is printed, and an empty string is returned.\n Args:\n item: The file from which the key is extracted.\n x: The key to be retrieved from the file.\n Returns:\n The value of the specified key from the file.\n \"\"\"\nresult = json.loads(item.read_text()).get(x, None)\nif not result:\nprint(f\"[WARN] Could not find key {x} in {item}\")\nresult = \"\"\nreturn result\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_translator","title":"get_translator","text":"get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict\n
Converts a list of fields into a nested dictionary representing a translator.
Parameters:
Name Type Description Defaultfields
list[TranslatorTuple]
A list of tuples representing fields to be translated.
[TranslatorTuple('', '', [])]
Returns:
Type Descriptiondict
A nested dictionary representing the translator. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
Example >>> fields = [\n... TranslatorTuple(\n... start='start__field1',\n... finish='field1',\n... lst=[{\n... 'fields': {'field1': 'translation1'},\n... 'pk': 1}],\n... )]\n>>> get_translator(fields)\n{'start': {'field1': {'translation1': 1}}}\n
Source code in alto2txt2fixture/parser.py
def get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict:\n\"\"\"\n Converts a list of fields into a nested dictionary representing a\n translator.\n Args:\n fields: A list of tuples representing fields to be translated.\n Returns:\n A nested dictionary representing the translator. The structure of\n the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n Example:\n ```pycon\n >>> fields = [\n ... TranslatorTuple(\n ... start='start__field1',\n ... finish='field1',\n ... lst=[{\n ... 'fields': {'field1': 'translation1'},\n ... 'pk': 1}],\n ... )]\n >>> get_translator(fields)\n {'start': {'field1': {'translation1': 1}}}\n ```\n \"\"\"\n_ = dict()\nfor field in fields:\nstart, finish, lst = field\npart1, part2 = start.split(\"__\")\nif part1 not in _:\n_[part1] = {}\nif part2 not in _[part1]:\n_[part1][part2] = {}\nif isinstance(finish, str):\n_[part1][part2] = {o[\"fields\"][finish]: o[\"pk\"] for o in lst}\nelif isinstance(finish, list):\n_[part1][part2] = {\n\"-\".join([o[\"fields\"][x] for x in finish]): o[\"pk\"] for o in lst\n}\nreturn _\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.parse","title":"parse","text":"parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None\n
Parses files from collections and generates fixtures for various models.
This function processes files from the specified collections and generates fixtures for different models, such as newspapers.dataprovider
, newspapers.ingest
, newspapers.digitisation
, newspapers.newspaper
, newspapers.issue
, and newspapers.item
.
It performs various steps, such as file listing, fixture generation, translation mapping, renaming fields, and saving fixtures to files.
Parameters:
Name Type Description Defaultcollections
list
A list of collections from which files are processed and fixtures are generated.
requiredcache_home
str
The directory path where the collections are located.
requiredoutput
str
The directory path where the fixtures will be saved.
requiredmax_elements_per_file
int
The maximum number of elements per file when saving fixtures.
requiredReturns:
Type DescriptionNone
This function generates fixtures but does not return any value.
Source code inalto2txt2fixture/parser.py
def parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None:\n\"\"\"\n Parses files from collections and generates fixtures for various models.\n This function processes files from the specified collections and generates\n fixtures for different models, such as `newspapers.dataprovider`,\n `newspapers.ingest`, `newspapers.digitisation`, `newspapers.newspaper`,\n `newspapers.issue`, and `newspapers.item`.\n It performs various steps, such as file listing, fixture generation,\n translation mapping, renaming fields, and saving fixtures to files.\n Args:\n collections: A list of collections from which files are\n processed and fixtures are generated.\n cache_home: The directory path where the collections are located.\n output: The directory path where the fixtures will be saved.\n max_elements_per_file: The maximum number of elements per file\n when saving fixtures.\n Returns:\n This function generates fixtures but does not return any value.\n \"\"\"\nglobal CACHE_HOME\nglobal OUTPUT\nglobal MAX_ELEMENTS_PER_FILE\nCACHE_HOME = cache_home\nOUTPUT = output\nMAX_ELEMENTS_PER_FILE = max_elements_per_file\n# Set up output directory\nreset_fixture_dir(OUTPUT)\n# Get file lists\nprint(\"\\nGetting file lists...\")\ndef issues_in_x(x):\nreturn \"issues\" in str(x.parent).split(\"/\")\ndef newspapers_in_x(x):\nreturn not any(\n[\ncondition\nfor y in str(x.parent).split(\"/\")\nfor condition in [\n\"issues\" in y,\n\"ingest\" in y,\n\"digitisation\" in y,\n\"data-provider\" in y,\n]\n]\n)\nall_json = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.json\")\n]\nall_jsonl = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.jsonl\")\n]\nprint(f\"--> {len(all_json):,} JSON files altogether\")\nprint(f\"--> {len(all_jsonl):,} JSONL files altogether\")\nprint(\"\\nSetting up fixtures...\")\n# Process data providers\ndef data_provider_in_x(x):\nreturn \"data-provider\" in str(x.parent).split(\"/\")\ndata_provider_json = list(\nfixtures(\nmodel=\"newspapers.dataprovider\",\nfilelist=[x for x in all_json if data_provider_in_x(x)],\nuniq_keys=[\"name\"],\n)\n)\nprint(f\"--> {len(data_provider_json):,} DataProvider fixtures\")\n# Process ingest\ndef ingest_in_x(x):\nreturn \"ingest\" in str(x.parent).split(\"/\")\ningest_json = list(\nfixtures(\nmodel=\"newspapers.ingest\",\nfilelist=[x for x in all_json if ingest_in_x(x)],\nuniq_keys=[\"lwm_tool_name\", \"lwm_tool_version\"],\n)\n)\nprint(f\"--> {len(ingest_json):,} Ingest fixtures\")\n# Process digitisation\ndef digitisation_in_x(x):\nreturn \"digitisation\" in str(x.parent).split(\"/\")\ndigitisation_json = list(\nfixtures(\nmodel=\"newspapers.digitisation\",\nfilelist=[x for x in all_json if digitisation_in_x(x)],\nuniq_keys=[\"software\"],\n)\n)\nprint(f\"--> {len(digitisation_json):,} Digitisation fixtures\")\n# Process newspapers\nnewspaper_json = list(\nfixtures(\nmodel=\"newspapers.newspaper\",\nfilelist=[file for file in all_json if newspapers_in_x(file)],\n)\n)\nprint(f\"--> {len(newspaper_json):,} Newspaper fixtures\")\n# Process issue\ntranslate = get_translator(\n[\nTranslatorTuple(\n\"publication__publication_code\", \"publication_code\", newspaper_json\n)\n]\n)\nrename = {\"publication\": {\"publication_code\": \"newspaper_id\"}}\nissue_json = list(\nfixtures(\nmodel=\"newspapers.issue\",\nfilelist=[file for file in all_json if issues_in_x(file)],\ntranslate=translate,\nrename=rename,\n)\n)\nprint(f\"--> {len(issue_json):,} Issue fixtures\")\n# Create translator/clear up memory before processing items\ntranslate = get_translator(\n[\n(\"issue__issue_identifier\", \"issue_code\", issue_json),\n(\"digitisation__software\", \"software\", digitisation_json),\n(\"data_provider__name\", \"name\", data_provider_json),\n(\n\"ingest__lwm_tool_identifier\",\n[\"lwm_tool_name\", \"lwm_tool_version\"],\ningest_json,\n),\n]\n)\nrename = {\n\"issue\": {\"issue_identifier\": \"issue_id\"},\n\"digitisation\": {\"software\": \"digitisation_id\"},\n\"data_provider\": {\"name\": \"data_provider_id\"},\n\"ingest\": {\"lwm_tool_identifier\": \"ingest_id\"},\n}\nsave_fixture(newspaper_json, \"Newspaper\")\nsave_fixture(issue_json, \"Issue\")\ndel newspaper_json\ndel issue_json\ngc.collect()\nprint(\"\\nSaving...\")\nsave_fixture(digitisation_json, \"Digitisation\")\nsave_fixture(ingest_json, \"Ingest\")\nsave_fixture(data_provider_json, \"DataProvider\")\n# Process items\nitem_json = fixtures(\nmodel=\"newspapers.item\",\nfilelist=all_jsonl,\ntranslate=translate,\nrename=rename,\n)\nsave_fixture(item_json, \"Item\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.reset_fixture_dir","title":"reset_fixture_dir","text":"reset_fixture_dir(output: str | Path) -> None\n
Resets the fixture directory by removing all JSON files inside it.
This function takes a directory path (output
) as input and removes all JSON files within the directory.
Prior to removal, it prompts the user for confirmation to proceed. If the user confirms, the function clears the fixture directory by deleting the JSON files.
Parameters:
Name Type Description Defaultoutput
str | Path
The directory path of the fixture directory to be reset.
requiredRaises:
Type DescriptionRuntimeError
If the output
directory is not specified as a string.
alto2txt2fixture/parser.py
def reset_fixture_dir(output: str | Path) -> None:\n\"\"\"\n Resets the fixture directory by removing all JSON files inside it.\n This function takes a directory path (``output``) as input and removes all\n JSON files within the directory.\n Prior to removal, it prompts the user for confirmation to proceed. If the\n user confirms, the function clears the fixture directory by deleting the\n JSON files.\n Args:\n output: The directory path of the fixture directory to be reset.\n Raises:\n RuntimeError: If the ``output`` directory is not specified as a string.\n \"\"\"\nif not isinstance(output, str):\nraise RuntimeError(\"`output` directory needs to be specified as a string.\")\noutput = Path(output)\ny = input(\nf\"This command will automatically empty the fixture directory ({output.absolute()}). \"\n\"Do you want to proceed? [y/N]\"\n)\nif not y.lower() == \"y\":\noutput.mkdir(parents=True, exist_ok=True)\nreturn\nprint(\"\\nClearing up the fixture directory\")\n# Ensure directory exists\noutput.mkdir(parents=True, exist_ok=True)\n# Drop all JSON files\n[x.unlink() for x in Path(output).glob(\"*.json\")]\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.uniq","title":"uniq","text":"uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]\n
Generates unique items from a list of files based on specified keys.
This function takes a list of files and yields unique items based on a combination of keys. The keys are extracted from each file using the get_key_from
function, and duplicate items are ignored.
Parameters:
Name Type Description Defaultfilelist
list
A list of files from which unique items are generated.
requiredkeys
list
A list of keys used for uniqueness. Each key specifies a field to be used for uniqueness checking in the generated items.
[]
Yields:
Type DescriptionAny
A unique item from filelist
.
alto2txt2fixture/parser.py
def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:\n\"\"\"\n Generates unique items from a list of files based on specified keys.\n This function takes a list of files and yields unique items based on a\n combination of keys. The keys are extracted from each file using the\n ``get_key_from`` function, and duplicate items are ignored.\n Args:\n filelist: A list of files from which unique items are\n generated.\n keys: A list of keys used for uniqueness. Each key specifies\n a field to be used for uniqueness checking in the generated\n items.\n Yields:\n A unique item from `filelist`.\n \"\"\"\nseen = set()\nfor item in filelist:\nkey = \"-\".join([get_key_from(item, x) for x in keys])\nif key not in seen:\nseen.add(key)\nyield item\nelse:\n# Drop it if duplicate\npass\n
"},{"location":"reference/alto2txt2fixture/patterns.html","title":"patterns","text":"Useful regular expressions, intially just PUBLICATION_CODE
.
Archive(\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n)\n
Manage extracting information from a ZIP archive.
The Archive
class represents a zip archive of XML files. The class is used to extract information from a ZIP archive, and it contains several methods to process the data contained in the archive.
open(Archive)
context manager
Archive can be opened with a context manager, which creates a meta object, with timings for the object. When closed, it will save the meta JSON to the correct paths.
Attributes:
Name Type Descriptionpath
Path
The path to the zip archive.
collection
str
The collection of the XML files in the archive. Default is \"\".
report
Path
The file path of the report file for the archive.
report_id
str
The report ID for the archive. If not provided, a random UUID is generated.
report_parent
Path
The parent directory of the report file for the archive.
jisc_papers
pd.DataFrame
A DataFrame of JISC papers.
size
str | float
The size of the archive, in human-readable format.
size_raw
str | float
The raw size of the archive, in bytes.
roots
Generator[ET.Element, None, None]
The root elements of the XML documents contained in the archive.
meta
dotdict
Metadata about the archive, such as its path, size, and number of contents.
json_indent
int
Indentation formatting of json
output
Raises:
Type DescriptionRuntimeError
If the path
does not exist.
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n):\n\"\"\"Constructor method.\"\"\"\nself.path: Path = Path(path)\nif not self.path.exists():\nraise RuntimeError(\"Path does not exist.\")\nself.size: str | float = get_size_from_path(self.path)\nself.size_raw: str | float = get_size_from_path(self.path, raw=True)\nself.zip_file: zipfile.ZipFile = zipfile.ZipFile(self.path)\nself.collection: str = collection\nself.roots: Generator[ET.Element, None, None] = self.get_roots()\nself.meta: dotdict = dotdict(\npath=str(self.path),\nbytes=self.size_raw,\nsize=self.size,\ncontents=len(self.filelist),\n)\nif not report_id:\nself.report_id: str = str(uuid.uuid4())\nelse:\nself.report_id = report_id\nself.jisc_papers: pd.DataFrame = jisc_papers\nself.report_parent: Path = Path(f\"{REPORT_DIR}/{self.report_id}\")\nself.report: Path = (\nself.report_parent / f\"{self.path.stem.replace('_metadata', '')}.json\"\n)\nself.json_indent: int = json_indent\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.documents","title":"documents property
","text":"documents\n
Property that calls the get_documents
method
property
","text":"filelist\n
Returns the list of files in the zip file
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.__len__","title":"__len__","text":"__len__()\n
The number of files inside the zip archive.
Source code inalto2txt2fixture/router.py
def __len__(self):\n\"\"\"The number of files inside the zip archive.\"\"\"\nreturn len(self.filelist)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_documents","title":"get_documents","text":"get_documents() -> Generator[Document, None, None]\n
A generator that yields instances of the Document class for each XML file in the ZIP archive.
It uses the tqdm
library to display a progress bar in the terminal while it is running.
If the contents of the ZIP file are not empty, the method creates an instance of the Document
class by passing the root element of the XML file, the collection name, meta information about the archive, and the JISC papers data frame (if provided) to the constructor of the Document
class. The instance of the Document
class is then returned by the generator.
Yields:
Type DescriptionDocument
Document
class instance for each unzipped XML
file.
alto2txt2fixture/router.py
def get_documents(self) -> Generator[Document, None, None]:\n\"\"\"\n A generator that yields instances of the Document class for each XML\n file in the ZIP archive.\n It uses the `tqdm` library to display a progress bar in the terminal\n while it is running.\n If the contents of the ZIP file are not empty, the method creates an\n instance of the ``Document`` class by passing the root element of the XML\n file, the collection name, meta information about the archive, and the\n JISC papers data frame (if provided) to the constructor of the\n ``Document`` class. The instance of the ``Document`` class is then\n returned by the generator.\n Yields:\n ``Document`` class instance for each unzipped `XML` file.\n \"\"\"\nfor xml_file in tqdm(\nself.filelist,\ndesc=f\"{Path(self.zip_file.filename).stem} ({self.meta.size})\",\nleave=False,\ncolour=\"green\",\n):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield Document(\nroot=ET.fromstring(xml),\ncollection=self.collection,\nmeta=self.meta,\njisc_papers=self.jisc_papers,\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_roots","title":"get_roots","text":"get_roots() -> Generator[ET.Element, None, None]\n
Yields the root elements of the XML documents contained in the archive.
Source code inalto2txt2fixture/router.py
def get_roots(self) -> Generator[ET.Element, None, None]:\n\"\"\"\n Yields the root elements of the XML documents contained in the archive.\n \"\"\"\nfor xml_file in tqdm(self.filelist, leave=False, colour=\"blue\"):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield ET.fromstring(xml)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache","title":"Cache","text":"Cache()\n
The Cache class provides a blueprint for creating and managing cache data. The class has several methods that help in getting the cache path, converting the data to a dictionary, and writing the cache data to a file.
It is inherited by many other classes in this document.
Initializes the Cache class object.
Source code inalto2txt2fixture/router.py
def __init__(self):\n\"\"\"\n Initializes the Cache class object.\n \"\"\"\npass\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.__str__","title":"__str__","text":"__str__() -> str\n
Returns the string representation of the cache data as a dictionary.
Source code inalto2txt2fixture/router.py
def __str__(self) -> str:\n\"\"\"\n Returns the string representation of the cache data as a dictionary.\n \"\"\"\nreturn str(self.as_dict())\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.as_dict","title":"as_dict","text":"as_dict() -> dict\n
Converts the cache data to a dictionary and returns it.
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n Converts the cache data to a dictionary and returns it.\n \"\"\"\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the cache path, which is used to store the cache data. The path is normally constructed using some of the object's properties (collection, kind, and id) but can be changed when inherited.
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the cache path, which is used to store the cache data.\n The path is normally constructed using some of the object's\n properties (collection, kind, and id) but can be changed when\n inherited.\n \"\"\"\nreturn Path(f\"{CACHE_HOME}/{self.collection}/{self.kind}/{self.id}.json\")\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.write_to_cache","title":"write_to_cache","text":"write_to_cache(json_indent: int = JSON_INDENT) -> Optional[bool]\n
Writes the cache data to a file at the specified cache path. The cache data is first converted to a dictionary using the as_dict method. If the cache path already exists, the function returns True.
Source code inalto2txt2fixture/router.py
def write_to_cache(self, json_indent: int = JSON_INDENT) -> Optional[bool]:\n\"\"\"\n Writes the cache data to a file at the specified cache path. The cache\n data is first converted to a dictionary using the as_dict method. If\n the cache path already exists, the function returns True.\n \"\"\"\npath = self.get_cache_path()\ntry:\nif path.exists():\nreturn True\nexcept AttributeError:\nerror(\nf\"Error occurred when getting cache path for \"\nf\"{self.kind}: {path}. It was not of expected \"\nf\"type Path but of type {type(path)}:\",\n)\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"w+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent))\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Collection","title":"Collection","text":"Collection(name: str = 'hmd', jisc_papers: Optional[pd.DataFrame] = None)\n
A Collection represents a group of newspaper archives from any passed alto2txt metadata output.
A Collection is initialised with a name and an optional pandas DataFrame of JISC papers. The archives
property returns an iterable of the Archive
objects within the collection.
Attributes:
Name Type Descriptionname
str
Name of the collection (default \"hmd\")
jisc_papers
pandas.DataFrame
DataFrame of JISC papers, optional
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, name: str = \"hmd\", jisc_papers: Optional[pd.DataFrame] = None):\n\"\"\"Constructor method.\"\"\"\nself.name: str = name\nself.jisc_papers: pd.DataFrame | None = jisc_papers\nself.dir: Path = Path(f\"{MNT}/{self.name}-alto2txt/metadata\")\nself.zip_files: list[Path] = sorted(\nlist(self.dir.glob(\"*.zip\")), key=lambda x: x.stat().st_size\n)\nself.zip_file_count: int = sum([1 for _ in self.dir.glob(\"*.zip\")])\nself.report_id: str = str(uuid.uuid4())\nself.empty: bool = self.zip_file_count == 0\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider","title":"DataProvider","text":"DataProvider(collection: str)\n
Bases: Cache
The DataProvider class extends the Cache class and represents a newspaper data provider. The class has several properties and methods that allow creation of a data provider object and the manipulation of its data.
Attributes:
Name Type Descriptioncollection
str
A string representing publication collection
kind
str
Indication of object type, defaults to data-provider
providers_meta_data
list[FixtureDict]
structured dict of metadata for known collection sources
collection_type
str
related data sources and potential linkage source
index_field
str
field name for querying existing records
Example>>> from pprint import pprint\n>>> hmd = DataProvider(\"hmd\")\n>>> hmd.pk\n2\n>>> pprint(hmd.as_dict())\n{'code': 'bl-hmd',\n 'collection': 'newspapers',\n 'legacy_code': 'hmd',\n 'name': 'Heritage Made Digital',\n 'source_note': 'British Library-funded digitised newspapers provided by the '\n 'British Newspaper Archive'}\n
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, collection: str):\n\"\"\"Constructor method.\"\"\"\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data","title":"meta_data property
","text":"meta_data: FixtureDict | dict\n
Return self.providers_meta_data[self.collection]
or {}
.
property
","text":"meta_data_fields: FixtureDict | dict\n
Return self.providers_meta_data[self.collection]
or {}
.
property
","text":"pk: int | None\n
Return pk
if provided via providers_meta_data
, else None
.
property
","text":"providers_index_dict: dict[str, FixtureDict]\n
Return all self.index_field
values from providers_meta_data
.
as_dict() -> dict\n
Return a dict
of the data provider object.
Returns:
Type Descriptiondict
Dictionary representation of the DataProvider object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n Return a `dict` of the data provider object.\n Returns:\n Dictionary representation of the DataProvider object\n \"\"\"\nif self.meta_data:\nreturn {\n\"name\": self.meta_data_fields[\"name\"],\n\"code\": self.meta_data_fields[\"code\"],\n\"legacy_code\": self.collection,\n\"source_note\": self.meta_data_fields[\"source_note\"],\n\"collection\": self.collection_type,\n}\nelse:\nreturn {\n\"name\": self.collection,\n\"code\": slugify(self.collection),\n\"source_note\": \"\",\n\"legacy_code\": None,\n\"collection\": self.collection_type,\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation","title":"Digitisation","text":"Digitisation(root: ET.Element, collection: str = '')\n
Bases: Cache
The Digitisation class extends the Cache class and represents a newspaper digitisation. The class has several properties and methods that allow creation of an digitisation object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
collection
str
A string that represents the collection of the publication
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'digitisation'\n
A string that represents the type of the object, set to \"digitisation\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the digitisation object.
Returns:
Type Descriptiondict
Dictionary representation of the Digitising object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the digitisation\n object.\n Returns:\n Dictionary representation of the Digitising object\n \"\"\"\ndic = {\nx.tag: x.text or \"\"\nfor x in self.root.findall(\"./process/*\")\nif x.tag\nin [\n\"xml_flavour\",\n\"software\",\n\"mets_namespace\",\n\"alto_namespace\",\n]\n}\nif not dic.get(\"software\"):\nreturn {}\nreturn dic\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document","title":"Document","text":"Document(*args, **kwargs)\n
The Document class is a representation of a document that contains information about a publication, newspaper, item, digitisation, and ingest. This class holds all the relevant information about a document in a structured manner and provides properties that can be used to access different aspects of the document.
Attributes:
Name Type Descriptioncollection
str | None
A string that represents the collection of the publication
root
ET.Element | None
An XML
element that represents the root of the publication
zip_file
str | None
A path to a valid zip
file
jisc_papers
pd.DataFrame | None
A pandas
DataFrame
object that holds information about the JISC papers
meta
dotdict | None
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, *args, **kwargs):\n\"\"\"Constructor method.\"\"\"\nself.collection: str | None = kwargs.get(\"collection\")\nif not self.collection or not isinstance(self.collection, str):\nraise RuntimeError(\"A valid collection must be passed\")\nself.root: ET.Element | None = kwargs.get(\"root\")\nif not self.root or not isinstance(self.root, ET.Element):\nraise RuntimeError(\"A valid XML root must be passed\")\nself.zip_file: str | None = kwargs.get(\"zip_file\")\nif self.zip_file and not isinstance(self.zip_file, str):\nraise RuntimeError(\"A valid zip file must be passed\")\nself.jisc_papers: pd.DataFrame | None = kwargs.get(\"jisc_papers\")\nif not isinstance(self.jisc_papers, pd.DataFrame):\nraise RuntimeError(\n\"A valid DataFrame containing JISC papers must be passed\"\n)\nself.meta: dotdict | None = kwargs.get(\"meta\")\nself._publication_elem = None\nself._input_sub_path = None\nself._ingest = None\nself._digitisation = None\nself._item = None\nself._issue = None\nself._newspaper = None\nself._data_provider = None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document.publication","title":"publication property
","text":"publication: ET.Element\n
This property returns an ElementTree object representing the publication information in the XML document.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest","title":"Ingest","text":"Ingest(root: ET.Element, collection: str = '')\n
Bases: Cache
The Ingest class extends the Cache class and represents a newspaper ingest. The class has several properties and methods that allow the creation of an ingest object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
collection
str
A string that represents the collection of the publication
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'ingest'\n
A string that represents the type of the object, set to \"ingest\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the ingest object.
Returns:
Type Descriptiondict
Dictionary representation of the Ingest object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the ingest\n object.\n Returns:\n Dictionary representation of the Ingest object\n \"\"\"\nreturn {\nf\"lwm_tool_{x.tag}\": x.text or \"\"\nfor x in self.root.findall(\"./process/lwm_tool/*\")\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue","title":"Issue","text":"Issue(\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n)\n
Bases: Cache
The Issue class extends the Cache class and represents a newspaper issue. The class has several properties and methods that allow the creation of an issue object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
An xml element that represents the root of the publication
newspaper
Newspaper | None
The parent newspaper
collection
str
A string that represents the collection of the publication
input_sub_path
str
TODO
meta
dotdict
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nself.publication: ET.Element = publication\nself.newspaper: Newspaper | None = newspaper\nself.collection: str = collection\nself.input_sub_path: str = input_sub_path\nself.meta: dotdict = meta\nself._issue = None\nself._issue_date = None\npath: str = str(self.get_cache_path())\nif not self.meta.issue_paths:\nself.meta.issue_paths = [path]\nelif path not in self.meta.issue_paths:\nself.meta.issue_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_code","title":"issue_code property
","text":"issue_code: str\n
Sets up and saves the issue code for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_date","title":"issue_dateproperty
","text":"issue_date: str\n
Sets up and saves the issue date for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.kind","title":"kindclass-attribute
instance-attribute
","text":"kind = 'issue'\n
A string that represents the type of the object, set to \"issue\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the issue object.
Returns:
Type Descriptiondict
Dictionary representation of the Issue object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the issue\n object.\n Returns:\n Dictionary representation of the Issue object\n \"\"\"\nif not self._issue:\nself._issue = dict(\nissue_code=self.issue_code,\nissue_date=self.issue_date,\npublication__publication_code=self.newspaper.publication_code,\ninput_sub_path=self.input_sub_path,\n)\nreturn self._issue\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the issue object.
Returns:
Type DescriptionPath
Path to the cache file for the issue object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the issue object.\n Returns:\n Path to the cache file for the issue object\n \"\"\"\njson_file = f\"/{self.newspaper.publication_code}/issues/{self.issue_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item","title":"Item","text":"Item(\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n)\n
Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper item, i.e. an article. The class has several properties and methods that allow the creation of an article object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
issue_code
str
A string that represents the issue code
digitisation
dict
TODO
ingest
dict
TODO
collection
str
A string that represents the collection of the publication
newspaper
Newspaper | None
The parent newspaper
meta
dotdict
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nif not isinstance(newspaper, Newspaper):\nraise RuntimeError(\"Expected newspaper to be of type router.Newspaper\")\nself.root: ET.Element = root\nself.issue_code: str = issue_code\nself.digitisation: dict = digitisation\nself.ingest: dict = ingest\nself.collection: str = collection\nself.newspaper: Newspaper | None = newspaper\nself.meta: dotdict = meta\nself._item_elem = None\nself._item_code = None\nself._item = None\npath: str = str(self.get_cache_path())\nif not self.meta.item_paths:\nself.meta.item_paths = [path]\nelif path not in self.meta.item_paths:\nself.meta.item_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_code","title":"item_code property
","text":"item_code: str\n
Sets up and saves the item code for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_elem","title":"item_elemproperty
","text":"item_elem\n
Sets up and saves the issue XML item for easy access as a property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.kind","title":"kindclass-attribute
instance-attribute
","text":"kind = 'item'\n
A string that represents the type of the object, set to \"item\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the item object (i.e. article).
Returns:
Type Descriptiondict
Dictionary representation of the Item object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the item object\n (i.e. article).\n Returns:\n Dictionary representation of the Item object\n \"\"\"\nif not self._item:\nself._item = {\nf\"{x.tag}\": x.text or \"\"\nfor x in self.item_elem.findall(\"*\")\nif x.tag\nin [\n\"title\",\n\"word_count\",\n\"ocr_quality_mean\",\n\"ocr_quality_sd\",\n\"plain_text_file\",\n\"item_type\",\n]\n}\nself._item[\"title\"] = self._item.get(\"title\", \"\")[:2097151]\nself._item = {\n\"item_code\": self.item_code,\n\"word_count\": self._item.get(\"word_count\", 0),\n\"title\": self._item.get(\"title\"),\n\"item_type\": self._item.get(\"item_type\"),\n\"input_filename\": self._item.get(\"plain_text_file\", \"\"),\n\"ocr_quality_mean\": self._item.get(\"ocr_quality_mean\", 0),\n\"ocr_quality_sd\": self._item.get(\"ocr_quality_sd\", 0),\n\"digitisation__software\": self.digitisation.id,\n\"ingest__lwm_tool_identifier\": self.ingest.id,\n\"issue__issue_identifier\": self.issue_code,\n\"data_provider__name\": self.collection,\n}\nreturn self._item\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the item (article) object.
Returns:
Type DescriptionPath
Path to the cache file for the article object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the item (article) object.\n Returns:\n Path to the cache file for the article object\n \"\"\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ f\"/{self.newspaper.publication_code}/items.jsonl\"\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.write_to_cache","title":"write_to_cache","text":"write_to_cache(json_indent = JSON_INDENT) -> None\n
Special cache-write function that appends rather than writes at the end of the process.
Returns:
Type DescriptionNone
None.
Source code inalto2txt2fixture/router.py
def write_to_cache(self, json_indent=JSON_INDENT) -> None:\n\"\"\"\n Special cache-write function that appends rather than writes at the\n end of the process.\n Returns:\n None.\n \"\"\"\npath = self.get_cache_path()\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"a+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent) + \"\\n\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper","title":"Newspaper","text":"Newspaper(\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n)\n
Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper.
The class has several properties and methods that allow the creation of a newspaper object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
An xml element that represents the root of the publication.
collection
A string that represents the collection of the publication.
meta
A dotdict object that holds metadata about the publication.
jisc_papers
A pandas DataFrame object for JISC paper information.
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.publication = root.find(\"./publication\")\nself.input_sub_path = root.find(\"./process/input_sub_path\").text\nself.issue_date = self.publication.find(\"./issue/date\").text\nself.collection = collection\nself.meta = meta\nself.jisc_papers = jisc_papers\nself._newspaper = None\nself._title = None\nself._publication_code = None\npath = str(self.get_cache_path())\nif not self.meta.newspaper_paths:\nself.meta.newspaper_paths = []\nelif path not in self.meta.newspaper_paths:\nself.meta.newspaper_paths.append(path)\nif not self.meta.publication_codes:\nself.meta.publication_codes = [self.publication_code]\nelif self.publication_code not in self.meta.publication_codes:\nself.meta.publication_codes.append(self.publication_code)\nself.zip_file = Path(meta.path).name\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'newspaper'\n
A string that represents the type of the object, set to \"newspaper\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.number_paths","title":"number_pathsproperty
","text":"number_paths: list\n
Returns the nested directories in which we want to save the cache file.
Returns:
Type Descriptionlist
List of the desired directories in descending order
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code","title":"publication_codeproperty
","text":"publication_code: str\n
A property that returns the code of the publication.
Returns:
Type Descriptionstr
The code of the publication
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.title","title":"titleproperty
","text":"title: str\n
A property that returns the title of the newspaper.
Returns:
Type Descriptionstr
The title of the newspaper
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the newspaper object.
Returns:
Type Descriptiondict
Dictionary representation of the Newspaper object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the newspaper\n object.\n Returns:\n Dictionary representation of the Newspaper object\n \"\"\"\nif not self._newspaper:\nself._newspaper = dict(\n**dict(publication_code=self.publication_code, title=self.title),\n**{\nx.tag: x.text or \"\"\nfor x in self.publication.findall(\"*\")\nif x.tag in [\"location\"]\n},\n)\nreturn self._newspaper\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the newspaper object.
Returns:
Type DescriptionPath
Path to the cache file for the newspaper object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the newspaper object.\n Returns:\n Path to the cache file for the newspaper object\n \"\"\"\njson_file = f\"/{self.publication_code}/{self.publication_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\" + \"/\".join(self.number_paths) + json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code_from_input_sub_path","title":"publication_code_from_input_sub_path","text":"publication_code_from_input_sub_path() -> str | None\n
A method that returns the publication code from the input sub-path of the publication process.
Returns:
Type Descriptionstr | None
The code of the publication
Source code inalto2txt2fixture/router.py
def publication_code_from_input_sub_path(self) -> str | None:\n\"\"\"\n A method that returns the publication code from the input sub-path of\n the publication process.\n Returns:\n The code of the publication\n \"\"\"\ng = PUBLICATION_CODE.findall(self.input_sub_path)\nif len(g) == 1:\nreturn g[0]\nreturn None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.route","title":"route","text":"route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None\n
This function is responsible for setting up the path for the alto2txt mountpoint, setting up the JISC papers and routing the collections for processing.
Parameters:
Name Type Description Defaultcollections
list
List of collection names
requiredcache_home
str
Directory path for the cache
requiredmountpoint
str
Directory path for the alto2txt mountpoint
requiredjisc_papers_path
str
Path to the JISC papers
requiredreport_dir
str
Path to the report directory
requiredReturns:
Type DescriptionNone
None
Source code inalto2txt2fixture/router.py
def route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None:\n\"\"\"\n This function is responsible for setting up the path for the alto2txt\n mountpoint, setting up the JISC papers and routing the collections for\n processing.\n Args:\n collections: List of collection names\n cache_home: Directory path for the cache\n mountpoint: Directory path for the alto2txt mountpoint\n jisc_papers_path: Path to the JISC papers\n report_dir: Path to the report directory\n Returns:\n None\n \"\"\"\nglobal CACHE_HOME\nglobal MNT\nglobal REPORT_DIR\nCACHE_HOME = cache_home\nREPORT_DIR = report_dir\nMNT = Path(mountpoint) if isinstance(mountpoint, str) else mountpoint\nif not MNT.exists():\nerror(\nf\"The mountpoint provided for alto2txt does not exist. \"\nf\"Either create a local copy or blobfuse it to \"\nf\"`{MNT.absolute()}`.\"\n)\njisc_papers = setup_jisc_papers(path=jisc_papers_path)\nfor collection_name in collections:\ncollection = Collection(name=collection_name, jisc_papers=jisc_papers)\nif collection.empty:\nerror(\nf\"It looks like {collection_name} is empty in the \"\nf\"alto2txt mountpoint: `{collection.dir.absolute()}`.\"\n)\nfor archive in collection.archives:\nwith archive as _:\n[\n(\ndoc.item.write_to_cache(),\ndoc.newspaper.write_to_cache(),\ndoc.issue.write_to_cache(),\ndoc.data_provider.write_to_cache(),\ndoc.ingest.write_to_cache(),\ndoc.digitisation.write_to_cache(),\n)\nfor doc in archive.documents\n]\nreturn\n
"},{"location":"reference/alto2txt2fixture/settings.html","title":"settings","text":"The settings
module provides configuration for running alto2txt2fixture
.
Most of these are managed within the settings
variable within this module.
Note
See the command line interface parameters documentation for means of modifying settings
when run.
Attributes:
Name Type DescriptionJSON_INDEX
Amount of indentation to include in output JSON
files
DATA_PROVIDER_INDEX
Final[str]
The field
used to index DataProvider
records
NEWSPAPER_COLLECTION_METADATA
Final[list[FixtureDict]]
A list of FixtureDict
s specifying speific newspaper data providers
SETUP_TITLE
str
the title printed at the commandline via cli.show_setup()
function
settings
dotdict
a docdict
configuration for running newspaper
portions of alto2txt2fixture
Bases: TypedDict
A dict
structure to ease use as a json
database fixture.
Attributes:
Name Type Descriptionpk
int
an id to uniquely define and query each entry
model
str
what model a given record is for
fields
dict[str, Any]
a dict
of record information conforming to model
table
Bases: NamedTuple
A named tuple of fields for translation.
Attributes:
Name Type Descriptionstart
str
A string representing the starting field name.
finish
str | list
A string or list specifying the field(s) to be translated. If it is a string, the translated field will be a direct mapping of the specified field in each item of the input list. If it is a list, the translated field will be a hyphen-separated concatenation of the specified fields in each item of the input list.
lst
list[dict]
A list of dictionaries representing the items to be translated. Each dictionary should contain the necessary fields for translation, with the field names specified in the start
parameter.
Bases: dict
dot.notation access to dictionary attributes
"},{"location":"reference/alto2txt2fixture/utils.html","title":"utils","text":""},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.check_newspaper_collection_configuration","title":"check_newspaper_collection_configuration","text":"check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[\nFixtureDict\n] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]\n
Check the names in collections
match the names in newspaper_collections
.
Parameters:
Name Type Description Defaultcollections
Iterable[str]
Names of newspaper collections, defaults to settings.COLLECTIONS
settings.COLLECTIONS
newspaper_collections
Iterable[FixtureDict]
Newspaper collections in a list of FixtureDict
format. Defaults to settings.FIXTURE_TABLE['dataprovider]
NEWSPAPER_COLLECTION_METADATA
data_provider_index
str
dict
fields
key
used to check matchiching collections
name
DATA_PROVIDER_INDEX
Returns:
Type Descriptionset[str]
A set of collections
without a matching newspaper_collections
record.
>>> check_newspaper_collection_configuration()\nset()\n
Source code in alto2txt2fixture/utils.py
def check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]:\n\"\"\"Check the names in `collections` match the names in `newspaper_collections`.\n Arguments:\n collections:\n Names of newspaper collections, defaults to ``settings.COLLECTIONS``\n newspaper_collections:\n Newspaper collections in a list of `FixtureDict` format. Defaults\n to ``settings.FIXTURE_TABLE['dataprovider]``\n data_provider_index:\n `dict` `fields` `key` used to check matchiching `collections` name\n Returns:\n A set of ``collections`` without a matching `newspaper_collections` record.\n Example:\n ```pycon\n >>> check_newspaper_collection_configuration()\n set()\n ```\n \"\"\"\nnewspaper_collection_names: tuple[str, ...] = tuple(\ndict_from_list_fixture_fields(\nnewspaper_collections, field_name=data_provider_index\n).keys()\n)\ncollection_diff: set[str] = set(collections) - set(newspaper_collection_names)\nif collection_diff:\nwarning(\nf\"{len(collection_diff)} `collections` \"\nf\"not in `newspaper_collections`: {collection_diff}\"\n)\nreturn collection_diff\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.clear_cache","title":"clear_cache","text":"clear_cache(dir: str | Path) -> None\n
Clears the cache directory by removing all .json
files in it.
Parameters:
Name Type Description Defaultdir
str | Path
The path of the directory to be cleared.
required Source code inalto2txt2fixture/utils.py
def clear_cache(dir: str | Path) -> None:\n\"\"\"\n Clears the cache directory by removing all `.json` files in it.\n Args:\n dir: The path of the directory to be cleared.\n \"\"\"\ndir = get_path_from(dir)\ny = input(\nf\"Do you want to erase the cache path now that the \"\nf\"files have been generated ({dir.absolute()})? [y/N]\"\n)\nif y.lower() == \"y\":\ninfo(\"Clearing up the cache directory\")\nfor x in dir.glob(\"*.json\"):\nx.unlink()\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.create_lookup","title":"create_lookup","text":"create_lookup(lst: list = [], on: list = []) -> dict\n
Create a lookup dictionary from a list of dictionaries.
Parameters:
Name Type Description Defaultlst
list
A list of dictionaries that should be used to generate the lookup.
[]
on
list
A list of keys from the dictionaries in the list that should be used as the keys in the lookup.
[]
Returns:
Type Descriptiondict
The generated lookup dictionary.
Source code inalto2txt2fixture/utils.py
def create_lookup(lst: list = [], on: list = []) -> dict:\n\"\"\"\n Create a lookup dictionary from a list of dictionaries.\n Args:\n lst: A list of dictionaries that should be used to generate the lookup.\n on: A list of keys from the dictionaries in the list that should be used as the keys in the lookup.\n Returns:\n The generated lookup dictionary.\n \"\"\"\nreturn {get_key(x, on): x[\"pk\"] for x in lst}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.dict_from_list_fixture_fields","title":"dict_from_list_fixture_fields","text":"dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]\n
Create a dict
from fixture_list
with attr_name
as key
.
Parameters:
Name Type Description Defaultfixture_list
Iterable[FixtureDict]
list
of FixtureDict
with attr_name
key fields
.
NEWSPAPER_COLLECTION_METADATA
field_name
str
key for values within fixture_list
fields
.
DATA_PROVIDER_INDEX
Returns:
Type Descriptiondict[str, FixtureDict]
A dict
where extracted field_name
is key for related FixtureDict
values.
>>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n>>> fixture_dict['hmd']['pk']\n2\n>>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture_dict['hmd']['fields']['code']\n'bl-hmd'\n
Source code in alto2txt2fixture/utils.py
def dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]:\n\"\"\"Create a `dict` from ``fixture_list`` with ``attr_name`` as `key`.\n Args:\n fixture_list: `list` of `FixtureDict` with ``attr_name`` key `fields`.\n field_name: key for values within ``fixture_list`` `fields`.\n Returns:\n A `dict` where extracted `field_name` is key for related `FixtureDict` values.\n Example:\n ```pycon\n >>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n >>> fixture_dict['hmd']['pk']\n 2\n >>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> fixture_dict['hmd']['fields']['code']\n 'bl-hmd'\n ```\n \"\"\"\nreturn {record[\"fields\"][field_name]: record for record in fixture_list}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.export_fixtures","title":"export_fixtures","text":"export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None\n
Export fixture_tables
in formats
.
This is still in experimental phase of development and not recommended for production.
Parameters:
Name Type Description Defaultfixture_tables
dict[str, Sequence[FixtureDict]]
dict
of table name (eg: dataprovider
) and FixtureDict
path
str | PathLike
Path to save exports in
settings.FIXTURE_TABLES_OUTPUT
prefix
str
str
to prefix export filenames with
'test-'
formats
Sequence[EXPORT_FORMATS]
list of EXPORT_FORMATS
to export
settings.FIXTURE_TABLES_FORMATS
Example >>> test_fixture_tables: dict[str, FixtureDict] = {\n... 'test0': NEWSPAPER_COLLECTION_METADATA,\n... 'test1': NEWSPAPER_COLLECTION_METADATA}\n>>> export_fixtures(test_fixture_tables, path='tests/')\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...Warning: Saving test0...\n...Warning: Saving test1...\n>>> from pandas import read_csv\n>>> fixture0_json = load_json('tests/test-test0-1.json')\n>>> fixture0_df = read_csv('tests/test-test0-1.csv')\n>>> fixture1_json = load_json('tests/test-test1-1.json')\n>>> fixture1_df = read_csv('tests/test-test1-1.csv')\n>>> fixture0_json == fixture1_json\nTrue\n>>> all(fixture0_df == fixture1_df)\nTrue\n>>> all(field in fixture0_json[0]['fields']\n... for field in ['created_at', 'updated_at'])\nTrue\n>>> fixture0_json[1]['pk']\n2\n>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n[2, 'hmd']\n
Source code in alto2txt2fixture/utils.py
def export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None:\n\"\"\"Export ``fixture_tables`` in ``formats``.\n Note:\n This is still in experimental phase of development and not recommended\n for production.\n Args:\n fixture_tables: `dict` of table name (eg: `dataprovider`) and `FixtureDict`\n path: Path to save exports in\n prefix: `str` to prefix export filenames with\n formats: list of `EXPORT_FORMATS` to export\n Example:\n ```pycon\n >>> test_fixture_tables: dict[str, FixtureDict] = {\n ... 'test0': NEWSPAPER_COLLECTION_METADATA,\n ... 'test1': NEWSPAPER_COLLECTION_METADATA}\n >>> export_fixtures(test_fixture_tables, path='tests/')\n ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n <BLANKLINE>\n ...Warning: Saving test0...\n ...Warning: Saving test1...\n >>> from pandas import read_csv\n >>> fixture0_json = load_json('tests/test-test0-1.json')\n >>> fixture0_df = read_csv('tests/test-test0-1.csv')\n >>> fixture1_json = load_json('tests/test-test1-1.json')\n >>> fixture1_df = read_csv('tests/test-test1-1.csv')\n >>> fixture0_json == fixture1_json\n True\n >>> all(fixture0_df == fixture1_df)\n True\n >>> all(field in fixture0_json[0]['fields']\n ... for field in ['created_at', 'updated_at'])\n True\n >>> fixture0_json[1]['pk']\n 2\n >>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n [2, 'hmd']\n ```\n \"\"\"\nfor table_name, records in fixture_tables.items():\nwarning(\nf\"Saving {table_name} fixture in {formats} formats \"\nf\"to {path} *without* checks...\"\n)\nif \"json\" in formats:\nsave_fixture(\nrecords,\nprefix=f\"{prefix}{table_name}\",\noutput_path=path,\nadd_created=add_created,\n)\nif \"csv\" in formats:\nfixtures_dict2csv(records, prefix=f\"{prefix}{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.filter_json_fields","title":"filter_json_fields","text":"filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs: Hashable\n) -> dict | list\n
Return keys
and values
from json_dict
where any fields
equal value
.
Parameters:
Name Type Description Defaultfile_path
PathLike | None
The file path
to load based on extension and filter
None
fields
Sequence[str]
Which fields to check equal value
[]
value
Hashable
Value to filter by
''
Returns:
Type Descriptiondict | list
A dict
of records indexed by pk
which fit filter criteria
Raises:
Type DescriptionValueError
file_path
must have a .json
suffix
>>> from pprint import pprint\n>>> entry_fixture: dict = [\n... {\"pk\": 4889, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n... \"price_raw\": ['2d'],\n... \"year\": 1920,\n... \"date_established_raw\": \"1857\",\n... \"persons\": [], \"newspaper\": \"\"}},\n... {\"pk\": 9207, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n... \"price_raw\": ['2 \u00bd d', '3 \u00bd d'],\n... \"year\": 1856,\n... \"date_established_raw\": \"1848\",\n... \"persons\": ['Stephen Soulby'],\n... \"newspaper\": \"\",}},\n... {\"pk\": 15, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n... \"price_raw\": ['2d', '3d'],\n... \"year\": 1857,\n... \"date_established_raw\": \"November , 1842\",\n... \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n... \"newspaper\": 1187}}\n... ]\n>>> pprint(filter_json_fields(entry_fixture,\n... fields=(\"newspaper\", \"persons\"),\n... value=\"\"))\n[{'fields': {'date_established_raw': '1857',\n 'newspaper': '',\n 'persons': [],\n 'price_raw': ['2d'],\n 'title': 'BIRMINGHAM POST .',\n 'year': 1920},\n 'model': 'mitchells.entry',\n 'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n 'newspaper': '',\n 'persons': ['Stephen Soulby'],\n 'price_raw': ['2 \u00bd d', '3 \u00bd d'],\n 'title': 'ULVERSTONE ADVERTISER .',\n 'year': 1856},\n 'model': 'mitchells.entry',\n 'pk': 9207}]\n
Source code in alto2txt2fixture/utils.py
def filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs,\n) -> dict | list:\n\"\"\"Return `keys` and `values` from `json_dict` where any `fields` equal `value`.\n Args:\n file_path: The file `path` to load based on extension and filter\n fields: Which fields to check equal `value`\n value: Value to filter by\n Returns:\n A `dict` of records indexed by `pk` which fit filter criteria\n Raises:\n ValueError: ``file_path`` must have a `.json` `suffix`\n Example:\n ```pycon\n >>> from pprint import pprint\n >>> entry_fixture: dict = [\n ... {\"pk\": 4889, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n ... \"price_raw\": ['2d'],\n ... \"year\": 1920,\n ... \"date_established_raw\": \"1857\",\n ... \"persons\": [], \"newspaper\": \"\"}},\n ... {\"pk\": 9207, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n ... \"price_raw\": ['2 \\u00bd d', '3 \\u00bd d'],\n ... \"year\": 1856,\n ... \"date_established_raw\": \"1848\",\n ... \"persons\": ['Stephen Soulby'],\n ... \"newspaper\": \"\",}},\n ... {\"pk\": 15, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n ... \"price_raw\": ['2d', '3d'],\n ... \"year\": 1857,\n ... \"date_established_raw\": \"November , 1842\",\n ... \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n ... \"newspaper\": 1187}}\n ... ]\n >>> pprint(filter_json_fields(entry_fixture,\n ... fields=(\"newspaper\", \"persons\"),\n ... value=\"\"))\n [{'fields': {'date_established_raw': '1857',\n 'newspaper': '',\n 'persons': [],\n 'price_raw': ['2d'],\n 'title': 'BIRMINGHAM POST .',\n 'year': 1920},\n 'model': 'mitchells.entry',\n 'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n 'newspaper': '',\n 'persons': ['Stephen Soulby'],\n 'price_raw': ['2 \\u00bd d', '3 \\u00bd d'],\n 'title': 'ULVERSTONE ADVERTISER .',\n 'year': 1856},\n 'model': 'mitchells.entry',\n 'pk': 9207}]\n ```\n \"\"\"\nif not json_results:\nassert file_path\ntry:\nassert Path(file_path).suffix == \".json\"\nexcept AssertionError:\nraise ValueError(f\"{file_path} must be `json` format.\")\njson_results = load_json(Path(file_path), **kwargs)\nassert json_results\nif isinstance(json_results, dict):\nreturn {\nk: v\nfor k, v in json_results.items()\nif any(v[\"fields\"][field] == value for field in fields)\n}\nelse:\nreturn [\nv\nfor v in json_results\nif any(v[\"fields\"][field] == value for field in fields)\n]\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_fields","title":"fixture_fields","text":"fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]\n
Generate a tuple of FixtureDict
field
names.
This is not in the utils
module to avoid a circular import.
Parameters:
Name Type Description Defaultfixture_dict
FixtureDict
A FixtureDict
instance to extract names from fields
include_pk
bool
Whether to include the pk
(primary key) column
True
Example >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n('name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> hmd_dict: dict[str, Any] = fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n>>> hmd_dict['code']\n'bl-hmd'\n>>> hmd_dict['pk']\n2\n>>> hmd_dict = fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n>>> 'pk' in hmd_dict\nFalse\n
Source code in alto2txt2fixture/utils.py
def fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]:\n\"\"\"Generate a tuple of `FixtureDict` `field` names.\n Note:\n This is not in the `utils` module to avoid a circular import.\n Args:\n fixture_dict: A `FixtureDict` instance to extract names from `fields`\n include_pk: Whether to include the `pk` (primary key) column\n Example:\n ```pycon\n >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n ('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n ('name', 'code', 'legacy_code', 'collection', 'source_note')\n >>> hmd_dict: dict[str, Any] = fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n >>> hmd_dict['code']\n 'bl-hmd'\n >>> hmd_dict['pk']\n 2\n >>> hmd_dict = fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n >>> 'pk' in hmd_dict\n False\n ```\n \"\"\"\nfields: OrderedDict[str, Any] = OrderedDict(fixture_dict[\"fields\"])\nif include_pk:\nfields[\"pk\"] = fixture_dict[\"pk\"]\nfields.move_to_end(\"pk\", last=False)\nif as_dict:\nreturn fields\nelse:\nreturn tuple(fields.keys())\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_or_default_dict","title":"fixture_or_default_dict","text":"fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict\n
Return a FixtureDict
from fixture_list
via key
index, else default_dict
.
Parameters:
Name Type Description Defaultkey
str
a str
to query fixture_dict
with
fixture_dict
dict[str, FixtureDict]
a dict
of str
to FixtureDict
, often generated by dict_from_list_fixture_fields
default_dict
FixtureDict | dict
a dict
to return if key
is not in fixture_dict
index
{}
Example >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA)\n>>> hmd_dict: FixtureDict = fixture_or_default_dict(\n... 'hmd', newspaper_dict\n... )\n>>> fixture_or_default_dict(\n... 'hmd', NEWSPAPER_COLLECTION_METADATA\n... )\n{}\n>>> fixture_or_default_dict(\n... 'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n... )\n{'a': 'default'}\n
Source code in alto2txt2fixture/utils.py
def fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict:\n\"\"\"Return a `FixtureDict` from ``fixture_list`` via ``key`` index, else ``default_dict``.\n Args:\n key:\n a `str` to query ``fixture_dict`` with\n fixture_dict: a `dict` of `str` to `FixtureDict`, often generated by\n ``dict_from_list_fixture_fields``\n default_dict: a `dict` to return if ``key`` is not in\n ``fixture_dict`` index\n Example:\n ```pycon\n >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA)\n >>> hmd_dict: FixtureDict = fixture_or_default_dict(\n ... 'hmd', newspaper_dict\n ... )\n >>> fixture_or_default_dict(\n ... 'hmd', NEWSPAPER_COLLECTION_METADATA\n ... )\n {}\n >>> fixture_or_default_dict(\n ... 'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n ... )\n {'a': 'default'}\n ```\n \"\"\"\nif key in fixture_dict:\nreturn fixture_dict[key]\nelse:\nreturn default_dict\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixtures_dict2csv","title":"fixtures_dict2csv","text":"fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None\n
Saves fixtures generated by a generator to separate separate CSV
files.
This function takes an Iterable
or Generator
of fixtures and saves to separate CSV
files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file
parameter.
Parameters:
Name Type Description Defaultfixtures
Iterable[FixtureDict] | Generator[FixtureDict, None, None]
An Iterable
or Generator
of the fixtures to be saved.
prefix
str
A string prefix to be added to the file names of the saved fixtures.
''
output_path
PathLike | str
Path to folder fixtures are saved to
settings.OUTPUT
max_elements_per_file
int
Maximum JSON
records saved in each file
settings.MAX_ELEMENTS_PER_FILE
Returns:
Type DescriptionNone
This function saves fixtures to files and does not return a value.
Example>>> from pandas import read_csv\n>>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n... prefix='test', output_path='tests/')\n>>> imported_fixture = read_csv('tests/test-1.csv')\n>>> imported_fixture.iloc[1]['pk']\n2\n>>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n'hmd'\n
Source code in alto2txt2fixture/utils.py
def fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate separate `CSV` files.\n This function takes an `Iterable` or `Generator` of fixtures and saves to\n separate `CSV` files. The fixtures are saved in batches, where each batch\n is determined by the ``max_elements_per_file`` parameter.\n Args:\n fixtures: An `Iterable` or `Generator` of the fixtures to be saved.\n prefix: A string prefix to be added to the file names of the\n saved fixtures.\n output_path: Path to folder fixtures are saved to\n max_elements_per_file: Maximum `JSON` records saved in each file\n Returns:\n This function saves fixtures to files and does not return a value.\n Example:\n ```pycon\n >>> from pandas import read_csv\n >>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n ... prefix='test', output_path='tests/')\n >>> imported_fixture = read_csv('tests/test-1.csv')\n >>> imported_fixture.iloc[1]['pk']\n 2\n >>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n 'hmd'\n ```\n \"\"\"\ninternal_counter: int = 1\ncounter: int = 1\nlst: list = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in fixtures:\nlst.append(fixture_fields(item, as_dict=True))\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst: list = []\ninternal_counter = 1\ncounter += 1\nelse:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\nreturn\nsave_fixture(records, prefix=f\"test-{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.gen_fixture_tables","title":"gen_fixture_tables","text":"gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]\n
Generator of rich.Table
instances from FixtureDict
configuration tables.
Parameters:
Name Type Description Defaultfixture_tables
dict[str, list[FixtureDict]]
dict
where key
is for Table
title and value
is a FixtureDict
{}
include_fixture_pk_column
bool
whether to include the pk
field from FixtureDict
True
Example >>> table_name: str = \"data_provider\"\n>>> tables = tuple(\n... gen_fixture_tables(\n... {table_name: NEWSPAPER_COLLECTION_METADATA}\n... ))\n>>> len(tables)\n1\n>>> assert tables[0].title == table_name\n>>> [column.header for column in tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n
Source code in alto2txt2fixture/utils.py
def gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]:\n\"\"\"Generator of `rich.Table` instances from `FixtureDict` configuration tables.\n Args:\n fixture_tables: `dict` where `key` is for `Table` title and `value` is a `FixtureDict`\n include_fixture_pk_column: whether to include the `pk` field from `FixtureDict`\n Example:\n ```pycon\n >>> table_name: str = \"data_provider\"\n >>> tables = tuple(\n ... gen_fixture_tables(\n ... {table_name: NEWSPAPER_COLLECTION_METADATA}\n ... ))\n >>> len(tables)\n 1\n >>> assert tables[0].title == table_name\n >>> [column.header for column in tables[0].columns]\n ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n ```\n \"\"\"\nfor name, fixture_records in fixture_tables.items():\nfixture_table: Table = Table(title=name)\nfor i, fixture_dict in enumerate(fixture_records):\nif i == 0:\n[\nfixture_table.add_column(name)\nfor name in fixture_fields(fixture_dict, include_fixture_pk_column)\n]\nrow_values: tuple[str, ...] = tuple(\nstr(x) for x in (fixture_dict[\"pk\"], *fixture_dict[\"fields\"].values())\n)\nfixture_table.add_row(*row_values)\nyield fixture_table\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_chunked_zipfiles","title":"get_chunked_zipfiles","text":"get_chunked_zipfiles(path: Path) -> list\n
This function takes in a Path
object path
and returns a list of lists of zipfiles
sorted and chunked according to certain conditions defined in the settings
object (see settings.CHUNK_THRESHOLD
).
Note: the function will also skip zip files of a certain file size, which can be specified in the settings
object (see settings.SKIP_FILE_SIZE
).
Parameters:
Name Type Description Defaultpath
Path
The input path where the zipfiles are located
requiredReturns:
Type Descriptionlist
A list of lists of zipfiles
, each inner list represents a chunk of zipfiles.
alto2txt2fixture/utils.py
def get_chunked_zipfiles(path: Path) -> list:\n\"\"\"This function takes in a `Path` object `path` and returns a list of lists\n of `zipfiles` sorted and chunked according to certain conditions defined\n in the `settings` object (see `settings.CHUNK_THRESHOLD`).\n Note: the function will also skip zip files of a certain file size, which\n can be specified in the `settings` object (see `settings.SKIP_FILE_SIZE`).\n Args:\n path: The input path where the zipfiles are located\n Returns:\n A list of lists of `zipfiles`, each inner list represents a chunk of\n zipfiles.\n \"\"\"\nzipfiles = sorted(\npath.glob(\"*.zip\"),\nkey=lambda x: x.stat().st_size,\nreverse=settings.START_WITH_LARGEST,\n)\nzipfiles = [x for x in zipfiles if x.stat().st_size <= settings.SKIP_FILE_SIZE]\nif len(zipfiles) > settings.CHUNK_THRESHOLD:\nchunks = array_split(zipfiles, len(zipfiles) / settings.CHUNK_THRESHOLD)\nelse:\nchunks = [zipfiles]\nreturn chunks\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_key","title":"get_key","text":"get_key(x: dict = dict(), on: list = []) -> str\n
Get a string key from a dictionary using values from specified keys.
Parameters:
Name Type Description Defaultx
dict
A dictionary from which the key is generated.
dict()
on
list
A list of keys from the dictionary that should be used to generate the key.
[]
Returns:
Type Descriptionstr
The generated string key.
Source code inalto2txt2fixture/utils.py
def get_key(x: dict = dict(), on: list = []) -> str:\n\"\"\"\n Get a string key from a dictionary using values from specified keys.\n Args:\n x: A dictionary from which the key is generated.\n on: A list of keys from the dictionary that should be used to\n generate the key.\n Returns:\n The generated string key.\n \"\"\"\nreturn f\"{'-'.join([str(x['fields'][y]) for y in on])}\"\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_lockfile","title":"get_lockfile","text":"get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path\n
Provides the path to any given lockfile, which controls whether any existing files should be overwritten or not.
Parameters:
Name Type Description Defaultcollection
str
Collection folder name
requiredkind
NewspaperElements
Either newspaper
or issue
or item
dic
dict
A dictionary with required information for either kind
passed
Returns:
Type DescriptionPath
Path to the resulting lockfile
Source code inalto2txt2fixture/utils.py
def get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path:\n\"\"\"\n Provides the path to any given lockfile, which controls whether any\n existing files should be overwritten or not.\n Args:\n collection: Collection folder name\n kind: Either `newspaper` or `issue` or `item`\n dic: A dictionary with required information for either `kind` passed\n Returns:\n Path to the resulting lockfile\n \"\"\"\np: Path\nbase = Path(f\"cache-lockfiles/{collection}\")\nif kind == \"newspaper\":\np = base / f\"newspapers/{dic['publication_code']}\"\nelif kind == \"issue\":\np = base / f\"issues/{dic['publication__publication_code']}/{dic['issue_code']}\"\nelif kind == \"item\":\ntry:\nif dic.get(\"issue_code\"):\np = base / f\"items/{dic['issue_code']}/{dic['item_code']}\"\nelif dic.get(\"issue__issue_identifier\"):\np = base / f\"items/{dic['issue__issue_identifier']}/{dic['item_code']}\"\nexcept KeyError:\nerror(\"An unknown error occurred (in get_lockfile)\")\nelse:\np = base / \"lockfile\"\np.parent.mkdir(parents=True, exist_ok=True) if settings.WRITE_LOCKFILES else None\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_now","title":"get_now","text":"get_now(as_str: bool = False) -> datetime.datetime | str\n
Return datetime.now()
as either a string or datetime
object.
Parameters:
Name Type Description Defaultas_str
bool
Whether to return now
time
as a str
or not, default: False
False
Returns:
Type Descriptiondatetime.datetime | str
datetime.now()
in pytz.UTC
time zone as a string if as_str
, else as a datetime.datetime
object.
alto2txt2fixture/utils.py
def get_now(as_str: bool = False) -> datetime.datetime | str:\n\"\"\"\n Return `datetime.now()` as either a string or `datetime` object.\n Args:\n as_str: Whether to return `now` `time` as a `str` or not, default: `False`\n Returns:\n `datetime.now()` in `pytz.UTC` time zone as a string if `as_str`, else\n as a `datetime.datetime` object.\n \"\"\"\nnow = datetime.datetime.now(tz=pytz.UTC)\nif as_str:\nreturn str(now)\nelse:\nassert isinstance(now, datetime.datetime)\nreturn now\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_path_from","title":"get_path_from","text":"get_path_from(p: str | Path) -> Path\n
Converts an input value into a Path object if it's not already one.
Parameters:
Name Type Description Defaultp
str | Path
The input value, which can be a string or a Path object.
requiredReturns:
Type DescriptionPath
The input value as a Path object.
Source code inalto2txt2fixture/utils.py
def get_path_from(p: str | Path) -> Path:\n\"\"\"\n Converts an input value into a Path object if it's not already one.\n Args:\n p: The input value, which can be a string or a Path object.\n Returns:\n The input value as a Path object.\n \"\"\"\nif isinstance(p, str):\np = Path(p)\nif not isinstance(p, Path):\nraise RuntimeError(f\"Unable to handle type: {type(p)}\")\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_size_from_path","title":"get_size_from_path","text":"get_size_from_path(p: str | Path, raw: bool = False) -> str | float\n
Returns a nice string for any given file size.
Parameters:
Name Type Description Defaultp
str | Path
Path to read the size from
requiredraw
bool
Whether to return the file size as total number of bytes or a human-readable MB/GB amount
False
Returns:
Type Descriptionstr | float
Return str
followed by MB
or GB
for size if not raw
otherwise float
.
alto2txt2fixture/utils.py
def get_size_from_path(p: str | Path, raw: bool = False) -> str | float:\n\"\"\"\n Returns a nice string for any given file size.\n Args:\n p: Path to read the size from\n raw: Whether to return the file size as total number of bytes or\n a human-readable MB/GB amount\n Returns:\n Return `str` followed by `MB` or `GB` for size if not `raw` otherwise `float`.\n \"\"\"\np = get_path_from(p)\nbytes = p.stat().st_size\nif raw:\nreturn bytes\nrel_size: float | int | str = round(bytes / 1000 / 1000 / 1000, 1)\nassert not isinstance(rel_size, str)\nif rel_size < 0.5:\nrel_size = round(bytes / 1000 / 1000, 1)\nrel_size = f\"{rel_size}MB\"\nelse:\nrel_size = f\"{rel_size}GB\"\nreturn rel_size\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.glob_filter","title":"glob_filter","text":"glob_filter(p: str) -> list\n
Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.
Parameters:
Name Type Description Defaultp
str
Path to a directory to filter
requiredReturns:
Type Descriptionlist
Sorted list of files contained in the provided path without the ones
list
whose names start with a .
alto2txt2fixture/utils.py
def glob_filter(p: str) -> list:\n\"\"\"\n Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.\n Args:\n p: Path to a directory to filter\n Returns:\n Sorted list of files contained in the provided path without the ones\n whose names start with a `.`\n \"\"\"\nreturn sorted([x for x in get_path_from(p).glob(\"*\") if not x.name.startswith(\".\")])\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.list_json_files","title":"list_json_files","text":"list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]\n
List json
files under the path specified in p
.
Parameters:
Name Type Description Defaultp
str | Path
The path to search for json
files
drill
bool
A flag indicating whether to drill down the subdirectories or not. Default is False
False
exclude_names
list
A list of file names to exclude from the search result. Default is an empty list
[]
include_names
list
A list of file names to include in search result. If provided, the exclude_names
argument will be ignored. Default is an empty list
[]
Returns:
Type DescriptionGenerator[Path, None, None] | list[Path]
A list of Path
objects pointing to the found json
files
alto2txt2fixture/utils.py
def list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]:\n\"\"\"\n List `json` files under the path specified in ``p``.\n Args:\n p: The path to search for `json` files\n drill: A flag indicating whether to drill down the subdirectories\n or not. Default is ``False``\n exclude_names: A list of file names to exclude from the search\n result. Default is an empty list\n include_names: A list of file names to include in search result.\n If provided, the ``exclude_names`` argument will be ignored.\n Default is an empty list\n Returns:\n A list of `Path` objects pointing to the found `json` files\n \"\"\"\nq: str = \"**/*.json\" if drill else \"*.json\"\nfiles = get_path_from(p).glob(q)\nif exclude_names:\nfiles = list({x for x in files if x.name not in exclude_names})\nelif include_names:\nfiles = list({x for x in files if x.name in include_names})\nreturn sorted(files)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_json","title":"load_json","text":"load_json(p: str | Path, crash: bool = False) -> dict | list\n
Easier access to reading json
files.
Parameters:
Name Type Description Defaultp
str | Path
Path to read json
from
crash
bool
Whether the program should crash if there is a json
decode error, default: False
False
Returns:
Type Descriptiondict | list
The decoded json
contents from the path, but an empty dictionary
dict | list
if the file cannot be decoded and crash
is set to False
alto2txt2fixture/utils.py
def load_json(p: str | Path, crash: bool = False) -> dict | list:\n\"\"\"\n Easier access to reading `json` files.\n Args:\n p: Path to read `json` from\n crash: Whether the program should crash if there is a `json` decode\n error, default: ``False``\n Returns:\n The decoded `json` contents from the path, but an empty dictionary\n if the file cannot be decoded and ``crash`` is set to ``False``\n \"\"\"\np = get_path_from(p)\ntry:\nreturn json.loads(p.read_text())\nexcept json.JSONDecodeError:\nmsg = f\"Error: {p.read_text()}\"\nerror(msg, crash=crash)\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_multiple_json","title":"load_multiple_json","text":"load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list\n
Load multiple json
files and return a list of their content.
Parameters:
Name Type Description Defaultp
str | Path
The path to search for json
files
drill
bool
A flag indicating whether to drill down the subdirectories or not. Default is False
False
filter_na
bool
A flag indicating whether to filter out the content that is None
. Default is True
.
True
crash
bool
A flag indicating whether to raise an exception when an error occurs while loading a json
file. Default is False
.
False
Returns:
Type Descriptionlist
A list
of the content of the loaded json
files.
alto2txt2fixture/utils.py
def load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list:\n\"\"\"\n Load multiple `json` files and return a list of their content.\n Args:\n p: The path to search for `json` files\n drill: A flag indicating whether to drill down the subdirectories\n or not. Default is `False`\n filter_na: A flag indicating whether to filter out the content that\n is `None`. Default is `True`.\n crash: A flag indicating whether to raise an exception when an\n error occurs while loading a `json` file. Default is `False`.\n Returns:\n A `list` of the content of the loaded `json` files.\n \"\"\"\nfiles = list_json_files(p, drill=drill)\ncontent = [load_json(x, crash=crash) for x in files]\nreturn [x for x in content if x] if filter_na else content\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.lock","title":"lock","text":"lock(lockfile: Path) -> None\n
Writes a '.' to a lockfile, after making sure the parent directory exists.
Parameters:
Name Type Description Defaultlockfile
Path
The path to the lock file to be created
requiredReturns:
Type DescriptionNone
None
Source code inalto2txt2fixture/utils.py
def lock(lockfile: Path) -> None:\n\"\"\"\n Writes a '.' to a lockfile, after making sure the parent directory exists.\n Args:\n lockfile: The path to the lock file to be created\n Returns:\n None\n \"\"\"\nlockfile.parent.mkdir(parents=True, exist_ok=True)\nlockfile.write_text(\"\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.save_fixture","title":"save_fixture","text":"save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n
Saves fixtures generated by a generator to separate JSON files.
This function takes a generator and saves the generated fixtures to separate JSON files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file
parameter.
Parameters:
Name Type Description Defaultgenerator
Sequence | Generator
A generator that yields the fixtures to be saved.
[]
prefix
str
A string prefix to be added to the file names of the saved fixtures.
''
output_path
PathLike | str
Path to folder fixtures are saved to
settings.OUTPUT
max_elements_per_file
int
Maximum JSON
records saved in each file
settings.MAX_ELEMENTS_PER_FILE
add_created
bool
Whether to add created_at
and updated_at
timestamps
True
json_indent
int
Number of indent spaces per line in saved JSON
JSON_INDENT
Returns:
Type DescriptionNone
This function saves the fixtures to files but does not return any value.
Example>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n... prefix='test', output_path='tests/')\n>>> imported_fixture = load_json('tests/test-1.json')\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> 'created_at' in imported_fixture[1]['fields']\nTrue\n
Source code in alto2txt2fixture/utils.py
def save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate JSON files.\n This function takes a generator and saves the generated fixtures to\n separate JSON files. The fixtures are saved in batches, where each batch\n is determined by the ``max_elements_per_file`` parameter.\n Args:\n generator: A generator that yields the fixtures to be saved.\n prefix: A string prefix to be added to the file names of the\n saved fixtures.\n output_path: Path to folder fixtures are saved to\n max_elements_per_file: Maximum `JSON` records saved in each file\n add_created: Whether to add `created_at` and `updated_at` `timestamps`\n json_indent: Number of indent spaces per line in saved `JSON`\n Returns:\n This function saves the fixtures to files but does not return\n any value.\n Example:\n ```pycon\n >>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n ... prefix='test', output_path='tests/')\n >>> imported_fixture = load_json('tests/test-1.json')\n >>> imported_fixture[1]['pk']\n 2\n >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> 'created_at' in imported_fixture[1]['fields']\n True\n ```\n \"\"\"\ninternal_counter = 1\ncounter = 1\nlst = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in generator:\nlst.append(item)\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst = []\ninternal_counter = 1\ncounter += 1\nelse:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.write_json","title":"write_json","text":"write_json(\np: str | Path,\no: dict,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n
Easier access to writing json
files. Checks whether parent exists.
Parameters:
Name Type Description Defaultp
str | Path
Path to write json
to
o
dict
Object to write to json
file
add_created
bool
If set to True will add created_at
and updated_at
to the dictionary's fields. If created_at
and updated_at
already exist in the fields, they will be forcefully updated.
True
json_indent
int
What indetation format to write out JSON
file in
JSON_INDENT
Returns:
Type DescriptionNone
None
Example>>> path = 'test-write-json/example.json'\n>>> write_json(p=path,\n... o=NEWSPAPER_COLLECTION_METADATA,\n... add_created=True)\n>>> imported_fixture = load_json(path)\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n
` Source code in alto2txt2fixture/utils.py
def write_json(\np: str | Path, o: dict, add_created: bool = True, json_indent: int = JSON_INDENT\n) -> None:\n\"\"\"\n Easier access to writing `json` files. Checks whether parent exists.\n Args:\n p: Path to write `json` to\n o: Object to write to `json` file\n add_created:\n If set to True will add `created_at` and `updated_at`\n to the dictionary's fields. If `created_at` and `updated_at`\n already exist in the fields, they will be forcefully updated.\n json_indent:\n What indetation format to write out `JSON` file in\n Returns:\n None\n Example:\n ```pycon\n >>> path = 'test-write-json/example.json'\n >>> write_json(p=path,\n ... o=NEWSPAPER_COLLECTION_METADATA,\n ... add_created=True)\n >>> imported_fixture = load_json(path)\n >>> imported_fixture[1]['pk']\n 2\n >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n ```\n `\n \"\"\"\np = get_path_from(p)\nif not (isinstance(o, dict) or isinstance(o, list)):\nraise RuntimeError(f\"Unable to handle data of type: {type(o)}\")\ndef _append_created_fields(o: dict):\n\"\"\"Add `created_at` and `updated_at` fields to a `dict` with `FixtureDict` values.\"\"\"\nreturn dict(\n**{k: v for k, v in o.items() if not k == \"fields\"},\nfields=dict(\n**{\nk: v\nfor k, v in o[\"fields\"].items()\nif not k == \"created_at\" and not k == \"updated_at\"\n},\n**{\"created_at\": NOW_str, \"updated_at\": NOW_str},\n),\n)\ntry:\nif add_created and isinstance(o, dict):\no = _append_created_fields(o)\nelif add_created and isinstance(o, list):\no = [_append_created_fields(x) for x in o]\nexcept KeyError:\nerror(\"An unknown error occurred (in write_json)\")\np.parent.mkdir(parents=True, exist_ok=True)\np.write_text(json.dumps(o, indent=json_indent))\nreturn\n
"},{"location":"tutorial/first-steps.html","title":"First Steps","text":""},{"location":"tutorial/first-steps.html#installing","title":"Installing","text":"The installation process should be fairly easy to take care of, using poetry
:
$ poetry install\n
However, this is only the first step in the process. As the script works through the alto2txt
collections, you will either need to choose the slower option \u2014 mounting them to your computer (using blobfuse
) \u2014\u00a0or the faster option \u2014 downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.
alto2txt
to the program","text":""},{"location":"tutorial/first-steps.html#downloading-local-copies-of-alto2txt-on-your-computer","title":"Downloading local copies of alto2txt
on your computer","text":"This option will take up a lot of hard drive space
As of the time of writing, downloading all of alto2txt
\u2019s metadata takes up about 185GB on your local drive.
You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.
"},{"location":"tutorial/first-steps.html#step-1-log-in-to-azure-using-microsoft-azure-storage-explorer","title":"Step 1: Log in to Azure using Microsoft Azure Storage Explorer","text":"Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.
Once you have opened MASE, you will need to sign into the appropriate Azure account.
"},{"location":"tutorial/first-steps.html#step-2-download-the-alto2txt-blob-container-to-your-hard-drive","title":"Step 2: Download thealto2txt
blob container to your hard drive","text":"On your left-hand side, you should see a menu where you can navigate to the correct \u201cblob container\u201d: Living with Machines
> Storage Accounts
> alto2txt
> Blob Containers
:
You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:
Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata
directory only onto your computer:
Select all of the files and press the download button:
Make sure you save all the zip files inside the correct local folder:
The \u201cActivities\u201d bar will now show you the progress and speed:
"},{"location":"tutorial/first-steps.html#mounting-alto2txt-on-your-computer","title":"Mountingalto2txt
on your computer","text":"This option will only work on a Linux or UNIX computer
If you have a mac, your only option is the one below.
"},{"location":"tutorial/first-steps.html#step-1-install-blobfuse","title":"Step 1: Install BlobFuse","text":"Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.
"},{"location":"tutorial/first-steps.html#step-2-set-up-sas-tokens","title":"Step 2: Set up SAS tokens","text":"Follow the instructions for setting up access to your Azure storage account.
"},{"location":"tutorial/first-steps.html#step-3-mount-your-blobs","title":"Step 3: Mount your blobs","text":"TODO #3: Write this section.
Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.
"}]} \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 0000000..8713024 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"alto2txt2fixture","text":"alto2txt2fixture
is a standalone tool to convert alto2txt
XML
output and other related datasets into JSON
(and where feasible CSV
) data with corresponding relational IDs to ease general use and ingestion into a relational database.
We target the the JSON
produced for importing into lwmdb
: a database built using the Django
python
webframework database fixture
structure.
We provide a command line interface to process alto2txt
XML
files stored locally (or mounted via azure
blobfuse
), and for additional public data we automate a means of downloading those automatically.
We recommend downloading a copy of the reposity or using git clone
. From a local copy use poetry
to install dependencies:
$ cd alto2txt2fixture\n$ poetry install\n
If you would like to test, render documentation and/or contribute to the code included dev
dependencies in a local install:
$ poetry install --with dev\n
"},{"location":"index.html#simple-use","title":"Simple use","text":"To processing newspaper metadata with a local copy of alto2txt
XML
results, it's easiest to have that data in the same folder as your alto2txt2fixture
checkout and poetry
installed folder. One arranged, you should be able to begin the JSON
converstion with
$ poetry run a2t2f-news\n
To generate related data in JSON
and CSV
form, assuming you have an internet collection and access to a living-with-machines
azure
account, the following will download related data into JSON
and CSV
files. The JSON
results should be consistent with lwmdb
tables for ease of import.
$ poetry run a2t2f-adj\n
"},{"location":"running.html","title":"Running the Program","text":""},{"location":"running.html#using-poetry-to-run","title":"Using poetry
to run","text":"The program should run automatically with the following command:
$ poetry run a2t2f-news\n
Alternatively, if you want to add optional parameters and don\u2019t want to use the standard poetry
script to run, you can use the (somewhat convoluted) poetry run alto2txt2fixture/run.py
and provide any optional parameters. You can see a list of all the \u201cOptional parameters\u201d below. For example, if you want to only include the hmd
collection:
$ poetry run alto2txt2fixture/run.py --collections hmd\n
"},{"location":"running.html#alternative-run-the-script-without-poetry","title":"Alternative: Run the script without poetry","text":"If you find yourself in trouble with poetry
, the program should run perfectly fine on its own, assuming the dependencies are installed. The same command, then, would be:
$ python alto2txt2fixture/run.py --collections hmd\n
Note
See the list under [tool.poetry.dependencies]
in pyproject.toml
for a list of dependencies that would need to be installed for alto2txt2fixture
to work outside a python poetry
environment.
The program has a number of optional parameters that you can choose to include or not. The table below describes each parameter, how to pass it to the program, and what its defaults are.
Flag Description Default value-c
, --collections
Which collections to process in the mounted alto2txt directory hmd
, lwm
, jisc
, bna
-o
, --output
Into which directory should the processed files be put? ./output/fixtures/
-m
, --mountpoint
Where is the alto2txt directories mounted? ./input/alto2txt/
-t
, --test-config
Print the config table but do not run False
"},{"location":"running.html#successfully-running-the-program-an-example","title":"Successfully running the program: An example","text":""},{"location":"understanding-results.html","title":"Understanding the Results","text":""},{"location":"understanding-results.html#the-resulting-file-structure","title":"The resulting file structure","text":"The examples below follow standard settings
If you choose other settings for when you run the program, your output directory may look different from the information on this page.
"},{"location":"understanding-results.html#reports","title":"Reports","text":"Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports
directory, you\u2019ll find a JSON file for each alto2txt
directory (organised by NLP identifier).
The report structure, thus, looks like this:
The JSON file has some good troubleshooting information. You\u2019ll find that the contents are structured as a Python dictionary
(or JavaScript Object
). Here is an example:
Here is an explanation of each of the keys in the dictionary:
Key Explanation Data typepath
The input path for the zip file that is being converted. string
bytes
The size of the input zip file represented in bytes. integer
size
The size of the input zip file represented in a human-readable string. string
contents
#TODO #3 integer
start
Date and time when processing started (see also end
below). datestring
newspaper_paths
#TODO #3 list
(string
) publication_codes
A list of the NLPs that are contained in the input zip file. list
(string
) issue_paths
A list of all the issue paths that are contained in the cache directory. list
(string
) item_paths
A list of all the item paths that are contained in the cache directory. list
(string
) end
Date and time when processing ended (see also start
above). datestring
seconds
Seconds that the script spent interpreting the zip file (should be added to the microseconds
below). integer
microseconds
Microseconds that the script spent interpreting the zip file (should be added to the seconds
above). integer
"},{"location":"understanding-results.html#fixtures","title":"Fixtures","text":"The most important output of the script is contained in the fixtures
directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider
, Digitisation
, Ingest
, Issue
, Newspaper
, and Item
). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6
elements*:
Each JSON file contains a Python-like list
(JavaScript Array
) of dictionaries
(JavaScript Objects
), which have a primary key (pk
), the related database model (in the example below the Django newspapers
app\u2019s newspaper
table), and a nested dictionary
/Object
which contains all the values for the database\u2019s table entry:
* The maximum elements per file can be adjusted in the settings.py
file\u2019s settings
object\u2019s MAX_ELEMENTS_PER_FILE
value.
Entry point for alto2txt2fixture.parse
to convert alto2txt
XML
-> JSON
.
This module defines the run function which is the main driver for the entire process.
It imports various functions from other modules and uses them to route and parse XML
data generated by alto2txt
.
The following steps are performed in the run function:
alto2txt
data into subdirectories with structured files.JSON
files.If the script is run as a main
program (i.e. if the name of the script is __main__
), the run()
function is executed.
Note: at present this does not include any functunality in create_adjacent_tables.py
parse_args(argv: list[str] | None = None) -> Namespace\n
Manage command line arguments for run()
This constructs an ArgumentParser
instance to manage configurating calls of run()
to manage newspaper
XML
to JSON
converstion.
Parameters:
Name Type Description Defaultargv
list[str] | None
If None
treat as equivalent of ['--help], if a
listof
strpass those options to
ArgumentParser`
None
Returns:
Type DescriptionNamespace
A Namespace
dict
-like configuration for run()
alto2txt2fixture/__main__.py
def parse_args(argv: list[str] | None = None) -> Namespace:\n\"\"\"Manage command line arguments for `run()`\n This constructs an `ArgumentParser` instance to manage\n configurating calls of `run()` to manage `newspaper`\n `XML` to `JSON` converstion.\n Arguments:\n argv:\n If `None` treat as equivalent of ['--help`],\n if a `list` of `str` pass those options to `ArgumentParser`\n Returns:\n A `Namespace` `dict`-like configuration for `run()`\n \"\"\"\nargv = None if not argv else argv\nparser = ArgumentParser(\nprog=\"a2t2f-news\",\ndescription=\"Process alto2txt XML into and Django JSON Fixture files\",\nepilog=(\n\"Note: this is still in beta mode and contributions welcome\\n\\n\" + __doc__\n),\nformatter_class=RawTextHelpFormatter,\n)\nparser.add_argument(\n\"-c\",\n\"--collections\",\nnargs=\"+\",\nhelp=\"<Optional> Set collections\",\nrequired=False,\n)\nparser.add_argument(\n\"-m\",\n\"--mountpoint\",\ntype=str,\nhelp=\"<Optional> Mountpoint\",\nrequired=False,\n)\nparser.add_argument(\n\"-o\",\n\"--output\",\ntype=str,\nhelp=\"<Optional> Set an output directory\",\nrequired=False,\n)\nparser.add_argument(\n\"-t\",\n\"--test-config\",\ndefault=False,\nhelp=\"Only print the configuration\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"-f\",\n\"--show-fixture-tables\",\ndefault=True,\nhelp=\"Print included fixture table configurations\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--export-fixture-tables\",\ndefault=True,\nhelp=\"Experimental: export fixture tables prior to data processing\",\naction=BooleanOptionalAction,\n)\nparser.add_argument(\n\"--data-provider-field\",\ntype=str,\ndefault=DATA_PROVIDER_INDEX,\nhelp=\"Key for indexing DataProvider records\",\n)\nreturn parser.parse_args(argv)\n
"},{"location":"reference/alto2txt2fixture/__main__.html#alto2txt2fixture.__main__.run","title":"run","text":"run(local_args: list[str] | None = None) -> None\n
Manage running newspaper XML
to JSON
conversion.
First parse_args
is called for command line arguments including:
collections
output
mountpoint
If any of these arguments are specified, they will be used, otherwise they will default to the values in the settings
module.
The show_setup
function is then called to display the configurations being used.
The route
function is then called to route the alto2txt files into subdirectories with structured files.
The parse
function is then called to parse the resulting JSON files.
Finally, the clear_cache
function is called to clear the cache (pending the user's confirmation).
Parameters:
Name Type Description Defaultlocal_args
list[str] | None
Options passed to parse_args()
None
Source code in alto2txt2fixture/__main__.py
def run(local_args: list[str] | None = None) -> None:\n\"\"\"Manage running newspaper `XML` to `JSON` conversion.\n First `parse_args` is called for command line arguments including:\n - `collections`\n - `output`\n - `mountpoint`\n If any of these arguments are specified, they will be used, otherwise they\n will default to the values in the `settings` module.\n The `show_setup` function is then called to display the configurations\n being used.\n The `route` function is then called to route the alto2txt files into\n subdirectories with structured files.\n The `parse` function is then called to parse the resulting JSON files.\n Finally, the `clear_cache` function is called to clear the cache\n (pending the user's confirmation).\n Arguments:\n local_args:\n Options passed to `parse_args()`\n \"\"\"\nargs: Namespace = parse_args(argv=local_args)\nif args.collections:\nCOLLECTIONS = [x.lower() for x in args.collections]\nelse:\nCOLLECTIONS = settings.COLLECTIONS\nif args.output:\nOUTPUT = args.output.rstrip(\"/\")\nelse:\nOUTPUT = settings.OUTPUT\nif args.mountpoint:\nMOUNTPOINT = args.mountpoint.rstrip(\"/\")\nelse:\nMOUNTPOINT = settings.MOUNTPOINT\nshow_setup(\nCOLLECTIONS=COLLECTIONS,\nOUTPUT=OUTPUT,\nCACHE_HOME=settings.CACHE_HOME,\nMOUNTPOINT=MOUNTPOINT,\nJISC_PAPERS_CSV=settings.JISC_PAPERS_CSV,\nREPORT_DIR=settings.REPORT_DIR,\nMAX_ELEMENTS_PER_FILE=settings.MAX_ELEMENTS_PER_FILE,\n)\nif args.show_fixture_tables:\n# Show a table of fixtures used, defaults to DataProvider Table\nshow_fixture_tables(settings, data_provider_index=args.data_provider_field)\nif args.export_fixture_tables:\nexport_fixtures(\nfixture_tables=settings.FIXTURE_TABLES,\npath=OUTPUT,\nformats=settings.FIXTURE_TABLES_FORMATS,\n)\nif not args.test_config:\n# Routing alto2txt into subdirectories with structured files\nroute(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nMOUNTPOINT,\nsettings.JISC_PAPERS_CSV,\nsettings.REPORT_DIR,\n)\n# Parsing the resulting JSON files\nparse(\nCOLLECTIONS,\nsettings.CACHE_HOME,\nOUTPUT,\nsettings.MAX_ELEMENTS_PER_FILE,\n)\nclear_cache(settings.CACHE_HOME)\n
"},{"location":"reference/alto2txt2fixture/cli.html","title":"cli","text":""},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_fixture_tables","title":"show_fixture_tables","text":"show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]\n
Print fixture tables specified in settings.fixture_tables
in rich.Table
format.
Parameters:
Name Type Description Defaultrun_settings
dotdict
alto2txt2fixture
run configuration
settings
print_in_call
bool
whether to print to console (will use console
variable if so)
True
data_provider_index
str
key to index dataprovider
from NEWSPAPER_COLLECTION_METADATA
DATA_PROVIDER_INDEX
Returns:
Type Descriptionlist[Table]
A list
of rich.Table
renders from configurations in run_settings.FIXTURE_TABLES
>>> fixture_tables: list[Table] = show_fixture_tables(\n... settings,\n... print_in_call=False)\n>>> len(fixture_tables)\n1\n>>> fixture_tables[0].title\n'dataprovider'\n>>> [column.header for column in fixture_tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n>>> fixture_tables = show_fixture_tables(settings)\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n
Note It is possible for the example test to fail in different screen sizes. Try increasing the window or screen width of terminal used to check before raising an issue.
Source code inalto2txt2fixture/cli.py
def show_fixture_tables(\nrun_settings: dotdict = settings,\nprint_in_call: bool = True,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> list[Table]:\n\"\"\"Print fixture tables specified in ``settings.fixture_tables`` in `rich.Table` format.\n Arguments:\n run_settings: `alto2txt2fixture` run configuration\n print_in_call: whether to print to console (will use ``console`` variable if so)\n data_provider_index: key to index `dataprovider` from ``NEWSPAPER_COLLECTION_METADATA``\n Returns:\n A `list` of `rich.Table` renders from configurations in ``run_settings.FIXTURE_TABLES``\n Example:\n ```pycon\n >>> fixture_tables: list[Table] = show_fixture_tables(\n ... settings,\n ... print_in_call=False)\n >>> len(fixture_tables)\n 1\n >>> fixture_tables[0].title\n 'dataprovider'\n >>> [column.header for column in fixture_tables[0].columns]\n ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n >>> fixture_tables = show_fixture_tables(settings)\n ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n <BLANKLINE>\n ...dataprovider...Heritage...\u2502 bl-hmd...\u2502 hmd...\n ```\n Note:\n It is possible for the example test to fail in different screen sizes. Try\n increasing the window or screen width of terminal used to check before\n raising an issue.\n \"\"\"\nif run_settings.FIXTURE_TABLES:\nif \"dataprovider\" in run_settings.FIXTURE_TABLES:\ncheck_newspaper_collection_configuration(\nrun_settings.COLLECTIONS,\nrun_settings.FIXTURE_TABLES[\"dataprovider\"],\ndata_provider_index=data_provider_index,\n)\nconsole_tables: list[Table] = list(\ngen_fixture_tables(run_settings.FIXTURE_TABLES)\n)\nif print_in_call:\nfor console_table in console_tables:\nconsole.print(console_table)\nreturn console_tables\nelse:\nreturn []\n
"},{"location":"reference/alto2txt2fixture/cli.html#alto2txt2fixture.cli.show_setup","title":"show_setup","text":"show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs: str) -> None\n
Generate a rich.table.Table
for printing configuration to console.
alto2txt2fixture/cli.py
def show_setup(clear: bool = True, title: str = SETUP_TITLE, **kwargs) -> None:\n\"\"\"Generate a `rich.table.Table` for printing configuration to console.\"\"\"\nif clear and os.name == \"posix\":\nos.system(\"clear\")\nelif clear:\nos.system(\"cls\")\ntable = Table(title=title)\ntable.add_column(\"Setting\", justify=\"right\", style=\"cyan\", no_wrap=True)\ntable.add_column(\"Value\", style=\"magenta\")\nfor key, value in kwargs.items():\ntable.add_row(str(key), str(value))\nconsole.print(table)\nreturn\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html","title":"create_adjacent_tables","text":""},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.correct_dict","title":"correct_dict","text":"correct_dict(o: dict) -> list\n
Returns a list with corrected data from a provided dictionary.
Source code inalto2txt2fixture/create_adjacent_tables.py
def correct_dict(o: dict) -> list:\n\"\"\"Returns a list with corrected data from a provided dictionary.\"\"\"\nreturn [(k, v[0], v[1]) for k, v in o.items() if not v[0].startswith(\"Q\")] + [\n(k, v[1], v[0]) for k, v in o.items() if v[0].startswith(\"Q\")\n]\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.csv2json_list","title":"csv2json_list","text":"csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list\n
Save csv_path
as a json
file and return as a list
.
alto2txt2fixture/create_adjacent_tables.py
def csv2json_list(\ncsv_path: PathLike,\noutput_path: Path = OUTPUT,\nsaved: list[Path] | None = None,\nindent: int = JSON_INDENT,\n) -> list:\n\"\"\"Save `csv_path` as a `json` file and return as a `list`.\"\"\"\njson_data = []\n# See this suggestion for `nan` values: https://stackoverflow.com/a/62691803/678486\ndf = (\npd.read_csv(csv_path, index_col=0).fillna(np.nan).replace([np.nan], [None])\n) # fillna(None)\nif \"political_leanings\" in df.columns:\ndf[\"political_leanings\"] = df[\"political_leanings\"].apply(json.loads)\nif \"prices\" in df.columns:\ndf[\"prices\"] = df[\"prices\"].apply(json.loads)\nmodel = Path(csv_path).stem.lower()\nfor pk, row in df.iterrows():\nfields = row.to_dict()\njson_data.append({\"pk\": pk, \"model\": model, \"fields\": fields})\n(Path(output_path) / csv_path).parent.mkdir(parents=True, exist_ok=True)\nPath(output_path / f\"{Path(csv_path).stem}.json\").write_text(\njson.dumps(json_data, indent=indent)\n)\nif not saved is None:\nsaved.append(output_path / f\"{Path(csv_path).stem}.json\")\nreturn json_data\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.download_data","title":"download_data","text":"download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None\n
Download files in files_dict
, overwrite if specified.
Parameters:
Name Type Description Defaultfiles_dict
RemoteDataFilesType
dict
of related files to download
{}
overwrite
bool
bool
to overwrite LOCAL_CACHE
files or not
OVERWRITE
exclude
list[str]
list
of files to exclude from files_dict
[]
Example >>> tmp: Path = getfixture('tmpdir')\n>>> set_path: Path = tmp.chdir()\n>>> download_data(exclude=[\n... \"mitchells\", \"Newspaper-1\", \"linking\"\n... ]) # doctest: +ELLIPSIS\nExcluding mitchells...\nExcluding Newspaper-1...\nExcluding linking...\nDownloading cache...dict_admin_counties.json\n100% ... 37/37 bytes\nDownloading cache...dict_countries.json\n100% ... 33.2/33.2 kB\nDownloading cache...dict_historic_counties.json\n100% ... 41.4/41.4 kB\nDownloading cache...nlp_loc_wikidata_concat.csv\n100% ... 59.8/59.8 kB\nDownloading cache...wikidata_gazetteer_selected_columns.csv\n100% ... 47.8/47.8 MB\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def download_data(\nfiles_dict: RemoteDataFilesType = {},\noverwrite: bool = OVERWRITE,\nexclude: list[str] = [],\n) -> None:\n\"\"\"Download files in ``files_dict``, overwrite if specified.\n Args:\n files_dict: `dict` of related files to download\n overwrite: `bool` to overwrite ``LOCAL_CACHE`` files or not\n exclude: `list` of files to exclude from ``files_dict``\n Example:\n ```pycon\n >>> tmp: Path = getfixture('tmpdir')\n >>> set_path: Path = tmp.chdir()\n >>> download_data(exclude=[\n ... \"mitchells\", \"Newspaper-1\", \"linking\"\n ... ]) # doctest: +ELLIPSIS\n Excluding mitchells...\n Excluding Newspaper-1...\n Excluding linking...\n Downloading cache...dict_admin_counties.json\n 100% ... 37/37 bytes\n Downloading cache...dict_countries.json\n 100% ... 33.2/33.2 kB\n Downloading cache...dict_historic_counties.json\n 100% ... 41.4/41.4 kB\n Downloading cache...nlp_loc_wikidata_concat.csv\n 100% ... 59.8/59.8 kB\n Downloading cache...wikidata_gazetteer_selected_columns.csv\n 100% ... 47.8/47.8 MB\n ```\n \"\"\"\nif not files_dict:\nfiles_dict = deepcopy(FILES)\nfor data_source in exclude:\nif data_source in files_dict:\nprint(f\"Excluding {data_source}...\")\nfiles_dict.pop(data_source, 0)\nelse:\nlogger.warning(\nf'\"{data_source}\" not an option to exclude from {files_dict}'\n)\n# Describe whether local file exists\nfor k in files_dict.keys():\nfiles_dict[k][\"exists\"] = files_dict[k][\"local\"].exists()\nfiles_to_download = [\n(v[\"remote\"], v[\"local\"], v[\"exists\"])\nfor v in files_dict.values()\nif \"exists\" in v and not v[\"exists\"] or overwrite\n]\nfor url, out, exists in files_to_download:\nrmtree(Path(out), ignore_errors=True) if exists else None\nprint(f\"Downloading {out}\")\nPath(out).parent.mkdir(parents=True, exist_ok=True)\nassert isinstance(url, str)\nwith urlopen(url) as response, open(out, \"wb\") as out_file:\ntotal: int = int(response.info()[\"Content-length\"])\nwith Progress(\n\"[progress.percentage]{task.percentage:>3.0f}%\",\nBarColumn(), # removed bar_width=None to avoid too long when resized\nDownloadColumn(),\n) as progress:\ndownload_task = progress.add_task(\"Download\", total=total)\nfor chunk in response:\nout_file.write(chunk)\nprogress.update(download_task, advance=len(chunk))\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_list","title":"get_list","text":"get_list(x)\n
Get a list from a string, which contains as separator. If no string is encountered, the function returns an empty list. Source code in alto2txt2fixture/create_adjacent_tables.py
def get_list(x):\n\"\"\"Get a list from a string, which contains <SEP> as separator. If no\n string is encountered, the function returns an empty list.\"\"\"\nreturn x.split(\"<SEP>\") if isinstance(x, str) else []\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.get_outpaths_dict","title":"get_outpaths_dict","text":"get_outpaths_dict(\nnames: Sequence[str], module_name: str\n) -> TableOutputConfigType\n
Return a dict
of csv
and json
paths for each module_name
table.
The csv
and json
paths
Parameters:
Name Type Description Defaultnames
Sequence[str]
iterable of names of each module_name
's component. Main target is csv
and json
table names
module_name
str
name of module each name is part of, that is added as a prefix
requiredReturns:
Type DescriptionTableOutputConfigType
A TableOutputConfigType
: a dict
of table names
and output csv
and json
filenames.
>>> from pprint import pprint\n>>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n{'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n 'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n
Source code in alto2txt2fixture/create_adjacent_tables.py
def get_outpaths_dict(names: Sequence[str], module_name: str) -> TableOutputConfigType:\n\"\"\"Return a `dict` of `csv` and `json` paths for each `module_name` table.\n The `csv` and `json` paths\n Args:\n names: iterable of names of each `module_name`'s component. Main target is `csv` and `json` table names\n module_name: name of module each name is part of, that is added as a prefix\n Returns:\n A ``TableOutputConfigType``: a `dict` of table ``names`` and output\n `csv` and `json` filenames.\n Example:\n ```pycon\n >>> from pprint import pprint\n >>> pprint(get_outpaths_dict(MITCHELLS_TABELS, \"mitchells\"))\n {'Entry': {'csv': 'mitchells.Entry.csv', 'json': 'mitchells.Entry.json'},\n 'Issue': {'csv': 'mitchells.Issue.csv', 'json': 'mitchells.Issue.json'},\n 'PoliticalLeaning': {'csv': 'mitchells.PoliticalLeaning.csv',\n 'json': 'mitchells.PoliticalLeaning.json'},\n 'Price': {'csv': 'mitchells.Price.csv', 'json': 'mitchells.Price.json'}}\n ```\n \"\"\"\nreturn {\nname: OutputPathDict(\ncsv=f\"{module_name}.{name}.csv\",\njson=f\"{module_name}.{name}.json\",\n)\nfor name in names\n}\n
"},{"location":"reference/alto2txt2fixture/create_adjacent_tables.html#alto2txt2fixture.create_adjacent_tables.run","title":"run","text":"run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None\n
Download, process and link files_dict
to json
and csv
.
This will require access to https://zooniversedata.blob.core.windows.net/downloads/
.
alto2txt2fixture/create_adjacent_tables.py
def run(\nfiles_dict: dict = {},\nfiles_to_download_overwrite: bool = OVERWRITE,\nsaved: list[PathLike] = SAVED,\ntime_stamp: str = \"\",\noutput_path: Path = OUTPUT,\n) -> None:\n\"\"\"Download, process and link ``files_dict`` to `json` and `csv`.\n Note:\n This will require access to `https://zooniversedata.blob.core.windows.net/downloads/`.\n \"\"\"\n# Ensure time_stamp from the point of calling `run`\nif not time_stamp:\ntime_stamp = get_now(as_str=False).strftime(TIME_FORMAT)\n# Ensure an independent deepcopy of FILES to avoid modifying subsequent runs\nif not files_dict:\nfiles_dict = deepcopy(FILES)\n# Download non-existing files\ndownload_data(files_dict=files_dict, overwrite=files_to_download_overwrite)\n# Create the output directory (defined in output_path)\noutput_path.mkdir(exist_ok=True, parents=True)\n# Read all the Wikidata Q values from Mitchells\nmitchells_df = pd.read_csv(files_dict[\"mitchells\"][\"local\"], index_col=0)\nmitchell_wikidata_mentions = sorted(\nlist(mitchells_df.PLACE_PUB_WIKI.unique()),\nkey=lambda x: int(x.replace(\"Q\", \"\")),\n)\n# Set up wikidata_gazetteer\ngaz_cols = [\"wikidata_id\", \"english_label\", \"latitude\", \"longitude\", \"geonamesIDs\"]\nwikidata_gazetteer = pd.read_csv(\nfiles_dict[\"wikidata_gazetteer_selected_columns\"][\"local\"], usecols=gaz_cols\n)\nwikidata_gazetteer.rename(\n{\n\"wikidata_id\": \"place_wikidata_id\",\n\"english_label\": \"place_label\",\n\"geonamesIDs\": \"geonames_ids\",\n},\naxis=1,\ninplace=True,\n)\n# Read in + fix all dictionaries\ndict_historic_counties = json.loads(\nPath(files_dict[\"dict_historic_counties\"][\"local\"]).read_text()\n)\ndict_admin_counties = json.loads(\nPath(files_dict[\"dict_admin_counties\"][\"local\"]).read_text()\n)\ndict_countries = json.loads(Path(files_dict[\"dict_countries\"][\"local\"]).read_text())\ndict_historic_counties = correct_dict(dict_historic_counties)\ndict_admin_counties = correct_dict(dict_admin_counties)\ndict_countries = correct_dict(dict_countries)\n# Create assisting frames\nhistorical_counties_df = pd.DataFrame(\ndict_historic_counties,\ncolumns=[\"place_wikidata_id\", \"hcounty_label\", \"hcounty_wikidata_id\"],\n)\nadmin_county_df = pd.DataFrame(\ndict_admin_counties,\ncolumns=[\n\"place_wikidata_id\",\n\"admin_county_label\",\n\"admin_county_wikidata_id\",\n],\n)\ncountries_df = pd.DataFrame(\ndict_countries,\ncolumns=[\"place_wikidata_id\", \"country_label\", \"country_wikidata_id\"],\n)\nwikidata_gazetteer = wikidata_gazetteer[\nwikidata_gazetteer.place_wikidata_id.isin(mitchell_wikidata_mentions)\n].sort_values(\"place_wikidata_id\")\nwikidata_gazetteer[\"place_pk\"] = np.arange(1, len(wikidata_gazetteer) + 1)\nwikidata_gazetteer = wikidata_gazetteer[\n[\"place_pk\"] + [x for x in wikidata_gazetteer.columns if not x == \"place_pk\"]\n]\n# Merge wikidata_gazetteer with all the assisting frames (and rename the\n# resulting columns)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, historical_counties_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, admin_county_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer = pd.merge(\nwikidata_gazetteer, countries_df, on=\"place_wikidata_id\", how=\"left\"\n)\nwikidata_gazetteer.rename(\n{\n\"admin_county_label\": \"admin_county__label\",\n\"admin_county_wikidata_id\": \"admin_county__wikidata_id\",\n\"hcounty_label\": \"historic_county__label\",\n\"hcounty_wikidata_id\": \"historic_county__wikidata_id\",\n\"country_label\": \"country__label\",\n\"country_wikidata_id\": \"country__wikidata_id\",\n},\naxis=1,\ninplace=True,\n)\n# Split back up into dataframes specific for the tables\nhistoric_county_table = (\nwikidata_gazetteer[[\"historic_county__label\", \"historic_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nhistoric_county_table = historic_county_table.replace({\"\": np.nan}).dropna()\nhistoric_county_table[\"historic_county__pk\"] = np.arange(\n1, len(historic_county_table) + 1\n)\nadmin_county_table = (\nwikidata_gazetteer[[\"admin_county__label\", \"admin_county__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\nadmin_county_table = admin_county_table.replace({\"\": np.nan}).dropna()\nadmin_county_table[\"admin_county__pk\"] = np.arange(1, len(admin_county_table) + 1)\ncountry_table = (\nwikidata_gazetteer[[\"country__label\", \"country__wikidata_id\"]]\n.drop_duplicates()\n.copy()\n)\ncountry_table = country_table.replace({\"\": np.nan}).dropna()\ncountry_table[\"country__pk\"] = np.arange(1, len(country_table) + 1)\n# Set up place_table from wikidata_gazetteer\nplace_table = wikidata_gazetteer.copy()\nplace_table = (\npd.merge(\nplace_table,\nhistoric_county_table,\non=[\"historic_county__label\", \"historic_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"historic_county__label\", \"historic_county__wikidata_id\"], axis=1)\n.rename({\"historic_county__pk\": \"historic_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\nadmin_county_table,\non=[\"admin_county__label\", \"admin_county__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"admin_county__label\", \"admin_county__wikidata_id\"], axis=1)\n.rename({\"admin_county__pk\": \"admin_county_id\"}, axis=1)\n)\nplace_table = (\npd.merge(\nplace_table,\ncountry_table,\non=[\"country__label\", \"country__wikidata_id\"],\nhow=\"left\",\n)\n.drop([\"country__label\", \"country__wikidata_id\"], axis=1)\n.rename({\"country__pk\": \"country_id\"}, axis=1)\n)\nplace_table.fillna(\"\", inplace=True)\nplace_table.set_index(\"place_pk\", inplace=True)\nplace_table.rename(\n{\"place_label\": \"label\", \"place_wikidata_id\": \"wikidata_id\"},\naxis=1,\ninplace=True,\n)\nplace_table[\"historic_county_id\"] = (\nplace_table[\"historic_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"admin_county_id\"] = (\nplace_table[\"admin_county_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table[\"country_id\"] = (\nplace_table[\"country_id\"]\n.replace(r\"^\\s*$\", 0, regex=True)\n.astype(int)\n.replace(0, \"\")\n)\nplace_table.index.rename(\"pk\", inplace=True)\nplace_table.rename(\n{\n\"historic_county_id\": \"historic_county\",\n\"admin_county_id\": \"admin_county\",\n\"country_id\": \"country\",\n},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.set_index(\"historic_county__pk\", inplace=True)\nhistoric_county_table.rename(\n{x: x.split(\"__\")[1] for x in historic_county_table.columns},\naxis=1,\ninplace=True,\n)\nhistoric_county_table.index.rename(\"pk\", inplace=True)\nadmin_county_table.set_index(\"admin_county__pk\", inplace=True)\nadmin_county_table.rename(\n{x: x.split(\"__\")[1] for x in admin_county_table.columns}, axis=1, inplace=True\n)\nadmin_county_table.index.rename(\"pk\", inplace=True)\ncountry_table.set_index(\"country__pk\", inplace=True)\ncountry_table.rename(\n{x: x.split(\"__\")[1] for x in country_table.columns}, axis=1, inplace=True\n)\ncountry_table.index.rename(\"pk\", inplace=True)\n# Adding created_at, updated_at to all the gazetteer tables\nplace_table[\"created_at\"] = time_stamp\nplace_table[\"updated_at\"] = time_stamp\nadmin_county_table[\"created_at\"] = time_stamp\nadmin_county_table[\"updated_at\"] = time_stamp\nhistoric_county_table[\"created_at\"] = time_stamp\nhistoric_county_table[\"updated_at\"] = time_stamp\ncountry_table[\"created_at\"] = time_stamp\ncountry_table[\"updated_at\"] = time_stamp\n# Save CSV files for gazetteer tables\nplace_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"])\nadmin_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"]\n)\nhistoric_county_table.to_csv(\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"]\n)\ncountry_table.to_csv(output_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"])\nsaved.extend(\n[\noutput_path / GAZETTEER_OUT_FILENAMES[PLACE][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[ADMIN_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[HISTORIC_COUNTY][\"csv\"],\noutput_path / GAZETTEER_OUT_FILENAMES[COUNTRY][\"csv\"],\n]\n)\n# Fix up Mitchells (already loaded)\nmitchells_df[\"politics\"] = mitchells_df.POLITICS.apply(get_list)\nmitchells_df[\"persons\"] = mitchells_df.PERSONS.apply(get_list)\nmitchells_df[\"organisations\"] = mitchells_df.ORGANIZATIONS.apply(get_list)\nmitchells_df[\"price\"] = mitchells_df.PRICE.apply(get_list)\nmitchells_df.rename(\n{\n\"ID\": \"mpd_id\",\n\"TITLE\": \"title\",\n\"politics\": \"political_leaning_raw\",\n\"price\": \"price_raw\",\n\"YEAR\": \"year\",\n\"PLACE_PUB_WIKI\": \"place_of_publication_id\",\n\"ESTABLISHED_DATE\": \"date_established_raw\",\n\"PUBLISED_DATE\": \"day_of_publication_raw\",\n},\naxis=1,\ninplace=True,\n)\ndrop_cols = [\n\"CHAIN_ID\",\n\"POLITICS\",\n\"PERSONS\",\n\"ORGANIZATIONS\",\n\"PRICE\",\n\"PLACE_PUB\",\n\"PLACE_PUB_COORD\",\n\"PLACES\",\n\"PLACES_TRES\",\n\"TEXT\",\n]\nmitchells_df.drop(columns=drop_cols, inplace=True)\n# Create derivative tables (from Mitchells) = political_leanings, prices,\n# issues\npolitical_leanings = sorted(\nlist(set([y.strip() for x in mitchells_df.political_leaning_raw for y in x]))\n)\npolitical_leanings_table = pd.DataFrame()\npolitical_leanings_table[\"political_leaning__pk\"] = np.arange(\n1, len(political_leanings) + 1\n)\npolitical_leanings_table[\"political_leaning__label\"] = political_leanings\nexport = political_leanings_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"political_leaning__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[POLITICAL_LEANING][\"csv\"])\nprices = sorted(list(set([y.strip() for x in mitchells_df.price_raw for y in x])))\nprices_table = pd.DataFrame()\nprices_table[\"price__pk\"] = np.arange(1, len(prices) + 1)\nprices_table[\"price__label\"] = prices\nexport = prices_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"price__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[PRICE][\"csv\"])\nissues = sorted(list(mitchells_df.year.unique()))\nissues_table = pd.DataFrame()\nissues_table[\"issue__pk\"] = np.arange(1, len(issues) + 1)\nissues_table[\"issue__year\"] = issues\nexport = issues_table.copy()\nexport[\"created_at\"] = time_stamp\nexport[\"updated_at\"] = time_stamp\nexport.set_index(\"issue__pk\", inplace=True)\nexport.index.rename(\"pk\", inplace=True)\nexport.rename(\n{x: x.split(\"__\")[1] if len(x.split(\"__\")) > 1 else x for x in export.columns},\naxis=1,\ninplace=True,\n)\nexport.to_csv(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ISSUE][\"csv\"])\n# Set up linking on Mitchells dataframe\nlinking_df = pd.read_csv(\nfiles_dict[\"linking\"][\"local\"],\nindex_col=0,\ndtype={\"NLP\": str},\nusecols=[\n\"NLP\",\n\"Title\",\n\"AcquiredYears\",\n\"Editions\",\n\"EditionTitles\",\n\"City\",\n\"Publisher\",\n\"UnavailableYears\",\n\"Collection\",\n\"UK\",\n\"Complete\",\n\"Notes\",\n\"County\",\n\"HistoricCounty\",\n\"First date held\",\n\"Publication title\",\n\"link_to_mpd\",\n],\n)\nlinking_df[\"NLP\"] = linking_df.index\nlinking_df.rename(\n{\"link_to_mpd\": \"mpd_id\", \"NLP\": \"newspaper\"}, axis=1, inplace=True\n)\n# Link Mitchells with all the other data\nmitchells_df = pd.merge(mitchells_df, linking_df, on=\"mpd_id\", how=\"inner\")\n# Create entry_table\nentry_table = mitchells_df.copy()\nentry_table[\"place_of_circulation_raw\"] = \"\"\nentry_table[\"publication_district_raw\"] = \"\"\nentry_table[\"publication_county_raw\"] = \"\"\n# TODO: What happened to the three columns above? (Check w Kaspar?)\n# Only keep relevant columns\nentry_table = entry_table[\n[\n\"title\",\n\"political_leaning_raw\",\n\"price_raw\",\n\"year\",\n\"date_established_raw\",\n\"day_of_publication_raw\",\n\"place_of_circulation_raw\",\n\"publication_district_raw\",\n\"publication_county_raw\",\n\"organisations\",\n\"persons\",\n\"place_of_publication_id\",\n\"newspaper\",\n]\n]\n# Fix refs to political_leanings_table\nrev = political_leanings_table.set_index(\"political_leaning__label\")\nentry_table[\"political_leanings\"] = entry_table.political_leaning_raw.apply(\nlambda x: [rev.at[y, \"political_leaning__pk\"] for y in x]\n)\n# Fix refs to prices_table\nrev = prices_table.set_index(\"price__label\")\nentry_table[\"prices\"] = entry_table.price_raw.apply(\nlambda x: [rev.at[y.strip(), \"price__pk\"] for y in x]\n)\n# Fix refs to issues_table\nrev = issues_table.set_index(\"issue__year\")\nentry_table[\"issue\"] = entry_table.year.apply(lambda x: rev.at[x, \"issue__pk\"])\n# Fix refs to place_table\nrev = place_table.copy()\nrev[\"place__pk\"] = rev.index\nrev.set_index(\"wikidata_id\", inplace=True)\nentry_table[\"place_of_publication\"] = entry_table.place_of_publication_id.apply(\ntest_place, rev=rev\n)\nentry_table.drop(columns=[\"place_of_publication_id\"], inplace=True)\n# Set up ref to newspapers\nrev = json.loads(files_dict[\"Newspaper-1\"][\"local\"].read_text())\nrev = [dict(pk=v[\"pk\"], **v[\"fields\"]) for v in rev]\nrev = pd.DataFrame(rev)\nrev.set_index(\"publication_code\", inplace=True)\nentry_table[\"newspaper\"] = entry_table.newspaper.str.zfill(7)\nentry_table[\"newspaper\"] = entry_table.newspaper.apply(test_paper, rev=rev)\n# Create PK for entries\nentry_table[\"pk\"] = np.arange(1, len(entry_table) + 1)\n# Sort columns in entries file\nentry_table = entry_table[\n[\"pk\"] + [col for col in entry_table.columns if not col == \"pk\"]\n]\n# Add created_at, modified_at to entry_table\nentry_table[\"created_at\"] = time_stamp\nentry_table[\"updated_at\"] = time_stamp\n# Export entry_table\nentry_table.set_index(\"pk\").to_csv(\noutput_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"]\n)\nsaved.append(output_path / MITCHELLS_OUT_FILENAMES[ENTRY][\"csv\"])\n# ######\u00a0NOW WE CAN EASILY CREATE JSON files_dict\nfor csv_file_path in output_path.glob(\"*.csv\"):\ncsv2json_list(csv_file_path)\nprint(\"Finished - saved files:\")\nprint(\"- \" + \"\\n- \".join([str(x) for x in saved]))\n
"},{"location":"reference/alto2txt2fixture/jisc.html","title":"jisc","text":""},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.get_jisc_title","title":"get_jisc_title","text":"get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str\n
Match a newspaper title
with jisc_papers
records.
Takes an input_sub_path
, a publication_code
, and an (optional) abbreviation for any newspaper to locate the title
in the jisc_papers
DataFrame
. jisc_papers
is usually loaded via the setup_jisc_papers
function.
Parameters:
Name Type Description Defaulttitle
str
target newspaper title
requiredissue_date
str
target newspaper issue_date
requiredjisc_papers
pd.DataFrame
DataFrame
of jisc_papers
to match
input_sub_path
str
path of files to narrow down query input_sub_path
requiredpublication_code
str
unique codes to match newspaper records
requiredabbr
str | None
an optional abbreviation of the newspaper title
None
Returns:
Type Descriptionstr
Matched title
str
or abbr
.
Returns:
Type Descriptionstr
A string estimating the JISC equivalent newspaper title
Source code inalto2txt2fixture/jisc.py
def get_jisc_title(\ntitle: str,\nissue_date: str,\njisc_papers: pd.DataFrame,\ninput_sub_path: str,\npublication_code: str,\nabbr: str | None = None,\n) -> str:\n\"\"\"\n Match a newspaper ``title`` with ``jisc_papers`` records.\n Takes an ``input_sub_path``, a ``publication_code``, and an (optional)\n abbreviation for any newspaper to locate the ``title`` in the\n ``jisc_papers`` `DataFrame`. ``jisc_papers`` is usually loaded via the\n ``setup_jisc_papers`` function.\n Args:\n title: target newspaper title\n issue_date: target newspaper issue_date\n jisc_papers: `DataFrame` of `jisc_papers` to match\n input_sub_path: path of files to narrow down query input_sub_path\n publication_code: unique codes to match newspaper records\n abbr: an optional abbreviation of the newspaper title\n Returns:\n Matched ``title`` `str` or ``abbr``.\n Returns:\n A string estimating the JISC equivalent newspaper title\n \"\"\"\n# First option, search the input_sub_path for a valid-looking publication_code\ng = PUBLICATION_CODE.findall(input_sub_path)\nif len(g) == 1:\npublication_code = g[0]\n# Let's see if we can find title:\ntitle = (\njisc_papers[\njisc_papers.publication_code == publication_code\n].title.to_list()[0]\nif jisc_papers[\njisc_papers.publication_code == publication_code\n].title.count()\n== 1\nelse title\n)\nreturn title\n# Second option, look through JISC papers for best match (on publication_code if we have it, but abbr more importantly if we have it)\nif abbr:\n_publication_code = publication_code\npublication_code = abbr\nif jisc_papers.abbr[jisc_papers.abbr == publication_code].count():\ndate = datetime.strptime(issue_date, \"%Y-%m-%d\")\nmask = (\n(jisc_papers.abbr == publication_code)\n& (date >= jisc_papers.start_date)\n& (date <= jisc_papers.end_date)\n)\nfiltered = jisc_papers.loc[mask]\nif filtered.publication_code.count() == 1:\npublication_code = filtered.publication_code.to_list()[0]\ntitle = filtered.title.to_list()[0]\nreturn title\n# Last option: let's find all the possible titles in the jisc_papers for the abbreviation, and if it's just one unique title, let's pick it!\nif abbr:\ntest = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test[0]\nelse:\nmask1 = (jisc_papers.abbr == publication_code) & (\njisc_papers.publication_code == _publication_code\n)\ntest1 = jisc_papers.loc[mask1]\ntest1 = list({x for x in jisc_papers[jisc_papers.abbr == abbr].title})\nif len(test) == 1:\nreturn test1[0]\n# Fallback: if abbreviation is set, we'll return that:\nif abbr:\n# For these exceptions, see issue comment:\n# https://github.com/alan-turing-institute/Living-with-Machines/issues/2453#issuecomment-1050652587\nif abbr == \"IPJL\":\nreturn \"Ipswich Journal\"\nelif abbr == \"BHCH\":\nreturn \"Bath Chronicle\"\nelif abbr == \"LSIR\":\nreturn \"Leeds Intelligencer\"\nelif abbr == \"AGER\":\nreturn \"Lancaster Gazetter, And General Advertiser For Lancashire West\"\nreturn abbr\nraise RuntimeError(f\"Title {title} could not be found.\")\n
"},{"location":"reference/alto2txt2fixture/jisc.html#alto2txt2fixture.jisc.setup_jisc_papers","title":"setup_jisc_papers","text":"setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame\n
Create a DataFrame
with information in JISC_PAPERS_CSV
in settings.
Returns:
Type Descriptionpd.DataFrame
DataFrame
with all JISC titles.
alto2txt2fixture/jisc.py
def setup_jisc_papers(path: str = settings.JISC_PAPERS_CSV) -> pd.DataFrame:\n\"\"\"\n Create a `DataFrame` with information in `JISC_PAPERS_CSV` in settings.\n Returns:\n `DataFrame` with all JISC titles.\n \"\"\"\nif not Path(path).exists():\nraise RuntimeError(\nf\"Could not find required JISC papers file. Put {Path(path).name} in {Path(path).parent} or correct the settings with a different path.\"\n)\nmonths = {\n\"Jan\": 1,\n\"Feb\": 2,\n\"Mar\": 3,\n\"Apr\": 4,\n\"May\": 5,\n\"Jun\": 6,\n\"June\": 6,\n\"Jul\": 7,\n\"July\": 7,\n\"Aug\": 8,\n\"Sep\": 9,\n\"Sept\": 9,\n\"Oct\": 10,\n\"Nov\": 11,\n\"Dec\": 12,\n\"Dec.\": 12,\n}\njisc_papers = pd.read_csv(\npath,\nusecols=[\n\"Newspaper Title\",\n\"NLP\",\n\"Abbr\",\n\"StartD\",\n\"StartM\",\n\"StartY\",\n\"EndD\",\n\"EndM\",\n\"EndY\",\n],\n)\njisc_papers[\"start_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.StartY),\nmonth=months[x.StartM.strip(\".\").strip()],\nday=int(x.StartD),\n),\naxis=1,\n)\njisc_papers[\"end_date\"] = jisc_papers.apply(\nlambda x: datetime(\nyear=int(x.EndY), month=months[x.EndM.strip(\".\").strip()], day=int(x.EndD)\n),\naxis=1,\n)\njisc_papers.drop(\n[\"StartD\", \"StartM\", \"StartY\", \"EndD\", \"EndM\", \"EndY\"],\naxis=\"columns\",\ninplace=True,\n)\njisc_papers.rename(\n{\"Newspaper Title\": \"title\", \"NLP\": \"publication_code\", \"Abbr\": \"abbr\"},\naxis=1,\ninplace=True,\n)\njisc_papers[\"title\"] = jisc_papers[\"title\"].apply(\nlambda x: \"The \" + x[:-5] if x.strip()[-5:].lower() == \", the\" else x\n)\njisc_papers[\"publication_code\"] = jisc_papers[\"publication_code\"].apply(\nlambda x: str(x).zfill(7)\n)\nreturn jisc_papers\n
"},{"location":"reference/alto2txt2fixture/log.html","title":"log","text":""},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.error","title":"error","text":"error(msg: str, crash: bool = True, silent: bool = True) -> None\n
Print msg
in colorama
Force.RED
and exit()
If silent
exit()
after call, else raise
RuntimeError
if crash=True
.
alto2txt2fixture/log.py
def error(msg: str, crash: bool = True, silent: bool = True) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.RED` and `exit()`\n If `silent` `exit()` after call, else `raise` `RuntimeError` if ``crash=True``.\"\"\"\nif crash and silent:\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nexit()\nelif crash:\nraise RuntimeError(msg) from None\nprint(f\"{Fore.RED}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.info","title":"info","text":"info(msg: str) -> None\n
Print msg
in colorama
Force.CYAN
colour.
alto2txt2fixture/log.py
def info(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.CYAN` colour.\"\"\"\nprint(f\"{Fore.CYAN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.success","title":"success","text":"success(msg: str) -> None\n
Print msg
in colorama
Force.GREEN
colour.
alto2txt2fixture/log.py
def success(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.GREEN` colour.\"\"\"\nprint(f\"{Fore.GREEN}{msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/log.html#alto2txt2fixture.log.warning","title":"warning","text":"warning(msg: str) -> None\n
Print msg
in colorama
Force.YELLOW
colour.
alto2txt2fixture/log.py
def warning(msg: str) -> None:\n\"\"\"Print ``msg`` in `colorama` `Force.YELLOW` colour.\"\"\"\nprint(f\"{Fore.YELLOW}Warning: {msg}{Style.RESET_ALL}\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html","title":"parser","text":""},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.fixtures","title":"fixtures","text":"fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]\n
Generates fixtures for a specified model using a list of files.
This function takes a list of files and generates fixtures for a specified model. The fixtures can be used to populate a database or perform other data-related operations.
Parameters:
Name Type Description Defaultfilelist
list
A list of files to process and generate fixtures from.
[]
model
str
The name of the model for which fixtures are generated. translate: A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used as keys, and their corresponding primary keys (obtained from the provided files) will be used as values in the generated fixtures. ''
rename
dict
A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the generated fixtures. {}
uniq_keys
list
A list of fields that need to be considered for uniqueness in the fixtures. If specified, the fixtures will yield only unique items based on the combination of these fields.
[]
Yields:
Type DescriptionFixtureDict
FixtureDict
from model
, pk
and dict
of fields
.
Returns:
Type DescriptionGenerator[FixtureDict, None, None]
This function generates fixtures but does not return any value.
Source code inalto2txt2fixture/parser.py
def fixtures(\nfilelist: list = [],\nmodel: str = \"\",\ntranslate: dict = {},\nrename: dict = {},\nuniq_keys: list = [],\n) -> Generator[FixtureDict, None, None]:\n\"\"\"\n Generates fixtures for a specified model using a list of files.\n This function takes a list of files and generates fixtures for a specified\n model. The fixtures can be used to populate a database or perform other\n data-related operations.\n Args:\n filelist: A list of files to process and generate fixtures from.\n model: The name of the model for which fixtures are generated.\n translate: A nested dictionary representing the translation mapping\n for fields. The structure of the translator follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n The translated fields will be used as keys, and their\n corresponding primary keys (obtained from the provided files) will\n be used as values in the generated fixtures.\n rename: A nested dictionary representing the field renaming\n mapping. The structure of the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': 'new_field_name'\n }\n }\n ```\n The fields specified in the dictionary will be renamed to the\n provided new field names in the generated fixtures.\n uniq_keys: A list of fields that need to be considered for\n uniqueness in the fixtures. If specified, the fixtures will yield\n only unique items based on the combination of these fields.\n Yields:\n `FixtureDict` from ``model``, ``pk`` and `dict` of ``fields``.\n Returns:\n This function generates fixtures but does not return any value.\n \"\"\"\nfilelist = sorted(filelist, key=lambda x: str(x).split(\"/\")[:-1])\ncount = len(filelist)\n# Process JSONL\nif [x for x in filelist if \".jsonl\" in x.name]:\npk = 0\n# In the future, we might want to show progress here (tqdm or suchlike)\nfor file in filelist:\nfor line in file.read_text().splitlines():\npk += 1\nline = json.loads(line)\nyield FixtureDict(\npk=pk,\nmodel=model,\nfields=dict(**get_fields(line, translate=translate, rename=rename)),\n)\nreturn\nelse:\n# Process JSON\npks = [x for x in range(1, count + 1)]\nif len(uniq_keys):\nuniq_files = list(uniq(filelist, uniq_keys))\ncount = len(uniq_files)\nzipped = zip(uniq_files, pks)\nelse:\nzipped = zip(filelist, pks)\nfor x in tqdm(\nzipped, total=count, desc=f\"{model} ({count:,} objs)\", leave=False\n):\nyield FixtureDict(\npk=x[1],\nmodel=model,\nfields=dict(**get_fields(x[0], translate=translate, rename=rename)),\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_fields","title":"get_fields","text":"get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict\n
Retrieves fields from a file and performs modifications and checks.
This function takes a file (in various formats: Path
, str
, or dict
) and processes its fields. It retrieves the fields from the file and performs modifications, translations, and checks on the fields.
Parameters:
Name Type Description Defaultfile
Union[Path, str, dict]
The file from which the fields are retrieved.
requiredtranslate
dict
A nested dictionary representing the translation mapping for fields. The structure of the translator follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
The translated fields will be used to replace the original fields in the retrieved fields. {}
rename
dict
A nested dictionary representing the field renaming mapping. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': 'new_field_name'\n}\n}\n
The fields specified in the dictionary will be renamed to the provided new field names in the retrieved fields. {}
allow_null
bool
Determines whether to allow None
values for relational fields. If set to True
, relational fields with missing values will be assigned None
. If set to False
, an error will be raised.
False
Returns:
Type Descriptiondict
A dictionary representing the retrieved fields from the file, with modifications and checks applied.
Raises:
Type DescriptionRuntimeError
If the file type is unsupported or if an error occurs during field retrieval or processing.
Source code inalto2txt2fixture/parser.py
def get_fields(\nfile: Union[Path, str, dict],\ntranslate: dict = {},\nrename: dict = {},\nallow_null: bool = False,\n) -> dict:\n\"\"\"\n Retrieves fields from a file and performs modifications and checks.\n This function takes a file (in various formats: `Path`, `str`, or `dict`)\n and processes its fields. It retrieves the fields from the file and\n performs modifications, translations, and checks on the fields.\n Args:\n file: The file from which the fields are retrieved.\n translate: A nested dictionary representing the translation mapping\n for fields. The structure of the translator follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n The translated fields will be used to replace the original fields\n in the retrieved fields.\n rename: A nested dictionary representing the field renaming\n mapping. The structure of the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': 'new_field_name'\n }\n }\n ```\n The fields specified in the dictionary will be renamed to the\n provided new field names in the retrieved fields.\n allow_null: Determines whether to allow ``None`` values for\n relational fields. If set to ``True``, relational fields with\n missing values will be assigned ``None``. If set to ``False``, an\n error will be raised.\n Returns:\n A dictionary representing the retrieved fields from the file,\n with modifications and checks applied.\n Raises:\n RuntimeError: If the file type is unsupported or if an error occurs\n during field retrieval or processing.\n \"\"\"\nif isinstance(file, Path):\ntry:\nfields = json.loads(file.read_text())\nexcept Exception as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, str):\nif \"\\n\" in file:\nraise RuntimeError(\"File has multiple lines.\")\ntry:\nfields = json.loads(file)\nexcept json.decoder.JSONDecodeError as e:\nraise RuntimeError(f\"Cannot interpret JSON ({e}): {file}\")\nelif isinstance(file, dict):\nfields = file\nelse:\nraise RuntimeError(f\"Cannot process type {type(file)}.\")\n# Fix relational fields for any file\nfor key in [key for key in fields.keys() if \"__\" in key]:\nparts = key.split(\"__\")\ntry:\nbefore = fields[key]\nif before:\nbefore = before.replace(\"---\", \"/\")\nloc = translate.get(parts[0], {}).get(parts[1], {})\nfields[key] = loc.get(before)\nif fields[key] is None:\nraise RuntimeError(\nf\"Cannot translate fields.{key} from {before}: {loc}\"\n)\nexcept AttributeError:\nif allow_null:\nfields[key] = None\nelse:\nprint(\n\"Content had relational fields, but something went wrong in parsing the data:\"\n)\nprint(\"file\", file)\nprint(\"fields\", fields)\nprint(\"KEY:\", key)\nraise RuntimeError()\nnew_name = rename.get(parts[0], {}).get(parts[1], None)\nif new_name:\nfields[new_name] = fields[key]\ndel fields[key]\nfields[\"created_at\"] = NOW_str\nfields[\"updated_at\"] = NOW_str\ntry:\nfields[\"item_type\"] = str(fields[\"item_type\"]).upper()\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_mean\"] == \"\":\nfields[\"ocr_quality_mean\"] = 0\nexcept KeyError:\npass\ntry:\nif fields[\"ocr_quality_sd\"] == \"\":\nfields[\"ocr_quality_sd\"] = 0\nexcept KeyError:\npass\nreturn fields\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_key_from","title":"get_key_from","text":"get_key_from(item: Path, x: str) -> str\n
Retrieves a specific key from a file and returns its value.
This function reads a file and extracts the value of a specified key. If the key is not found or an error occurs while processing the file, a warning is printed, and an empty string is returned.
Parameters:
Name Type Description Defaultitem
Path
The file from which the key is extracted.
requiredx
str
The key to be retrieved from the file.
requiredReturns:
Type Descriptionstr
The value of the specified key from the file.
Source code inalto2txt2fixture/parser.py
def get_key_from(item: Path, x: str) -> str:\n\"\"\"\n Retrieves a specific key from a file and returns its value.\n This function reads a file and extracts the value of a specified\n key. If the key is not found or an error occurs while processing\n the file, a warning is printed, and an empty string is returned.\n Args:\n item: The file from which the key is extracted.\n x: The key to be retrieved from the file.\n Returns:\n The value of the specified key from the file.\n \"\"\"\nresult = json.loads(item.read_text()).get(x, None)\nif not result:\nprint(f\"[WARN] Could not find key {x} in {item}\")\nresult = \"\"\nreturn result\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.get_translator","title":"get_translator","text":"get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict\n
Converts a list of fields into a nested dictionary representing a translator.
Parameters:
Name Type Description Defaultfields
list[TranslatorTuple]
A list of tuples representing fields to be translated.
[TranslatorTuple('', '', [])]
Returns:
Type Descriptiondict
A nested dictionary representing the translator. The structure of the dictionary follows the format:
{\n'part1': {\n'part2': {\n'translated_field': 'pk'\n}\n}\n}\n
Example >>> fields = [\n... TranslatorTuple(\n... start='start__field1',\n... finish='field1',\n... lst=[{\n... 'fields': {'field1': 'translation1'},\n... 'pk': 1}],\n... )]\n>>> get_translator(fields)\n{'start': {'field1': {'translation1': 1}}}\n
Source code in alto2txt2fixture/parser.py
def get_translator(\nfields: list[TranslatorTuple] = [TranslatorTuple(\"\", \"\", [])]\n) -> dict:\n\"\"\"\n Converts a list of fields into a nested dictionary representing a\n translator.\n Args:\n fields: A list of tuples representing fields to be translated.\n Returns:\n A nested dictionary representing the translator. The structure of\n the dictionary follows the format:\n ```python\n {\n 'part1': {\n 'part2': {\n 'translated_field': 'pk'\n }\n }\n }\n ```\n Example:\n ```pycon\n >>> fields = [\n ... TranslatorTuple(\n ... start='start__field1',\n ... finish='field1',\n ... lst=[{\n ... 'fields': {'field1': 'translation1'},\n ... 'pk': 1}],\n ... )]\n >>> get_translator(fields)\n {'start': {'field1': {'translation1': 1}}}\n ```\n \"\"\"\n_ = dict()\nfor field in fields:\nstart, finish, lst = field\npart1, part2 = start.split(\"__\")\nif part1 not in _:\n_[part1] = {}\nif part2 not in _[part1]:\n_[part1][part2] = {}\nif isinstance(finish, str):\n_[part1][part2] = {o[\"fields\"][finish]: o[\"pk\"] for o in lst}\nelif isinstance(finish, list):\n_[part1][part2] = {\n\"-\".join([o[\"fields\"][x] for x in finish]): o[\"pk\"] for o in lst\n}\nreturn _\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.parse","title":"parse","text":"parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None\n
Parses files from collections and generates fixtures for various models.
This function processes files from the specified collections and generates fixtures for different models, such as newspapers.dataprovider
, newspapers.ingest
, newspapers.digitisation
, newspapers.newspaper
, newspapers.issue
, and newspapers.item
.
It performs various steps, such as file listing, fixture generation, translation mapping, renaming fields, and saving fixtures to files.
Parameters:
Name Type Description Defaultcollections
list
A list of collections from which files are processed and fixtures are generated.
requiredcache_home
str
The directory path where the collections are located.
requiredoutput
str
The directory path where the fixtures will be saved.
requiredmax_elements_per_file
int
The maximum number of elements per file when saving fixtures.
requiredReturns:
Type DescriptionNone
This function generates fixtures but does not return any value.
Source code inalto2txt2fixture/parser.py
def parse(\ncollections: list, cache_home: str, output: str, max_elements_per_file: int\n) -> None:\n\"\"\"\n Parses files from collections and generates fixtures for various models.\n This function processes files from the specified collections and generates\n fixtures for different models, such as `newspapers.dataprovider`,\n `newspapers.ingest`, `newspapers.digitisation`, `newspapers.newspaper`,\n `newspapers.issue`, and `newspapers.item`.\n It performs various steps, such as file listing, fixture generation,\n translation mapping, renaming fields, and saving fixtures to files.\n Args:\n collections: A list of collections from which files are\n processed and fixtures are generated.\n cache_home: The directory path where the collections are located.\n output: The directory path where the fixtures will be saved.\n max_elements_per_file: The maximum number of elements per file\n when saving fixtures.\n Returns:\n This function generates fixtures but does not return any value.\n \"\"\"\nglobal CACHE_HOME\nglobal OUTPUT\nglobal MAX_ELEMENTS_PER_FILE\nCACHE_HOME = cache_home\nOUTPUT = output\nMAX_ELEMENTS_PER_FILE = max_elements_per_file\n# Set up output directory\nreset_fixture_dir(OUTPUT)\n# Get file lists\nprint(\"\\nGetting file lists...\")\ndef issues_in_x(x):\nreturn \"issues\" in str(x.parent).split(\"/\")\ndef newspapers_in_x(x):\nreturn not any(\n[\ncondition\nfor y in str(x.parent).split(\"/\")\nfor condition in [\n\"issues\" in y,\n\"ingest\" in y,\n\"digitisation\" in y,\n\"data-provider\" in y,\n]\n]\n)\nall_json = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.json\")\n]\nall_jsonl = [\nx for y in collections for x in (Path(CACHE_HOME) / y).glob(\"**/*.jsonl\")\n]\nprint(f\"--> {len(all_json):,} JSON files altogether\")\nprint(f\"--> {len(all_jsonl):,} JSONL files altogether\")\nprint(\"\\nSetting up fixtures...\")\n# Process data providers\ndef data_provider_in_x(x):\nreturn \"data-provider\" in str(x.parent).split(\"/\")\ndata_provider_json = list(\nfixtures(\nmodel=\"newspapers.dataprovider\",\nfilelist=[x for x in all_json if data_provider_in_x(x)],\nuniq_keys=[\"name\"],\n)\n)\nprint(f\"--> {len(data_provider_json):,} DataProvider fixtures\")\n# Process ingest\ndef ingest_in_x(x):\nreturn \"ingest\" in str(x.parent).split(\"/\")\ningest_json = list(\nfixtures(\nmodel=\"newspapers.ingest\",\nfilelist=[x for x in all_json if ingest_in_x(x)],\nuniq_keys=[\"lwm_tool_name\", \"lwm_tool_version\"],\n)\n)\nprint(f\"--> {len(ingest_json):,} Ingest fixtures\")\n# Process digitisation\ndef digitisation_in_x(x):\nreturn \"digitisation\" in str(x.parent).split(\"/\")\ndigitisation_json = list(\nfixtures(\nmodel=\"newspapers.digitisation\",\nfilelist=[x for x in all_json if digitisation_in_x(x)],\nuniq_keys=[\"software\"],\n)\n)\nprint(f\"--> {len(digitisation_json):,} Digitisation fixtures\")\n# Process newspapers\nnewspaper_json = list(\nfixtures(\nmodel=\"newspapers.newspaper\",\nfilelist=[file for file in all_json if newspapers_in_x(file)],\n)\n)\nprint(f\"--> {len(newspaper_json):,} Newspaper fixtures\")\n# Process issue\ntranslate = get_translator(\n[\nTranslatorTuple(\n\"publication__publication_code\", \"publication_code\", newspaper_json\n)\n]\n)\nrename = {\"publication\": {\"publication_code\": \"newspaper_id\"}}\nissue_json = list(\nfixtures(\nmodel=\"newspapers.issue\",\nfilelist=[file for file in all_json if issues_in_x(file)],\ntranslate=translate,\nrename=rename,\n)\n)\nprint(f\"--> {len(issue_json):,} Issue fixtures\")\n# Create translator/clear up memory before processing items\ntranslate = get_translator(\n[\n(\"issue__issue_identifier\", \"issue_code\", issue_json),\n(\"digitisation__software\", \"software\", digitisation_json),\n(\"data_provider__name\", \"name\", data_provider_json),\n(\n\"ingest__lwm_tool_identifier\",\n[\"lwm_tool_name\", \"lwm_tool_version\"],\ningest_json,\n),\n]\n)\nrename = {\n\"issue\": {\"issue_identifier\": \"issue_id\"},\n\"digitisation\": {\"software\": \"digitisation_id\"},\n\"data_provider\": {\"name\": \"data_provider_id\"},\n\"ingest\": {\"lwm_tool_identifier\": \"ingest_id\"},\n}\nsave_fixture(newspaper_json, \"Newspaper\")\nsave_fixture(issue_json, \"Issue\")\ndel newspaper_json\ndel issue_json\ngc.collect()\nprint(\"\\nSaving...\")\nsave_fixture(digitisation_json, \"Digitisation\")\nsave_fixture(ingest_json, \"Ingest\")\nsave_fixture(data_provider_json, \"DataProvider\")\n# Process items\nitem_json = fixtures(\nmodel=\"newspapers.item\",\nfilelist=all_jsonl,\ntranslate=translate,\nrename=rename,\n)\nsave_fixture(item_json, \"Item\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.reset_fixture_dir","title":"reset_fixture_dir","text":"reset_fixture_dir(output: str | Path) -> None\n
Resets the fixture directory by removing all JSON files inside it.
This function takes a directory path (output
) as input and removes all JSON files within the directory.
Prior to removal, it prompts the user for confirmation to proceed. If the user confirms, the function clears the fixture directory by deleting the JSON files.
Parameters:
Name Type Description Defaultoutput
str | Path
The directory path of the fixture directory to be reset.
requiredRaises:
Type DescriptionRuntimeError
If the output
directory is not specified as a string.
alto2txt2fixture/parser.py
def reset_fixture_dir(output: str | Path) -> None:\n\"\"\"\n Resets the fixture directory by removing all JSON files inside it.\n This function takes a directory path (``output``) as input and removes all\n JSON files within the directory.\n Prior to removal, it prompts the user for confirmation to proceed. If the\n user confirms, the function clears the fixture directory by deleting the\n JSON files.\n Args:\n output: The directory path of the fixture directory to be reset.\n Raises:\n RuntimeError: If the ``output`` directory is not specified as a string.\n \"\"\"\nif not isinstance(output, str):\nraise RuntimeError(\"`output` directory needs to be specified as a string.\")\noutput = Path(output)\ny = input(\nf\"This command will automatically empty the fixture directory ({output.absolute()}). \"\n\"Do you want to proceed? [y/N]\"\n)\nif not y.lower() == \"y\":\noutput.mkdir(parents=True, exist_ok=True)\nreturn\nprint(\"\\nClearing up the fixture directory\")\n# Ensure directory exists\noutput.mkdir(parents=True, exist_ok=True)\n# Drop all JSON files\n[x.unlink() for x in Path(output).glob(\"*.json\")]\nreturn\n
"},{"location":"reference/alto2txt2fixture/parser.html#alto2txt2fixture.parser.uniq","title":"uniq","text":"uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]\n
Generates unique items from a list of files based on specified keys.
This function takes a list of files and yields unique items based on a combination of keys. The keys are extracted from each file using the get_key_from
function, and duplicate items are ignored.
Parameters:
Name Type Description Defaultfilelist
list
A list of files from which unique items are generated.
requiredkeys
list
A list of keys used for uniqueness. Each key specifies a field to be used for uniqueness checking in the generated items.
[]
Yields:
Type DescriptionAny
A unique item from filelist
.
alto2txt2fixture/parser.py
def uniq(filelist: list, keys: list = []) -> Generator[Any, None, None]:\n\"\"\"\n Generates unique items from a list of files based on specified keys.\n This function takes a list of files and yields unique items based on a\n combination of keys. The keys are extracted from each file using the\n ``get_key_from`` function, and duplicate items are ignored.\n Args:\n filelist: A list of files from which unique items are\n generated.\n keys: A list of keys used for uniqueness. Each key specifies\n a field to be used for uniqueness checking in the generated\n items.\n Yields:\n A unique item from `filelist`.\n \"\"\"\nseen = set()\nfor item in filelist:\nkey = \"-\".join([get_key_from(item, x) for x in keys])\nif key not in seen:\nseen.add(key)\nyield item\nelse:\n# Drop it if duplicate\npass\n
"},{"location":"reference/alto2txt2fixture/patterns.html","title":"patterns","text":"Useful regular expressions, intially just PUBLICATION_CODE
.
Archive(\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n)\n
Manage extracting information from a ZIP archive.
The Archive
class represents a zip archive of XML files. The class is used to extract information from a ZIP archive, and it contains several methods to process the data contained in the archive.
open(Archive)
context manager
Archive can be opened with a context manager, which creates a meta object, with timings for the object. When closed, it will save the meta JSON to the correct paths.
Attributes:
Name Type Descriptionpath
Path
The path to the zip archive.
collection
str
The collection of the XML files in the archive. Default is \"\".
report
Path
The file path of the report file for the archive.
report_id
str
The report ID for the archive. If not provided, a random UUID is generated.
report_parent
Path
The parent directory of the report file for the archive.
jisc_papers
pd.DataFrame
A DataFrame of JISC papers.
size
str | float
The size of the archive, in human-readable format.
size_raw
str | float
The raw size of the archive, in bytes.
roots
Generator[ET.Element, None, None]
The root elements of the XML documents contained in the archive.
meta
dotdict
Metadata about the archive, such as its path, size, and number of contents.
json_indent
int
Indentation formatting of json
output
Raises:
Type DescriptionRuntimeError
If the path
does not exist.
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\npath: str | Path,\ncollection: str = \"\",\nreport_id: str | None = None,\njisc_papers: pd.DataFrame | None = None,\njson_indent: int = JSON_INDENT,\n):\n\"\"\"Constructor method.\"\"\"\nself.path: Path = Path(path)\nif not self.path.exists():\nraise RuntimeError(\"Path does not exist.\")\nself.size: str | float = get_size_from_path(self.path)\nself.size_raw: str | float = get_size_from_path(self.path, raw=True)\nself.zip_file: zipfile.ZipFile = zipfile.ZipFile(self.path)\nself.collection: str = collection\nself.roots: Generator[ET.Element, None, None] = self.get_roots()\nself.meta: dotdict = dotdict(\npath=str(self.path),\nbytes=self.size_raw,\nsize=self.size,\ncontents=len(self.filelist),\n)\nif not report_id:\nself.report_id: str = str(uuid.uuid4())\nelse:\nself.report_id = report_id\nself.jisc_papers: pd.DataFrame = jisc_papers\nself.report_parent: Path = Path(f\"{REPORT_DIR}/{self.report_id}\")\nself.report: Path = (\nself.report_parent / f\"{self.path.stem.replace('_metadata', '')}.json\"\n)\nself.json_indent: int = json_indent\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.documents","title":"documents property
","text":"documents\n
Property that calls the get_documents
method
property
","text":"filelist\n
Returns the list of files in the zip file
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.__len__","title":"__len__","text":"__len__()\n
The number of files inside the zip archive.
Source code inalto2txt2fixture/router.py
def __len__(self):\n\"\"\"The number of files inside the zip archive.\"\"\"\nreturn len(self.filelist)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_documents","title":"get_documents","text":"get_documents() -> Generator[Document, None, None]\n
A generator that yields instances of the Document class for each XML file in the ZIP archive.
It uses the tqdm
library to display a progress bar in the terminal while it is running.
If the contents of the ZIP file are not empty, the method creates an instance of the Document
class by passing the root element of the XML file, the collection name, meta information about the archive, and the JISC papers data frame (if provided) to the constructor of the Document
class. The instance of the Document
class is then returned by the generator.
Yields:
Type DescriptionDocument
Document
class instance for each unzipped XML
file.
alto2txt2fixture/router.py
def get_documents(self) -> Generator[Document, None, None]:\n\"\"\"\n A generator that yields instances of the Document class for each XML\n file in the ZIP archive.\n It uses the `tqdm` library to display a progress bar in the terminal\n while it is running.\n If the contents of the ZIP file are not empty, the method creates an\n instance of the ``Document`` class by passing the root element of the XML\n file, the collection name, meta information about the archive, and the\n JISC papers data frame (if provided) to the constructor of the\n ``Document`` class. The instance of the ``Document`` class is then\n returned by the generator.\n Yields:\n ``Document`` class instance for each unzipped `XML` file.\n \"\"\"\nfor xml_file in tqdm(\nself.filelist,\ndesc=f\"{Path(self.zip_file.filename).stem} ({self.meta.size})\",\nleave=False,\ncolour=\"green\",\n):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield Document(\nroot=ET.fromstring(xml),\ncollection=self.collection,\nmeta=self.meta,\njisc_papers=self.jisc_papers,\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Archive.get_roots","title":"get_roots","text":"get_roots() -> Generator[ET.Element, None, None]\n
Yields the root elements of the XML documents contained in the archive.
Source code inalto2txt2fixture/router.py
def get_roots(self) -> Generator[ET.Element, None, None]:\n\"\"\"\n Yields the root elements of the XML documents contained in the archive.\n \"\"\"\nfor xml_file in tqdm(self.filelist, leave=False, colour=\"blue\"):\nwith self.zip_file.open(xml_file) as f:\nxml = f.read()\nif xml:\nyield ET.fromstring(xml)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache","title":"Cache","text":"Cache()\n
The Cache class provides a blueprint for creating and managing cache data. The class has several methods that help in getting the cache path, converting the data to a dictionary, and writing the cache data to a file.
It is inherited by many other classes in this document.
Initializes the Cache class object.
Source code inalto2txt2fixture/router.py
def __init__(self):\n\"\"\"\n Initializes the Cache class object.\n \"\"\"\npass\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.__str__","title":"__str__","text":"__str__() -> str\n
Returns the string representation of the cache data as a dictionary.
Source code inalto2txt2fixture/router.py
def __str__(self) -> str:\n\"\"\"\n Returns the string representation of the cache data as a dictionary.\n \"\"\"\nreturn str(self.as_dict())\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.as_dict","title":"as_dict","text":"as_dict() -> dict\n
Converts the cache data to a dictionary and returns it.
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n Converts the cache data to a dictionary and returns it.\n \"\"\"\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the cache path, which is used to store the cache data. The path is normally constructed using some of the object's properties (collection, kind, and id) but can be changed when inherited.
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the cache path, which is used to store the cache data.\n The path is normally constructed using some of the object's\n properties (collection, kind, and id) but can be changed when\n inherited.\n \"\"\"\nreturn Path(f\"{CACHE_HOME}/{self.collection}/{self.kind}/{self.id}.json\")\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Cache.write_to_cache","title":"write_to_cache","text":"write_to_cache(json_indent: int = JSON_INDENT) -> Optional[bool]\n
Writes the cache data to a file at the specified cache path. The cache data is first converted to a dictionary using the as_dict method. If the cache path already exists, the function returns True.
Source code inalto2txt2fixture/router.py
def write_to_cache(self, json_indent: int = JSON_INDENT) -> Optional[bool]:\n\"\"\"\n Writes the cache data to a file at the specified cache path. The cache\n data is first converted to a dictionary using the as_dict method. If\n the cache path already exists, the function returns True.\n \"\"\"\npath = self.get_cache_path()\ntry:\nif path.exists():\nreturn True\nexcept AttributeError:\nerror(\nf\"Error occurred when getting cache path for \"\nf\"{self.kind}: {path}. It was not of expected \"\nf\"type Path but of type {type(path)}:\",\n)\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"w+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent))\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Collection","title":"Collection","text":"Collection(name: str = 'hmd', jisc_papers: Optional[pd.DataFrame] = None)\n
A Collection represents a group of newspaper archives from any passed alto2txt metadata output.
A Collection is initialised with a name and an optional pandas DataFrame of JISC papers. The archives
property returns an iterable of the Archive
objects within the collection.
Attributes:
Name Type Descriptionname
str
Name of the collection (default \"hmd\")
jisc_papers
pandas.DataFrame
DataFrame of JISC papers, optional
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, name: str = \"hmd\", jisc_papers: Optional[pd.DataFrame] = None):\n\"\"\"Constructor method.\"\"\"\nself.name: str = name\nself.jisc_papers: pd.DataFrame | None = jisc_papers\nself.dir: Path = Path(f\"{MNT}/{self.name}-alto2txt/metadata\")\nself.zip_files: list[Path] = sorted(\nlist(self.dir.glob(\"*.zip\")), key=lambda x: x.stat().st_size\n)\nself.zip_file_count: int = sum([1 for _ in self.dir.glob(\"*.zip\")])\nself.report_id: str = str(uuid.uuid4())\nself.empty: bool = self.zip_file_count == 0\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider","title":"DataProvider","text":"DataProvider(collection: str)\n
Bases: Cache
The DataProvider class extends the Cache class and represents a newspaper data provider. The class has several properties and methods that allow creation of a data provider object and the manipulation of its data.
Attributes:
Name Type Descriptioncollection
str
A string representing publication collection
kind
str
Indication of object type, defaults to data-provider
providers_meta_data
list[FixtureDict]
structured dict of metadata for known collection sources
collection_type
str
related data sources and potential linkage source
index_field
str
field name for querying existing records
Example>>> from pprint import pprint\n>>> hmd = DataProvider(\"hmd\")\n>>> hmd.pk\n2\n>>> pprint(hmd.as_dict())\n{'code': 'bl-hmd',\n 'collection': 'newspapers',\n 'legacy_code': 'hmd',\n 'name': 'Heritage Made Digital',\n 'source_note': 'British Library-funded digitised newspapers provided by the '\n 'British Newspaper Archive'}\n
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, collection: str):\n\"\"\"Constructor method.\"\"\"\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.DataProvider.meta_data","title":"meta_data property
","text":"meta_data: FixtureDict | dict\n
Return self.providers_meta_data[self.collection]
or {}
.
property
","text":"meta_data_fields: FixtureDict | dict\n
Return self.providers_meta_data[self.collection]
or {}
.
property
","text":"pk: int | None\n
Return pk
if provided via providers_meta_data
, else None
.
property
","text":"providers_index_dict: dict[str, FixtureDict]\n
Return all self.index_field
values from providers_meta_data
.
as_dict() -> dict\n
Return a dict
of the data provider object.
Returns:
Type Descriptiondict
Dictionary representation of the DataProvider object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n Return a `dict` of the data provider object.\n Returns:\n Dictionary representation of the DataProvider object\n \"\"\"\nif self.meta_data:\nreturn {\n\"name\": self.meta_data_fields[\"name\"],\n\"code\": self.meta_data_fields[\"code\"],\n\"legacy_code\": self.collection,\n\"source_note\": self.meta_data_fields[\"source_note\"],\n\"collection\": self.collection_type,\n}\nelse:\nreturn {\n\"name\": self.collection,\n\"code\": slugify(self.collection),\n\"source_note\": \"\",\n\"legacy_code\": None,\n\"collection\": self.collection_type,\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation","title":"Digitisation","text":"Digitisation(root: ET.Element, collection: str = '')\n
Bases: Cache
The Digitisation class extends the Cache class and represents a newspaper digitisation. The class has several properties and methods that allow creation of an digitisation object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
collection
str
A string that represents the collection of the publication
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'digitisation'\n
A string that represents the type of the object, set to \"digitisation\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Digitisation.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the digitisation object.
Returns:
Type Descriptiondict
Dictionary representation of the Digitising object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the digitisation\n object.\n Returns:\n Dictionary representation of the Digitising object\n \"\"\"\ndic = {\nx.tag: x.text or \"\"\nfor x in self.root.findall(\"./process/*\")\nif x.tag\nin [\n\"xml_flavour\",\n\"software\",\n\"mets_namespace\",\n\"alto_namespace\",\n]\n}\nif not dic.get(\"software\"):\nreturn {}\nreturn dic\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document","title":"Document","text":"Document(*args, **kwargs)\n
The Document class is a representation of a document that contains information about a publication, newspaper, item, digitisation, and ingest. This class holds all the relevant information about a document in a structured manner and provides properties that can be used to access different aspects of the document.
Attributes:
Name Type Descriptioncollection
str | None
A string that represents the collection of the publication
root
ET.Element | None
An XML
element that represents the root of the publication
zip_file
str | None
A path to a valid zip
file
jisc_papers
pd.DataFrame | None
A pandas
DataFrame
object that holds information about the JISC papers
meta
dotdict | None
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, *args, **kwargs):\n\"\"\"Constructor method.\"\"\"\nself.collection: str | None = kwargs.get(\"collection\")\nif not self.collection or not isinstance(self.collection, str):\nraise RuntimeError(\"A valid collection must be passed\")\nself.root: ET.Element | None = kwargs.get(\"root\")\nif not self.root or not isinstance(self.root, ET.Element):\nraise RuntimeError(\"A valid XML root must be passed\")\nself.zip_file: str | None = kwargs.get(\"zip_file\")\nif self.zip_file and not isinstance(self.zip_file, str):\nraise RuntimeError(\"A valid zip file must be passed\")\nself.jisc_papers: pd.DataFrame | None = kwargs.get(\"jisc_papers\")\nif not isinstance(self.jisc_papers, pd.DataFrame):\nraise RuntimeError(\n\"A valid DataFrame containing JISC papers must be passed\"\n)\nself.meta: dotdict | None = kwargs.get(\"meta\")\nself._publication_elem = None\nself._input_sub_path = None\nself._ingest = None\nself._digitisation = None\nself._item = None\nself._issue = None\nself._newspaper = None\nself._data_provider = None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Document.publication","title":"publication property
","text":"publication: ET.Element\n
This property returns an ElementTree object representing the publication information in the XML document.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest","title":"Ingest","text":"Ingest(root: ET.Element, collection: str = '')\n
Bases: Cache
The Ingest class extends the Cache class and represents a newspaper ingest. The class has several properties and methods that allow the creation of an ingest object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
collection
str
A string that represents the collection of the publication
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(self, root: ET.Element, collection: str = \"\"):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.root: ET.Element = root\nself.collection: str = collection\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'ingest'\n
A string that represents the type of the object, set to \"ingest\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Ingest.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the ingest object.
Returns:
Type Descriptiondict
Dictionary representation of the Ingest object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the ingest\n object.\n Returns:\n Dictionary representation of the Ingest object\n \"\"\"\nreturn {\nf\"lwm_tool_{x.tag}\": x.text or \"\"\nfor x in self.root.findall(\"./process/lwm_tool/*\")\n}\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue","title":"Issue","text":"Issue(\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n)\n
Bases: Cache
The Issue class extends the Cache class and represents a newspaper issue. The class has several properties and methods that allow the creation of an issue object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
An xml element that represents the root of the publication
newspaper
Newspaper | None
The parent newspaper
collection
str
A string that represents the collection of the publication
input_sub_path
str
TODO
meta
dotdict
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\npublication: ET.Element,\nnewspaper: Optional[Newspaper] = None,\ncollection: str = \"\",\ninput_sub_path: str = \"\",\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nself.publication: ET.Element = publication\nself.newspaper: Newspaper | None = newspaper\nself.collection: str = collection\nself.input_sub_path: str = input_sub_path\nself.meta: dotdict = meta\nself._issue = None\nself._issue_date = None\npath: str = str(self.get_cache_path())\nif not self.meta.issue_paths:\nself.meta.issue_paths = [path]\nelif path not in self.meta.issue_paths:\nself.meta.issue_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_code","title":"issue_code property
","text":"issue_code: str\n
Sets up and saves the issue code for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.issue_date","title":"issue_dateproperty
","text":"issue_date: str\n
Sets up and saves the issue date for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.kind","title":"kindclass-attribute
instance-attribute
","text":"kind = 'issue'\n
A string that represents the type of the object, set to \"issue\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the issue object.
Returns:
Type Descriptiondict
Dictionary representation of the Issue object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the issue\n object.\n Returns:\n Dictionary representation of the Issue object\n \"\"\"\nif not self._issue:\nself._issue = dict(\nissue_code=self.issue_code,\nissue_date=self.issue_date,\npublication__publication_code=self.newspaper.publication_code,\ninput_sub_path=self.input_sub_path,\n)\nreturn self._issue\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Issue.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the issue object.
Returns:
Type DescriptionPath
Path to the cache file for the issue object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the issue object.\n Returns:\n Path to the cache file for the issue object\n \"\"\"\njson_file = f\"/{self.newspaper.publication_code}/issues/{self.issue_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item","title":"Item","text":"Item(\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n)\n
Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper item, i.e. an article. The class has several properties and methods that allow the creation of an article object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
ET.Element
An xml element that represents the root of the publication
issue_code
str
A string that represents the issue code
digitisation
dict
TODO
ingest
dict
TODO
collection
str
A string that represents the collection of the publication
newspaper
Newspaper | None
The parent newspaper
meta
dotdict
TODO
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\nissue_code: str = \"\",\ndigitisation: dict = {},\ningest: dict = {},\ncollection: str = \"\",\nnewspaper: Optional[Newspaper] = None,\nmeta: dotdict = dotdict(),\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nif not isinstance(newspaper, Newspaper):\nraise RuntimeError(\"Expected newspaper to be of type router.Newspaper\")\nself.root: ET.Element = root\nself.issue_code: str = issue_code\nself.digitisation: dict = digitisation\nself.ingest: dict = ingest\nself.collection: str = collection\nself.newspaper: Newspaper | None = newspaper\nself.meta: dotdict = meta\nself._item_elem = None\nself._item_code = None\nself._item = None\npath: str = str(self.get_cache_path())\nif not self.meta.item_paths:\nself.meta.item_paths = [path]\nelif path not in self.meta.item_paths:\nself.meta.item_paths.append(path)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_code","title":"item_code property
","text":"item_code: str\n
Sets up and saves the item code for easy access as property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.item_elem","title":"item_elemproperty
","text":"item_elem\n
Sets up and saves the issue XML item for easy access as a property.
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.kind","title":"kindclass-attribute
instance-attribute
","text":"kind = 'item'\n
A string that represents the type of the object, set to \"item\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the item object (i.e. article).
Returns:
Type Descriptiondict
Dictionary representation of the Item object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the item object\n (i.e. article).\n Returns:\n Dictionary representation of the Item object\n \"\"\"\nif not self._item:\nself._item = {\nf\"{x.tag}\": x.text or \"\"\nfor x in self.item_elem.findall(\"*\")\nif x.tag\nin [\n\"title\",\n\"word_count\",\n\"ocr_quality_mean\",\n\"ocr_quality_sd\",\n\"plain_text_file\",\n\"item_type\",\n]\n}\nself._item[\"title\"] = self._item.get(\"title\", \"\")[:2097151]\nself._item = {\n\"item_code\": self.item_code,\n\"word_count\": self._item.get(\"word_count\", 0),\n\"title\": self._item.get(\"title\"),\n\"item_type\": self._item.get(\"item_type\"),\n\"input_filename\": self._item.get(\"plain_text_file\", \"\"),\n\"ocr_quality_mean\": self._item.get(\"ocr_quality_mean\", 0),\n\"ocr_quality_sd\": self._item.get(\"ocr_quality_sd\", 0),\n\"digitisation__software\": self.digitisation.id,\n\"ingest__lwm_tool_identifier\": self.ingest.id,\n\"issue__issue_identifier\": self.issue_code,\n\"data_provider__name\": self.collection,\n}\nreturn self._item\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the item (article) object.
Returns:
Type DescriptionPath
Path to the cache file for the article object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the item (article) object.\n Returns:\n Path to the cache file for the article object\n \"\"\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\"\n+ \"/\".join(self.newspaper.number_paths)\n+ f\"/{self.newspaper.publication_code}/items.jsonl\"\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Item.write_to_cache","title":"write_to_cache","text":"write_to_cache(json_indent = JSON_INDENT) -> None\n
Special cache-write function that appends rather than writes at the end of the process.
Returns:
Type DescriptionNone
None.
Source code inalto2txt2fixture/router.py
def write_to_cache(self, json_indent=JSON_INDENT) -> None:\n\"\"\"\n Special cache-write function that appends rather than writes at the\n end of the process.\n Returns:\n None.\n \"\"\"\npath = self.get_cache_path()\npath.parent.mkdir(parents=True, exist_ok=True)\nwith open(path, \"a+\") as f:\nf.write(json.dumps(self.as_dict(), indent=json_indent) + \"\\n\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper","title":"Newspaper","text":"Newspaper(\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n)\n
Bases: Cache
The Newspaper class extends the Cache class and represents a newspaper.
The class has several properties and methods that allow the creation of a newspaper object and the manipulation of its data.
Attributes:
Name Type Descriptionroot
An xml element that represents the root of the publication.
collection
A string that represents the collection of the publication.
meta
A dotdict object that holds metadata about the publication.
jisc_papers
A pandas DataFrame object for JISC paper information.
Constructor method.
Source code inalto2txt2fixture/router.py
def __init__(\nself,\nroot: ET.Element,\ncollection: str = \"\",\nmeta: dotdict = dotdict(),\njisc_papers: Optional[pd.DataFrame] = None,\n):\n\"\"\"Constructor method.\"\"\"\nif not isinstance(root, ET.Element):\nraise RuntimeError(f\"Expected root to be xml.etree.Element: {type(root)}\")\nself.publication = root.find(\"./publication\")\nself.input_sub_path = root.find(\"./process/input_sub_path\").text\nself.issue_date = self.publication.find(\"./issue/date\").text\nself.collection = collection\nself.meta = meta\nself.jisc_papers = jisc_papers\nself._newspaper = None\nself._title = None\nself._publication_code = None\npath = str(self.get_cache_path())\nif not self.meta.newspaper_paths:\nself.meta.newspaper_paths = []\nelif path not in self.meta.newspaper_paths:\nself.meta.newspaper_paths.append(path)\nif not self.meta.publication_codes:\nself.meta.publication_codes = [self.publication_code]\nelif self.publication_code not in self.meta.publication_codes:\nself.meta.publication_codes.append(self.publication_code)\nself.zip_file = Path(meta.path).name\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.kind","title":"kind class-attribute
instance-attribute
","text":"kind = 'newspaper'\n
A string that represents the type of the object, set to \"newspaper\".
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.number_paths","title":"number_pathsproperty
","text":"number_paths: list\n
Returns the nested directories in which we want to save the cache file.
Returns:
Type Descriptionlist
List of the desired directories in descending order
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code","title":"publication_codeproperty
","text":"publication_code: str\n
A property that returns the code of the publication.
Returns:
Type Descriptionstr
The code of the publication
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.title","title":"titleproperty
","text":"title: str\n
A property that returns the title of the newspaper.
Returns:
Type Descriptionstr
The title of the newspaper
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.as_dict","title":"as_dict","text":"as_dict() -> dict\n
A method that returns a dictionary representation of the newspaper object.
Returns:
Type Descriptiondict
Dictionary representation of the Newspaper object
Source code inalto2txt2fixture/router.py
def as_dict(self) -> dict:\n\"\"\"\n A method that returns a dictionary representation of the newspaper\n object.\n Returns:\n Dictionary representation of the Newspaper object\n \"\"\"\nif not self._newspaper:\nself._newspaper = dict(\n**dict(publication_code=self.publication_code, title=self.title),\n**{\nx.tag: x.text or \"\"\nfor x in self.publication.findall(\"*\")\nif x.tag in [\"location\"]\n},\n)\nreturn self._newspaper\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.get_cache_path","title":"get_cache_path","text":"get_cache_path() -> Path\n
Returns the path to the cache file for the newspaper object.
Returns:
Type DescriptionPath
Path to the cache file for the newspaper object
Source code inalto2txt2fixture/router.py
def get_cache_path(self) -> Path:\n\"\"\"\n Returns the path to the cache file for the newspaper object.\n Returns:\n Path to the cache file for the newspaper object\n \"\"\"\njson_file = f\"/{self.publication_code}/{self.publication_code}.json\"\nreturn Path(\nf\"{CACHE_HOME}/{self.collection}/\" + \"/\".join(self.number_paths) + json_file\n)\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.Newspaper.publication_code_from_input_sub_path","title":"publication_code_from_input_sub_path","text":"publication_code_from_input_sub_path() -> str | None\n
A method that returns the publication code from the input sub-path of the publication process.
Returns:
Type Descriptionstr | None
The code of the publication
Source code inalto2txt2fixture/router.py
def publication_code_from_input_sub_path(self) -> str | None:\n\"\"\"\n A method that returns the publication code from the input sub-path of\n the publication process.\n Returns:\n The code of the publication\n \"\"\"\ng = PUBLICATION_CODE.findall(self.input_sub_path)\nif len(g) == 1:\nreturn g[0]\nreturn None\n
"},{"location":"reference/alto2txt2fixture/router.html#alto2txt2fixture.router.route","title":"route","text":"route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None\n
This function is responsible for setting up the path for the alto2txt mountpoint, setting up the JISC papers and routing the collections for processing.
Parameters:
Name Type Description Defaultcollections
list
List of collection names
requiredcache_home
str
Directory path for the cache
requiredmountpoint
str
Directory path for the alto2txt mountpoint
requiredjisc_papers_path
str
Path to the JISC papers
requiredreport_dir
str
Path to the report directory
requiredReturns:
Type DescriptionNone
None
Source code inalto2txt2fixture/router.py
def route(\ncollections: list,\ncache_home: str,\nmountpoint: str,\njisc_papers_path: str,\nreport_dir: str,\n) -> None:\n\"\"\"\n This function is responsible for setting up the path for the alto2txt\n mountpoint, setting up the JISC papers and routing the collections for\n processing.\n Args:\n collections: List of collection names\n cache_home: Directory path for the cache\n mountpoint: Directory path for the alto2txt mountpoint\n jisc_papers_path: Path to the JISC papers\n report_dir: Path to the report directory\n Returns:\n None\n \"\"\"\nglobal CACHE_HOME\nglobal MNT\nglobal REPORT_DIR\nCACHE_HOME = cache_home\nREPORT_DIR = report_dir\nMNT = Path(mountpoint) if isinstance(mountpoint, str) else mountpoint\nif not MNT.exists():\nerror(\nf\"The mountpoint provided for alto2txt does not exist. \"\nf\"Either create a local copy or blobfuse it to \"\nf\"`{MNT.absolute()}`.\"\n)\njisc_papers = setup_jisc_papers(path=jisc_papers_path)\nfor collection_name in collections:\ncollection = Collection(name=collection_name, jisc_papers=jisc_papers)\nif collection.empty:\nerror(\nf\"It looks like {collection_name} is empty in the \"\nf\"alto2txt mountpoint: `{collection.dir.absolute()}`.\"\n)\nfor archive in collection.archives:\nwith archive as _:\n[\n(\ndoc.item.write_to_cache(),\ndoc.newspaper.write_to_cache(),\ndoc.issue.write_to_cache(),\ndoc.data_provider.write_to_cache(),\ndoc.ingest.write_to_cache(),\ndoc.digitisation.write_to_cache(),\n)\nfor doc in archive.documents\n]\nreturn\n
"},{"location":"reference/alto2txt2fixture/settings.html","title":"settings","text":"The settings
module provides configuration for running alto2txt2fixture
.
Most of these are managed within the settings
variable within this module.
Note
See the command line interface parameters documentation for means of modifying settings
when run.
Attributes:
Name Type DescriptionJSON_INDEX
Amount of indentation to include in output JSON
files
DATA_PROVIDER_INDEX
Final[str]
The field
used to index DataProvider
records
NEWSPAPER_COLLECTION_METADATA
Final[list[FixtureDict]]
A list of FixtureDict
s specifying speific newspaper data providers
SETUP_TITLE
str
the title printed at the commandline via cli.show_setup()
function
settings
dotdict
a docdict
configuration for running newspaper
portions of alto2txt2fixture
Bases: TypedDict
A dict
structure to ease use as a json
database fixture.
Attributes:
Name Type Descriptionpk
int
an id to uniquely define and query each entry
model
str
what model a given record is for
fields
dict[str, Any]
a dict
of record information conforming to model
table
Bases: NamedTuple
A named tuple of fields for translation.
Attributes:
Name Type Descriptionstart
str
A string representing the starting field name.
finish
str | list
A string or list specifying the field(s) to be translated. If it is a string, the translated field will be a direct mapping of the specified field in each item of the input list. If it is a list, the translated field will be a hyphen-separated concatenation of the specified fields in each item of the input list.
lst
list[dict]
A list of dictionaries representing the items to be translated. Each dictionary should contain the necessary fields for translation, with the field names specified in the start
parameter.
Bases: dict
dot.notation access to dictionary attributes
"},{"location":"reference/alto2txt2fixture/utils.html","title":"utils","text":""},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.check_newspaper_collection_configuration","title":"check_newspaper_collection_configuration","text":"check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[\nFixtureDict\n] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]\n
Check the names in collections
match the names in newspaper_collections
.
Parameters:
Name Type Description Defaultcollections
Iterable[str]
Names of newspaper collections, defaults to settings.COLLECTIONS
settings.COLLECTIONS
newspaper_collections
Iterable[FixtureDict]
Newspaper collections in a list of FixtureDict
format. Defaults to settings.FIXTURE_TABLE['dataprovider]
NEWSPAPER_COLLECTION_METADATA
data_provider_index
str
dict
fields
key
used to check matchiching collections
name
DATA_PROVIDER_INDEX
Returns:
Type Descriptionset[str]
A set of collections
without a matching newspaper_collections
record.
>>> check_newspaper_collection_configuration()\nset()\n
Source code in alto2txt2fixture/utils.py
def check_newspaper_collection_configuration(\ncollections: Iterable[str] = settings.COLLECTIONS,\nnewspaper_collections: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\ndata_provider_index: str = DATA_PROVIDER_INDEX,\n) -> set[str]:\n\"\"\"Check the names in `collections` match the names in `newspaper_collections`.\n Arguments:\n collections:\n Names of newspaper collections, defaults to ``settings.COLLECTIONS``\n newspaper_collections:\n Newspaper collections in a list of `FixtureDict` format. Defaults\n to ``settings.FIXTURE_TABLE['dataprovider]``\n data_provider_index:\n `dict` `fields` `key` used to check matchiching `collections` name\n Returns:\n A set of ``collections`` without a matching `newspaper_collections` record.\n Example:\n ```pycon\n >>> check_newspaper_collection_configuration()\n set()\n ```\n \"\"\"\nnewspaper_collection_names: tuple[str, ...] = tuple(\ndict_from_list_fixture_fields(\nnewspaper_collections, field_name=data_provider_index\n).keys()\n)\ncollection_diff: set[str] = set(collections) - set(newspaper_collection_names)\nif collection_diff:\nwarning(\nf\"{len(collection_diff)} `collections` \"\nf\"not in `newspaper_collections`: {collection_diff}\"\n)\nreturn collection_diff\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.clear_cache","title":"clear_cache","text":"clear_cache(dir: str | Path) -> None\n
Clears the cache directory by removing all .json
files in it.
Parameters:
Name Type Description Defaultdir
str | Path
The path of the directory to be cleared.
required Source code inalto2txt2fixture/utils.py
def clear_cache(dir: str | Path) -> None:\n\"\"\"\n Clears the cache directory by removing all `.json` files in it.\n Args:\n dir: The path of the directory to be cleared.\n \"\"\"\ndir = get_path_from(dir)\ny = input(\nf\"Do you want to erase the cache path now that the \"\nf\"files have been generated ({dir.absolute()})? [y/N]\"\n)\nif y.lower() == \"y\":\ninfo(\"Clearing up the cache directory\")\nfor x in dir.glob(\"*.json\"):\nx.unlink()\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.create_lookup","title":"create_lookup","text":"create_lookup(lst: list = [], on: list = []) -> dict\n
Create a lookup dictionary from a list of dictionaries.
Parameters:
Name Type Description Defaultlst
list
A list of dictionaries that should be used to generate the lookup.
[]
on
list
A list of keys from the dictionaries in the list that should be used as the keys in the lookup.
[]
Returns:
Type Descriptiondict
The generated lookup dictionary.
Source code inalto2txt2fixture/utils.py
def create_lookup(lst: list = [], on: list = []) -> dict:\n\"\"\"\n Create a lookup dictionary from a list of dictionaries.\n Args:\n lst: A list of dictionaries that should be used to generate the lookup.\n on: A list of keys from the dictionaries in the list that should be used as the keys in the lookup.\n Returns:\n The generated lookup dictionary.\n \"\"\"\nreturn {get_key(x, on): x[\"pk\"] for x in lst}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.dict_from_list_fixture_fields","title":"dict_from_list_fixture_fields","text":"dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]\n
Create a dict
from fixture_list
with attr_name
as key
.
Parameters:
Name Type Description Defaultfixture_list
Iterable[FixtureDict]
list
of FixtureDict
with attr_name
key fields
.
NEWSPAPER_COLLECTION_METADATA
field_name
str
key for values within fixture_list
fields
.
DATA_PROVIDER_INDEX
Returns:
Type Descriptiondict[str, FixtureDict]
A dict
where extracted field_name
is key for related FixtureDict
values.
>>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n>>> fixture_dict['hmd']['pk']\n2\n>>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture_dict['hmd']['fields']['code']\n'bl-hmd'\n
Source code in alto2txt2fixture/utils.py
def dict_from_list_fixture_fields(\nfixture_list: Iterable[FixtureDict] = NEWSPAPER_COLLECTION_METADATA,\nfield_name: str = DATA_PROVIDER_INDEX,\n) -> dict[str, FixtureDict]:\n\"\"\"Create a `dict` from ``fixture_list`` with ``attr_name`` as `key`.\n Args:\n fixture_list: `list` of `FixtureDict` with ``attr_name`` key `fields`.\n field_name: key for values within ``fixture_list`` `fields`.\n Returns:\n A `dict` where extracted `field_name` is key for related `FixtureDict` values.\n Example:\n ```pycon\n >>> fixture_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields()\n >>> fixture_dict['hmd']['pk']\n 2\n >>> fixture_dict['hmd']['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> fixture_dict['hmd']['fields']['code']\n 'bl-hmd'\n ```\n \"\"\"\nreturn {record[\"fields\"][field_name]: record for record in fixture_list}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.export_fixtures","title":"export_fixtures","text":"export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None\n
Export fixture_tables
in formats
.
This is still in experimental phase of development and not recommended for production.
Parameters:
Name Type Description Defaultfixture_tables
dict[str, Sequence[FixtureDict]]
dict
of table name (eg: dataprovider
) and FixtureDict
path
str | PathLike
Path to save exports in
settings.FIXTURE_TABLES_OUTPUT
prefix
str
str
to prefix export filenames with
'test-'
formats
Sequence[EXPORT_FORMATS]
list of EXPORT_FORMATS
to export
settings.FIXTURE_TABLES_FORMATS
Example >>> test_fixture_tables: dict[str, FixtureDict] = {\n... 'test0': NEWSPAPER_COLLECTION_METADATA,\n... 'test1': NEWSPAPER_COLLECTION_METADATA}\n>>> export_fixtures(test_fixture_tables, path='tests/')\n... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n<BLANKLINE>\n...Warning: Saving test0...\n...Warning: Saving test1...\n>>> from pandas import read_csv\n>>> fixture0_json = load_json('tests/test-test0-1.json')\n>>> fixture0_df = read_csv('tests/test-test0-1.csv')\n>>> fixture1_json = load_json('tests/test-test1-1.json')\n>>> fixture1_df = read_csv('tests/test-test1-1.csv')\n>>> fixture0_json == fixture1_json\nTrue\n>>> all(fixture0_df == fixture1_df)\nTrue\n>>> all(field in fixture0_json[0]['fields']\n... for field in ['created_at', 'updated_at'])\nTrue\n>>> fixture0_json[1]['pk']\n2\n>>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n[2, 'hmd']\n
Source code in alto2txt2fixture/utils.py
def export_fixtures(\nfixture_tables: dict[str, Sequence[FixtureDict]],\npath: str | PathLike = settings.FIXTURE_TABLES_OUTPUT,\nprefix: str = \"test-\",\nadd_created: bool = True,\nformats: Sequence[EXPORT_FORMATS] = settings.FIXTURE_TABLES_FORMATS,\n) -> None:\n\"\"\"Export ``fixture_tables`` in ``formats``.\n Note:\n This is still in experimental phase of development and not recommended\n for production.\n Args:\n fixture_tables: `dict` of table name (eg: `dataprovider`) and `FixtureDict`\n path: Path to save exports in\n prefix: `str` to prefix export filenames with\n formats: list of `EXPORT_FORMATS` to export\n Example:\n ```pycon\n >>> test_fixture_tables: dict[str, FixtureDict] = {\n ... 'test0': NEWSPAPER_COLLECTION_METADATA,\n ... 'test1': NEWSPAPER_COLLECTION_METADATA}\n >>> export_fixtures(test_fixture_tables, path='tests/')\n ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE\n <BLANKLINE>\n ...Warning: Saving test0...\n ...Warning: Saving test1...\n >>> from pandas import read_csv\n >>> fixture0_json = load_json('tests/test-test0-1.json')\n >>> fixture0_df = read_csv('tests/test-test0-1.csv')\n >>> fixture1_json = load_json('tests/test-test1-1.json')\n >>> fixture1_df = read_csv('tests/test-test1-1.csv')\n >>> fixture0_json == fixture1_json\n True\n >>> all(fixture0_df == fixture1_df)\n True\n >>> all(field in fixture0_json[0]['fields']\n ... for field in ['created_at', 'updated_at'])\n True\n >>> fixture0_json[1]['pk']\n 2\n >>> fixture0_json[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> fixture0_df[['pk', DATA_PROVIDER_INDEX]].iloc[1].to_list()\n [2, 'hmd']\n ```\n \"\"\"\nfor table_name, records in fixture_tables.items():\nwarning(\nf\"Saving {table_name} fixture in {formats} formats \"\nf\"to {path} *without* checks...\"\n)\nif \"json\" in formats:\nsave_fixture(\nrecords,\nprefix=f\"{prefix}{table_name}\",\noutput_path=path,\nadd_created=add_created,\n)\nif \"csv\" in formats:\nfixtures_dict2csv(records, prefix=f\"{prefix}{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.filter_json_fields","title":"filter_json_fields","text":"filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs: Hashable\n) -> dict | list\n
Return keys
and values
from json_dict
where any fields
equal value
.
Parameters:
Name Type Description Defaultfile_path
PathLike | None
The file path
to load based on extension and filter
None
fields
Sequence[str]
Which fields to check equal value
[]
value
Hashable
Value to filter by
''
Returns:
Type Descriptiondict | list
A dict
of records indexed by pk
which fit filter criteria
Raises:
Type DescriptionValueError
file_path
must have a .json
suffix
>>> from pprint import pprint\n>>> entry_fixture: dict = [\n... {\"pk\": 4889, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n... \"price_raw\": ['2d'],\n... \"year\": 1920,\n... \"date_established_raw\": \"1857\",\n... \"persons\": [], \"newspaper\": \"\"}},\n... {\"pk\": 9207, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n... \"price_raw\": ['2 \u00bd d', '3 \u00bd d'],\n... \"year\": 1856,\n... \"date_established_raw\": \"1848\",\n... \"persons\": ['Stephen Soulby'],\n... \"newspaper\": \"\",}},\n... {\"pk\": 15, \"model\": \"mitchells.entry\",\n... \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n... \"price_raw\": ['2d', '3d'],\n... \"year\": 1857,\n... \"date_established_raw\": \"November , 1842\",\n... \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n... \"newspaper\": 1187}}\n... ]\n>>> pprint(filter_json_fields(entry_fixture,\n... fields=(\"newspaper\", \"persons\"),\n... value=\"\"))\n[{'fields': {'date_established_raw': '1857',\n 'newspaper': '',\n 'persons': [],\n 'price_raw': ['2d'],\n 'title': 'BIRMINGHAM POST .',\n 'year': 1920},\n 'model': 'mitchells.entry',\n 'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n 'newspaper': '',\n 'persons': ['Stephen Soulby'],\n 'price_raw': ['2 \u00bd d', '3 \u00bd d'],\n 'title': 'ULVERSTONE ADVERTISER .',\n 'year': 1856},\n 'model': 'mitchells.entry',\n 'pk': 9207}]\n
Source code in alto2txt2fixture/utils.py
def filter_json_fields(\njson_results: list | dict | None = None,\nfile_path: PathLike | None = None,\nfields: Sequence[str] = [],\nvalue: Hashable = \"\",\n**kwargs,\n) -> dict | list:\n\"\"\"Return `keys` and `values` from `json_dict` where any `fields` equal `value`.\n Args:\n file_path: The file `path` to load based on extension and filter\n fields: Which fields to check equal `value`\n value: Value to filter by\n Returns:\n A `dict` of records indexed by `pk` which fit filter criteria\n Raises:\n ValueError: ``file_path`` must have a `.json` `suffix`\n Example:\n ```pycon\n >>> from pprint import pprint\n >>> entry_fixture: dict = [\n ... {\"pk\": 4889, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"BIRMINGHAM POST .\",\n ... \"price_raw\": ['2d'],\n ... \"year\": 1920,\n ... \"date_established_raw\": \"1857\",\n ... \"persons\": [], \"newspaper\": \"\"}},\n ... {\"pk\": 9207, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"ULVERSTONE ADVERTISER .\",\n ... \"price_raw\": ['2 \\u00bd d', '3 \\u00bd d'],\n ... \"year\": 1856,\n ... \"date_established_raw\": \"1848\",\n ... \"persons\": ['Stephen Soulby'],\n ... \"newspaper\": \"\",}},\n ... {\"pk\": 15, \"model\": \"mitchells.entry\",\n ... \"fields\": {\"title\": \"LLOYD'S WEEKLY LONDON NEWSPAPER .\",\n ... \"price_raw\": ['2d', '3d'],\n ... \"year\": 1857,\n ... \"date_established_raw\": \"November , 1842\",\n ... \"persons\": ['Mr. Douglas Jerrold', 'Edward Lloyd'],\n ... \"newspaper\": 1187}}\n ... ]\n >>> pprint(filter_json_fields(entry_fixture,\n ... fields=(\"newspaper\", \"persons\"),\n ... value=\"\"))\n [{'fields': {'date_established_raw': '1857',\n 'newspaper': '',\n 'persons': [],\n 'price_raw': ['2d'],\n 'title': 'BIRMINGHAM POST .',\n 'year': 1920},\n 'model': 'mitchells.entry',\n 'pk': 4889},\n {'fields': {'date_established_raw': '1848',\n 'newspaper': '',\n 'persons': ['Stephen Soulby'],\n 'price_raw': ['2 \\u00bd d', '3 \\u00bd d'],\n 'title': 'ULVERSTONE ADVERTISER .',\n 'year': 1856},\n 'model': 'mitchells.entry',\n 'pk': 9207}]\n ```\n \"\"\"\nif not json_results:\nassert file_path\ntry:\nassert Path(file_path).suffix == \".json\"\nexcept AssertionError:\nraise ValueError(f\"{file_path} must be `json` format.\")\njson_results = load_json(Path(file_path), **kwargs)\nassert json_results\nif isinstance(json_results, dict):\nreturn {\nk: v\nfor k, v in json_results.items()\nif any(v[\"fields\"][field] == value for field in fields)\n}\nelse:\nreturn [\nv\nfor v in json_results\nif any(v[\"fields\"][field] == value for field in fields)\n]\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_fields","title":"fixture_fields","text":"fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]\n
Generate a tuple of FixtureDict
field
names.
This is not in the utils
module to avoid a circular import.
Parameters:
Name Type Description Defaultfixture_dict
FixtureDict
A FixtureDict
instance to extract names from fields
include_pk
bool
Whether to include the pk
(primary key) column
True
Example >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n('name', 'code', 'legacy_code', 'collection', 'source_note')\n>>> hmd_dict: dict[str, Any] = fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n>>> hmd_dict['code']\n'bl-hmd'\n>>> hmd_dict['pk']\n2\n>>> hmd_dict = fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n>>> 'pk' in hmd_dict\nFalse\n
Source code in alto2txt2fixture/utils.py
def fixture_fields(\nfixture_dict: FixtureDict, include_pk: bool = True, as_dict: bool = False\n) -> tuple[str, ...] | dict[str, Any]:\n\"\"\"Generate a tuple of `FixtureDict` `field` names.\n Note:\n This is not in the `utils` module to avoid a circular import.\n Args:\n fixture_dict: A `FixtureDict` instance to extract names from `fields`\n include_pk: Whether to include the `pk` (primary key) column\n Example:\n ```pycon\n >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0])\n ('pk', 'name', 'code', 'legacy_code', 'collection', 'source_note')\n >>> fixture_fields(NEWSPAPER_COLLECTION_METADATA[0], include_pk=False)\n ('name', 'code', 'legacy_code', 'collection', 'source_note')\n >>> hmd_dict: dict[str, Any] = fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA[1], as_dict=True)\n >>> hmd_dict['code']\n 'bl-hmd'\n >>> hmd_dict['pk']\n 2\n >>> hmd_dict = fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA[1], include_pk=False, as_dict=True)\n >>> 'pk' in hmd_dict\n False\n ```\n \"\"\"\nfields: OrderedDict[str, Any] = OrderedDict(fixture_dict[\"fields\"])\nif include_pk:\nfields[\"pk\"] = fixture_dict[\"pk\"]\nfields.move_to_end(\"pk\", last=False)\nif as_dict:\nreturn fields\nelse:\nreturn tuple(fields.keys())\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixture_or_default_dict","title":"fixture_or_default_dict","text":"fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict\n
Return a FixtureDict
from fixture_list
via key
index, else default_dict
.
Parameters:
Name Type Description Defaultkey
str
a str
to query fixture_dict
with
fixture_dict
dict[str, FixtureDict]
a dict
of str
to FixtureDict
, often generated by dict_from_list_fixture_fields
default_dict
FixtureDict | dict
a dict
to return if key
is not in fixture_dict
index
{}
Example >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n... NEWSPAPER_COLLECTION_METADATA)\n>>> hmd_dict: FixtureDict = fixture_or_default_dict(\n... 'hmd', newspaper_dict\n... )\n>>> fixture_or_default_dict(\n... 'hmd', NEWSPAPER_COLLECTION_METADATA\n... )\n{}\n>>> fixture_or_default_dict(\n... 'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n... )\n{'a': 'default'}\n
Source code in alto2txt2fixture/utils.py
def fixture_or_default_dict(\nkey: str,\nfixture_dict: dict[str, FixtureDict],\ndefault_dict: FixtureDict | dict = {},\n) -> FixtureDict | dict:\n\"\"\"Return a `FixtureDict` from ``fixture_list`` via ``key`` index, else ``default_dict``.\n Args:\n key:\n a `str` to query ``fixture_dict`` with\n fixture_dict: a `dict` of `str` to `FixtureDict`, often generated by\n ``dict_from_list_fixture_fields``\n default_dict: a `dict` to return if ``key`` is not in\n ``fixture_dict`` index\n Example:\n ```pycon\n >>> newspaper_dict: dict[str, FixtureDict] = dict_from_list_fixture_fields(\n ... NEWSPAPER_COLLECTION_METADATA)\n >>> hmd_dict: FixtureDict = fixture_or_default_dict(\n ... 'hmd', newspaper_dict\n ... )\n >>> fixture_or_default_dict(\n ... 'hmd', NEWSPAPER_COLLECTION_METADATA\n ... )\n {}\n >>> fixture_or_default_dict(\n ... 'hmd', NEWSPAPER_COLLECTION_METADATA, {'a': 'default'}\n ... )\n {'a': 'default'}\n ```\n \"\"\"\nif key in fixture_dict:\nreturn fixture_dict[key]\nelse:\nreturn default_dict\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.fixtures_dict2csv","title":"fixtures_dict2csv","text":"fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None\n
Saves fixtures generated by a generator to separate separate CSV
files.
This function takes an Iterable
or Generator
of fixtures and saves to separate CSV
files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file
parameter.
Parameters:
Name Type Description Defaultfixtures
Iterable[FixtureDict] | Generator[FixtureDict, None, None]
An Iterable
or Generator
of the fixtures to be saved.
prefix
str
A string prefix to be added to the file names of the saved fixtures.
''
output_path
PathLike | str
Path to folder fixtures are saved to
settings.OUTPUT
max_elements_per_file
int
Maximum JSON
records saved in each file
settings.MAX_ELEMENTS_PER_FILE
Returns:
Type DescriptionNone
This function saves fixtures to files and does not return a value.
Example>>> from pandas import read_csv\n>>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n... prefix='test', output_path='tests/')\n>>> imported_fixture = read_csv('tests/test-1.csv')\n>>> imported_fixture.iloc[1]['pk']\n2\n>>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n'hmd'\n
Source code in alto2txt2fixture/utils.py
def fixtures_dict2csv(\nfixtures: Iterable[FixtureDict] | Generator[FixtureDict, None, None],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nindex: bool = False,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate separate `CSV` files.\n This function takes an `Iterable` or `Generator` of fixtures and saves to\n separate `CSV` files. The fixtures are saved in batches, where each batch\n is determined by the ``max_elements_per_file`` parameter.\n Args:\n fixtures: An `Iterable` or `Generator` of the fixtures to be saved.\n prefix: A string prefix to be added to the file names of the\n saved fixtures.\n output_path: Path to folder fixtures are saved to\n max_elements_per_file: Maximum `JSON` records saved in each file\n Returns:\n This function saves fixtures to files and does not return a value.\n Example:\n ```pycon\n >>> from pandas import read_csv\n >>> fixtures_dict2csv(NEWSPAPER_COLLECTION_METADATA,\n ... prefix='test', output_path='tests/')\n >>> imported_fixture = read_csv('tests/test-1.csv')\n >>> imported_fixture.iloc[1]['pk']\n 2\n >>> imported_fixture.iloc[1][DATA_PROVIDER_INDEX]\n 'hmd'\n ```\n \"\"\"\ninternal_counter: int = 1\ncounter: int = 1\nlst: list = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in fixtures:\nlst.append(fixture_fields(item, as_dict=True))\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst: list = []\ninternal_counter = 1\ncounter += 1\nelse:\ndf: DataFrame = DataFrame.from_records(lst)\ndf.to_csv(Path(f\"{output_path}/{prefix}-{counter}.csv\"), index=index)\nreturn\nsave_fixture(records, prefix=f\"test-{table_name}\", output_path=path)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.gen_fixture_tables","title":"gen_fixture_tables","text":"gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]\n
Generator of rich.Table
instances from FixtureDict
configuration tables.
Parameters:
Name Type Description Defaultfixture_tables
dict[str, list[FixtureDict]]
dict
where key
is for Table
title and value
is a FixtureDict
{}
include_fixture_pk_column
bool
whether to include the pk
field from FixtureDict
True
Example >>> table_name: str = \"data_provider\"\n>>> tables = tuple(\n... gen_fixture_tables(\n... {table_name: NEWSPAPER_COLLECTION_METADATA}\n... ))\n>>> len(tables)\n1\n>>> assert tables[0].title == table_name\n>>> [column.header for column in tables[0].columns]\n['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n
Source code in alto2txt2fixture/utils.py
def gen_fixture_tables(\nfixture_tables: dict[str, list[FixtureDict]] = {},\ninclude_fixture_pk_column: bool = True,\n) -> Generator[Table, None, None]:\n\"\"\"Generator of `rich.Table` instances from `FixtureDict` configuration tables.\n Args:\n fixture_tables: `dict` where `key` is for `Table` title and `value` is a `FixtureDict`\n include_fixture_pk_column: whether to include the `pk` field from `FixtureDict`\n Example:\n ```pycon\n >>> table_name: str = \"data_provider\"\n >>> tables = tuple(\n ... gen_fixture_tables(\n ... {table_name: NEWSPAPER_COLLECTION_METADATA}\n ... ))\n >>> len(tables)\n 1\n >>> assert tables[0].title == table_name\n >>> [column.header for column in tables[0].columns]\n ['pk', 'name', 'code', 'legacy_code', 'collection', 'source_note']\n ```\n \"\"\"\nfor name, fixture_records in fixture_tables.items():\nfixture_table: Table = Table(title=name)\nfor i, fixture_dict in enumerate(fixture_records):\nif i == 0:\n[\nfixture_table.add_column(name)\nfor name in fixture_fields(fixture_dict, include_fixture_pk_column)\n]\nrow_values: tuple[str, ...] = tuple(\nstr(x) for x in (fixture_dict[\"pk\"], *fixture_dict[\"fields\"].values())\n)\nfixture_table.add_row(*row_values)\nyield fixture_table\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_chunked_zipfiles","title":"get_chunked_zipfiles","text":"get_chunked_zipfiles(path: Path) -> list\n
This function takes in a Path
object path
and returns a list of lists of zipfiles
sorted and chunked according to certain conditions defined in the settings
object (see settings.CHUNK_THRESHOLD
).
Note: the function will also skip zip files of a certain file size, which can be specified in the settings
object (see settings.SKIP_FILE_SIZE
).
Parameters:
Name Type Description Defaultpath
Path
The input path where the zipfiles are located
requiredReturns:
Type Descriptionlist
A list of lists of zipfiles
, each inner list represents a chunk of zipfiles.
alto2txt2fixture/utils.py
def get_chunked_zipfiles(path: Path) -> list:\n\"\"\"This function takes in a `Path` object `path` and returns a list of lists\n of `zipfiles` sorted and chunked according to certain conditions defined\n in the `settings` object (see `settings.CHUNK_THRESHOLD`).\n Note: the function will also skip zip files of a certain file size, which\n can be specified in the `settings` object (see `settings.SKIP_FILE_SIZE`).\n Args:\n path: The input path where the zipfiles are located\n Returns:\n A list of lists of `zipfiles`, each inner list represents a chunk of\n zipfiles.\n \"\"\"\nzipfiles = sorted(\npath.glob(\"*.zip\"),\nkey=lambda x: x.stat().st_size,\nreverse=settings.START_WITH_LARGEST,\n)\nzipfiles = [x for x in zipfiles if x.stat().st_size <= settings.SKIP_FILE_SIZE]\nif len(zipfiles) > settings.CHUNK_THRESHOLD:\nchunks = array_split(zipfiles, len(zipfiles) / settings.CHUNK_THRESHOLD)\nelse:\nchunks = [zipfiles]\nreturn chunks\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_key","title":"get_key","text":"get_key(x: dict = dict(), on: list = []) -> str\n
Get a string key from a dictionary using values from specified keys.
Parameters:
Name Type Description Defaultx
dict
A dictionary from which the key is generated.
dict()
on
list
A list of keys from the dictionary that should be used to generate the key.
[]
Returns:
Type Descriptionstr
The generated string key.
Source code inalto2txt2fixture/utils.py
def get_key(x: dict = dict(), on: list = []) -> str:\n\"\"\"\n Get a string key from a dictionary using values from specified keys.\n Args:\n x: A dictionary from which the key is generated.\n on: A list of keys from the dictionary that should be used to\n generate the key.\n Returns:\n The generated string key.\n \"\"\"\nreturn f\"{'-'.join([str(x['fields'][y]) for y in on])}\"\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_lockfile","title":"get_lockfile","text":"get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path\n
Provides the path to any given lockfile, which controls whether any existing files should be overwritten or not.
Parameters:
Name Type Description Defaultcollection
str
Collection folder name
requiredkind
NewspaperElements
Either newspaper
or issue
or item
dic
dict
A dictionary with required information for either kind
passed
Returns:
Type DescriptionPath
Path to the resulting lockfile
Source code inalto2txt2fixture/utils.py
def get_lockfile(collection: str, kind: NewspaperElements, dic: dict) -> Path:\n\"\"\"\n Provides the path to any given lockfile, which controls whether any\n existing files should be overwritten or not.\n Args:\n collection: Collection folder name\n kind: Either `newspaper` or `issue` or `item`\n dic: A dictionary with required information for either `kind` passed\n Returns:\n Path to the resulting lockfile\n \"\"\"\np: Path\nbase = Path(f\"cache-lockfiles/{collection}\")\nif kind == \"newspaper\":\np = base / f\"newspapers/{dic['publication_code']}\"\nelif kind == \"issue\":\np = base / f\"issues/{dic['publication__publication_code']}/{dic['issue_code']}\"\nelif kind == \"item\":\ntry:\nif dic.get(\"issue_code\"):\np = base / f\"items/{dic['issue_code']}/{dic['item_code']}\"\nelif dic.get(\"issue__issue_identifier\"):\np = base / f\"items/{dic['issue__issue_identifier']}/{dic['item_code']}\"\nexcept KeyError:\nerror(\"An unknown error occurred (in get_lockfile)\")\nelse:\np = base / \"lockfile\"\np.parent.mkdir(parents=True, exist_ok=True) if settings.WRITE_LOCKFILES else None\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_now","title":"get_now","text":"get_now(as_str: bool = False) -> datetime.datetime | str\n
Return datetime.now()
as either a string or datetime
object.
Parameters:
Name Type Description Defaultas_str
bool
Whether to return now
time
as a str
or not, default: False
False
Returns:
Type Descriptiondatetime.datetime | str
datetime.now()
in pytz.UTC
time zone as a string if as_str
, else as a datetime.datetime
object.
alto2txt2fixture/utils.py
def get_now(as_str: bool = False) -> datetime.datetime | str:\n\"\"\"\n Return `datetime.now()` as either a string or `datetime` object.\n Args:\n as_str: Whether to return `now` `time` as a `str` or not, default: `False`\n Returns:\n `datetime.now()` in `pytz.UTC` time zone as a string if `as_str`, else\n as a `datetime.datetime` object.\n \"\"\"\nnow = datetime.datetime.now(tz=pytz.UTC)\nif as_str:\nreturn str(now)\nelse:\nassert isinstance(now, datetime.datetime)\nreturn now\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_path_from","title":"get_path_from","text":"get_path_from(p: str | Path) -> Path\n
Converts an input value into a Path object if it's not already one.
Parameters:
Name Type Description Defaultp
str | Path
The input value, which can be a string or a Path object.
requiredReturns:
Type DescriptionPath
The input value as a Path object.
Source code inalto2txt2fixture/utils.py
def get_path_from(p: str | Path) -> Path:\n\"\"\"\n Converts an input value into a Path object if it's not already one.\n Args:\n p: The input value, which can be a string or a Path object.\n Returns:\n The input value as a Path object.\n \"\"\"\nif isinstance(p, str):\np = Path(p)\nif not isinstance(p, Path):\nraise RuntimeError(f\"Unable to handle type: {type(p)}\")\nreturn p\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.get_size_from_path","title":"get_size_from_path","text":"get_size_from_path(p: str | Path, raw: bool = False) -> str | float\n
Returns a nice string for any given file size.
Parameters:
Name Type Description Defaultp
str | Path
Path to read the size from
requiredraw
bool
Whether to return the file size as total number of bytes or a human-readable MB/GB amount
False
Returns:
Type Descriptionstr | float
Return str
followed by MB
or GB
for size if not raw
otherwise float
.
alto2txt2fixture/utils.py
def get_size_from_path(p: str | Path, raw: bool = False) -> str | float:\n\"\"\"\n Returns a nice string for any given file size.\n Args:\n p: Path to read the size from\n raw: Whether to return the file size as total number of bytes or\n a human-readable MB/GB amount\n Returns:\n Return `str` followed by `MB` or `GB` for size if not `raw` otherwise `float`.\n \"\"\"\np = get_path_from(p)\nbytes = p.stat().st_size\nif raw:\nreturn bytes\nrel_size: float | int | str = round(bytes / 1000 / 1000 / 1000, 1)\nassert not isinstance(rel_size, str)\nif rel_size < 0.5:\nrel_size = round(bytes / 1000 / 1000, 1)\nrel_size = f\"{rel_size}MB\"\nelse:\nrel_size = f\"{rel_size}GB\"\nreturn rel_size\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.glob_filter","title":"glob_filter","text":"glob_filter(p: str) -> list\n
Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.
Parameters:
Name Type Description Defaultp
str
Path to a directory to filter
requiredReturns:
Type Descriptionlist
Sorted list of files contained in the provided path without the ones
list
whose names start with a .
alto2txt2fixture/utils.py
def glob_filter(p: str) -> list:\n\"\"\"\n Return ordered glob, filtered out any pesky, unwanted .DS_Store from macOS.\n Args:\n p: Path to a directory to filter\n Returns:\n Sorted list of files contained in the provided path without the ones\n whose names start with a `.`\n \"\"\"\nreturn sorted([x for x in get_path_from(p).glob(\"*\") if not x.name.startswith(\".\")])\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.list_json_files","title":"list_json_files","text":"list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]\n
List json
files under the path specified in p
.
Parameters:
Name Type Description Defaultp
str | Path
The path to search for json
files
drill
bool
A flag indicating whether to drill down the subdirectories or not. Default is False
False
exclude_names
list
A list of file names to exclude from the search result. Default is an empty list
[]
include_names
list
A list of file names to include in search result. If provided, the exclude_names
argument will be ignored. Default is an empty list
[]
Returns:
Type DescriptionGenerator[Path, None, None] | list[Path]
A list of Path
objects pointing to the found json
files
alto2txt2fixture/utils.py
def list_json_files(\np: str | Path,\ndrill: bool = False,\nexclude_names: list = [],\ninclude_names: list = [],\n) -> Generator[Path, None, None] | list[Path]:\n\"\"\"\n List `json` files under the path specified in ``p``.\n Args:\n p: The path to search for `json` files\n drill: A flag indicating whether to drill down the subdirectories\n or not. Default is ``False``\n exclude_names: A list of file names to exclude from the search\n result. Default is an empty list\n include_names: A list of file names to include in search result.\n If provided, the ``exclude_names`` argument will be ignored.\n Default is an empty list\n Returns:\n A list of `Path` objects pointing to the found `json` files\n \"\"\"\nq: str = \"**/*.json\" if drill else \"*.json\"\nfiles = get_path_from(p).glob(q)\nif exclude_names:\nfiles = list({x for x in files if x.name not in exclude_names})\nelif include_names:\nfiles = list({x for x in files if x.name in include_names})\nreturn sorted(files)\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_json","title":"load_json","text":"load_json(p: str | Path, crash: bool = False) -> dict | list\n
Easier access to reading json
files.
Parameters:
Name Type Description Defaultp
str | Path
Path to read json
from
crash
bool
Whether the program should crash if there is a json
decode error, default: False
False
Returns:
Type Descriptiondict | list
The decoded json
contents from the path, but an empty dictionary
dict | list
if the file cannot be decoded and crash
is set to False
alto2txt2fixture/utils.py
def load_json(p: str | Path, crash: bool = False) -> dict | list:\n\"\"\"\n Easier access to reading `json` files.\n Args:\n p: Path to read `json` from\n crash: Whether the program should crash if there is a `json` decode\n error, default: ``False``\n Returns:\n The decoded `json` contents from the path, but an empty dictionary\n if the file cannot be decoded and ``crash`` is set to ``False``\n \"\"\"\np = get_path_from(p)\ntry:\nreturn json.loads(p.read_text())\nexcept json.JSONDecodeError:\nmsg = f\"Error: {p.read_text()}\"\nerror(msg, crash=crash)\nreturn {}\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.load_multiple_json","title":"load_multiple_json","text":"load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list\n
Load multiple json
files and return a list of their content.
Parameters:
Name Type Description Defaultp
str | Path
The path to search for json
files
drill
bool
A flag indicating whether to drill down the subdirectories or not. Default is False
False
filter_na
bool
A flag indicating whether to filter out the content that is None
. Default is True
.
True
crash
bool
A flag indicating whether to raise an exception when an error occurs while loading a json
file. Default is False
.
False
Returns:
Type Descriptionlist
A list
of the content of the loaded json
files.
alto2txt2fixture/utils.py
def load_multiple_json(\np: str | Path,\ndrill: bool = False,\nfilter_na: bool = True,\ncrash: bool = False,\n) -> list:\n\"\"\"\n Load multiple `json` files and return a list of their content.\n Args:\n p: The path to search for `json` files\n drill: A flag indicating whether to drill down the subdirectories\n or not. Default is `False`\n filter_na: A flag indicating whether to filter out the content that\n is `None`. Default is `True`.\n crash: A flag indicating whether to raise an exception when an\n error occurs while loading a `json` file. Default is `False`.\n Returns:\n A `list` of the content of the loaded `json` files.\n \"\"\"\nfiles = list_json_files(p, drill=drill)\ncontent = [load_json(x, crash=crash) for x in files]\nreturn [x for x in content if x] if filter_na else content\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.lock","title":"lock","text":"lock(lockfile: Path) -> None\n
Writes a '.' to a lockfile, after making sure the parent directory exists.
Parameters:
Name Type Description Defaultlockfile
Path
The path to the lock file to be created
requiredReturns:
Type DescriptionNone
None
Source code inalto2txt2fixture/utils.py
def lock(lockfile: Path) -> None:\n\"\"\"\n Writes a '.' to a lockfile, after making sure the parent directory exists.\n Args:\n lockfile: The path to the lock file to be created\n Returns:\n None\n \"\"\"\nlockfile.parent.mkdir(parents=True, exist_ok=True)\nlockfile.write_text(\"\")\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.save_fixture","title":"save_fixture","text":"save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n
Saves fixtures generated by a generator to separate JSON files.
This function takes a generator and saves the generated fixtures to separate JSON files. The fixtures are saved in batches, where each batch is determined by the max_elements_per_file
parameter.
Parameters:
Name Type Description Defaultgenerator
Sequence | Generator
A generator that yields the fixtures to be saved.
[]
prefix
str
A string prefix to be added to the file names of the saved fixtures.
''
output_path
PathLike | str
Path to folder fixtures are saved to
settings.OUTPUT
max_elements_per_file
int
Maximum JSON
records saved in each file
settings.MAX_ELEMENTS_PER_FILE
add_created
bool
Whether to add created_at
and updated_at
timestamps
True
json_indent
int
Number of indent spaces per line in saved JSON
JSON_INDENT
Returns:
Type DescriptionNone
This function saves the fixtures to files but does not return any value.
Example>>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n... prefix='test', output_path='tests/')\n>>> imported_fixture = load_json('tests/test-1.json')\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n>>> 'created_at' in imported_fixture[1]['fields']\nTrue\n
Source code in alto2txt2fixture/utils.py
def save_fixture(\ngenerator: Sequence | Generator = [],\nprefix: str = \"\",\noutput_path: PathLike | str = settings.OUTPUT,\nmax_elements_per_file: int = settings.MAX_ELEMENTS_PER_FILE,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None:\n\"\"\"Saves fixtures generated by a generator to separate JSON files.\n This function takes a generator and saves the generated fixtures to\n separate JSON files. The fixtures are saved in batches, where each batch\n is determined by the ``max_elements_per_file`` parameter.\n Args:\n generator: A generator that yields the fixtures to be saved.\n prefix: A string prefix to be added to the file names of the\n saved fixtures.\n output_path: Path to folder fixtures are saved to\n max_elements_per_file: Maximum `JSON` records saved in each file\n add_created: Whether to add `created_at` and `updated_at` `timestamps`\n json_indent: Number of indent spaces per line in saved `JSON`\n Returns:\n This function saves the fixtures to files but does not return\n any value.\n Example:\n ```pycon\n >>> save_fixture(NEWSPAPER_COLLECTION_METADATA,\n ... prefix='test', output_path='tests/')\n >>> imported_fixture = load_json('tests/test-1.json')\n >>> imported_fixture[1]['pk']\n 2\n >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n >>> 'created_at' in imported_fixture[1]['fields']\n True\n ```\n \"\"\"\ninternal_counter = 1\ncounter = 1\nlst = []\nPath(output_path).mkdir(parents=True, exist_ok=True)\nfor item in generator:\nlst.append(item)\ninternal_counter += 1\nif internal_counter > max_elements_per_file:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\n# Save up some memory\ndel lst\ngc.collect()\n# Re-instantiate\nlst = []\ninternal_counter = 1\ncounter += 1\nelse:\nwrite_json(\np=Path(f\"{output_path}/{prefix}-{counter}.json\"),\no=lst,\nadd_created=add_created,\njson_indent=json_indent,\n)\nreturn\n
"},{"location":"reference/alto2txt2fixture/utils.html#alto2txt2fixture.utils.write_json","title":"write_json","text":"write_json(\np: str | Path,\no: dict,\nadd_created: bool = True,\njson_indent: int = JSON_INDENT,\n) -> None\n
Easier access to writing json
files. Checks whether parent exists.
Parameters:
Name Type Description Defaultp
str | Path
Path to write json
to
o
dict
Object to write to json
file
add_created
bool
If set to True will add created_at
and updated_at
to the dictionary's fields. If created_at
and updated_at
already exist in the fields, they will be forcefully updated.
True
json_indent
int
What indetation format to write out JSON
file in
JSON_INDENT
Returns:
Type DescriptionNone
None
Example>>> path = 'test-write-json/example.json'\n>>> write_json(p=path,\n... o=NEWSPAPER_COLLECTION_METADATA,\n... add_created=True)\n>>> imported_fixture = load_json(path)\n>>> imported_fixture[1]['pk']\n2\n>>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n'hmd'\n
` Source code in alto2txt2fixture/utils.py
def write_json(\np: str | Path, o: dict, add_created: bool = True, json_indent: int = JSON_INDENT\n) -> None:\n\"\"\"\n Easier access to writing `json` files. Checks whether parent exists.\n Args:\n p: Path to write `json` to\n o: Object to write to `json` file\n add_created:\n If set to True will add `created_at` and `updated_at`\n to the dictionary's fields. If `created_at` and `updated_at`\n already exist in the fields, they will be forcefully updated.\n json_indent:\n What indetation format to write out `JSON` file in\n Returns:\n None\n Example:\n ```pycon\n >>> path = 'test-write-json/example.json'\n >>> write_json(p=path,\n ... o=NEWSPAPER_COLLECTION_METADATA,\n ... add_created=True)\n >>> imported_fixture = load_json(path)\n >>> imported_fixture[1]['pk']\n 2\n >>> imported_fixture[1]['fields'][DATA_PROVIDER_INDEX]\n 'hmd'\n ```\n `\n \"\"\"\np = get_path_from(p)\nif not (isinstance(o, dict) or isinstance(o, list)):\nraise RuntimeError(f\"Unable to handle data of type: {type(o)}\")\ndef _append_created_fields(o: dict):\n\"\"\"Add `created_at` and `updated_at` fields to a `dict` with `FixtureDict` values.\"\"\"\nreturn dict(\n**{k: v for k, v in o.items() if not k == \"fields\"},\nfields=dict(\n**{\nk: v\nfor k, v in o[\"fields\"].items()\nif not k == \"created_at\" and not k == \"updated_at\"\n},\n**{\"created_at\": NOW_str, \"updated_at\": NOW_str},\n),\n)\ntry:\nif add_created and isinstance(o, dict):\no = _append_created_fields(o)\nelif add_created and isinstance(o, list):\no = [_append_created_fields(x) for x in o]\nexcept KeyError:\nerror(\"An unknown error occurred (in write_json)\")\np.parent.mkdir(parents=True, exist_ok=True)\np.write_text(json.dumps(o, indent=json_indent))\nreturn\n
"},{"location":"tutorial/first-steps.html","title":"First Steps","text":""},{"location":"tutorial/first-steps.html#installing","title":"Installing","text":"The installation process should be fairly easy to take care of, using poetry
:
$ poetry install\n
However, this is only the first step in the process. As the script works through the alto2txt
collections, you will either need to choose the slower option \u2014 mounting them to your computer (using blobfuse
) \u2014\u00a0or the faster option \u2014 downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.
alto2txt
to the program","text":""},{"location":"tutorial/first-steps.html#downloading-local-copies-of-alto2txt-on-your-computer","title":"Downloading local copies of alto2txt
on your computer","text":"This option will take up a lot of hard drive space
As of the time of writing, downloading all of alto2txt
\u2019s metadata takes up about 185GB on your local drive.
You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.
"},{"location":"tutorial/first-steps.html#step-1-log-in-to-azure-using-microsoft-azure-storage-explorer","title":"Step 1: Log in to Azure using Microsoft Azure Storage Explorer","text":"Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.
Once you have opened MASE, you will need to sign into the appropriate Azure account.
"},{"location":"tutorial/first-steps.html#step-2-download-the-alto2txt-blob-container-to-your-hard-drive","title":"Step 2: Download thealto2txt
blob container to your hard drive","text":"On your left-hand side, you should see a menu where you can navigate to the correct \u201cblob container\u201d: Living with Machines
> Storage Accounts
> alto2txt
> Blob Containers
:
You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:
Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata
directory only onto your computer:
Select all of the files and press the download button:
Make sure you save all the zip files inside the correct local folder:
The \u201cActivities\u201d bar will now show you the progress and speed:
"},{"location":"tutorial/first-steps.html#mounting-alto2txt-on-your-computer","title":"Mountingalto2txt
on your computer","text":"This option will only work on a Linux or UNIX computer
If you have a mac, your only option is the one below.
"},{"location":"tutorial/first-steps.html#step-1-install-blobfuse","title":"Step 1: Install BlobFuse","text":"Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.
"},{"location":"tutorial/first-steps.html#step-2-set-up-sas-tokens","title":"Step 2: Set up SAS tokens","text":"Follow the instructions for setting up access to your Azure storage account.
"},{"location":"tutorial/first-steps.html#step-3-mount-your-blobs","title":"Step 3: Mount your blobs","text":"TODO #3: Write this section.
Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.
"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..1cce305 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,83 @@ + +The installation process should be fairly easy to take care of, using poetry
:
However, this is only the first step in the process. As the script works through the alto2txt
collections, you will either need to choose the slower option — mounting them to your computer (using blobfuse
) — or the faster option — downloading the required zip files from the Azure storage to your local hard drive. In the two following sections, both of those options are described.
alto2txt
to the programalto2txt
on your computerThis option will take up a lot of hard drive space
+As of the time of writing, downloading all of alto2txt
’s metadata takes up about 185GB on your local drive.
You do not have to download all of the collections or all of the zip files for each collection, as long as you are aware that the resulting fixtures will be limited in scope.
+Microsoft Azure Storage Explorer (MASE) is a great and free tool for downloading content off Azure. Your first step is to download and install this product on your local computer.
+Once you have opened MASE, you will need to sign into the appropriate Azure account.
+alto2txt
blob container to your hard driveOn your left-hand side, you should see a menu where you can navigate to the correct “blob container”: Living with Machines
> Storage Accounts
> alto2txt
> Blob Containers
:
You will want to replicate the same structure as the Blob Container itself in a folder on your hard drive:
+ +Once you have the structure set up, you are ready to download all of the files needed. For each of the blob containers, make sure that you download the metadata
directory only onto your computer:
Select all of the files and press the download button:
+ +Make sure you save all the zip files inside the correct local folder:
+ +The “Activities” bar will now show you the progress and speed:
+ +alto2txt
on your computerThis option will only work on a Linux or UNIX computer
+If you have a mac, your only option is the one below.
+Follow the instructions for installing BlobFuse and the instructions for how to prepare your drive for mounting.
+Follow the instructions for setting up access to your Azure storage account.
+TODO #3: Write this section.
+Note that you can also search on the internet for ideas on how to create local scripts to facilitate easier connection next time.
+ + + + + + +The examples below follow standard settings
+If you choose other settings for when you run the program, your output directory may look different from the information on this page.
+Reports are automatically generated with a unique hash as the overarching folder structure. Inside the reports
directory, you’ll find a JSON file for each alto2txt
directory (organised by NLP identifier).
The report structure, thus, looks like this:
+ +The JSON file has some good troubleshooting information. You’ll find that the contents are structured as a Python dictionary
(or JavaScript Object
). Here is an example:
Here is an explanation of each of the keys in the dictionary:
+Key | +Explanation | +Data type | +
---|---|---|
path |
+The input path for the zip file that is being converted. | +string |
+
bytes |
+The size of the input zip file represented in bytes. | +integer |
+
size |
+The size of the input zip file represented in a human-readable string. | +string |
+
contents |
+#TODO #3 | +integer |
+
start |
+Date and time when processing started (see also end below). |
+datestring |
+
newspaper_paths |
+#TODO #3 | +list (string ) |
+
publication_codes |
+A list of the NLPs that are contained in the input zip file. | +list (string ) |
+
issue_paths |
+A list of all the issue paths that are contained in the cache directory. | +list (string ) |
+
item_paths |
+A list of all the item paths that are contained in the cache directory. | +list (string ) |
+
end |
+Date and time when processing ended (see also start above). |
+datestring |
+
seconds |
+Seconds that the script spent interpreting the zip file (should be added to the microseconds below). |
+integer |
+
microseconds |
+Microseconds that the script spent interpreting the zip file (should be added to the seconds above). |
+integer |
+
The most important output of the script is contained in the fixtures
directory. This directory contains JSON files for all the different columns in the corresponding Django metadata database (i.e. DataProvider
, Digitisation
, Ingest
, Issue
, Newspaper
, and Item
). The numbering at the end of each file indicates the order of the files as they are divided into a maximum of 2e6
elements*:
Each JSON file contains a Python-like list
(JavaScript Array
) of dictionaries
(JavaScript Objects
), which have a primary key (pk
), the related database model (in the example below the Django newspapers
app’s newspaper
table), and a nested dictionary
/Object
which contains all the values for the database’s table entry:
* The maximum elements per file can be adjusted in the settings.py
file’s settings
object’s MAX_ELEMENTS_PER_FILE
value.