Skip to content

Commit

Permalink
run linters over all files
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed Apr 10, 2024
1 parent aa76251 commit 5485376
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 36 deletions.
37 changes: 26 additions & 11 deletions bhl/build-index.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
"""Build index of Biodiversity Heritage Library books"""

import argparse
import xml.etree.ElementTree as ET
import os
import logging
import json
import logging
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

from tqdm.auto import tqdm


logging.basicConfig(level=logging.INFO, format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="build-index: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


SOURCE_NAME = "biodiversity-heritage-library"


def parse_args():
parser = argparse.ArgumentParser("Biodiversity Heritage Library index builder")
parser.add_argument("--metadata-file", default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml", help="Path to XML metadata file")
parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/", help="Path to output directory")
parser.add_argument(
"--metadata-file",
default=f"data/{SOURCE_NAME}/raw/metadata/bhlitem.mods.xml",
help="Path to XML metadata file",
)
parser.add_argument(
"--output-dir",
default=f"data/{SOURCE_NAME}/raw/",
help="Path to output directory",
)
return parser.parse_args()


Expand All @@ -28,7 +38,7 @@ def main(args):

logging.info(f"Loading metadata file from {args.metadata_file}")
metadata = ET.parse(args.metadata_file).getroot()

num_entries = 0
pbar = tqdm(metadata)
for entry in pbar:
Expand All @@ -45,12 +55,17 @@ def main(args):
break

pbar.set_postfix({"Entries w/ License Info": num_entries})

logging.info("Computing summary statistics")
counts = {license: len(uris) for license, uris in index.items()}
print("\nLicense Summary Statistics:")
print(json.dumps(dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])), indent=4))

print(
json.dumps(
dict(sorted(counts.items(), reverse=True, key=lambda entry: entry[1])),
indent=4,
)
)

logging.info(f"Saving index to {args.output_dir}")
os.makedirs(args.output_dir, exist_ok=True)
with open(os.path.join(args.output_dir, "index.json"), "w") as f:
Expand Down
66 changes: 49 additions & 17 deletions bhl/extract-files.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
"""Build index of Biodiversity Heritage Library books"""

import argparse
import tarfile
import os
import logging
import json
import logging
import os
import tarfile
from collections import defaultdict

from tqdm.auto import tqdm


logging.basicConfig(level=logging.INFO, format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="extract-files: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


SOURCE_NAME = "biodiversity-heritage-library"


def parse_args():
parser = argparse.ArgumentParser("Biodiversity Heritage Library file extractor")
parser.add_argument("--index-file", default=f"data/{SOURCE_NAME}/raw/index.json", help="Path to JSON index")
parser.add_argument("--whitelist-file", default="bhl/license_whitelist.json", help="Path to JSON file of whitelisted license strings")
parser.add_argument("--content-file", default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2", help="Path to tar-ed and bz2 compressed content file")
parser.add_argument("--output-dir", default=f"data/{SOURCE_NAME}/raw/extracted_data", help="Path to output directory")
parser.add_argument(
"--index-file",
default=f"data/{SOURCE_NAME}/raw/index.json",
help="Path to JSON index",
)
parser.add_argument(
"--whitelist-file",
default="bhl/license_whitelist.json",
help="Path to JSON file of whitelisted license strings",
)
parser.add_argument(
"--content-file",
default=f"data/{SOURCE_NAME}/raw/data/bhl-ocr-20230823.tar.bz2",
help="Path to tar-ed and bz2 compressed content file",
)
parser.add_argument(
"--output-dir",
default=f"data/{SOURCE_NAME}/raw/extracted_data",
help="Path to output directory",
)
return parser.parse_args()


Expand All @@ -30,18 +49,26 @@ def main(args):
logging.info(f"Loading index file from {args.index_file}")
with open(args.index_file, "r") as f:
index = json.load(f)

logging.info(f"Loading license whitelist file from {args.whitelist_file}")
with open(args.whitelist_file, "r") as f:
whitelist = json.load(f)

logging.info(f"Loading content file from {args.content_file}")
content_file = tarfile.open(args.content_file, "r:bz2")

logging.info("Constructing list of all whitelisted items")
whitelisted_items = set(sum([[uri.split("/")[-1].zfill(6) for uri in index[license]] for license in whitelist], start=[]))
whitelisted_items = set(
sum(
[
[uri.split("/")[-1].zfill(6) for uri in index[license]]
for license in whitelist
],
start=[],
)
)
logging.info(f"Found {len(whitelisted_items)} whitelisted items")

num_extracted_files = 0
extracted_size = 0
pbar = tqdm(content_file)
Expand All @@ -50,12 +77,17 @@ def main(args):
continue
item_id = item_info.path.split("/")[2]
if item_id in whitelisted_items:
content_file.extract(item_info, path=args.output_dir)
content_file.extract(item_info, path=args.output_dir)
num_extracted_files += 1
extracted_size += item_info.size

pbar.set_postfix({"Extracted Files": num_extracted_files, "Extracted Size": f"{extracted_size / 2**30:.3f} GB"})


pbar.set_postfix(
{
"Extracted Files": num_extracted_files,
"Extracted Size": f"{extracted_size / 2**30:.3f} GB",
}
)


if __name__ == "__main__":
args = parse_args()
Expand Down
30 changes: 22 additions & 8 deletions bhl/to-dolma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,32 @@
import argparse
import datetime
import glob
import logging
import json
import logging
import os

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma


logging.basicConfig(level=logging.INFO, format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
logging.basicConfig(
level=logging.INFO,
format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s",
)


BASE_URL = "https://www.biodiversitylibrary.org/page"
SOURCE_NAME = "biodiversity-heritage-library"

parser = argparse.ArgumentParser(description="Convert data to dolma.")
parser.add_argument(
"--data", default=f"data/{SOURCE_NAME}/extracted_data", help="Path to the directory containing BHL data."
"--data",
default=f"data/{SOURCE_NAME}/extracted_data",
help="Path to the directory containing BHL data.",
)
parser.add_argument(
"--output_dir",
default=f"data/{SOURCE_NAME}/v0",
help="Where the dolma formatted data goes."
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--filename", default="bhl.jsonl.gz", help="The base filename for the BHL data"
Expand All @@ -34,8 +38,12 @@
)


def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL):
item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[0].split("-")
def format_dolma(
content_file: str, source_name: str = SOURCE_NAME, base_url: str = BASE_URL
):
item_id, page_id, page_num = os.path.splitext(os.path.basename(content_file))[
0
].split("-")
with open(content_file) as f:
try:
text = f.read()
Expand All @@ -60,7 +68,13 @@ def format_dolma(content_file: str, source_name: str = SOURCE_NAME, base_url: st

def main(args):
# Use iterators so we don't have to load the whole dataset in memory.
content_pages = filter(lambda x: x is not None, map(format_dolma, glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True)))
content_pages = filter(
lambda x: x is not None,
map(
format_dolma,
glob.iglob(os.path.join(args.data, "**", "*.txt"), recursive=True),
),
)
to_dolma(content_pages, args.output_dir, args.filename, args.shard_size)


Expand Down
1 change: 1 addition & 0 deletions gutenberg/possible-rights.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import argparse
import glob

import tqdm
from rdflib import Graph

Expand Down

0 comments on commit 5485376

Please sign in to comment.