Skip to content

Commit

Permalink
- black + isort
Browse files Browse the repository at this point in the history
- wrap download-files construct_record in try/catch
- if <pre> tag is available, use pre-formatted text. else use
trafilatura.extract to convert to markdown
  • Loading branch information
nkandpa2 committed Jun 25, 2024
1 parent f14783b commit 8d46b15
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 23 deletions.
67 changes: 49 additions & 18 deletions usgpo/download-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import jsonlines
import trafilatura
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from utils import api_query

from licensed_pile import logs
from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma
Expand Down Expand Up @@ -44,25 +45,55 @@ def download_file(api_key, file_url):
return text


def parse_html(html):
# Most documents are pre-formatted text inside of the a <pre> tag
# For the rest of the documents, we use trafilatura to extract to markdown
soup = BeautifulSoup(html, "html.parser")
pre_tag = soup.find("pre")
if pre_tag:
text = pre_tag.get_text()
else:
text = trafilatura.extract(html, output_format="markdown")
return text


def construct_record(api_key, file):
file_url = file["links"].get("txtLink")
if file_url is None:
logger = logs.get_logger("usgpo")
try:
links = file.get("links")
if links is None:
return None

file_url = links.get("txtLink")
# Occassionally there will be multiple txtLinks pointing to the same URL. Just take the first.
if isinstance(file_url, list):
file_url = file_url[0]

if file_url is None:
return None

html = download_file(api_key, file_url)
text = parse_html(html)

if text is None or len(text) == 0:
return None

return {
"id": file["package_id"],
"title": file["title"],
"date": file["date"],
"author": file["author"],
"publisher": file["publisher"],
"category": file["category"],
"text": text,
"source": SOURCE_NAME,
"added": datetime.datetime.utcnow().isoformat(),
"metadata": {"license": str(PermissiveLicenses.PD), "url": file_url},
}

except Exception as e:
logger.error(f"Failed to download package {file['package_id']}: {e}")
return None
html = download_file(api_key, file_url)
text = trafilatura.extract(html)

return {
"id": file["package_id"],
"title": file["title"],
"date": file["date"],
"author": file["author"],
"publisher": file["publisher"],
"category": file["category"],
"text": text,
"source": SOURCE_NAME,
"added": datetime.datetime.utcnow().isoformat(),
"metadata": {"license": str(PermissiveLicenses.PD), "url": file_url},
}


def generate_records(args):
Expand Down
15 changes: 10 additions & 5 deletions usgpo/get-links.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import jsonlines
from tqdm.auto import tqdm

from utils import api_query

from licensed_pile import logs


Expand Down Expand Up @@ -110,16 +110,21 @@ def get_package_metadata(api_key, package):
def main(args):
logger = logs.get_logger("usgpo")
os.makedirs(args.output_dir, exist_ok=True)

# Get packages from the specified USGPO collections from `args.start_date` to current day
logger.info(f"Getting packages from the following collections: {args.collections}")
packages = get_packages(args.api_key, args.collections, args.start_date)

logger.info(f"Getting package metadata and writing out to {args.output_dir}")
with jsonlines.open(os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True) as writer:
with jsonlines.open(
os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True
) as writer:
# Spawn multiple worker threads to get the metadata associated with all packages
with ThreadPoolExecutor(max_workers=args.workers) as executor:
metadata_futures_to_package = {executor.submit(get_package_metadata, args.api_key, package): package for package in packages}
metadata_futures_to_package = {
executor.submit(get_package_metadata, args.api_key, package): package
for package in packages
}

# Write out package metadata to file
for metadata_future in tqdm(as_completed(metadata_futures_to_package)):
Expand Down

0 comments on commit 8d46b15

Please sign in to comment.