diff --git a/usgpo/download-files.py b/usgpo/download-files.py index 69dad32..11dbd89 100644 --- a/usgpo/download-files.py +++ b/usgpo/download-files.py @@ -4,9 +4,10 @@ import jsonlines import trafilatura +from bs4 import BeautifulSoup from tqdm.auto import tqdm - from utils import api_query + from licensed_pile import logs from licensed_pile.licenses import PermissiveLicenses from licensed_pile.write import to_dolma @@ -44,25 +45,55 @@ def download_file(api_key, file_url): return text +def parse_html(html): + # Most documents are pre-formatted text inside of the a
tag + # For the rest of the documents, we use trafilatura to extract to markdown + soup = BeautifulSoup(html, "html.parser") + pre_tag = soup.find("pre") + if pre_tag: + text = pre_tag.get_text() + else: + text = trafilatura.extract(html, output_format="markdown") + return text + + def construct_record(api_key, file): - file_url = file["links"].get("txtLink") - if file_url is None: + logger = logs.get_logger("usgpo") + try: + links = file.get("links") + if links is None: + return None + + file_url = links.get("txtLink") + # Occassionally there will be multiple txtLinks pointing to the same URL. Just take the first. + if isinstance(file_url, list): + file_url = file_url[0] + + if file_url is None: + return None + + html = download_file(api_key, file_url) + text = parse_html(html) + + if text is None or len(text) == 0: + return None + + return { + "id": file["package_id"], + "title": file["title"], + "date": file["date"], + "author": file["author"], + "publisher": file["publisher"], + "category": file["category"], + "text": text, + "source": SOURCE_NAME, + "added": datetime.datetime.utcnow().isoformat(), + "metadata": {"license": str(PermissiveLicenses.PD), "url": file_url}, + } + + except Exception as e: + logger.error(f"Failed to download package {file['package_id']}: {e}") return None - html = download_file(api_key, file_url) - text = trafilatura.extract(html) - - return { - "id": file["package_id"], - "title": file["title"], - "date": file["date"], - "author": file["author"], - "publisher": file["publisher"], - "category": file["category"], - "text": text, - "source": SOURCE_NAME, - "added": datetime.datetime.utcnow().isoformat(), - "metadata": {"license": str(PermissiveLicenses.PD), "url": file_url}, - } def generate_records(args): diff --git a/usgpo/get-links.py b/usgpo/get-links.py index b6ae62d..3ed37a6 100644 --- a/usgpo/get-links.py +++ b/usgpo/get-links.py @@ -5,8 +5,8 @@ import jsonlines from tqdm.auto import tqdm - from utils import api_query + from licensed_pile import logs @@ -110,16 +110,21 @@ def get_package_metadata(api_key, package): def main(args): logger = logs.get_logger("usgpo") os.makedirs(args.output_dir, exist_ok=True) - + # Get packages from the specified USGPO collections from `args.start_date` to current day logger.info(f"Getting packages from the following collections: {args.collections}") packages = get_packages(args.api_key, args.collections, args.start_date) - + logger.info(f"Getting package metadata and writing out to {args.output_dir}") - with jsonlines.open(os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True) as writer: + with jsonlines.open( + os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True + ) as writer: # Spawn multiple worker threads to get the metadata associated with all packages with ThreadPoolExecutor(max_workers=args.workers) as executor: - metadata_futures_to_package = {executor.submit(get_package_metadata, args.api_key, package): package for package in packages} + metadata_futures_to_package = { + executor.submit(get_package_metadata, args.api_key, package): package + for package in packages + } # Write out package metadata to file for metadata_future in tqdm(as_completed(metadata_futures_to_package)):