diff --git a/usgpo/download-files.py b/usgpo/download-files.py index 69dad32..11dbd89 100644 --- a/usgpo/download-files.py +++ b/usgpo/download-files.py @@ -4,9 +4,10 @@ import jsonlines import trafilatura +from bs4 import BeautifulSoup from tqdm.auto import tqdm - from utils import api_query + from licensed_pile import logs from licensed_pile.licenses import PermissiveLicenses from licensed_pile.write import to_dolma @@ -44,25 +45,55 @@ def download_file(api_key, file_url): return text +def parse_html(html): + # Most documents are pre-formatted text inside of the a
 tag
+    # For the rest of the documents, we use trafilatura to extract to markdown
+    soup = BeautifulSoup(html, "html.parser")
+    pre_tag = soup.find("pre")
+    if pre_tag:
+        text = pre_tag.get_text()
+    else:
+        text = trafilatura.extract(html, output_format="markdown")
+    return text
+
+
 def construct_record(api_key, file):
-    file_url = file["links"].get("txtLink")
-    if file_url is None:
+    logger = logs.get_logger("usgpo")
+    try:
+        links = file.get("links")
+        if links is None:
+            return None
+
+        file_url = links.get("txtLink")
+        # Occassionally there will be multiple txtLinks pointing to the same URL. Just take the first.
+        if isinstance(file_url, list):
+            file_url = file_url[0]
+
+        if file_url is None:
+            return None
+
+        html = download_file(api_key, file_url)
+        text = parse_html(html)
+
+        if text is None or len(text) == 0:
+            return None
+
+        return {
+            "id": file["package_id"],
+            "title": file["title"],
+            "date": file["date"],
+            "author": file["author"],
+            "publisher": file["publisher"],
+            "category": file["category"],
+            "text": text,
+            "source": SOURCE_NAME,
+            "added": datetime.datetime.utcnow().isoformat(),
+            "metadata": {"license": str(PermissiveLicenses.PD), "url": file_url},
+        }
+
+    except Exception as e:
+        logger.error(f"Failed to download package {file['package_id']}: {e}")
         return None
-    html = download_file(api_key, file_url)
-    text = trafilatura.extract(html)
-
-    return {
-        "id": file["package_id"],
-        "title": file["title"],
-        "date": file["date"],
-        "author": file["author"],
-        "publisher": file["publisher"],
-        "category": file["category"],
-        "text": text,
-        "source": SOURCE_NAME,
-        "added": datetime.datetime.utcnow().isoformat(),
-        "metadata": {"license": str(PermissiveLicenses.PD), "url": file_url},
-    }
 
 
 def generate_records(args):
diff --git a/usgpo/get-links.py b/usgpo/get-links.py
index b6ae62d..3ed37a6 100644
--- a/usgpo/get-links.py
+++ b/usgpo/get-links.py
@@ -5,8 +5,8 @@
 
 import jsonlines
 from tqdm.auto import tqdm
-
 from utils import api_query
+
 from licensed_pile import logs
 
 
@@ -110,16 +110,21 @@ def get_package_metadata(api_key, package):
 def main(args):
     logger = logs.get_logger("usgpo")
     os.makedirs(args.output_dir, exist_ok=True)
-    
+
     # Get packages from the specified USGPO collections from `args.start_date` to current day
     logger.info(f"Getting packages from the following collections: {args.collections}")
     packages = get_packages(args.api_key, args.collections, args.start_date)
-    
+
     logger.info(f"Getting package metadata and writing out to {args.output_dir}")
-    with jsonlines.open(os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True) as writer:
+    with jsonlines.open(
+        os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True
+    ) as writer:
         # Spawn multiple worker threads to get the metadata associated with all packages
         with ThreadPoolExecutor(max_workers=args.workers) as executor:
-            metadata_futures_to_package = {executor.submit(get_package_metadata, args.api_key, package): package for package in packages}
+            metadata_futures_to_package = {
+                executor.submit(get_package_metadata, args.api_key, package): package
+                for package in packages
+            }
 
             # Write out package metadata to file
             for metadata_future in tqdm(as_completed(metadata_futures_to_package)):