webrecorder · anjackson · Sep 13, 2023 · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/wacz/main.py b/wacz/main.py
@@ -1,11 +1,12 @@
 from argparse import ArgumentParser, RawTextHelpFormatter
 from io import BytesIO, StringIO, TextIOWrapper
-import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources
+import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources, zlib
 from wacz.waczindexer import WACZIndexer
 from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict
 from wacz.validate import Validation, OUTDATED_WACZ
 from wacz.util import validateJSON, get_py_wacz_version
 from warcio.timeutils import iso_date_to_timestamp
+from warcio.bufferedreaders import DecompressingBufferedReader
 
 """
 WACZ Generator
@@ -103,6 +104,26 @@ def main(args=None):
         help="URL of verify server to verify the signature, if any, in dapackage-digest.json",
     )
 
+    index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index")
+    index.add_argument(
+        "-f", "--file", required=True, help="The WACZ file to read and index."
+    )
+    index.add_argument(
+        "-o",
+        "--output-file",
+        required=False,
+        default="-",
+        help="The CDXJ output file. Defaults to '-' which means to print to STDOUT.",
+    )
+    index.add_argument(
+        "-p",
+        "--wacz-prefix",
+        required=False,
+        default=None,
+        help="Prefix to use when referring to the WACZ file from the CDXJ index. e.g. if the prefix is '/disk/path/' and the WACZ is called example.wacz then the CDXJ file will refer to '/disk/path/example.wacz'.",
+    )
+    index.set_defaults(func=index_wacz)
+
     cmd = parser.parse_args(args=args)
 
     if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None:
@@ -154,6 +175,60 @@ def validate_wacz(res):
     return 0
 
 
+def index_wacz(res):
+    # Open up the ZIP:
+    with zipfile.ZipFile(res.file) as zf:
+        # Determine the WACZ filename/path to use:
+        wacz_path = os.path.basename(res.file)
+        # Allow users to specify the prefix where the WACZ is stored:
+        if res.wacz_prefix:
+            wacz_path = f"{res.wacz_prefix}{wacz_path}"
+
+        # Get a look-up table for offsets for each archive file:
+        archive_offsets = {}
+        archives_prefix = "archive/"
+        for zinfo in zf.infolist():
+            if zinfo.filename.startswith(archives_prefix):
+                archive_name = zinfo.filename[len(archives_prefix) :]
+                archive_offsets[archive_name] = zinfo.header_offset + len(
+                    zinfo.FileHeader()
+                )
+                if zinfo.compress_type != 0:
+                    raise Exception(
+                        "Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!"
+                    )
+
+        # Set up the output stream:
+        with open(
+            res.output_file, "w"
+        ) if res.output_file != "-" else sys.stdout as f_out:
+            # Stream through the index in the WACZ:
+            index_file = "indexes/index.cdx.gz"
+            zinfo = zf.getinfo(index_file)
+            with zf.open(index_file) as f:
+                reader = DecompressingBufferedReader(f)
+                while True:
+                    line = reader.readline()
+                    # If we reach the end, end:
+                    if len(line) == 0:
+                        break
+                    # Otherwise, decode the line:
+                    surt, timestamp, json_data_str = (
+                        line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
+                    )
+                    json_data = json.loads(json_data_str)
+                    # Also override the filename to point at the WACZ:
+                    archive_filename = json_data["filename"]
+                    json_data["filename"] = wacz_path
+                    # Override the offset to include of file offset in the ZIP:
+                    archive_offset = json_data["offset"]
+                    json_data["offset"] = archive_offsets[archive_filename] + int(
+                        archive_offset
+                    )
+                    # Output the modified values:
+                    f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n")
+
+
 def create_wacz(res):
     wacz = zipfile.ZipFile(res.output, "w")