Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add index generation system that uses offsets into the WACZ itself. #38

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 76 additions & 1 deletion wacz/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from argparse import ArgumentParser, RawTextHelpFormatter
from io import BytesIO, StringIO, TextIOWrapper
import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources
import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources, zlib
from wacz.waczindexer import WACZIndexer
from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict
from wacz.validate import Validation, OUTDATED_WACZ
from wacz.util import validateJSON, get_py_wacz_version
from warcio.timeutils import iso_date_to_timestamp
from warcio.bufferedreaders import DecompressingBufferedReader

"""
WACZ Generator
Expand Down Expand Up @@ -103,6 +104,26 @@ def main(args=None):
help="URL of verify server to verify the signature, if any, in dapackage-digest.json",
)

index = subparsers.add_parser("index", help="generate a WACZ-level CDXJ index")
index.add_argument(
"-f", "--file", required=True, help="The WACZ file to read and index."
)
index.add_argument(
"-o",
"--output-file",
required=False,
default="-",
help="The CDXJ output file. Defaults to '-' which means to print to STDOUT.",
)
index.add_argument(
"-p",
"--wacz-prefix",
required=False,
default=None,
help="Prefix to use when referring to the WACZ file from the CDXJ index. e.g. if the prefix is '/disk/path/' and the WACZ is called example.wacz then the CDXJ file will refer to '/disk/path/example.wacz'.",
)
index.set_defaults(func=index_wacz)

cmd = parser.parse_args(args=args)

if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None:
Expand Down Expand Up @@ -154,6 +175,60 @@ def validate_wacz(res):
return 0


def index_wacz(res):
# Open up the ZIP:
with zipfile.ZipFile(res.file) as zf:
# Determine the WACZ filename/path to use:
wacz_path = os.path.basename(res.file)
# Allow users to specify the prefix where the WACZ is stored:
if res.wacz_prefix:
wacz_path = f"{res.wacz_prefix}{wacz_path}"

# Get a look-up table for offsets for each archive file:
archive_offsets = {}
archives_prefix = "archive/"
for zinfo in zf.infolist():
if zinfo.filename.startswith(archives_prefix):
archive_name = zinfo.filename[len(archives_prefix) :]
archive_offsets[archive_name] = zinfo.header_offset + len(
zinfo.FileHeader()
)
if zinfo.compress_type != 0:
raise Exception(
"Can't generate WACZ-level index from compressed WARC records! This file does not conform to the WACZ standard!"
)

# Set up the output stream:
with open(
res.output_file, "w"
) if res.output_file != "-" else sys.stdout as f_out:
# Stream through the index in the WACZ:
index_file = "indexes/index.cdx.gz"
zinfo = zf.getinfo(index_file)
with zf.open(index_file) as f:
reader = DecompressingBufferedReader(f)
while True:
line = reader.readline()
# If we reach the end, end:
if len(line) == 0:
break
# Otherwise, decode the line:
surt, timestamp, json_data_str = (
line.decode("utf-8").rstrip("\n").split(" ", maxsplit=2)
)
json_data = json.loads(json_data_str)
# Also override the filename to point at the WACZ:
archive_filename = json_data["filename"]
json_data["filename"] = wacz_path
# Override the offset to include of file offset in the ZIP:
archive_offset = json_data["offset"]
json_data["offset"] = archive_offsets[archive_filename] + int(
archive_offset
)
# Output the modified values:
f_out.write(f"{surt} {timestamp} {json.dumps(json_data)}\n")


def create_wacz(res):
wacz = zipfile.ZipFile(res.output, "w")

Expand Down
Loading