pex-tool · cosmicexplorer · Jul 6, 2023 · Aug 1, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/pex/bin/pex.py b/pex/bin/pex.py
@@ -164,6 +164,23 @@ def configure_clp_pex_options(parser):
         ),
     )
 
+    group.add_argument(
+        "--cache-dists",
+        "--no-cache-dists",
+        dest="cache_dists",
+        default=None,
+        action=HandleBoolAction,
+        help=(
+            "Whether to zip up each dist contained in the output PEX file into a fingerprinted "
+            "cache directory to speed up later PEX file builds. For `--layout packed`, this "
+            "behavior is enabled by default. "
+            "For `--layout zipapp`, this synthesizes the zip file from those cached zips with an "
+            "experimental zip merging technique, so this flag is disabled by default when building "
+            "a zipapp. This will re-use the same caches as `--layout packed`, so creating a "
+            "zipapp or packed PEX file from the same inputs will only populate the cache once. "
+            "This flag and behavior do not apply to other layouts."
+        ),
+    )
     group.add_argument(
         "--compress",
         "--compressed",
@@ -175,7 +192,11 @@ def configure_clp_pex_options(parser):
         action=HandleBoolAction,
         help=(
             "Whether to compress zip entries when creating either a zipapp PEX file or a packed "
-            "PEX's bootstrap and dependency zip files. Does nothing for loose layout PEXes."
+            "PEX's bootstrap and dependency zip files. "
+            "Uncompressed PEX files are much faster to create from an empty cache, but are no "
+            "faster after the cache has been populated, and uncompressed cache entries will "
+            "consume many times more space on disk. "
+            "Does nothing for loose layout PEXes."
         ),
     )
 
@@ -200,7 +221,7 @@ def configure_clp_pex_options(parser):
         action=HandleVenvAction,
         help="Convert the pex file to a venv before executing it. If 'prepend' or 'append' is "
         "specified, then all scripts and console scripts provided by distributions in the pex file "
-        "will be added to the PATH in the corresponding position. If the the pex file will be run "
+        "will be added to the PATH in the corresponding position. If the pex file will be run "
         "multiple times under a stable runtime PEX_ROOT, the venv creation will only be done once "
         "and subsequent runs will enjoy lower startup latency.",
     )
@@ -282,10 +303,12 @@ def configure_clp_pex_options(parser):
         dest="compile",
         default=False,
         action=HandleBoolAction,
-        help="Compiling means that the built pex will include .pyc files, which will result in "
-        "slightly faster startup performance. However, compiling means that the generated pex "
+        help="Compiling means that the built PEX will include .pyc files, which will result in "
+        "slightly faster startup performance. However, compiling means that the generated PEX "
         "likely will not be reproducible, meaning that if you were to run `./pex -o` with the "
-        "same inputs then the new pex would not be byte-for-byte identical to the original.",
+        "same inputs then the new PEX would not be byte-for-byte identical to the original. "
+        "Note that all PEX files are now unzipped and compiled when first executed, so this "
+        "flag only affects the startup performance of the first execution.",
     )
 
     group.add_argument(
@@ -294,10 +317,14 @@ def configure_clp_pex_options(parser):
         dest="use_system_time",
         default=False,
         action=HandleBoolAction,
-        help="Use the current system time to generate timestamps for the new pex. Otherwise, Pex "
-        "will use midnight on January 1, 1980. By using system time, the generated pex "
-        "will not be reproducible, meaning that if you were to run `./pex -o` with the "
-        "same inputs then the new pex would not be byte-for-byte identical to the original.",
+        help="Convert modification times from the filesystem into timestamps for any zip file "
+        "entries. Otherwise, Pex will use midnight on January 1, 1980. By using system time, the "
+        "generated PEX will not be reproducible, meaning that if you were to run `./pex -o` with "
+        "the same inputs then the new pex PEX not be byte-for-byte identical to the original. "
+        "Note that zip file entries synthesized from the pex cache (including any resolved "
+        "distributions) will always use the reproducible timestamp regardless of this flag. "
+        "Any unzipped output file will retain the timestamps of their sources regardless of this "
+        "flag, although this will not affect their checksum.",
     )
 
     group.add_argument(
@@ -949,6 +976,7 @@ def do_main(
             deterministic_timestamp=not options.use_system_time,
             layout=options.layout,
             compress=options.compress,
+            cache_dists=options.cache_dists,
         )
         if options.seed != Seed.NONE:
             seed_info = seed_cache(

diff --git a/pex/common.py b/pex/common.py
@@ -6,6 +6,7 @@
 import atexit
 import contextlib
 import errno
+import io
 import itertools
 import os
 import re
@@ -38,6 +39,8 @@
         Union,
     )
 
+    _DateTime = Tuple[int, int, int, int, int, int]
+
 
 # We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of
 # https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT).
@@ -137,6 +140,23 @@ def do_copy():
         do_copy()
 
 
+def copy_file_range(source, destination, length, buffer_size=io.DEFAULT_BUFFER_SIZE):
+    # type: (io.BufferedIOBase, io.BufferedIOBase, int, int) -> None
+    """Implementation of shutil.copyfileobj() that only copies exactly `length` bytes."""
+    # We require a BufferedIOBase in order to avoid handling short reads or writes.
+    remaining_length = length
+    if buffer_size > length:
+        buffer_size = length
+    cur_buf = bytearray(buffer_size)
+    while remaining_length > buffer_size:
+        assert source.readinto(cur_buf) == buffer_size
+        assert destination.write(cur_buf) == buffer_size
+        remaining_length -= buffer_size
+    remainder = source.read(remaining_length)
+    assert len(remainder) == remaining_length
+    assert destination.write(remainder) == remaining_length
+
+
 # See http://stackoverflow.com/questions/2572172/referencing-other-modules-in-atexit
 class MktempTeardownRegistry(object):
     def __init__(self):
@@ -173,7 +193,14 @@ class ZipEntry(namedtuple("ZipEntry", ["info", "data"])):
         pass
 
     @classmethod
-    def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
+    def zip_entry_from_file(
+        cls,
+        filename,  # type: str
+        arcname=None,  # type: Optional[str]
+        date_time=None,  # type: Optional[Tuple[int, ...]]
+        compression=zipfile.ZIP_STORED,  # type: int
+    ):
+        # type: (...) -> PermPreservingZipFile.ZipEntry
         """Construct a ZipEntry for a file on the filesystem.
 
         Usually a similar `zip_info_from_file` method is provided by `ZipInfo`, but it is not
@@ -192,16 +219,20 @@ def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
             arcname += "/"
         if date_time is None:
             date_time = time.localtime(st.st_mtime)
-        zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6])
+        zinfo = zipfile.ZipInfo(filename=arcname, date_time=cast("_DateTime", date_time[:6]))
         zinfo.external_attr = (st.st_mode & 0xFFFF) << 16  # Unix attributes
         if isdir:
             zinfo.file_size = 0
             zinfo.external_attr |= 0x10  # MS-DOS directory flag
+            # Always store directories decompressed, because they are empty but take up 2 bytes when
+            # compressed.
             zinfo.compress_type = zipfile.ZIP_STORED
             data = b""
         else:
             zinfo.file_size = st.st_size
-            zinfo.compress_type = zipfile.ZIP_DEFLATED
+            # File contents may be compressed or decompressed. Decompressed is significantly faster
+            # to write, but caching makes up for that.
+            zinfo.compress_type = compression
             with open(filename, "rb") as fp:
                 data = fp.read()
         return cls.ZipEntry(info=zinfo, data=data)
@@ -281,18 +312,32 @@ def safe_mkdir(directory, clean=False):
         return directory
 
 
+def _ensure_parent(filename):
+    # type: (str) -> None
+    parent_dir = os.path.dirname(filename)
+    if parent_dir:
+        safe_mkdir(parent_dir)
+
+
 def safe_open(filename, *args, **kwargs):
     """Safely open a file.
 
     ``safe_open`` ensures that the directory components leading up the specified file have been
     created first.
     """
-    parent_dir = os.path.dirname(filename)
-    if parent_dir:
-        safe_mkdir(parent_dir)
+    _ensure_parent(filename)
     return open(filename, *args, **kwargs)  # noqa: T802
 
 
+def safe_io_open(filename, *args, **kwargs):
+    # type: (str, Any, Any) -> io.IOBase
+    """``safe_open()``, but using ``io.open()`` instead.
+
+    With the right arguments, this ensures the result produces a buffered file handle on py2."""
+    _ensure_parent(filename)
+    return cast("io.IOBase", io.open(filename, *args, **kwargs))
+
+
 def safe_delete(filename):
     # type: (str) -> None
     """Delete a file safely.
@@ -608,9 +653,13 @@ def delete(self):
         # type: () -> None
         shutil.rmtree(self.chroot)
 
+    # This directory traversal, file I/O, and compression can be made faster with complex
+    # parallelism and pipelining in a compiled language, but the result is much harder to package,
+    # and is still less performant than effective caching. See investigation in
+    # https://github.com/pantsbuild/pex/issues/2158 and https://github.com/pantsbuild/pex/pull/2175.
     def zip(
         self,
-        filename,  # type: str
+        output_file,  # type: Union[str, io.IOBase, io.BufferedRandom]
         mode="w",  # type: str
         deterministic_timestamp=False,  # type: bool
         exclude_file=lambda _: False,  # type: Callable[[str], bool]
@@ -628,7 +677,7 @@ def zip(
             selected_files = self.files()
 
         compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
-        with open_zip(filename, mode, compression) as zf:
+        with open_zip(output_file, mode, compression) as zf:
 
             def write_entry(
                 filename,  # type: str
@@ -638,11 +687,12 @@ def write_entry(
                 zip_entry = zf.zip_entry_from_file(
                     filename=filename,
                     arcname=os.path.relpath(arcname, strip_prefix) if strip_prefix else arcname,
-                    date_time=DETERMINISTIC_DATETIME.timetuple()
-                    if deterministic_timestamp
-                    else None,
+                    date_time=(
+                        DETERMINISTIC_DATETIME.timetuple() if deterministic_timestamp else None
+                    ),
+                    compression=compression,
                 )
-                zf.writestr(zip_entry.info, zip_entry.data, compression)
+                zf.writestr(zip_entry.info, zip_entry.data)
 
             def get_parent_dir(path):
                 # type: (str) -> Optional[str]