-
-
Notifications
You must be signed in to change notification settings - Fork 261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
consume packed wheel cache in zipapp creation #2175
Changes from all commits
347e208
feff2c8
e587fd3
2131a26
31592ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import atexit | ||
import contextlib | ||
import errno | ||
import io | ||
import itertools | ||
import os | ||
import re | ||
|
@@ -38,6 +39,8 @@ | |
Union, | ||
) | ||
|
||
_DateTime = Tuple[int, int, int, int, int, int] | ||
|
||
|
||
# We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of | ||
# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). | ||
|
@@ -137,6 +140,23 @@ def do_copy(): | |
do_copy() | ||
|
||
|
||
def copy_file_range(source, destination, length, buffer_size=io.DEFAULT_BUFFER_SIZE): | ||
# type: (io.BufferedIOBase, io.BufferedIOBase, int, int) -> None | ||
"""Implementation of shutil.copyfileobj() that only copies exactly `length` bytes.""" | ||
# We require a BufferedIOBase in order to avoid handling short reads or writes. | ||
remaining_length = length | ||
if buffer_size > length: | ||
buffer_size = length | ||
cur_buf = bytearray(buffer_size) | ||
while remaining_length > buffer_size: | ||
assert source.readinto(cur_buf) == buffer_size | ||
assert destination.write(cur_buf) == buffer_size | ||
remaining_length -= buffer_size | ||
remainder = source.read(remaining_length) | ||
assert len(remainder) == remaining_length | ||
assert destination.write(remainder) == remaining_length | ||
|
||
|
||
# See http://stackoverflow.com/questions/2572172/referencing-other-modules-in-atexit | ||
class MktempTeardownRegistry(object): | ||
def __init__(self): | ||
|
@@ -173,7 +193,14 @@ class ZipEntry(namedtuple("ZipEntry", ["info", "data"])): | |
pass | ||
|
||
@classmethod | ||
def zip_entry_from_file(cls, filename, arcname=None, date_time=None): | ||
def zip_entry_from_file( | ||
cls, | ||
filename, # type: str | ||
arcname=None, # type: Optional[str] | ||
date_time=None, # type: Optional[Tuple[int, ...]] | ||
compression=zipfile.ZIP_STORED, # type: int | ||
): | ||
# type: (...) -> PermPreservingZipFile.ZipEntry | ||
"""Construct a ZipEntry for a file on the filesystem. | ||
|
||
Usually a similar `zip_info_from_file` method is provided by `ZipInfo`, but it is not | ||
|
@@ -192,16 +219,20 @@ def zip_entry_from_file(cls, filename, arcname=None, date_time=None): | |
arcname += "/" | ||
if date_time is None: | ||
date_time = time.localtime(st.st_mtime) | ||
zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6]) | ||
zinfo = zipfile.ZipInfo(filename=arcname, date_time=cast("_DateTime", date_time[:6])) | ||
zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes | ||
if isdir: | ||
zinfo.file_size = 0 | ||
zinfo.external_attr |= 0x10 # MS-DOS directory flag | ||
# Always store directories decompressed, because they are empty but take up 2 bytes when | ||
# compressed. | ||
zinfo.compress_type = zipfile.ZIP_STORED | ||
data = b"" | ||
else: | ||
zinfo.file_size = st.st_size | ||
zinfo.compress_type = zipfile.ZIP_DEFLATED | ||
# File contents may be compressed or decompressed. Decompressed is significantly faster | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a gratuitous comment. You're selling here or else speaking to an end user. Clearly this is not the place to speak to the user and the coder doesn't need to be sold to, if a function takes a param ... it takes a param. If its a public function maybe sell in the API doc. |
||
# to write, but caching makes up for that. | ||
zinfo.compress_type = compression | ||
with open(filename, "rb") as fp: | ||
data = fp.read() | ||
return cls.ZipEntry(info=zinfo, data=data) | ||
|
@@ -281,18 +312,32 @@ def safe_mkdir(directory, clean=False): | |
return directory | ||
|
||
|
||
def _ensure_parent(filename): | ||
# type: (str) -> None | ||
parent_dir = os.path.dirname(filename) | ||
if parent_dir: | ||
safe_mkdir(parent_dir) | ||
|
||
|
||
def safe_open(filename, *args, **kwargs): | ||
"""Safely open a file. | ||
|
||
``safe_open`` ensures that the directory components leading up the specified file have been | ||
created first. | ||
""" | ||
parent_dir = os.path.dirname(filename) | ||
if parent_dir: | ||
safe_mkdir(parent_dir) | ||
_ensure_parent(filename) | ||
return open(filename, *args, **kwargs) # noqa: T802 | ||
|
||
|
||
def safe_io_open(filename, *args, **kwargs): | ||
# type: (str, Any, Any) -> io.IOBase | ||
"""``safe_open()``, but using ``io.open()`` instead. | ||
|
||
With the right arguments, this ensures the result produces a buffered file handle on py2.""" | ||
_ensure_parent(filename) | ||
return cast("io.IOBase", io.open(filename, *args, **kwargs)) | ||
|
||
|
||
def safe_delete(filename): | ||
# type: (str) -> None | ||
"""Delete a file safely. | ||
|
@@ -608,9 +653,13 @@ def delete(self): | |
# type: () -> None | ||
shutil.rmtree(self.chroot) | ||
|
||
# This directory traversal, file I/O, and compression can be made faster with complex | ||
# parallelism and pipelining in a compiled language, but the result is much harder to package, | ||
# and is still less performant than effective caching. See investigation in | ||
# https://github.com/pantsbuild/pex/issues/2158 and https://github.com/pantsbuild/pex/pull/2175. | ||
def zip( | ||
self, | ||
filename, # type: str | ||
output_file, # type: Union[str, io.IOBase, io.BufferedRandom] | ||
mode="w", # type: str | ||
deterministic_timestamp=False, # type: bool | ||
exclude_file=lambda _: False, # type: Callable[[str], bool] | ||
|
@@ -628,7 +677,7 @@ def zip( | |
selected_files = self.files() | ||
|
||
compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED | ||
with open_zip(filename, mode, compression) as zf: | ||
with open_zip(output_file, mode, compression) as zf: | ||
|
||
def write_entry( | ||
filename, # type: str | ||
|
@@ -638,11 +687,12 @@ def write_entry( | |
zip_entry = zf.zip_entry_from_file( | ||
filename=filename, | ||
arcname=os.path.relpath(arcname, strip_prefix) if strip_prefix else arcname, | ||
date_time=DETERMINISTIC_DATETIME.timetuple() | ||
if deterministic_timestamp | ||
else None, | ||
date_time=( | ||
DETERMINISTIC_DATETIME.timetuple() if deterministic_timestamp else None | ||
), | ||
compression=compression, | ||
) | ||
zf.writestr(zip_entry.info, zip_entry.data, compression) | ||
zf.writestr(zip_entry.info, zip_entry.data) | ||
|
||
def get_parent_dir(path): | ||
# type: (str) -> Optional[str] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reading a second time there is just no reason to cache uncompressed at all. I definitely vote for fail fast. It seems
compress=False
,cached_dists=True
should not be allowed at all. Further, there is no reason for the current packed layout behavior to be toggleable. If you're building a packed layout, the zips should be compressed zips since the whole point is cache friendliness. If there is no call to add a degree of freedom, don't add it because, at least in Pex, you can never take it back.