Skip to content

Commit

Permalink
fix: Prevent truncated Parquet files in S3 after failed CreateMultipa…
Browse files Browse the repository at this point in the history
…rtUpload

During a call to s3.to_parquet(), if the size of the data exceeds 5MB a multi-part upload
operation will be initiated.
If the S3 call to CreateMultipartUpload fails (such as with a 503 SlowDown error) then
the incomplete Parquet file data was being written to S3 using 'put_object' during close().
This resulted in broken Parquet files in S3, causing errors when queried by services like Athena.

Now, the data buffer is cleared at the end of the call to flush() -- even when an exception occurs.
  • Loading branch information
rdwebster authored Oct 8, 2024
1 parent 9b2cdd9 commit f69e058
Showing 1 changed file with 36 additions and 31 deletions.
67 changes: 36 additions & 31 deletions awswrangler/s3/_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,39 +406,44 @@ def flush(self, force: bool = False) -> None:
return None
if total_size == 0:
return None
_logger.debug("Flushing: %s bytes", total_size)
self._mpu = self._mpu or _utils.try_it(
f=self._client.create_multipart_upload, # type: ignore[arg-type]
ex=_S3_RETRYABLE_ERRORS,
base=0.5,
max_num_tries=6,
Bucket=self._bucket,
Key=self._key,
**get_botocore_valid_kwargs(
function_name="create_multipart_upload", s3_additional_kwargs=self._s3_additional_kwargs
),
)
self._buffer.seek(0)
for chunk_size in _utils.get_even_chunks_sizes(
total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False
):
_logger.debug("chunk_size: %s bytes", chunk_size)
self._parts_count += 1
self._upload_proxy.upload(
bucket=self._bucket,
key=self._key,
part=self._parts_count,
upload_id=self._mpu["UploadId"],
data=self._buffer.read(chunk_size),
s3_client=self._client,
boto3_kwargs=get_botocore_valid_kwargs(
function_name="upload_part", s3_additional_kwargs=self._s3_additional_kwargs

try:
_logger.debug("Flushing: %s bytes", total_size)
self._mpu = self._mpu or _utils.try_it(
f=self._client.create_multipart_upload, # type: ignore[arg-type]
ex=_S3_RETRYABLE_ERRORS,
base=0.5,
max_num_tries=6,
Bucket=self._bucket,
Key=self._key,
**get_botocore_valid_kwargs(
function_name="create_multipart_upload", s3_additional_kwargs=self._s3_additional_kwargs
),
)
self._buffer.seek(0)
self._buffer.truncate(0)
self._buffer.close()
self._buffer = io.BytesIO()
self._buffer.seek(0)
for chunk_size in _utils.get_even_chunks_sizes(
total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False
):
_logger.debug("chunk_size: %s bytes", chunk_size)
self._parts_count += 1
self._upload_proxy.upload(
bucket=self._bucket,
key=self._key,
part=self._parts_count,
upload_id=self._mpu["UploadId"],
data=self._buffer.read(chunk_size),
s3_client=self._client,
boto3_kwargs=get_botocore_valid_kwargs(
function_name="upload_part", s3_additional_kwargs=self._s3_additional_kwargs
),
)
finally:
# Ensure that the buffer is cleared (even in the event of an exception) so that
# any partial data doesn't get written when close() is called.
self._buffer.seek(0)
self._buffer.truncate(0)
self._buffer.close()
self._buffer = io.BytesIO()
return None

def readable(self) -> bool:
Expand Down

0 comments on commit f69e058

Please sign in to comment.