Skip to content

Commit b820690

Browse files
authored
fix: Prevent truncated Parquet files in S3 after failed CreateMultipartUpload (#2993)
During a call to s3.to_parquet(), if the size of the data exceeds 5MB a multi-part upload operation will be initiated. If the S3 call to CreateMultipartUpload fails (such as with a 503 SlowDown error) then the incomplete Parquet file data was being written to S3 using 'put_object' during close(). This resulted in broken Parquet files in S3, causing errors when queried by services like Athena. Now, the data buffer is cleared at the end of the call to flush() -- even when an exception occurs.
1 parent 066134f commit b820690

File tree

1 file changed

+36
-31
lines changed

1 file changed

+36
-31
lines changed

awswrangler/s3/_fs.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -406,39 +406,44 @@ def flush(self, force: bool = False) -> None:
406406
return None
407407
if total_size == 0:
408408
return None
409-
_logger.debug("Flushing: %s bytes", total_size)
410-
self._mpu = self._mpu or _utils.try_it(
411-
f=self._client.create_multipart_upload, # type: ignore[arg-type]
412-
ex=_S3_RETRYABLE_ERRORS,
413-
base=0.5,
414-
max_num_tries=6,
415-
Bucket=self._bucket,
416-
Key=self._key,
417-
**get_botocore_valid_kwargs(
418-
function_name="create_multipart_upload", s3_additional_kwargs=self._s3_additional_kwargs
419-
),
420-
)
421-
self._buffer.seek(0)
422-
for chunk_size in _utils.get_even_chunks_sizes(
423-
total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False
424-
):
425-
_logger.debug("chunk_size: %s bytes", chunk_size)
426-
self._parts_count += 1
427-
self._upload_proxy.upload(
428-
bucket=self._bucket,
429-
key=self._key,
430-
part=self._parts_count,
431-
upload_id=self._mpu["UploadId"],
432-
data=self._buffer.read(chunk_size),
433-
s3_client=self._client,
434-
boto3_kwargs=get_botocore_valid_kwargs(
435-
function_name="upload_part", s3_additional_kwargs=self._s3_additional_kwargs
409+
410+
try:
411+
_logger.debug("Flushing: %s bytes", total_size)
412+
self._mpu = self._mpu or _utils.try_it(
413+
f=self._client.create_multipart_upload, # type: ignore[arg-type]
414+
ex=_S3_RETRYABLE_ERRORS,
415+
base=0.5,
416+
max_num_tries=6,
417+
Bucket=self._bucket,
418+
Key=self._key,
419+
**get_botocore_valid_kwargs(
420+
function_name="create_multipart_upload", s3_additional_kwargs=self._s3_additional_kwargs
436421
),
437422
)
438-
self._buffer.seek(0)
439-
self._buffer.truncate(0)
440-
self._buffer.close()
441-
self._buffer = io.BytesIO()
423+
self._buffer.seek(0)
424+
for chunk_size in _utils.get_even_chunks_sizes(
425+
total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False
426+
):
427+
_logger.debug("chunk_size: %s bytes", chunk_size)
428+
self._parts_count += 1
429+
self._upload_proxy.upload(
430+
bucket=self._bucket,
431+
key=self._key,
432+
part=self._parts_count,
433+
upload_id=self._mpu["UploadId"],
434+
data=self._buffer.read(chunk_size),
435+
s3_client=self._client,
436+
boto3_kwargs=get_botocore_valid_kwargs(
437+
function_name="upload_part", s3_additional_kwargs=self._s3_additional_kwargs
438+
),
439+
)
440+
finally:
441+
# Ensure that the buffer is cleared (even in the event of an exception) so that
442+
# any partial data doesn't get written when close() is called.
443+
self._buffer.seek(0)
444+
self._buffer.truncate(0)
445+
self._buffer.close()
446+
self._buffer = io.BytesIO()
442447
return None
443448

444449
def readable(self) -> bool:

0 commit comments

Comments
 (0)