Skip to content

Commit

Permalink
S3 Downloader: download in batches, check existing files better
Browse files Browse the repository at this point in the history
  • Loading branch information
bghira committed Sep 15, 2023
1 parent 2b89535 commit 7d38cc3
Showing 1 changed file with 1 addition and 8 deletions.
9 changes: 1 addition & 8 deletions toolkit/datasets/csv_to_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,14 +558,7 @@ def main():
# Fetch and process images
to_fetch = df.to_dict(orient="records")
logger.info(f"Fetching {len(to_fetch)} images...")
# Split data into batches and process each batch
num_batches = len(to_fetch) // BATCH_SIZE + (len(to_fetch) % BATCH_SIZE != 0)

for i in range(num_batches):
start_idx = i * BATCH_SIZE
end_idx = start_idx + BATCH_SIZE
batch = to_fetch[start_idx:end_idx]
process_batch(batch, existing_files, s3_client, args, uri_column)
fetch_data(s3_client, to_fetch, args, uri_column)


if __name__ == "__main__":
Expand Down

0 comments on commit 7d38cc3

Please sign in to comment.