From f60aef562f75ab546a5fbf910003790041a7bc3b Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Tue, 9 Jan 2024 21:38:55 -0800 Subject: [PATCH] Avoid uncessary file uploads --- scripts/upload.py | 54 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/scripts/upload.py b/scripts/upload.py index 441585c940..127e615779 100644 --- a/scripts/upload.py +++ b/scripts/upload.py @@ -1,17 +1,31 @@ import boto3 from pathlib import Path import mimetypes +import hashlib import os +import json from multiprocessing.pool import ThreadPool AKEY = os.environ["AWS_S3_ACCESS_KEY"] SKEY = os.environ["AWS_S3_SECRET_KEY"] + BUCKET = "keras.io" USE_THREADING = True +HASH_CACHE = "contents_hashes.json" s3 = boto3.client("s3", aws_access_key_id=AKEY, aws_secret_access_key=SKEY) +def hash_file(fpath): + h = hashlib.sha256() + b = bytearray(128 * 1024) + mv = memoryview(b) + with open(fpath, "rb", buffering=0) as f: + while n := f.readinto(mv): + h.update(mv[:n]) + return h.hexdigest()[:8] + + def upload_file(bucket, fpath, key_name, redirect=None): print(f"...Upload to {bucket}:{key_name}") mime = mimetypes.guess_type(fpath)[0] @@ -23,6 +37,19 @@ def upload_file(bucket, fpath, key_name, redirect=None): ) +def load_hash_cache(): + s3.download_file(BUCKET, HASH_CACHE, HASH_CACHE) + with open(HASH_CACHE) as f: + contents = f.read() + return json.loads(contents) + + +def save_hash_cache(hash_cache): + with open(HASH_CACHE, "w") as f: + f.write(json.dumps(hash_cache)) + upload_file(BUCKET, HASH_CACHE, HASH_CACHE) + + def wrapped_upload_file(args): bucket, fpath, key_name = args upload_file(bucket, fpath, key_name) @@ -44,10 +71,10 @@ def cleanup(site_directory, redirect_directory): s3.delete_object(Bucket=BUCKET, Key=key) -def upload_dir(directory, include_img=True): +def upload_dir(directory, include_img=True, hash_cache=None): print(f"Uploading files from '{directory}'...") all_targets = [] - for dp, dn, fn in os.walk(directory): + for dp, _, fn in os.walk(directory): if fn: for f in fn: fpath = os.path.join(dp, f) @@ -60,6 +87,18 @@ def upload_dir(directory, include_img=True): print("> " + fpath) print(">>>>>> " + key_name) all_targets.append((BUCKET, fpath, key_name)) + + if hash_cache is not None: + filtered_targets = [] + new_hash_cache = {} + for bucket, fpath, key_name in all_targets: + new_hash = hash_file(fpath) + old_hash = hash_cache.get(key_name) + if new_hash != old_hash: + filtered_targets.append((bucket, fpath, key_name)) + new_hash_cache[key_name] = new_hash + all_targets = filtered_targets + if USE_THREADING: pool = ThreadPool(processes=8) pool.map(wrapped_upload_file, all_targets) @@ -67,10 +106,13 @@ def upload_dir(directory, include_img=True): for args in all_targets: wrapped_upload_file(args) + if hash_cache is not None: + return new_hash_cache + def upload_redirects(directory): print("Uploading redirects...") - for dp, dn, fn in os.walk(directory): + for dp, _, fn in os.walk(directory): if fn: for f in fn: fpath = os.path.join(dp, f) @@ -87,4 +129,8 @@ def upload_redirects(directory): if __name__ == "__main__": root = Path(__file__).parent.parent.resolve() - upload_dir(os.path.join(root, "site"), include_img=True) + hash_cache = load_hash_cache() + hash_cache = upload_dir( + os.path.join(root, "site"), include_img=True, hash_cache=hash_cache + ) + save_hash_cache(hash_cache)