From db2b55226a4eb7878fb980bbf539d3395a2f6dbc Mon Sep 17 00:00:00 2001 From: Alexandre de Siqueira Date: Mon, 15 Mar 2021 11:26:08 -0700 Subject: [PATCH] Refactoring hash-dependent functions (#63) * Fixing PermissionError on NamedTempFile * Revert "Fixing PermissionError on NamedTempFile" This reverts commit d6eb1467e39bb8361aeb374d704e07c1e8915b3c. * Improving how to deal with hashes --- butterfly/connection.py | 128 ++++++++------------------ models/.SHA256SUM_ONLINE-id_gender | 1 - models/.SHA256SUM_ONLINE-id_position | 1 - models/.SHA256SUM_ONLINE-segmentation | 1 - models/SHA256SUM-id_gender | 1 - models/SHA256SUM-id_position | 1 - models/SHA256SUM-segmentation | 1 - 7 files changed, 36 insertions(+), 98 deletions(-) delete mode 100644 models/.SHA256SUM_ONLINE-id_gender delete mode 100644 models/.SHA256SUM_ONLINE-id_position delete mode 100644 models/.SHA256SUM_ONLINE-segmentation delete mode 100644 models/SHA256SUM-id_gender delete mode 100644 models/SHA256SUM-id_position delete mode 100644 models/SHA256SUM-segmentation diff --git a/butterfly/connection.py b/butterfly/connection.py index dd4de75..b0535b1 100644 --- a/butterfly/connection.py +++ b/butterfly/connection.py @@ -1,6 +1,8 @@ from pathlib import Path from pooch import retrieve +from urllib import request +import hashlib import socket @@ -12,15 +14,9 @@ } URL_HASH = { - 'id_gender' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/id_gender/.SHA256SUM_ONLINE-id_gender', - 'id_position' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/id_position/.SHA256SUM_ONLINE-id_position', - 'segmentation' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/segmentation/.SHA256SUM_ONLINE-segmentation' - } - -LOCAL_HASH = { - 'id_gender' : Path('./models/SHA256SUM-id_gender'), - 'id_position' : Path('./models/SHA256SUM-id_position'), - 'segmentation' : Path('./models/SHA256SUM-segmentation') + 'id_gender' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/id_gender/SHA256SUM-id_gender', + 'id_position' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/id_position/SHA256SUM-id_position', + 'segmentation' : 'https://gitlab.com/alexdesiqueira/mothra-models/-/raw/main/models/segmentation/SHA256SUM-segmentation' } @@ -39,39 +35,8 @@ def _get_model_info(weights): URL of the file for the latest model. url_hash : str URL of the hash file for the latest model. - local_hash : pathlib.Path - Path of the local hash file. - """ - return (URL_MODEL.get(weights.stem), URL_HASH.get(weights.stem), - LOCAL_HASH.get(weights.stem)) - - -def _check_hashes(weights): - """Helping function. Downloads hashes for `weights` if they are not - present. - - Parameters - ---------- - weights : str or pathlib.Path - Path of the file containing weights. - - Returns - ------- - None """ - _, url_hash, local_hash = _get_model_info(weights) - - if not local_hash.is_file(): - download_hash_from_url(url_hash=url_hash, filename=local_hash) - - # creating filename to save url_hash. - filename = local_hash.parent/Path(url_hash).name - - if not filename.is_file(): - download_hash_from_url(url_hash=url_hash, filename=filename) - - - return None + return (URL_MODEL.get(weights.stem), URL_HASH.get(weights.stem)) def download_weights(weights): @@ -86,9 +51,7 @@ def download_weights(weights): ------- None """ - # check if hashes are in disk, then get info from the model. - _check_hashes(weights) - _, url_hash, local_hash = _get_model_info(weights) + _, url_hash = _get_model_info(weights) # check if weights is in its folder. If not, download the file. if not weights.is_file(): @@ -97,9 +60,8 @@ def download_weights(weights): # file exists: check if we have the last version; download if not. else: if has_internet(): - local_hash_val = read_hash_local(filename=local_hash) - url_hash_val = read_hash_from_url(path=local_hash.parent, - url_hash=url_hash) + local_hash_val = read_hash_local(weights) + url_hash_val = read_hash_from_url(url_hash) if local_hash_val != url_hash_val: print('New training data available. Downloading...') fetch_data(weights) @@ -107,24 +69,6 @@ def download_weights(weights): return None -def download_hash_from_url(url_hash, filename): - """Downloads hash from `url_hash`. - - Parameters - ---------- - url_hash : str - URL of the SHA256 hash. - filename : str - Filename to save the SHA256 hash locally. - - Returns - ------- - None - """ - retrieve(url=url_hash, known_hash=None, fname=filename, path='.') - return None - - def fetch_data(weights): """Downloads and checks the hash of `weights`, according to its filename. @@ -137,15 +81,11 @@ def fetch_data(weights): ------- None """ - url_model, url_hash, local_hash = _get_model_info(weights) + url_model, url_hash = _get_model_info(weights) - # creating filename to save url_hash. - filename = local_hash.parent/Path(url_hash).name - - download_hash_from_url(url_hash=url_hash, filename=filename) - local_hash_val = read_hash_local(local_hash) + url_hash_val = read_hash_from_url(url_hash) retrieve(url=url_model, - known_hash=f'sha256:{local_hash_val}', + known_hash=f'sha256:{url_hash_val}', fname=weights, path='.') @@ -166,40 +106,44 @@ def has_internet(): return socket.gethostbyname(socket.gethostname()) != '127.0.0.1' -def read_hash_local(filename): - """Reads local SHA256 hash file. +def read_hash_local(weights): + """Reads local SHA256 hash from weights. Parameters ---------- - filename : pathlib.Path - Path of the hash file. + weights : str or pathlib.Path + Path of the file containing weights. Returns ------- - local_hash : str - SHA256 hash. + local_hash : str or None + SHA256 hash of weights file. Notes ----- Returns None if file is not found. """ + BUFFER_SIZE = 65536 + sha256 = hashlib.sha256() + try: - with open(filename, 'r') as file_hash: - hashes = [line for line in file_hash] - # expecting only one hash, and not interested in the filename: - local_hash, _ = hashes[0].split() + with open(weights, 'rb') as file_weights: + while True: + data = file_weights.read(BUFFER_SIZE) + if not data: + break + sha256.update(data) + local_hash = sha256.hexdigest() except FileNotFoundError: local_hash = None return local_hash -def read_hash_from_url(path, url_hash): - """Downloads and returns the SHA256 hash online for the file in `url_hash`. +def read_hash_from_url(url_hash): + """Returns the SHA256 hash online for the file in `url_hash`. Parameters ---------- - path : str - Where to look for the hash file. url_hash : str URL of the hash file for the latest model. @@ -208,14 +152,14 @@ def read_hash_from_url(path, url_hash): online_hash : str SHA256 hash for the file in `url_hash`. """ - filename = Path(url_hash).name - latest_hash = Path(f'{path}/{filename}') + user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' + headers = {'User-Agent':user_agent,} - download_hash_from_url(url_hash=url_hash, filename=filename) - with open(latest_hash, 'r') as file_hash: - hashes = [line for line in file_hash] + aux_req = request.Request(url_hash, None, headers) + response = request.urlopen(aux_req) + hashes = response.read() # expecting only one hash, and not interested in the filename: - online_hash, _ = hashes[0].split() + online_hash, _ = hashes.decode('ascii').split() return online_hash diff --git a/models/.SHA256SUM_ONLINE-id_gender b/models/.SHA256SUM_ONLINE-id_gender deleted file mode 100644 index 8dfe3cd..0000000 --- a/models/.SHA256SUM_ONLINE-id_gender +++ /dev/null @@ -1 +0,0 @@ -8fd76a968c63f27324da500fe6984f3b8e1703ee5adfcb588ca21d08e7464b89 id_gender.pkl diff --git a/models/.SHA256SUM_ONLINE-id_position b/models/.SHA256SUM_ONLINE-id_position deleted file mode 100644 index 1f5aade..0000000 --- a/models/.SHA256SUM_ONLINE-id_position +++ /dev/null @@ -1 +0,0 @@ -911829014692e7dd711ab89f623f1bec1ec5643bfc3a5c7a906e4ab86eefb80b id_position.pkl diff --git a/models/.SHA256SUM_ONLINE-segmentation b/models/.SHA256SUM_ONLINE-segmentation deleted file mode 100644 index c7ced8c..0000000 --- a/models/.SHA256SUM_ONLINE-segmentation +++ /dev/null @@ -1 +0,0 @@ -2a702e09b2c94898db06e0d5393ded3f2010ade7b640c69d8149be5724cc78fc segmentation.pkl diff --git a/models/SHA256SUM-id_gender b/models/SHA256SUM-id_gender deleted file mode 100644 index 8dfe3cd..0000000 --- a/models/SHA256SUM-id_gender +++ /dev/null @@ -1 +0,0 @@ -8fd76a968c63f27324da500fe6984f3b8e1703ee5adfcb588ca21d08e7464b89 id_gender.pkl diff --git a/models/SHA256SUM-id_position b/models/SHA256SUM-id_position deleted file mode 100644 index 1f5aade..0000000 --- a/models/SHA256SUM-id_position +++ /dev/null @@ -1 +0,0 @@ -911829014692e7dd711ab89f623f1bec1ec5643bfc3a5c7a906e4ab86eefb80b id_position.pkl diff --git a/models/SHA256SUM-segmentation b/models/SHA256SUM-segmentation deleted file mode 100644 index c7ced8c..0000000 --- a/models/SHA256SUM-segmentation +++ /dev/null @@ -1 +0,0 @@ -2a702e09b2c94898db06e0d5393ded3f2010ade7b640c69d8149be5724cc78fc segmentation.pkl