Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fix several issues with the hashing method #2049

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 124 additions & 131 deletions aiida/common/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,27 @@
import random
import time
import uuid
import codecs
from datetime import datetime

import six
from six.moves import range
from passlib.context import CryptContext

try: # Python3
try: # Python3
from functools import singledispatch
from collections import abc
except ImportError: # Python2
except ImportError: # Python2
from singledispatch import singledispatch
import collections as abc

import numpy as np

from .folders import Folder

"""
Here we define a single password hashing instance for the full AiiDA.
"""


# The prefix of the hashed using pbkdf2_sha256 algorithm in Django
HASHING_PREFIX_DJANGO = "pbkdf2_sha256"
# The prefix of the hashed using pbkdf2_sha256 algorithm in Passlib
Expand All @@ -46,7 +45,7 @@
# Number of random chars to add after UNUSABLE_PASSWORD_PREFIX
UNUSABLE_PASSWORD_SUFFIX_LENGTH = 40

HASHING_KEY="HashingKey"
HASHING_KEY = "HashingKey"

pwd_context = CryptContext(
# The list of hashes that we support
Expand All @@ -56,12 +55,11 @@

# We set the number of rounds that should be used...
pbkdf2_sha256__default_rounds=8000,
)
)


def create_unusable_pass():
return UNUSABLE_PASSWORD_PREFIX + get_random_string(
UNUSABLE_PASSWORD_SUFFIX_LENGTH)
return UNUSABLE_PASSWORD_PREFIX + get_random_string(UNUSABLE_PASSWORD_SUFFIX_LENGTH)


def is_password_usable(enc_pass):
Expand All @@ -73,13 +71,13 @@ def is_password_usable(enc_pass):

# Backward compatibility for old Django hashing
if enc_pass.startswith(HASHING_PREFIX_DJANGO):
enc_pass = enc_pass.replace(HASHING_PREFIX_DJANGO,
HASHING_PREFIX_PBKDF2_SHA256, 1)
enc_pass = enc_pass.replace(HASHING_PREFIX_DJANGO, HASHING_PREFIX_PBKDF2_SHA256, 1)
if pwd_context.identify(enc_pass) is not None:
return True

return False


###################################################################
# THE FOLLOWING WAS TAKEN FROM DJANGO BUT IT CAN BE EASILY REPLACED
###################################################################
Expand All @@ -95,9 +93,7 @@ def is_password_usable(enc_pass):
using_sysrandom = False


def get_random_string(length=12,
allowed_chars='abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'):
def get_random_string(length=12, allowed_chars='abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'):
"""
Returns a securely generated random string.

Expand All @@ -111,31 +107,10 @@ def get_random_string(length=12,
# time a random string is required. This may change the
# properties of the chosen random sequence slightly, but this
# is better than absolute predictability.
random.seed(
hashlib.sha256(
("%s%s%s" % (
random.getstate(),
time.time(),
HASHING_KEY)).encode('utf-8')
).digest())
random.seed(hashlib.sha256(("%s%s%s" % (random.getstate(), time.time(), HASHING_KEY)).encode('utf-8')).digest())
return ''.join(random.choice(allowed_chars) for i in range(length))


def make_hash_with_type(type_chr, string_to_hash):
"""
get a hash digest for a given enumerated type and its content

:param type_chr: a single char, lower case for simple datatypes, upper case for composite datatypes
:param string_to_hash: an encoded string (a `str` in Python 2, latin1-encoded `bytes` in Python 3)

We don't check anything for speed efficiency.

The `latin1` here is not an error. Since this was introduced in Python 2 and no proper care was
taken to properly encode/decode strings, the default was used, which was `latin1` at that time.
"""
return hashlib.sha224(type_chr.encode('latin1') + string_to_hash).hexdigest()

@singledispatch
def make_hash(object_to_hash, **kwargs):
"""
Makes a hash from a dictionary, list, tuple or set to any level, that contains
Expand Down Expand Up @@ -204,91 +179,120 @@ def make_hash(object_to_hash, **kwargs):
the string of dictionary do not suffice if we want to check for equality
of dictionaries using hashes.
"""
raise ValueError("Value of type {} cannot be hashed".format(
type(object_to_hash))
)
int_hash = _make_int_hash(object_to_hash, **kwargs)
return u'{:052x}'.format(int_hash)

@make_hash.register(abc.Sequence)

@singledispatch
def _make_int_hash(object_to_hash, **kwargs):
"""
Implementation of the ``make_hash`` function. The hash is created as a
28 byte integer, and only later converted to a string.
"""
raise ValueError("Value of type {} cannot be hashed".format(type(object_to_hash)))

# int hashes with size 28 bytes
INT_HASH_MASK = 2**(8 * 28) - 1

def _hash_combine(hash1, hash2):
"""
Combine two hashes, using the approach of boost::hash_combine.
"""
return (hash1 ^ (hash2 + 0x9e3779b9 + (hash1 << 6) + (hash1 >> 2))) & INT_HASH_MASK


def _combine_hash_list(hashes):
"""
Combines multiple hashes.
"""
if not hashes:
return 0
res = hashes[0]
for val in hashes[1:]:
res = _hash_combine(res, val)
return res


def _add_type_salt(type_bytes):
"""
Decorator which combines the output of the decorated function with a given
'type salt', which is a type-specific hash. The purpose of this is that
two objects which have the same binary structure but different types do not
hash to the same value.
"""
assert isinstance(type_bytes, six.binary_type)
salt_hash = int(codecs.encode(hashlib.sha224(type_bytes).digest(), 'hex'), 16)

def decorator(func):

def inner(object_to_hash, **kwargs):
return _hash_combine(salt_hash, func(object_to_hash, **kwargs))

return inner

return decorator


@_make_int_hash.register(abc.Sequence)
@_add_type_salt(b'L')
def _(sequence, **kwargs):
hashes = tuple([
make_hash(x, **kwargs) for x in sequence
])
return make_hash_with_type('L', ",".join(hashes).encode('latin1'))
return _combine_hash_list([_make_int_hash(x, **kwargs) for x in sequence])


@make_hash.register(abc.Set)
@_make_int_hash.register(abc.Set)
@_add_type_salt(b'S')
def _(object_to_hash, **kwargs):
hashes = tuple([
make_hash(x, **kwargs)
for x
in sorted(object_to_hash)
])
return make_hash_with_type('S', ",".join(hashes).encode('latin1'))

@make_hash.register(abc.Mapping)
hashes = sorted(_make_int_hash(x, **kwargs) for x in object_to_hash)
return _combine_hash_list(hashes)


@_make_int_hash.register(abc.Mapping)
@_add_type_salt(b'D')
def _(mapping, **kwargs):
hashed_dictionary = {
k: make_hash(v, **kwargs)
for k,v
in mapping.items()
}
return make_hash_with_type(
'D',
make_hash(sorted(hashed_dictionary.items()), **kwargs).encode('latin1')
)

@make_hash.register(numbers.Real)
def _(object_to_hash, **kwargs):
return make_hash_with_type(
'f',
truncate_float64(object_to_hash).tobytes()
)
hashed_items = sorted(_make_int_hash(item, **kwargs) for item in mapping.items())
return _combine_hash_list(hashed_items)

@make_hash.register(numbers.Complex)
def _(object_to_hash, **kwargs):
return make_hash_with_type(
'c',
','.join([
make_hash(object_to_hash.real, **kwargs),
make_hash(object_to_hash.imag, **kwargs)
]).encode('latin1')
)

@make_hash.register(numbers.Integral)
def _(object_to_hash, **kwargs):
return make_hash_with_type('i', str(object_to_hash).encode('latin1'))

# if the type is unicode in Python 2 or a str in Python 3, convert it
# to a str in Python 2 and bytes in Python 3 using the default Python 2 encoding.
# This should emulate what the hashlib has been doing internally: converting
# unicode strings to latin1 bytes representation before hashing.
@make_hash.register(six.text_type)
@_make_int_hash.register(numbers.Real)
@_add_type_salt(b'f')
def _(object_to_hash, **kwargs):
return make_hash_with_type('s', object_to_hash.encode('latin1'))
return _make_int_hash(truncate_float64(object_to_hash).tobytes(), **kwargs)

# for str in Python 2 and bytes in Python 3, simply forward them to
# the hashing function, without trying to encode them
@make_hash.register(six.binary_type)
def _(object_to_hash, **kwargs):
return make_hash_with_type('s', object_to_hash)

@make_hash.register(bool)
@_make_int_hash.register(numbers.Complex)
@_add_type_salt(b'c')
def _(object_to_hash, **kwargs):
return make_hash_with_type('b', str(object_to_hash).encode('latin1'))
return _combine_hash_list(
[_make_int_hash(object_to_hash.real, **kwargs),
_make_int_hash(object_to_hash.imag, **kwargs)])

@make_hash.register(type(None))
def _(object_to_hash, **kwargs):
return make_hash_with_type('n', str(object_to_hash).encode('latin1'))

@make_hash.register(datetime)
# If the type is unicode in Python 2 or a str in Python 3, convert it
# to a str in Python 2 and bytes in Python 3 using the utf-8 encoding.
@_make_int_hash.register(six.text_type)
@_add_type_salt(b's')
def _(object_to_hash, **kwargs):
return make_hash_with_type('d', str(object_to_hash).encode('latin1'))
return _make_int_hash(object_to_hash.encode('utf-8'), **kwargs)

@make_hash.register(uuid.UUID)

@_make_int_hash.register(six.binary_type)
def _(object_to_hash, **kwargs):
return make_hash_with_type('u', str(object_to_hash).encode('latin1'))
return int(codecs.encode(hashlib.sha224(object_to_hash).digest(), 'hex'), 16)


def _make_int_hash_from_str_repr(object_to_hash):
return _make_int_hash(str(object_to_hash).encode('utf-8'))


@make_hash.register(Folder)
_make_int_hash.register(int)(_add_type_salt(b'i')(_make_int_hash_from_str_repr))
_make_int_hash.register(bool)(_add_type_salt(b'b')(_make_int_hash_from_str_repr))
_make_int_hash.register(type(None))(_add_type_salt(b'n')(_make_int_hash_from_str_repr))
_make_int_hash.register(datetime)(_add_type_salt(b'd')(_make_int_hash_from_str_repr))
_make_int_hash.register(uuid.UUID)(_add_type_salt(b'u')(_make_int_hash_from_str_repr))


@_make_int_hash.register(Folder)
@_add_type_salt(b'pd')
def _(folder, **kwargs):
# make sure file is closed after being read
def _read_file(folder, name):
Expand All @@ -297,39 +301,27 @@ def _read_file(folder, name):

ignored_folder_content = kwargs.get('ignored_folder_content', [])

return make_hash_with_type(
'pd',
make_hash([
(
name,
folder.get_subfolder(name) if folder.isdir(name) else
make_hash_with_type('pf', _read_file(folder, name))
)
for name in sorted(folder.get_content_list())
if name not in ignored_folder_content
], **kwargs).encode('latin1')
)

@make_hash.register(np.ndarray)
return _make_int_hash([(name, folder.get_subfolder(name)) if folder.isdir(name) else _hash_combine(
_make_int_hash(b'pf', **kwargs), _make_int_hash(_read_file(folder, name), **kwargs))
for name in sorted(folder.get_content_list())
if name not in ignored_folder_content], **kwargs)


@_make_int_hash.register(np.ndarray)
def _(object_to_hash, **kwargs):
if object_to_hash.dtype == np.float64:
return make_hash_with_type(
'af',
make_hash(truncate_array64(object_to_hash).tobytes(), **kwargs).encode('latin1')
)
return _hash_combine(
_make_int_hash(b'af', **kwargs),
_make_int_hash(truncate_array64(object_to_hash).tobytes(), **kwargs))
elif object_to_hash.dtype == np.complex128:
return make_hash_with_type(
'ac',
make_hash([
object_to_hash.real,
object_to_hash.imag
], **kwargs).encode('latin1')
)
return _hash_combine(
_make_int_hash(b'ac', **kwargs),
_make_int_hash([object_to_hash.real, object_to_hash.imag], **kwargs))
else:
return make_hash_with_type(
'ao',
make_hash(object_to_hash.tobytes(), **kwargs).encode('latin1')
)
return _hash_combine(
_make_int_hash(b'ao', **kwargs),
_make_int_hash(object_to_hash.tobytes(), **kwargs))


def truncate_float64(x, num_bits=4):
mask = ~(2**num_bits - 1)
Expand All @@ -338,6 +330,7 @@ def truncate_float64(x, num_bits=4):
truncated_x = masked_int.view(np.float64)
return truncated_x


def truncate_array64(x, num_bits=4):
mask = ~(2**num_bits - 1)
int_array = np.array(x, dtype=np.float64).view(np.int64)
Expand Down
Loading