Skip to content

Commit

Permalink
Slightly change API and add a few more tests
Browse files Browse the repository at this point in the history
Signed-off-by: Mihai Maruseac <[email protected]>
  • Loading branch information
mihaimaruseac committed May 23, 2024
1 parent d5c0642 commit dcb9ace
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 89 deletions.
64 changes: 21 additions & 43 deletions model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
>>> with open("/tmp/file", "w") as f:
... f.write("abcd")
>>> hasher = FileHasher("/tmp/file", SHA256())
>>> hasher.compute().hex()
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
>>> hasher.compute()
>>> hasher.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
```
Expand All @@ -30,8 +29,7 @@
>>> with open("/tmp/file", "w") as f:
... f.write("0123abcd")
>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
>>> hasher.compute().hex()
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
>>> hasher.compute()
>>> hasher.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
```
Expand All @@ -40,11 +38,8 @@
```python
>>> with open("/tmp/file", "w") as f:
... f.write("abcd")
>>> inner_hasher = SHA256()
>>> inner_hasher.update(b"0123")
>>> hasher = FileHasher("/tmp/file", inner_hasher)
>>> hasher.compute().hex()
'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
>>> hasher = FileHasher("/tmp/file", SHA256(b"0123"))
>>> hasher.compute()
>>> hasher.digest_hex
'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
```
Expand All @@ -54,8 +49,7 @@
>>> with open("/tmp/file", "w") as f:
... f.write("0123abcd")
>>> hasher = FileHasher("/tmp/file", SHA256())
>>> hasher.compute().hex()
'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
>>> hasher.compute()
>>> hasher.digest_hex
'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
```
Expand All @@ -77,12 +71,6 @@ class FileHasher(hashing.HashEngine):
chunk size changes. As such, we can dynamically determine an optimal value
for the chunk argument.
Because we only read a file once, it is an error to call the `update` and
`finalize` methods of `FileHashEngine`. Instead, hashing is computed in a
`compute` method, which, for convenience, returns the digest. The
`digest_value` and `digest_hex` properties can still be used to retrieve the
digest, after `compute` is called.
The `digest_name()` method MUST record all parameters that influence the
hash output. For example, if a file is split into shards which are hashed
separately and the final digest value is computed by aggregating these
Expand Down Expand Up @@ -123,8 +111,19 @@ def __init__(
self._digest = b""
self._digest_name_override = digest_name_override

def compute(self) -> bytes:
"""Computes the digest of the file."""
@override
@property
def digest_name(self) -> str:
if self._digest_name_override is not None:
return self._digest_name_override
return f"file-{self._content_hasher.digest_name}"

@override
def update(self, data: bytes) -> None:
raise TypeError("The hash engine does not support calling update().")

@override
def compute(self) -> None:
if self._chunk_size == 0:
with open(self._file, "rb") as f:
self._content_hasher.update(f.read())
Expand All @@ -136,28 +135,8 @@ def compute(self) -> bytes:
break
self._content_hasher.update(data)

self._content_hasher.finalize()
self._content_hasher.compute()
self._digest = self._content_hasher.digest_value
return self._digest

@override
@property
def digest_name(self) -> str:
if self._digest_name_override is not None:
return self._digest_name_override
return f"file-{self._content_hasher.digest_name}"

@override
def update(self, data: bytes) -> None:
raise TypeError(
"FileHashEngine does not support `update`. Use `compute` instead."
)

@override
def finalize(self) -> None:
raise TypeError(
"FileHashEngine does not support `finalize`. Use `compute` instead."
)

@override
@property
Expand Down Expand Up @@ -238,7 +217,7 @@ def __init__(
self._shard_size = shard_size

@override
def compute(self) -> bytes:
def compute(self) -> None:
with open(self._file, "rb") as f:
f.seek(self._start)
to_read = self._end - self._start
Expand All @@ -253,9 +232,8 @@ def compute(self) -> bytes:
to_read -= len(data)
self._content_hasher.update(data)

self._content_hasher.finalize()
self._content_hasher.compute()
self._digest = self._content_hasher.digest_value
return self._digest

@override
@property
Expand Down
30 changes: 14 additions & 16 deletions model_signing/hashing/file_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,22 @@ def sample_file_content_only(tmp_path_factory):
def expected_digest():
# To ensure that the expected file digest is always up to date, use the
# memory hashing and create a fixture for the expected value.
hasher = memory.SHA256()
hasher.update(_FULL_CONTENT.encode("utf-8"))
hasher.finalize()
hasher = memory.SHA256(_FULL_CONTENT.encode("utf-8"))
hasher.compute()
return hasher.digest_hex


@pytest.fixture(scope="session")
def expected_header_digest():
hasher = memory.SHA256()
hasher.update(_HEADER.encode("utf-8"))
hasher.finalize()
hasher = memory.SHA256(_HEADER.encode("utf-8"))
hasher.compute()
return hasher.digest_hex


@pytest.fixture(scope="session")
def expected_content_digest():
hasher = memory.SHA256()
hasher.update(_CONTENT.encode("utf-8"))
hasher.finalize()
hasher = memory.SHA256(_CONTENT.encode("utf-8"))
hasher.compute()
return hasher.digest_hex


Expand All @@ -71,11 +68,6 @@ def test_fails_with_negative_chunk_size(self):
with pytest.raises(ValueError, match="Chunk size must be non-negative"):
file.FileHasher("unused", memory.SHA256(), chunk_size=-2)

def test_compute_and_digest_value_are_the_same(self, sample_file):
hasher = file.FileHasher(sample_file, memory.SHA256())
computed_digest = hasher.compute()
assert computed_digest == hasher.digest_value

def test_hash_of_known_file(self, sample_file, expected_digest):
hasher = file.FileHasher(sample_file, memory.SHA256())
hasher.compute()
Expand All @@ -91,11 +83,17 @@ def test_hash_of_known_file_small_chunk(self, sample_file, expected_digest):
hasher.compute()
assert hasher.digest_hex == expected_digest

def test_hash_file_twice(self, sample_file):
hasher1 = file.FileHasher(sample_file, memory.SHA256())
hasher1.compute()
hasher2 = file.FileHasher(sample_file, memory.SHA256())
hasher2.compute()
assert hasher1.digest_value == hasher2.digest_value

def test_hash_of_known_file_with_header(
self, sample_file_content_only, expected_digest
):
inner_hasher = memory.SHA256()
inner_hasher.update(_HEADER.encode("utf-8"))
inner_hasher = memory.SHA256(_HEADER.encode("utf-8"))
hasher = file.FileHasher(sample_file_content_only, inner_hasher)
hasher.compute()
assert hasher.digest_hex == expected_digest
Expand Down
30 changes: 16 additions & 14 deletions model_signing/hashing/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,31 @@ class HashEngine(metaclass=ABCMeta):

@abstractmethod
def update(self, data: bytes) -> None:
"""Updates the digest based on new bytes.
"""Appends additional bytes to the data to be hashed.
Repeated calls are equivalent to a single call with the concatenation of
all the arguments. That is, `he.update(a); he.update(b)` is the same as
`he.update(a + b)`.
Implementations might decide to not support this operation.
If `finalize` has been called, the behavior of subsequent calls to
`update` is implementation specific.
Similarly, implementations might decide to not support this operation
after `compute` has been called. Or, they might decide that additional
calls to `update` after `compute` has been called have no effect.
Implementations may update internal data on each call to `update`
instead of performing the entire digest computation on `compute`.
"""
pass

@abstractmethod
def finalize(self) -> None:
"""Records that the entire object has been passed to the engine.
def compute(self) -> None:
"""Computes the digest of the entire data passed to the engine.
This method MUST be called only once, after which only the computed
This method should be called only once, after which only the computed
digest and the name of the algorithm can be accessed. This is to ensure
that hashing methods that rely on FFI or use aditional resources (e.g.,
compute the digest on GPU) can properly free allocated resources.
Calling `finalize` more than once may not result in an error. Instead,
in order to allow for implementations that don't need to perform
additional testing, the behavior is implementation specific.
Calling `compute` more than once may not result in an error. Instead, in
order to allow for implementations that don't need to perform additional
testing, the behavior is implementation specific.
"""
pass

Expand All @@ -76,7 +78,7 @@ def digest_name(self) -> str:
def digest_value(self) -> bytes:
"""The digest of the data passed to the hash engine.
The returned value is only valid if `finalize` has been previously
The returned value is only valid if `compute` has been previously
called. For all other cases, the returned value is implementation
specific.
"""
Expand All @@ -89,7 +91,7 @@ def digest_hex(self) -> str:
In general, this method should be used only in tests and for debugging.
The returned value is only valid if `finalize` has been previously
The returned value is only valid if `compute` has been previously
called. For all other cases, the returned value is implementation
specific.
"""
Expand Down
18 changes: 13 additions & 5 deletions model_signing/hashing/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@
```python
>>> hasher = SHA256()
>>> hasher.update(b"abcd")
>>> hasher.finalize()
>>> hasher.compute()
>>> hasher.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
```
Or, passing the data directly in the constructor:
```python
>>> hasher = SHA256(b"abcd")
>>> hasher.compute()
>>> hasher.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
```
Expand All @@ -36,16 +44,16 @@
class SHA256(hashing.HashEngine):
"""A wrapper around `hashlib.sha256`."""

def __init__(self):
self._hasher = hashlib.sha256()
def __init__(self, initial_data: bytes = b""):
self._hasher = hashlib.sha256(initial_data)

@override
def update(self, data: bytes) -> None:
self._hasher.update(data)

@override
def finalize(self) -> None:
pass # nothing to do, `hashlib` methods don't require finalizers
def compute(self) -> None:
pass # nothing to do, digest already computed

@override
@property
Expand Down
20 changes: 15 additions & 5 deletions model_signing/hashing/memory_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
class TestPrecomputedDigest:

def test_hash_known_value(self):
hasher = memory.SHA256()
hasher.update("Test string".encode("utf-8"))
hasher.finalize()
hasher = memory.SHA256(b"Test string")
hasher.compute()
expected = (
"a3e49d843df13c2e2a7786f6ecd7e0d184f45d718d1ac1a8a63e570466e489dd"
)
Expand All @@ -33,12 +32,23 @@ def test_hash_update_twice_is_the_same_as_update_with_concatenation(self):
hasher1 = memory.SHA256()
hasher1.update(str1.encode("utf-8"))
hasher1.update(str2.encode("utf-8"))
hasher1.finalize()
hasher1.compute()

str_all = str1 + str2
hasher2 = memory.SHA256()
hasher2.update(str_all.encode("utf-8"))
hasher2.finalize()
hasher2.compute()

assert hasher1.digest_hex == hasher2.digest_hex
assert hasher1.digest_value == hasher2.digest_value

def test_hash_update_empty(self):
hasher1 = memory.SHA256(b"Test string")
hasher1.update(b"")
hasher1.compute()

hasher2 = memory.SHA256(b"Test string")
hasher2.compute()

assert hasher1.digest_hex == hasher2.digest_hex
assert hasher1.digest_value == hasher2.digest_value
6 changes: 3 additions & 3 deletions model_signing/hashing/precomputed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
Example usage:
```python
>>> hasher = PrecomputedDigest("short-hash", b"abcd")
>>> hasher.finalize()
>>> hasher.compute()
>>> hasher.digest_hex
'61626364'
```
Expand All @@ -41,10 +41,10 @@ class PrecomputedDigest(hashing.HashEngine):

@override
def update(self, data: bytes) -> None:
pass # nothing to do, hash already computed
raise TypeError("The hash engine does not support calling update().")

@override
def finalize(self) -> None:
def compute(self) -> None:
pass # nothing to do, hash already computed

@override
Expand Down
18 changes: 15 additions & 3 deletions model_signing/hashing/precomputed_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,29 @@

class TestPrecomputedDigest:

def test_finalize_does_not_change_hash(self):
def test_compute_does_not_change_hash(self):
hash_value = b"value"
hasher = precomputed.PrecomputedDigest("test", hash_value)
assert hasher.digest_value == hash_value
hasher.finalize()
hasher.compute()
assert hasher.digest_value == hash_value

def test_expected_hash_and_hex(self):
hash_value = b"abcd"
hash_hex_value = "61626364"
hasher = precomputed.PrecomputedDigest("test", hash_value)
hasher.finalize()
hasher.compute()
assert hasher.digest_value == hash_value
assert hasher.digest_hex == hash_hex_value

def test_expected_hash_and_hex_unicode(self):
hash_value = "*哈¥эш希".encode("utf-8")
hash_hex_value = "2ae59388c2a5d18dd188e5b88c"
hasher = precomputed.PrecomputedDigest("test", hash_value)
hasher.compute()
assert hasher.digest_value == hash_value
assert hasher.digest_hex == hash_hex_value

def test_expected_hash_type(self):
hasher = precomputed.PrecomputedDigest("test", b"abcd")
assert hasher.digest_name == "test"

0 comments on commit dcb9ace

Please sign in to comment.