From dcb9acefa574750a340f164d526307dafdd94159 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Thu, 23 May 2024 09:18:22 -0700 Subject: [PATCH] Slightly change API and add a few more tests Signed-off-by: Mihai Maruseac --- model_signing/hashing/file.py | 64 ++++++++--------------- model_signing/hashing/file_test.py | 30 +++++------ model_signing/hashing/hashing.py | 30 ++++++----- model_signing/hashing/memory.py | 18 +++++-- model_signing/hashing/memory_test.py | 20 +++++-- model_signing/hashing/precomputed.py | 6 +-- model_signing/hashing/precomputed_test.py | 18 +++++-- 7 files changed, 97 insertions(+), 89 deletions(-) diff --git a/model_signing/hashing/file.py b/model_signing/hashing/file.py index e34b9b68..4f5d0a1e 100644 --- a/model_signing/hashing/file.py +++ b/model_signing/hashing/file.py @@ -19,8 +19,7 @@ >>> with open("/tmp/file", "w") as f: ... f.write("abcd") >>> hasher = FileHasher("/tmp/file", SHA256()) ->>> hasher.compute().hex() -'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' +>>> hasher.compute() >>> hasher.digest_hex '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' ``` @@ -30,8 +29,7 @@ >>> with open("/tmp/file", "w") as f: ... f.write("0123abcd") >>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) ->>> hasher.compute().hex() -'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' +>>> hasher.compute() >>> hasher.digest_hex '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' ``` @@ -40,11 +38,8 @@ ```python >>> with open("/tmp/file", "w") as f: ... f.write("abcd") ->>> inner_hasher = SHA256() ->>> inner_hasher.update(b"0123") ->>> hasher = FileHasher("/tmp/file", inner_hasher) ->>> hasher.compute().hex() -'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' +>>> hasher = FileHasher("/tmp/file", SHA256(b"0123")) +>>> hasher.compute() >>> hasher.digest_hex '64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' ``` @@ -54,8 +49,7 @@ >>> with open("/tmp/file", "w") as f: ... f.write("0123abcd") >>> hasher = FileHasher("/tmp/file", SHA256()) ->>> hasher.compute().hex() -'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' +>>> hasher.compute() >>> hasher.digest_hex '64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' ``` @@ -77,12 +71,6 @@ class FileHasher(hashing.HashEngine): chunk size changes. As such, we can dynamically determine an optimal value for the chunk argument. - Because we only read a file once, it is an error to call the `update` and - `finalize` methods of `FileHashEngine`. Instead, hashing is computed in a - `compute` method, which, for convenience, returns the digest. The - `digest_value` and `digest_hex` properties can still be used to retrieve the - digest, after `compute` is called. - The `digest_name()` method MUST record all parameters that influence the hash output. For example, if a file is split into shards which are hashed separately and the final digest value is computed by aggregating these @@ -123,8 +111,19 @@ def __init__( self._digest = b"" self._digest_name_override = digest_name_override - def compute(self) -> bytes: - """Computes the digest of the file.""" + @override + @property + def digest_name(self) -> str: + if self._digest_name_override is not None: + return self._digest_name_override + return f"file-{self._content_hasher.digest_name}" + + @override + def update(self, data: bytes) -> None: + raise TypeError("The hash engine does not support calling update().") + + @override + def compute(self) -> None: if self._chunk_size == 0: with open(self._file, "rb") as f: self._content_hasher.update(f.read()) @@ -136,28 +135,8 @@ def compute(self) -> bytes: break self._content_hasher.update(data) - self._content_hasher.finalize() + self._content_hasher.compute() self._digest = self._content_hasher.digest_value - return self._digest - - @override - @property - def digest_name(self) -> str: - if self._digest_name_override is not None: - return self._digest_name_override - return f"file-{self._content_hasher.digest_name}" - - @override - def update(self, data: bytes) -> None: - raise TypeError( - "FileHashEngine does not support `update`. Use `compute` instead." - ) - - @override - def finalize(self) -> None: - raise TypeError( - "FileHashEngine does not support `finalize`. Use `compute` instead." - ) @override @property @@ -238,7 +217,7 @@ def __init__( self._shard_size = shard_size @override - def compute(self) -> bytes: + def compute(self) -> None: with open(self._file, "rb") as f: f.seek(self._start) to_read = self._end - self._start @@ -253,9 +232,8 @@ def compute(self) -> bytes: to_read -= len(data) self._content_hasher.update(data) - self._content_hasher.finalize() + self._content_hasher.compute() self._digest = self._content_hasher.digest_value - return self._digest @override @property diff --git a/model_signing/hashing/file_test.py b/model_signing/hashing/file_test.py index 1f2778eb..ac3c2ede 100644 --- a/model_signing/hashing/file_test.py +++ b/model_signing/hashing/file_test.py @@ -43,25 +43,22 @@ def sample_file_content_only(tmp_path_factory): def expected_digest(): # To ensure that the expected file digest is always up to date, use the # memory hashing and create a fixture for the expected value. - hasher = memory.SHA256() - hasher.update(_FULL_CONTENT.encode("utf-8")) - hasher.finalize() + hasher = memory.SHA256(_FULL_CONTENT.encode("utf-8")) + hasher.compute() return hasher.digest_hex @pytest.fixture(scope="session") def expected_header_digest(): - hasher = memory.SHA256() - hasher.update(_HEADER.encode("utf-8")) - hasher.finalize() + hasher = memory.SHA256(_HEADER.encode("utf-8")) + hasher.compute() return hasher.digest_hex @pytest.fixture(scope="session") def expected_content_digest(): - hasher = memory.SHA256() - hasher.update(_CONTENT.encode("utf-8")) - hasher.finalize() + hasher = memory.SHA256(_CONTENT.encode("utf-8")) + hasher.compute() return hasher.digest_hex @@ -71,11 +68,6 @@ def test_fails_with_negative_chunk_size(self): with pytest.raises(ValueError, match="Chunk size must be non-negative"): file.FileHasher("unused", memory.SHA256(), chunk_size=-2) - def test_compute_and_digest_value_are_the_same(self, sample_file): - hasher = file.FileHasher(sample_file, memory.SHA256()) - computed_digest = hasher.compute() - assert computed_digest == hasher.digest_value - def test_hash_of_known_file(self, sample_file, expected_digest): hasher = file.FileHasher(sample_file, memory.SHA256()) hasher.compute() @@ -91,11 +83,17 @@ def test_hash_of_known_file_small_chunk(self, sample_file, expected_digest): hasher.compute() assert hasher.digest_hex == expected_digest + def test_hash_file_twice(self, sample_file): + hasher1 = file.FileHasher(sample_file, memory.SHA256()) + hasher1.compute() + hasher2 = file.FileHasher(sample_file, memory.SHA256()) + hasher2.compute() + assert hasher1.digest_value == hasher2.digest_value + def test_hash_of_known_file_with_header( self, sample_file_content_only, expected_digest ): - inner_hasher = memory.SHA256() - inner_hasher.update(_HEADER.encode("utf-8")) + inner_hasher = memory.SHA256(_HEADER.encode("utf-8")) hasher = file.FileHasher(sample_file_content_only, inner_hasher) hasher.compute() assert hasher.digest_hex == expected_digest diff --git a/model_signing/hashing/hashing.py b/model_signing/hashing/hashing.py index f7034825..5fe768e7 100644 --- a/model_signing/hashing/hashing.py +++ b/model_signing/hashing/hashing.py @@ -29,29 +29,31 @@ class HashEngine(metaclass=ABCMeta): @abstractmethod def update(self, data: bytes) -> None: - """Updates the digest based on new bytes. + """Appends additional bytes to the data to be hashed. - Repeated calls are equivalent to a single call with the concatenation of - all the arguments. That is, `he.update(a); he.update(b)` is the same as - `he.update(a + b)`. + Implementations might decide to not support this operation. - If `finalize` has been called, the behavior of subsequent calls to - `update` is implementation specific. + Similarly, implementations might decide to not support this operation + after `compute` has been called. Or, they might decide that additional + calls to `update` after `compute` has been called have no effect. + + Implementations may update internal data on each call to `update` + instead of performing the entire digest computation on `compute`. """ pass @abstractmethod - def finalize(self) -> None: - """Records that the entire object has been passed to the engine. + def compute(self) -> None: + """Computes the digest of the entire data passed to the engine. - This method MUST be called only once, after which only the computed + This method should be called only once, after which only the computed digest and the name of the algorithm can be accessed. This is to ensure that hashing methods that rely on FFI or use aditional resources (e.g., compute the digest on GPU) can properly free allocated resources. - Calling `finalize` more than once may not result in an error. Instead, - in order to allow for implementations that don't need to perform - additional testing, the behavior is implementation specific. + Calling `compute` more than once may not result in an error. Instead, in + order to allow for implementations that don't need to perform additional + testing, the behavior is implementation specific. """ pass @@ -76,7 +78,7 @@ def digest_name(self) -> str: def digest_value(self) -> bytes: """The digest of the data passed to the hash engine. - The returned value is only valid if `finalize` has been previously + The returned value is only valid if `compute` has been previously called. For all other cases, the returned value is implementation specific. """ @@ -89,7 +91,7 @@ def digest_hex(self) -> str: In general, this method should be used only in tests and for debugging. - The returned value is only valid if `finalize` has been previously + The returned value is only valid if `compute` has been previously called. For all other cases, the returned value is implementation specific. """ diff --git a/model_signing/hashing/memory.py b/model_signing/hashing/memory.py index 333c9d9d..df8893b7 100644 --- a/model_signing/hashing/memory.py +++ b/model_signing/hashing/memory.py @@ -21,7 +21,15 @@ ```python >>> hasher = SHA256() >>> hasher.update(b"abcd") ->>> hasher.finalize() +>>> hasher.compute() +>>> hasher.digest_hex +'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' +``` + +Or, passing the data directly in the constructor: +```python +>>> hasher = SHA256(b"abcd") +>>> hasher.compute() >>> hasher.digest_hex '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' ``` @@ -36,16 +44,16 @@ class SHA256(hashing.HashEngine): """A wrapper around `hashlib.sha256`.""" - def __init__(self): - self._hasher = hashlib.sha256() + def __init__(self, initial_data: bytes = b""): + self._hasher = hashlib.sha256(initial_data) @override def update(self, data: bytes) -> None: self._hasher.update(data) @override - def finalize(self) -> None: - pass # nothing to do, `hashlib` methods don't require finalizers + def compute(self) -> None: + pass # nothing to do, digest already computed @override @property diff --git a/model_signing/hashing/memory_test.py b/model_signing/hashing/memory_test.py index 5b4c8acc..0e47fcff 100644 --- a/model_signing/hashing/memory_test.py +++ b/model_signing/hashing/memory_test.py @@ -18,9 +18,8 @@ class TestPrecomputedDigest: def test_hash_known_value(self): - hasher = memory.SHA256() - hasher.update("Test string".encode("utf-8")) - hasher.finalize() + hasher = memory.SHA256(b"Test string") + hasher.compute() expected = ( "a3e49d843df13c2e2a7786f6ecd7e0d184f45d718d1ac1a8a63e570466e489dd" ) @@ -33,12 +32,23 @@ def test_hash_update_twice_is_the_same_as_update_with_concatenation(self): hasher1 = memory.SHA256() hasher1.update(str1.encode("utf-8")) hasher1.update(str2.encode("utf-8")) - hasher1.finalize() + hasher1.compute() str_all = str1 + str2 hasher2 = memory.SHA256() hasher2.update(str_all.encode("utf-8")) - hasher2.finalize() + hasher2.compute() + + assert hasher1.digest_hex == hasher2.digest_hex + assert hasher1.digest_value == hasher2.digest_value + + def test_hash_update_empty(self): + hasher1 = memory.SHA256(b"Test string") + hasher1.update(b"") + hasher1.compute() + + hasher2 = memory.SHA256(b"Test string") + hasher2.compute() assert hasher1.digest_hex == hasher2.digest_hex assert hasher1.digest_value == hasher2.digest_value diff --git a/model_signing/hashing/precomputed.py b/model_signing/hashing/precomputed.py index 3afb6929..c5f98e2c 100644 --- a/model_signing/hashing/precomputed.py +++ b/model_signing/hashing/precomputed.py @@ -20,7 +20,7 @@ Example usage: ```python >>> hasher = PrecomputedDigest("short-hash", b"abcd") ->>> hasher.finalize() +>>> hasher.compute() >>> hasher.digest_hex '61626364' ``` @@ -41,10 +41,10 @@ class PrecomputedDigest(hashing.HashEngine): @override def update(self, data: bytes) -> None: - pass # nothing to do, hash already computed + raise TypeError("The hash engine does not support calling update().") @override - def finalize(self) -> None: + def compute(self) -> None: pass # nothing to do, hash already computed @override diff --git a/model_signing/hashing/precomputed_test.py b/model_signing/hashing/precomputed_test.py index c267e48d..70f55107 100644 --- a/model_signing/hashing/precomputed_test.py +++ b/model_signing/hashing/precomputed_test.py @@ -17,17 +17,29 @@ class TestPrecomputedDigest: - def test_finalize_does_not_change_hash(self): + def test_compute_does_not_change_hash(self): hash_value = b"value" hasher = precomputed.PrecomputedDigest("test", hash_value) assert hasher.digest_value == hash_value - hasher.finalize() + hasher.compute() assert hasher.digest_value == hash_value def test_expected_hash_and_hex(self): hash_value = b"abcd" hash_hex_value = "61626364" hasher = precomputed.PrecomputedDigest("test", hash_value) - hasher.finalize() + hasher.compute() assert hasher.digest_value == hash_value assert hasher.digest_hex == hash_hex_value + + def test_expected_hash_and_hex_unicode(self): + hash_value = "*哈¥эш希".encode("utf-8") + hash_hex_value = "2ae59388c2a5d18dd188e5b88c" + hasher = precomputed.PrecomputedDigest("test", hash_value) + hasher.compute() + assert hasher.digest_value == hash_value + assert hasher.digest_hex == hash_hex_value + + def test_expected_hash_type(self): + hasher = precomputed.PrecomputedDigest("test", b"abcd") + assert hasher.digest_name == "test"