Slightly change API and add a few more tests

Signed-off-by: Mihai Maruseac <[email protected]>
sigstore · May 23, 2024 · dcb9ace · dcb9ace
1 parent d5c0642
commit dcb9ace
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 89 deletions.
diff --git a/model_signing/hashing/file.py b/model_signing/hashing/file.py
@@ -19,8 +19,7 @@
 >>> with open("/tmp/file", "w") as f:
 ...     f.write("abcd")
 >>> hasher = FileHasher("/tmp/file", SHA256())
->>> hasher.compute().hex()
-'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
+>>> hasher.compute()
 >>> hasher.digest_hex
 '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
 ```
@@ -30,8 +29,7 @@
 >>> with open("/tmp/file", "w") as f:
 ...     f.write("0123abcd")
 >>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
->>> hasher.compute().hex()
-'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
+>>> hasher.compute()
 >>> hasher.digest_hex
 '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
 ```
@@ -40,11 +38,8 @@
 ```python
 >>> with open("/tmp/file", "w") as f:
 ...     f.write("abcd")
->>> inner_hasher = SHA256()
->>> inner_hasher.update(b"0123")
->>> hasher = FileHasher("/tmp/file", inner_hasher)
->>> hasher.compute().hex()
-'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
+>>> hasher = FileHasher("/tmp/file", SHA256(b"0123"))
+>>> hasher.compute()
 >>> hasher.digest_hex
 '64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
 ```
@@ -54,8 +49,7 @@
 >>> with open("/tmp/file", "w") as f:
 ...     f.write("0123abcd")
 >>> hasher = FileHasher("/tmp/file", SHA256())
->>> hasher.compute().hex()
-'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
+>>> hasher.compute()
 >>> hasher.digest_hex
 '64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848'
 ```
@@ -77,12 +71,6 @@ class FileHasher(hashing.HashEngine):
     chunk size changes. As such, we can dynamically determine an optimal value
     for the chunk argument.
 
-    Because we only read a file once, it is an error to call the `update` and
-    `finalize` methods of `FileHashEngine`. Instead, hashing is computed in a
-    `compute` method, which, for convenience, returns the digest. The
-    `digest_value` and `digest_hex` properties can still be used to retrieve the
-    digest, after `compute` is called.
-
     The `digest_name()` method MUST record all parameters that influence the
     hash output. For example, if a file is split into shards which are hashed
     separately and the final digest value is computed by aggregating these
@@ -123,8 +111,19 @@ def __init__(
         self._digest = b""
         self._digest_name_override = digest_name_override
 
-    def compute(self) -> bytes:
-        """Computes the digest of the file."""
+    @override
+    @property
+    def digest_name(self) -> str:
+        if self._digest_name_override is not None:
+            return self._digest_name_override
+        return f"file-{self._content_hasher.digest_name}"
+
+    @override
+    def update(self, data: bytes) -> None:
+        raise TypeError("The hash engine does not support calling update().")
+
+    @override
+    def compute(self) -> None:
         if self._chunk_size == 0:
             with open(self._file, "rb") as f:
                 self._content_hasher.update(f.read())
@@ -136,28 +135,8 @@ def compute(self) -> bytes:
                         break
                     self._content_hasher.update(data)
 
-        self._content_hasher.finalize()
+        self._content_hasher.compute()
         self._digest = self._content_hasher.digest_value
-        return self._digest
-
-    @override
-    @property
-    def digest_name(self) -> str:
-        if self._digest_name_override is not None:
-            return self._digest_name_override
-        return f"file-{self._content_hasher.digest_name}"
-
-    @override
-    def update(self, data: bytes) -> None:
-        raise TypeError(
-            "FileHashEngine does not support `update`. Use `compute` instead."
-        )
-
-    @override
-    def finalize(self) -> None:
-        raise TypeError(
-            "FileHashEngine does not support `finalize`. Use `compute` instead."
-        )
 
     @override
     @property
@@ -238,7 +217,7 @@ def __init__(
         self._shard_size = shard_size
 
     @override
-    def compute(self) -> bytes:
+    def compute(self) -> None:
         with open(self._file, "rb") as f:
             f.seek(self._start)
             to_read = self._end - self._start
@@ -253,9 +232,8 @@ def compute(self) -> bytes:
                     to_read -= len(data)
                     self._content_hasher.update(data)
 
-        self._content_hasher.finalize()
+        self._content_hasher.compute()
         self._digest = self._content_hasher.digest_value
-        return self._digest
 
     @override
     @property

diff --git a/model_signing/hashing/file_test.py b/model_signing/hashing/file_test.py
@@ -43,25 +43,22 @@ def sample_file_content_only(tmp_path_factory):
 def expected_digest():
     # To ensure that the expected file digest is always up to date, use the
     # memory hashing and create a fixture for the expected value.
-    hasher = memory.SHA256()
-    hasher.update(_FULL_CONTENT.encode("utf-8"))
-    hasher.finalize()
+    hasher = memory.SHA256(_FULL_CONTENT.encode("utf-8"))
+    hasher.compute()
     return hasher.digest_hex
 
 
 @pytest.fixture(scope="session")
 def expected_header_digest():
-    hasher = memory.SHA256()
-    hasher.update(_HEADER.encode("utf-8"))
-    hasher.finalize()
+    hasher = memory.SHA256(_HEADER.encode("utf-8"))
+    hasher.compute()
     return hasher.digest_hex
 
 
 @pytest.fixture(scope="session")
 def expected_content_digest():
-    hasher = memory.SHA256()
-    hasher.update(_CONTENT.encode("utf-8"))
-    hasher.finalize()
+    hasher = memory.SHA256(_CONTENT.encode("utf-8"))
+    hasher.compute()
     return hasher.digest_hex
 
 
@@ -71,11 +68,6 @@ def test_fails_with_negative_chunk_size(self):
         with pytest.raises(ValueError, match="Chunk size must be non-negative"):
             file.FileHasher("unused", memory.SHA256(), chunk_size=-2)
 
-    def test_compute_and_digest_value_are_the_same(self, sample_file):
-        hasher = file.FileHasher(sample_file, memory.SHA256())
-        computed_digest = hasher.compute()
-        assert computed_digest == hasher.digest_value
-
     def test_hash_of_known_file(self, sample_file, expected_digest):
         hasher = file.FileHasher(sample_file, memory.SHA256())
         hasher.compute()
@@ -91,11 +83,17 @@ def test_hash_of_known_file_small_chunk(self, sample_file, expected_digest):
         hasher.compute()
         assert hasher.digest_hex == expected_digest
 
+    def test_hash_file_twice(self, sample_file):
+        hasher1 = file.FileHasher(sample_file, memory.SHA256())
+        hasher1.compute()
+        hasher2 = file.FileHasher(sample_file, memory.SHA256())
+        hasher2.compute()
+        assert hasher1.digest_value == hasher2.digest_value
+
     def test_hash_of_known_file_with_header(
         self, sample_file_content_only, expected_digest
     ):
-        inner_hasher = memory.SHA256()
-        inner_hasher.update(_HEADER.encode("utf-8"))
+        inner_hasher = memory.SHA256(_HEADER.encode("utf-8"))
         hasher = file.FileHasher(sample_file_content_only, inner_hasher)
         hasher.compute()
         assert hasher.digest_hex == expected_digest

diff --git a/model_signing/hashing/hashing.py b/model_signing/hashing/hashing.py
@@ -29,29 +29,31 @@ class HashEngine(metaclass=ABCMeta):
 
     @abstractmethod
     def update(self, data: bytes) -> None:
-        """Updates the digest based on new bytes.
+        """Appends additional bytes to the data to be hashed.
 
-        Repeated calls are equivalent to a single call with the concatenation of
-        all the arguments. That is, `he.update(a); he.update(b)` is the same as
-        `he.update(a + b)`.
+        Implementations might decide to not support this operation.
 
-        If `finalize` has been called, the behavior of subsequent calls to
-        `update` is implementation specific.
+        Similarly, implementations might decide to not support this operation
+        after `compute` has been called. Or, they might decide that additional
+        calls to `update` after `compute` has been called have no effect.
+
+        Implementations may update internal data on each call to `update`
+        instead of performing the entire digest computation on `compute`.
         """
         pass
 
     @abstractmethod
-    def finalize(self) -> None:
-        """Records that the entire object has been passed to the engine.
+    def compute(self) -> None:
+        """Computes the digest of the entire data passed to the engine.
 
-        This method MUST be called only once, after which only the computed
+        This method should be called only once, after which only the computed
         digest and the name of the algorithm can be accessed. This is to ensure
         that hashing methods that rely on FFI or use aditional resources (e.g.,
         compute the digest on GPU) can properly free allocated resources.
 
-        Calling `finalize` more than once may not result in an error. Instead,
-        in order to allow for implementations that don't need to perform
-        additional testing, the behavior is implementation specific.
+        Calling `compute` more than once may not result in an error. Instead, in
+        order to allow for implementations that don't need to perform additional
+        testing, the behavior is implementation specific.
         """
         pass
 
@@ -76,7 +78,7 @@ def digest_name(self) -> str:
     def digest_value(self) -> bytes:
         """The digest of the data passed to the hash engine.
 
-        The returned value is only valid if `finalize` has been previously
+        The returned value is only valid if `compute` has been previously
         called. For all other cases, the returned value is implementation
         specific.
         """
@@ -89,7 +91,7 @@ def digest_hex(self) -> str:
 
         In general, this method should be used only in tests and for debugging.
 
-        The returned value is only valid if `finalize` has been previously
+        The returned value is only valid if `compute` has been previously
         called. For all other cases, the returned value is implementation
         specific.
         """

diff --git a/model_signing/hashing/memory.py b/model_signing/hashing/memory.py
@@ -21,7 +21,15 @@
 ```python
 >>> hasher = SHA256()
 >>> hasher.update(b"abcd")
->>> hasher.finalize()
+>>> hasher.compute()
+>>> hasher.digest_hex
+'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
+```
+
+Or, passing the data directly in the constructor:
+```python
+>>> hasher = SHA256(b"abcd")
+>>> hasher.compute()
 >>> hasher.digest_hex
 '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
 ```
@@ -36,16 +44,16 @@
 class SHA256(hashing.HashEngine):
     """A wrapper around `hashlib.sha256`."""
 
-    def __init__(self):
-        self._hasher = hashlib.sha256()
+    def __init__(self, initial_data: bytes = b""):
+        self._hasher = hashlib.sha256(initial_data)
 
     @override
     def update(self, data: bytes) -> None:
         self._hasher.update(data)
 
     @override
-    def finalize(self) -> None:
-        pass  # nothing to do, `hashlib` methods don't require finalizers
+    def compute(self) -> None:
+        pass  # nothing to do, digest already computed
 
     @override
     @property

diff --git a/model_signing/hashing/memory_test.py b/model_signing/hashing/memory_test.py
@@ -18,9 +18,8 @@
 class TestPrecomputedDigest:
 
     def test_hash_known_value(self):
-        hasher = memory.SHA256()
-        hasher.update("Test string".encode("utf-8"))
-        hasher.finalize()
+        hasher = memory.SHA256(b"Test string")
+        hasher.compute()
         expected = (
             "a3e49d843df13c2e2a7786f6ecd7e0d184f45d718d1ac1a8a63e570466e489dd"
         )
@@ -33,12 +32,23 @@ def test_hash_update_twice_is_the_same_as_update_with_concatenation(self):
         hasher1 = memory.SHA256()
         hasher1.update(str1.encode("utf-8"))
         hasher1.update(str2.encode("utf-8"))
-        hasher1.finalize()
+        hasher1.compute()
 
         str_all = str1 + str2
         hasher2 = memory.SHA256()
         hasher2.update(str_all.encode("utf-8"))
-        hasher2.finalize()
+        hasher2.compute()
+
+        assert hasher1.digest_hex == hasher2.digest_hex
+        assert hasher1.digest_value == hasher2.digest_value
+
+    def test_hash_update_empty(self):
+        hasher1 = memory.SHA256(b"Test string")
+        hasher1.update(b"")
+        hasher1.compute()
+
+        hasher2 = memory.SHA256(b"Test string")
+        hasher2.compute()
 
         assert hasher1.digest_hex == hasher2.digest_hex
         assert hasher1.digest_value == hasher2.digest_value
diff --git a/model_signing/hashing/precomputed.py b/model_signing/hashing/precomputed.py
@@ -20,7 +20,7 @@
 Example usage:
 ```python
 >>> hasher = PrecomputedDigest("short-hash", b"abcd")
->>> hasher.finalize()
+>>> hasher.compute()
 >>> hasher.digest_hex
 '61626364'
 ```
@@ -41,10 +41,10 @@ class PrecomputedDigest(hashing.HashEngine):
 
     @override
     def update(self, data: bytes) -> None:
-        pass  # nothing to do, hash already computed
+        raise TypeError("The hash engine does not support calling update().")
 
     @override
-    def finalize(self) -> None:
+    def compute(self) -> None:
         pass  # nothing to do, hash already computed
 
     @override

diff --git a/model_signing/hashing/precomputed_test.py b/model_signing/hashing/precomputed_test.py
@@ -17,17 +17,29 @@
 
 class TestPrecomputedDigest:
 
-    def test_finalize_does_not_change_hash(self):
+    def test_compute_does_not_change_hash(self):
         hash_value = b"value"
         hasher = precomputed.PrecomputedDigest("test", hash_value)
         assert hasher.digest_value == hash_value
-        hasher.finalize()
+        hasher.compute()
         assert hasher.digest_value == hash_value
 
     def test_expected_hash_and_hex(self):
         hash_value = b"abcd"
         hash_hex_value = "61626364"
         hasher = precomputed.PrecomputedDigest("test", hash_value)
-        hasher.finalize()
+        hasher.compute()
         assert hasher.digest_value == hash_value
         assert hasher.digest_hex == hash_hex_value
+
+    def test_expected_hash_and_hex_unicode(self):
+        hash_value = "*哈¥эш希".encode("utf-8")
+        hash_hex_value = "2ae59388c2a5d18dd188e5b88c"
+        hasher = precomputed.PrecomputedDigest("test", hash_value)
+        hasher.compute()
+        assert hasher.digest_value == hash_value
+        assert hasher.digest_hex == hash_hex_value
+
+    def test_expected_hash_type(self):
+        hasher = precomputed.PrecomputedDigest("test", b"abcd")
+        assert hasher.digest_name == "test"