diff --git a/model_signing/hashing/file.py b/model_signing/hashing/file.py index a6e94ec0..ef88407d 100644 --- a/model_signing/hashing/file.py +++ b/model_signing/hashing/file.py @@ -28,7 +28,7 @@ ```python >>> with open("/tmp/file", "w") as f: ... f.write("0123abcd") ->>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8) +>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) >>> digest = hasher.compute() >>> digest.digest_hex '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' @@ -144,8 +144,7 @@ def __init__( Args: file: The file to hash. Use `set_file` to reset it. content_hasher: A `hashing.HashEngine` instance used to compute the - digest of the file. This instance must not be used outside of this - instance. However, it may be pre-initialized with a header. + digest of the file. start: The file offset to start reading from. Must be valid. Reset with `set_shard`. end: The file offset to start reading from. Must be stricly greater diff --git a/model_signing/manifest/__init__.py b/model_signing/manifest/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/manifest/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/manifest/manifest.py b/model_signing/manifest/manifest.py new file mode 100644 index 00000000..29cbc0d8 --- /dev/null +++ b/model_signing/manifest/manifest.py @@ -0,0 +1,39 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for representing a serialized representation of an ML model. + +Currently, we only support a manifest that wraps around a digest. But, to +support incremental updates and partial signature verification, we need a +manifest that lists files and their digests. That will come in a future change, +soon. +""" + +from abc import ABCMeta +from dataclasses import dataclass + +from model_signing.hashing import hashing + + +class Manifest(metaclass=ABCMeta): + """Generic manifest file to represent a model.""" + + pass + + +@dataclass +class DigestManifest(Manifest): + """A manifest that is just a hash.""" + + digest: hashing.Digest diff --git a/model_signing/serializing/__init__.py b/model_signing/serializing/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/serializing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/serializing/dfs.py b/model_signing/serializing/dfs.py new file mode 100644 index 00000000..5d42c4c1 --- /dev/null +++ b/model_signing/serializing/dfs.py @@ -0,0 +1,120 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model serializers that build a single hash out of a DFS traversal.""" + +import base64 +import pathlib +from typing import Callable +from typing_extensions import override + +from model_signing.hashing import file +from model_signing.hashing import hashing +from model_signing.manifest import manifest +from model_signing.serializing import serializing + + +def _check_file_or_directory(path: pathlib.Path) -> bool: + """Checks that the given path is either a file or a directory. + + There is no support for sockets, pipes, or any other operating system + concept abstracted as a file. + + Furthermore, this would return False if the path is a broken symlink, if it + doesn't exists or if there are permission errors. + """ + return path.is_file() or path.is_dir() + + +def _build_header(*, entry_name: str, entry_type: str) -> bytes: + """Builds a header to encode a path with given name and type. + + Args: + entry_name: The name of the entry to build the header for. + entry_type: The type of the entry (file or directory). + """ + encoded_type = entry_type.encode("utf-8") + # Prevent confusion if name has a "." inside by encoding to base64. + encoded_name = base64.b64encode(entry_name.encode("utf-8")) + # Note: make sure to end with a ".". + return b".".join([encoded_type, encoded_name, b""]) + + +class DFSSerializer(serializing.Serializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. If the model is + a file, the hash is the digest of the file. If the model is a directory, we + perform a depth-first traversal of the directory, hash each individual files + and aggregate the hashes together. + """ + + def __init__( + self, + file_hasher: file.FileHasher, + merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], + ): + """Initializes an instance to hash a file with a specific `HashEngine`. + + Args: + hasher: The hash engine used to hash the individual files. + merge_hasher_factory: A callable that returns a + `hashing.StreamingHashEngine` instance used to merge individual + file digests to compute an aggregate digest. + """ + self._file_hasher = file_hasher + self._merge_hasher_factory = merge_hasher_factory + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + # TODO(mihaimaruseac): Add checks to exclude symlinks if desired + if not _check_file_or_directory(model_path): + raise ValueError( + f"Cannot use '{model_path}' as file or directory. It could be a" + " special file, it could be missing, or there might be a" + " permission issue." + ) + + if model_path.is_file(): + self._file_hasher.set_file(model_path) + return manifest.DigestManifest(self._file_hasher.compute()) + + return manifest.DigestManifest(self._dfs(model_path)) + + def _dfs(self, directory: pathlib.Path) -> hashing.Digest: + # TODO(mihaimaruseac): Add support for excluded files + children = sorted([x for x in directory.iterdir()]) + + hasher = self._merge_hasher_factory() + for child in children: + if not _check_file_or_directory(child): + raise ValueError( + f"Cannot use '{child}' as file or directory. It could be a" + " special file, it could be missing, or there might be a" + " permission issue." + ) + + if child.is_file(): + header = _build_header(entry_name=child.name, entry_type="file") + hasher.update(header) + self._file_hasher.set_file(child) + digest = self._file_hasher.compute() + hasher.update(digest.digest_value) + else: + header = _build_header(entry_name=child.name, entry_type="dir") + hasher.update(header) + digest = self._dfs(child) + hasher.update(digest.digest_value) + + return hasher.compute() diff --git a/model_signing/serializing/dfs_test.py b/model_signing/serializing/dfs_test.py new file mode 100644 index 00000000..8ff67c20 --- /dev/null +++ b/model_signing/serializing/dfs_test.py @@ -0,0 +1,300 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from model_signing.hashing import file +from model_signing.hashing import memory +from model_signing.serializing import dfs + + +# some constants used throughout testing +_KNOWN_MODEL_TEXT: bytes = b"This is a simple model" +_ANOTHER_MODEL_TEXT: bytes = b"This is another simple model" + + +# Note: Don't make fixtures with global scope as we are altering the models! +@pytest.fixture +def sample_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(_KNOWN_MODEL_TEXT) + return file + + +@pytest.fixture +def empty_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(b"") + return file + + +@pytest.fixture +def sample_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + for i in range(2): + root_dir = model_root / f"d{i}" + root_dir.mkdir() + for j in range(3): + dir_file = root_dir / f"f{i}{j}" + dir_file.write_text(f"This is file f{i}{j} in d{i}.") + + for i in range(4): + root_file = model_root / f"f{i}" + root_file.write_text(f"This is file f{i} in root.") + + return model_root + + +@pytest.fixture +def empty_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + return model_root + + +@pytest.fixture +def deep_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + current = model_root + for i in range(5): + current = current / f"d{i}" + current.mkdir() + + for i in range(4): + file = current / f"f{i}" + file.write_text(f"This is file f{i}.") + + return model_root + + +class TestDFSSerializer: + + def test_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + expected = ( + "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" + ) + assert manifest.digest.digest_hex == expected + + def test_file_hash_is_same_as_hash_of_content(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex == digest.digest_hex + + def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + new_name = sample_model_file.with_name("new-file") + new_file = sample_model_file.rename(new_name) + new_manifest = serializer.serialize(new_file) + + assert manifest == new_manifest + + def test_file_model_hash_changes_if_content_changes( + self, sample_model_file + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT) + new_manifest = serializer.serialize(sample_model_file) + + assert manifest.digest.algorithm == new_manifest.digest.algorithm + assert manifest.digest.digest_value != new_manifest.digest.digest_value + + def test_directory_model_with_only_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + model = sample_model_file.parent + manifest = serializer.serialize(model) + + expected = ( + "a0865eb7e299e3bca3951e24930c56dcf1533ecff63bda06a9be67906773c628" + ) + assert manifest.digest.digest_hex == expected + + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex != digest.digest_hex + + def test_known_folder(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + expected = ( + "310af4fc4c52bf63cd1687c67076ed3e56bc5480a1b151539e6c550506ae0301" + ) + assert manifest.digest.digest_hex == expected + + def test_folder_model_hash_is_same_if_model_is_moved( + self, sample_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + + new_name = sample_model_folder.with_name("new-root") + new_model = sample_model_folder.rename(new_name) + new_manifest = serializer.serialize(new_model) + + assert manifest == new_manifest + + def test_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_directory_model_with_only_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + model = empty_model_file.parent + manifest = serializer.serialize(model) + expected = ( + "8a587b2129fdecfbea38d5152b626299f5994d9b99d36b321aea356f69b38c61" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder(self, empty_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_folder) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder_hashes_the_same_as_empty_file( + self, empty_model_file, empty_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + folder_manifest = serializer.serialize(empty_model_folder) + file_manifest = serializer.serialize(empty_model_file) + assert ( + folder_manifest.digest.digest_hex == file_manifest.digest.digest_hex + ) + + def test_folder_model_empty_entry(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + new_empty_dir = altered_dir / "empty" + new_empty_dir.mkdir() + manifest1 = serializer.serialize(sample_model_folder) + + new_empty_dir.rmdir() + + new_empty_file = altered_dir / "empty" + new_empty_file.write_text("") + manifest2 = serializer.serialize(sample_model_folder) + + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_rename = files[0] + + new_name = file_to_rename.with_name("new-file") + file_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_dir(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + dir_to_rename = dirs[0] + + new_name = dir_to_rename.with_name("new-dir") + dir_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_replace_file_empty_folder(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Replace first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_replace = files[0] + file_to_replace.unlink() + file_to_replace.mkdir() + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_change_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_change = files[0] + file_to_change.write_bytes(_KNOWN_MODEL_TEXT) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_deep_folder(self, deep_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(deep_model_folder) + expected = ( + "36eed9389ebbbe15ac15d33c81dabb60ccb7c945ff641d78f59db9aa9dc47ac9" + ) + assert manifest.digest.digest_hex == expected diff --git a/model_signing/serializing/serializing.py b/model_signing/serializing/serializing.py new file mode 100644 index 00000000..50c8f729 --- /dev/null +++ b/model_signing/serializing/serializing.py @@ -0,0 +1,33 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for serializing ML models. + +Currently we have only one serializer that performs a DFS traversal of the model +directory, but more serializers are coming soon. +""" + +from abc import ABCMeta, abstractmethod +import pathlib + +from model_signing.manifest import manifest + + +class Serializer(metaclass=ABCMeta): + """Generic ML model format serializer.""" + + @abstractmethod + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + """Serializes the model given by the `model_path` argument.""" + pass