Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate serialize_v0 to new API (as part of serialization layer) #190

Merged
merged 5 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
```python
>>> with open("/tmp/file", "w") as f:
... f.write("0123abcd")
>>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
>>> digest = hasher.compute()
>>> digest.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
Expand Down Expand Up @@ -144,8 +144,7 @@ def __init__(
Args:
file: The file to hash. Use `set_file` to reset it.
content_hasher: A `hashing.HashEngine` instance used to compute the
digest of the file. This instance must not be used outside of this
instance. However, it may be pre-initialized with a header.
digest of the file.
start: The file offset to start reading from. Must be valid. Reset
with `set_shard`.
end: The file offset to start reading from. Must be stricly greater
Expand Down
13 changes: 13 additions & 0 deletions model_signing/manifest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
39 changes: 39 additions & 0 deletions model_signing/manifest/manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Machinery for representing a serialized representation of an ML model.

Currently, we only support a manifest that wraps around a digest. But, to
support incremental updates and partial signature verification, we need a
manifest that lists files and their digests. That will come in a future change,
soon.
"""

from abc import ABCMeta
from dataclasses import dataclass

from model_signing.hashing import hashing


class Manifest(metaclass=ABCMeta):
"""Generic manifest file to represent a model."""
mihaimaruseac marked this conversation as resolved.
Show resolved Hide resolved

pass


@dataclass
class DigestManifest(Manifest):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does a model look like as a manifest?
As the digest manifest is defined now, it contains a single digest. Shouldn't it contain a list of digests to represent the model?

Where would we place the signature in this manifest?

Perhaps we could align a bit more with the sigstore bundle type and least provide a model manifest that contains the digests, the signature and verification material.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Working on that now, hope to have a PR later today/early tomorrow. This PR and the one after it only migrated the old serialize_v0/serialize_v1 versions

"""A manifest that is just a hash."""

digest: hashing.Digest
13 changes: 13 additions & 0 deletions model_signing/serializing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
120 changes: 120 additions & 0 deletions model_signing/serializing/dfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model serializers that build a single hash out of a DFS traversal."""

import base64
import pathlib
from typing import Callable
from typing_extensions import override

from model_signing.hashing import file
from model_signing.hashing import hashing
from model_signing.manifest import manifest
from model_signing.serializing import serializing


def _check_file_or_directory(path: pathlib.Path) -> bool:
"""Checks that the given path is either a file or a directory.

There is no support for sockets, pipes, or any other operating system
concept abstracted as a file.

Furthermore, this would return False if the path is a broken symlink, if it
doesn't exists or if there are permission errors.
"""
return path.is_file() or path.is_dir()
mihaimaruseac marked this conversation as resolved.
Show resolved Hide resolved


def _build_header(*, entry_name: str, entry_type: str) -> bytes:
"""Builds a header to encode a path with given name and type.

Args:
entry_name: The name of the entry to build the header for.
entry_type: The type of the entry (file or directory).
"""
encoded_type = entry_type.encode("utf-8")
# Prevent confusion if name has a "." inside by encoding to base64.
encoded_name = base64.b64encode(entry_name.encode("utf-8"))
# Note: make sure to end with a ".".
return b".".join([encoded_type, encoded_name, b""])


class DFSSerializer(serializing.Serializer):
"""Serializer for a model that performs a traversal of the model directory.

This serializer produces a single hash for the entire model. If the model is
a file, the hash is the digest of the file. If the model is a directory, we
perform a depth-first traversal of the directory, hash each individual files
and aggregate the hashes together.
"""

def __init__(
self,
file_hasher: file.FileHasher,
merge_hasher_factory: Callable[[], hashing.StreamingHashEngine],
):
"""Initializes an instance to hash a file with a specific `HashEngine`.

Args:
hasher: The hash engine used to hash the individual files.
merge_hasher_factory: A callable that returns a
`hashing.StreamingHashEngine` instance used to merge individual
file digests to compute an aggregate digest.
"""
self._file_hasher = file_hasher
self._merge_hasher_factory = merge_hasher_factory

@override
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
# TODO(mihaimaruseac): Add checks to exclude symlinks if desired
mihaimaruseac marked this conversation as resolved.
Show resolved Hide resolved
if not _check_file_or_directory(model_path):
mihaimaruseac marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
f"Cannot use '{model_path}' as file or directory. It could be a"
" special file, it could be missing, or there might be a"
" permission issue."
)

if model_path.is_file():
self._file_hasher.set_file(model_path)
return manifest.DigestManifest(self._file_hasher.compute())

return manifest.DigestManifest(self._dfs(model_path))

def _dfs(self, directory: pathlib.Path) -> hashing.Digest:
# TODO(mihaimaruseac): Add support for excluded files
mihaimaruseac marked this conversation as resolved.
Show resolved Hide resolved
children = sorted([x for x in directory.iterdir()])

hasher = self._merge_hasher_factory()
for child in children:
if not _check_file_or_directory(child):
raise ValueError(
f"Cannot use '{child}' as file or directory. It could be a"
" special file, it could be missing, or there might be a"
" permission issue."
)

if child.is_file():
header = _build_header(entry_name=child.name, entry_type="file")
hasher.update(header)
self._file_hasher.set_file(child)
digest = self._file_hasher.compute()
hasher.update(digest.digest_value)
else:
header = _build_header(entry_name=child.name, entry_type="dir")
hasher.update(header)
digest = self._dfs(child)
hasher.update(digest.digest_value)

return hasher.compute()
Loading
Loading