Skip to content

Commit

Permalink
unzip -> extract
Browse files Browse the repository at this point in the history
  • Loading branch information
FynnBe committed Nov 4, 2024
1 parent 2c7145f commit dd75554
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 87 deletions.
127 changes: 110 additions & 17 deletions bioimageio/spec/_internal/io.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
from __future__ import annotations

# pyright: reportUnnecessaryTypeIgnoreComment=warning
import hashlib
import sys
import warnings
import zipfile
from abc import abstractmethod
from collections.abc import Mapping as MappingAbc

# pyright: reportUnnecessaryTypeIgnoreComment=warning
from contextlib import nullcontext
from dataclasses import dataclass
from datetime import date as _date
from datetime import datetime as _datetime
from functools import lru_cache
from math import ceil
from pathlib import Path, PurePath
from tempfile import mktemp
from typing import (
Any,
Dict,
Expand Down Expand Up @@ -518,9 +521,16 @@ class OpenedBioimageioYaml:


@dataclass
class DownloadedFile:
path: Union[FilePath, zipfile.Path]
original_root: Union[AbsoluteDirectory, RootHttpUrl, ZipFile]
class LocalFile:
path: FilePath
original_root: Union[AbsoluteDirectory, RootHttpUrl, FileInZip]
original_file_name: FileName


@dataclass
class FileInZip:
path: zipfile.Path
original_root: ZipFile
original_file_name: FileName


Expand Down Expand Up @@ -578,29 +588,84 @@ def reset(self): ...
def close(self): ...


# def open(source: Union[PermissiveFileSource, FileDescr], /, progressbar: Union[Progressbar, bool, None] = None,
# **kwargs: Unpack[HashKwargs]):
# if isinstance(source, FileDescr):
# if kwargs.get("sha256") is None:
# kwargs["sha256"] = source.sha256
# source = source.source
def extract(
source: Union[FilePath, ZipFile, zipfile.Path],
folder: Optional[DirectoryPath] = None,
overwrite: bool = False,
) -> DirectoryPath:
extract_member = None
if isinstance(source, zipfile.Path):
extract_member = source.at
source = source.root

if isinstance(source, ZipFile):
zip_context = nullcontext(source)
if folder is None:
if source.filename is None:
folder = Path(mktemp())
else:
zip_path = Path(source.filename)
folder = zip_path.with_suffix(zip_path.suffix + ".unzip")
else:
zip_context = ZipFile(source, "r")
if folder is None:
folder = source.with_suffix(source.suffix + ".unzip")

if overwrite and folder.exists():
warnings.warn(f"Overwriting existing unzipped archive at {folder}")

with zip_context as f:
if extract_member is not None:
extracted_file_path = folder / extract_member
if extracted_file_path.exists() and not overwrite:
warnings.warn(f"Found unzipped {extracted_file_path}.")
else:
_ = f.extract(extract_member, folder)

# if isinstance(source, RelativeFilePath):
return folder

elif overwrite or not folder.exists():
f.extractall(folder)
return folder

found_content = {p.relative_to(folder).as_posix() for p in folder.glob("*")}
expected_content = {info.filename for info in f.filelist}
if expected_missing := expected_content - found_content:
parts = folder.name.split("_")
nr, *suffixes = parts[-1].split(".")
if nr.isdecimal():
nr = str(int(nr) + 1)
else:
nr = f"1.{nr}"

def download(
parts[-1] = ".".join([nr, *suffixes])
out_path_new = folder.with_name("_".join(parts))
warnings.warn(
f"Unzipped archive at {folder} is missing expected files"
+ f" {expected_missing}."
+ f" Unzipping to {out_path_new} instead to avoid overwriting."
)
return extract(f, out_path_new, overwrite=overwrite)
else:
warnings.warn(
f"Found unzipped archive with all expected files at {folder}."
)
return folder


def resolve(
source: Union[PermissiveFileSource, FileDescr, zipfile.Path],
/,
progressbar: Union[Progressbar, bool, None] = None,
**kwargs: Unpack[HashKwargs],
) -> DownloadedFile:
"""download `source` URL (or pass local file path)"""
) -> Union[LocalFile, FileInZip]:
"""Resolve file `source` (download if needed)"""
if isinstance(source, FileDescr):
return source.download()
elif isinstance(source, zipfile.Path):
zip_root = source.root
assert isinstance(zip_root, ZipFile)
return DownloadedFile(
return FileInZip(
source,
zip_root,
extract_file_name(source),
Expand All @@ -610,7 +675,7 @@ def download(
if isinstance(strict_source, RelativeFilePath):
strict_source = strict_source.absolute()
if isinstance(strict_source, zipfile.Path):
return DownloadedFile(
return FileInZip(
strict_source, strict_source.root, extract_file_name(strict_source)
)

Expand Down Expand Up @@ -650,13 +715,41 @@ def download(
local_source = Path(_ls).absolute()
root = strict_source.parent

return DownloadedFile(
return LocalFile(
local_source,
root,
extract_file_name(strict_source),
)


download = resolve


def resolve_and_extract(
source: Union[PermissiveFileSource, FileDescr, zipfile.Path],
/,
progressbar: Union[Progressbar, bool, None] = None,
**kwargs: Unpack[HashKwargs],
) -> LocalFile:
"""Resolve `source` within current ValidationContext,
download if needed and
extract file if within zip archive.
note: If source points to a zip file it is not extracted
"""
local = resolve(source, progressbar=progressbar)
if isinstance(local, LocalFile):
return local

extracted = extract(local.path)
# local.original_root
# local.original_root.extract(local.original_file_name, output_path)

return LocalFile(
extracted, original_root=local, original_file_name=local.original_file_name
)


class LightHttpFileDescr(Node):
"""http source with sha256 value (minimal validation)"""

Expand Down
77 changes: 7 additions & 70 deletions bioimageio/spec/_internal/io_utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
import io
import warnings
import zipfile
from contextlib import nullcontext
from difflib import get_close_matches
from pathlib import Path
from tempfile import mktemp
from types import MappingProxyType
from typing import (
IO,
Any,
Dict,
Mapping,
Optional,
Union,
cast,
)
Expand All @@ -21,7 +18,7 @@
import requests
from loguru import logger
from numpy.typing import NDArray
from pydantic import DirectoryPath, FilePath, NewPath, RootModel
from pydantic import FilePath, NewPath, RootModel
from ruyaml import YAML
from typing_extensions import Unpack

Expand All @@ -30,6 +27,7 @@
BIOIMAGEIO_YAML,
BioimageioYamlContent,
FileDescr,
FileInZip,
HashKwargs,
LightHttpFileDescr,
OpenedBioimageioYaml,
Expand Down Expand Up @@ -146,7 +144,11 @@ def open_bioimageio_yaml(
root = downloaded.original_root

content = _sanitize_bioimageio_yaml(read_yaml(local_source))
return OpenedBioimageioYaml(content, root, downloaded.original_file_name)
return OpenedBioimageioYaml(
content,
root.original_root if isinstance(root, FileInZip) else root,
downloaded.original_file_name,
)


_IdMap = RootModel[Dict[str, LightHttpFileDescr]]
Expand Down Expand Up @@ -182,71 +184,6 @@ def get_id_map() -> Mapping[str, LightHttpFileDescr]:
return MappingProxyType(ret)


def unzip(
zip_file: Union[FilePath, ZipFile, zipfile.Path],
out_path: Optional[DirectoryPath] = None,
overwrite: bool = False,
) -> DirectoryPath:
extract_member = None
if isinstance(zip_file, zipfile.Path):
extract_member = zip_file.at
zip_file = zip_file.root

if isinstance(zip_file, ZipFile):
zip_context = nullcontext(zip_file)
if out_path is None:
if zip_file.filename is None:
out_path = Path(mktemp())
else:
zip_path = Path(zip_file.filename)
out_path = zip_path.with_suffix(zip_path.suffix + ".unzip")
else:
zip_context = ZipFile(zip_file, "r")
if out_path is None:
out_path = zip_file.with_suffix(zip_file.suffix + ".unzip")

if overwrite and out_path.exists():
warnings.warn(f"Overwriting existing unzipped archive at {out_path}")

with zip_context as f:
if extract_member is not None:
extracted_file_path = out_path / extract_member
if extracted_file_path.exists() and not overwrite:
warnings.warn(f"Found unzipped {extracted_file_path}.")
else:
_ = f.extract(extract_member, out_path)

return out_path

elif overwrite or not out_path.exists():
f.extractall(out_path)
return out_path

found_content = {p.relative_to(out_path).as_posix() for p in out_path.glob("*")}
expected_content = {info.filename for info in f.filelist}
if expected_missing := expected_content - found_content:
parts = out_path.name.split("_")
nr, *suffixes = parts[-1].split(".")
if nr.isdecimal():
nr = str(int(nr) + 1)
else:
nr = f"1.{nr}"

parts[-1] = ".".join([nr, *suffixes])
out_path_new = out_path.with_name("_".join(parts))
warnings.warn(
f"Unzipped archive at {out_path} is missing expected files"
+ f" {expected_missing}."
+ f" Unzipping to {out_path_new} instead to avoid overwriting."
)
return unzip(f, out_path_new, overwrite=overwrite)
else:
warnings.warn(
f"Found unzipped archive with all expected files at {out_path}."
)
return out_path


def write_content_to_zip(
content: Mapping[FileName, Union[str, FilePath, zipfile.Path, Dict[Any, Any]]],
zip: zipfile.ZipFile,
Expand Down

0 comments on commit dd75554

Please sign in to comment.