From 81c5a7b60ca9988f63ba1e071d833e224570e01c Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Wed, 17 Apr 2024 19:07:07 +0000 Subject: [PATCH 1/6] Improve metadata management for Zarr Sink --- .../zarr/large_image_source_zarr/__init__.py | 169 ++++++++++++++++-- 1 file changed, 153 insertions(+), 16 deletions(-) diff --git a/sources/zarr/large_image_source_zarr/__init__.py b/sources/zarr/large_image_source_zarr/__init__.py index 1de7d443f..dfeeb2203 100644 --- a/sources/zarr/large_image_source_zarr/__init__.py +++ b/sources/zarr/large_image_source_zarr/__init__.py @@ -75,6 +75,7 @@ def __init__(self, path, **kwargs): def _initOpen(self, **kwargs): self._largeImagePath = str(self._getLargeImagePath()) self._zarr = None + self._editable = False if not os.path.isfile(self._largeImagePath) and '//:' not in self._largeImagePath: raise TileSourceFileNotFoundError(self._largeImagePath) from None try: @@ -122,6 +123,9 @@ def _initNew(self, path, **kwargs): self._framecount = 0 self._mm_x = 0 self._mm_y = 0 + self._channelNames = [] + self._channelColors = [] + self._imageDescription = None self._levels = [] def __del__(self): @@ -308,6 +312,8 @@ def _validateZarr(self): Validate that we can read tiles from the zarr parent group in self._zarr. Set up the appropriate class variables. """ + if self._editable: + self._writeInternalMetadata() found = self._scanZarrGroup(self._zarr) if found['best'] is None: msg = 'No data array that can be used.' @@ -425,6 +431,8 @@ def getInternalMetadata(self, **kwargs): :returns: a dictionary of data or None. """ + if self._editable: + self._writeInternalMetadata() result = {} result['zarr'] = { 'base': self._zarr.attrs.asdict(), @@ -561,8 +569,6 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): :param kwargs: start locations for any additional axes. Note that ``level`` is a reserved word and not permitted for an axis name. """ - # TODO: improve band bookkeeping - self._checkEditable() store_path = str(kwargs.pop('level', 0)) placement = { @@ -625,20 +631,8 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): path=store_path, ) - # If base data changed, update large_image attributes and OME metadata + # If base data changed, update large_image attributes if store_path == '0': - self._zarr.attrs.update({ - 'multiscales': [{ - 'version': '0.5-dev', - 'axes': [{ - 'name': a, - 'type': 'space' if a in ['x', 'y'] else 'other', - } for a in axes], - 'datasets': [{'path': 0}], - }], - 'omero': {'version': '0.5-dev'}, - }) - self._dtype = tile.dtype self._bandCount = new_dims.get(axes[-1]) # last axis is assumed to be bands self.sizeX = new_dims.get('x') @@ -653,6 +647,92 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): self.levels = int(max(1, math.ceil(math.log(max( self.sizeX / self.tileWidth, self.sizeY / self.tileHeight)) / math.log(2)) + 1)) + def _writeInternalMetadata(self): + self._checkEditable() + with self._addLock: + name = str(self._tempdir.name).split('/')[-1] + arrays = dict(self._zarr.arrays()) + channel_axis = self._axes.get('s') or self._axes.get('c') + datasets = [] + axes = [] + channels = [] + rdefs = {'model': 'color' if len(self._channelColors) else 'greyscale'} + sorted_axes = [a[0] for a in sorted(self._axes.items(), key=lambda item: item[1])] + for arr_name in arrays: + level = int(arr_name) + scale = [1.0 for a in sorted_axes] + scale[self._axes.get('x')] = self._mm_x * (2 ** level) + scale[self._axes.get('y')] = self._mm_y * (2 ** level) + dataset_metadata = { + 'path': arr_name, + 'coordinateTransformations': [{ + 'type': 'scale', + 'scale': scale, + }], + } + datasets.append(dataset_metadata) + for a in sorted_axes: + axis_metadata = {'name': a} + if a in ['x', 'y']: + axis_metadata['type'] = 'space' + axis_metadata['unit'] = 'millimeter' + elif a in ['s', 'c']: + axis_metadata['type'] = 'channel' + elif a == 't': + rdefs['defaultT'] = 0 + elif a == 'z': + rdefs['defaultZ'] = 0 + axes.append(axis_metadata) + if channel_axis and len(arrays) > 0: + base_array = list(arrays.values())[0] + base_shape = base_array.shape + for c in range(base_shape[channel_axis]): + channel_metadata = { + 'active': True, + 'coefficient': 1, + 'color': 'FFFFFF', + 'family': 'linear', + 'inverted': False, + 'label': f'Band {c + 1}', + } + channel_data = base_array[..., c] + channel_min = np.min(channel_data) + channel_max = np.max(channel_data) + channel_metadata['window'] = { + 'end': channel_max, + 'max': channel_max, + 'min': channel_min, + 'start': channel_min, + } + if len(self._channelNames) > c: + channel_metadata['label'] = self._channelNames[c] + if len(self._channelColors) > c: + channel_metadata['color'] = self._channelColors[c] + channels.append(channel_metadata) + # Guidelines from https://ngff.openmicroscopy.org/latest/ + self._zarr.attrs.update({ + 'multiscales': [{ + 'version': '0.5', + 'name': name, + 'axes': axes, + 'datasets': datasets, + 'metadata': { + 'description': self._imageDescription or '', + 'kwargs': { + 'multichannel': (channel_axis is not None), + }, + }, + }], + 'omero': { + 'id': 1, + 'version': '0.5', + 'name': name, + 'channels': channels, + 'rdefs': rdefs, + }, + 'bioformats2raw.layout': 3, + }) + @property def crop(self): """ @@ -678,6 +758,59 @@ def crop(self, value): raise TileSourceError(msg) self._crop = (x, y, w, h) + @property + def mm_x(self): + return self._mm_x + + @mm_x.setter + def mm_x(self, value): + self._checkEditable() + value = float(value) if value is not None else None + if value is not None and value <= 0: + msg = 'mm_x must be positive or None' + raise TileSourceError(msg) + self._mm_x = value + + @property + def mm_y(self): + return self._mm_y + + @mm_y.setter + def mm_y(self, value): + self._checkEditable() + value = float(value) if value is not None else None + if value is not None and value <= 0: + msg = 'mm_y must be positive or None' + raise TileSourceError(msg) + self._mm_y = value + + @property + def imageDescription(self): + return self._imageDescription + + @imageDescription.setter + def imageDescription(self, description): + self._checkEditable() + self._imageDescription = description + + @property + def channelNames(self): + return self._channelNames + + @channelNames.setter + def channelNames(self, names): + self._checkEditable() + self._channelNames = names + + @property + def channelColors(self): + return self._channelColors + + @channelColors.setter + def channelColors(self, colors): + self._checkEditable() + self._channelColors = colors + def _generateDownsampledLevels(self, resample_method): self._checkEditable() current_arrays = dict(self._zarr.arrays()) @@ -701,6 +834,7 @@ def _generateDownsampledLevels(self, resample_method): width=4096 + tile_overlap['x'], height=4096 + tile_overlap['y'], ) + sorted_axes = [a[0] for a in sorted(self._axes.items(), key=lambda item: item[1])] for level in range(1, self.levels): scale_factor = 2 ** level iterator_output = dict( @@ -735,7 +869,7 @@ def _generateDownsampledLevels(self, resample_method): x=x, y=y, **frame_position, - axes=list(self._axes.keys()), + axes=sorted_axes, level=level, ) self._validateZarr() # refresh self._levels before continuing @@ -795,6 +929,8 @@ def write( **frame_position, ) + source._writeInternalMetadata() + if suffix in ['.zarr', '.db', '.sqlite', '.zip']: if resample is None: resample = ( @@ -803,6 +939,7 @@ def write( else ResampleMethod.NP_NEAREST ) source._generateDownsampledLevels(resample) + source._writeInternalMetadata() # rewrite with new level datasets if suffix == '.zarr': shutil.copytree(source._tempdir.name, path) From d27858df30904b0ec5ff531bdf2ff3321e786e70 Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Wed, 17 Apr 2024 19:07:29 +0000 Subject: [PATCH 2/6] Add `testMetadata` to pytests --- test/test_sink.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/test/test_sink.py b/test/test_sink.py index ab5cc5712..7d1cee4dc 100644 --- a/test/test_sink.py +++ b/test/test_sink.py @@ -1,9 +1,12 @@ +import math + import large_image_source_test import large_image_source_zarr import numpy as np import pytest import large_image +from large_image.constants import NEW_IMAGE_PATH_FLAG from large_image.tilesource.resample import ResampleMethod TMP_DIR = 'tmp/zarr_sink' @@ -333,3 +336,94 @@ def testCropToTiff(tmp_path): source = large_image.open(output_file) assert source.sizeX == 1800 assert source.sizeY == 1825 + + +def testMetadata(tmp_path): + output_file = tmp_path / 'test.db' + sink = large_image_source_zarr.new() + + description = 'random data image for testing internal metadata' + channel_names = ['red', 'green', 'blue', 'IR', 'UV'] + channel_colors = ['FF0000', '00FF00', '0000FF', 'FFFF00', 'FF00FF'] + num_frames = 4 + num_bands = 5 + for z in range(num_frames): + sink.addTile(np.random.random((1000, 1000, num_bands)), 0, 0, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 950, 0, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 0, 900, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 950, 900, z=z) + + sink.imageDescription = description + sink.channelNames = channel_names + sink.channelColors = channel_colors + sink.mm_x = 5 + sink.mm_y = 5 + + sink.write(output_file) + written = large_image_source_zarr.open(output_file) + assert written._is_ome + + int_metadata = written.getInternalMetadata() + base_metadata = int_metadata.get('zarr', {}).get('base') + assert base_metadata is not None + assert base_metadata['bioformats2raw.layout'] == 3 + + multiscales = base_metadata.get('multiscales') + assert multiscales is not None + assert len(multiscales) == 1 + assert multiscales[0].get('version') == '0.5' + assert NEW_IMAGE_PATH_FLAG in multiscales[0].get('name') + + axes = multiscales[0].get('axes') + assert axes is not None + assert len(axes) == 4 + assert {'name': 'z'} in axes + assert {'name': 'y', 'type': 'space', 'unit': 'millimeter'} in axes + assert {'name': 'x', 'type': 'space', 'unit': 'millimeter'} in axes + assert {'name': 's', 'type': 'channel'} in axes + + datasets = multiscales[0].get('datasets') + assert len(datasets) == 3 + for i, d in enumerate(datasets): + assert d.get('path') == str(i) + coord_transforms = d.get('coordinateTransformations') + assert coord_transforms is not None + assert len(coord_transforms) == 1 + assert coord_transforms[0].get('type') == 'scale' + assert coord_transforms[0].get('scale') == [ + 1.0, 5 * 2 ** i, 5 * 2 ** i, 1.0, + ] + + nested_metadata = multiscales[0].get('metadata') + assert nested_metadata is not None + assert nested_metadata.get('description') == description + assert nested_metadata.get('kwargs', {}).get('multichannel') + + omero = base_metadata.get('omero') + assert omero is not None + assert omero.get('id') == 1 + assert omero.get('version') == '0.5' + assert NEW_IMAGE_PATH_FLAG in omero.get('name') + + channels = omero.get('channels') + assert channels is not None + assert len(channels) == num_bands + for i, c in enumerate(channels): + assert c.get('active') + assert c.get('coefficient') == 1 + assert c.get('color') == channel_colors[i] + assert c.get('family') == 'linear' + assert not c.get('inverted') + assert c.get('label') == channel_names[i] + window = c.get('window') + assert window is not None + # max should be nearly 1 and min should be nearly 0 + assert math.ceil(window.get('end')) == 1 + assert math.ceil(window.get('max')) == 1 + assert math.floor(window.get('start')) == 0 + assert math.floor(window.get('min')) == 0 + + rdefs = omero.get('rdefs') + assert rdefs is not None + assert rdefs.get('model') == 'color' + assert rdefs.get('defaultZ') == 0 From 0d3f325862955bf397f3f58824d95bdf43ab96f6 Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Thu, 18 Apr 2024 14:08:40 +0000 Subject: [PATCH 3/6] Implement `addAssociatedImage` method and add a test for its behavior --- .../zarr/large_image_source_zarr/__init__.py | 18 +++++++++ test/test_sink.py | 40 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/sources/zarr/large_image_source_zarr/__init__.py b/sources/zarr/large_image_source_zarr/__init__.py index dfeeb2203..cd364410c 100644 --- a/sources/zarr/large_image_source_zarr/__init__.py +++ b/sources/zarr/large_image_source_zarr/__init__.py @@ -127,6 +127,7 @@ def _initNew(self, path, **kwargs): self._channelColors = [] self._imageDescription = None self._levels = [] + self._associatedImages = [] def __del__(self): if not hasattr(self, '_derivedSource'): @@ -647,6 +648,23 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): self.levels = int(max(1, math.ceil(math.log(max( self.sizeX / self.tileWidth, self.sizeY / self.tileHeight)) / math.log(2)) + 1)) + def addAssociatedImage(self, image, overwrite=False): + """ + Add an associated image to this source. + + :param image: a numpy array, PIL Image, or a binary string + with an image. The numpy array can have 2 or 3 dimensions. + :param overwrite: If true, allow overwriting an existing associated image with the same label + """ + data, _ = _imageToNumpy(image) + with self._addLock: + # Each associated image should be in its own group + num_existing = len(self.getAssociatedImagesList()) + name = f'associated_{num_existing + 1}' + group = self._zarr.require_group(name, overwrite=overwrite) + arr = zarr.array(data, overwrite=overwrite, store=self._zarr_store, path=f'{name}/image') + self._associatedImages.append((group, arr)) + def _writeInternalMetadata(self): self._checkEditable() with self._addLock: diff --git a/test/test_sink.py b/test/test_sink.py index 7d1cee4dc..7f51f35b2 100644 --- a/test/test_sink.py +++ b/test/test_sink.py @@ -1,4 +1,5 @@ import math +from PIL import Image import large_image_source_test import large_image_source_zarr @@ -427,3 +428,42 @@ def testMetadata(tmp_path): assert rdefs is not None assert rdefs.get('model') == 'color' assert rdefs.get('defaultZ') == 0 + +def testAddAssociatedImages(tmp_path): + output_file = tmp_path / 'test.db' + sink = large_image_source_zarr.new() + + num_frames = 4 + num_bands = 5 + for z in range(num_frames): + sink.addTile(np.random.random((1000, 1000, num_bands)), 0, 0, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 950, 0, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 0, 900, z=z) + sink.addTile(np.random.random((1000, 1000, num_bands)), 950, 900, z=z) + + image_sizes = [ + (200, 300, 3), + (400, 500, 3), + (600, 700, 3) + ] + + for image_size in image_sizes: + image_data = (np.random.random(image_size) * 255).astype(np.uint8) + img = Image.fromarray(image_data) + sink.addAssociatedImage(img) + + original_image_list = sink.getAssociatedImagesList() + + sink.write(output_file) + written = large_image_source_zarr.open(output_file) + written_image_list = written.getAssociatedImagesList() + + for image_list in [original_image_list, written_image_list]: + assert len(image_list) == len(image_sizes) + for i, image_name in enumerate(image_list): + retrieved = sink._getAssociatedImage(image_name) + expected_size = image_sizes[i] + assert retrieved is not None + assert isinstance(retrieved, Image.Image) + # PIL Image size doesn't include bands and swaps x & y + assert retrieved.size == (expected_size[1], expected_size[0]) \ No newline at end of file From 77744de2168e2fa5df407cd91d5c490ff96d7483 Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Thu, 18 Apr 2024 14:11:12 +0000 Subject: [PATCH 4/6] Reformat with `tox -e format` --- sources/zarr/large_image_source_zarr/__init__.py | 8 ++++++-- test/test_sink.py | 9 +++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sources/zarr/large_image_source_zarr/__init__.py b/sources/zarr/large_image_source_zarr/__init__.py index cd364410c..90c3fb388 100644 --- a/sources/zarr/large_image_source_zarr/__init__.py +++ b/sources/zarr/large_image_source_zarr/__init__.py @@ -662,9 +662,13 @@ def addAssociatedImage(self, image, overwrite=False): num_existing = len(self.getAssociatedImagesList()) name = f'associated_{num_existing + 1}' group = self._zarr.require_group(name, overwrite=overwrite) - arr = zarr.array(data, overwrite=overwrite, store=self._zarr_store, path=f'{name}/image') + arr = zarr.array( + data, + overwrite=overwrite, + store=self._zarr_store, + path=f'{name}/image') self._associatedImages.append((group, arr)) - + def _writeInternalMetadata(self): self._checkEditable() with self._addLock: diff --git a/test/test_sink.py b/test/test_sink.py index 7f51f35b2..a947f2753 100644 --- a/test/test_sink.py +++ b/test/test_sink.py @@ -1,10 +1,10 @@ import math -from PIL import Image import large_image_source_test import large_image_source_zarr import numpy as np import pytest +from PIL import Image import large_image from large_image.constants import NEW_IMAGE_PATH_FLAG @@ -429,6 +429,7 @@ def testMetadata(tmp_path): assert rdefs.get('model') == 'color' assert rdefs.get('defaultZ') == 0 + def testAddAssociatedImages(tmp_path): output_file = tmp_path / 'test.db' sink = large_image_source_zarr.new() @@ -444,7 +445,7 @@ def testAddAssociatedImages(tmp_path): image_sizes = [ (200, 300, 3), (400, 500, 3), - (600, 700, 3) + (600, 700, 3), ] for image_size in image_sizes: @@ -453,7 +454,7 @@ def testAddAssociatedImages(tmp_path): sink.addAssociatedImage(img) original_image_list = sink.getAssociatedImagesList() - + sink.write(output_file) written = large_image_source_zarr.open(output_file) written_image_list = written.getAssociatedImagesList() @@ -466,4 +467,4 @@ def testAddAssociatedImages(tmp_path): assert retrieved is not None assert isinstance(retrieved, Image.Image) # PIL Image size doesn't include bands and swaps x & y - assert retrieved.size == (expected_size[1], expected_size[0]) \ No newline at end of file + assert retrieved.size == (expected_size[1], expected_size[0]) From 0b86db74e541104543e238a60207d6ce3ca25cb0 Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Thu, 18 Apr 2024 14:15:24 +0000 Subject: [PATCH 5/6] Remove unnecessary `overwrite` arg from `addAssociatedImage` --- sources/zarr/large_image_source_zarr/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sources/zarr/large_image_source_zarr/__init__.py b/sources/zarr/large_image_source_zarr/__init__.py index 90c3fb388..89c64b3a5 100644 --- a/sources/zarr/large_image_source_zarr/__init__.py +++ b/sources/zarr/large_image_source_zarr/__init__.py @@ -648,23 +648,21 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): self.levels = int(max(1, math.ceil(math.log(max( self.sizeX / self.tileWidth, self.sizeY / self.tileHeight)) / math.log(2)) + 1)) - def addAssociatedImage(self, image, overwrite=False): + def addAssociatedImage(self, image): """ Add an associated image to this source. :param image: a numpy array, PIL Image, or a binary string with an image. The numpy array can have 2 or 3 dimensions. - :param overwrite: If true, allow overwriting an existing associated image with the same label """ data, _ = _imageToNumpy(image) with self._addLock: # Each associated image should be in its own group num_existing = len(self.getAssociatedImagesList()) name = f'associated_{num_existing + 1}' - group = self._zarr.require_group(name, overwrite=overwrite) + group = self._zarr.require_group(name) arr = zarr.array( data, - overwrite=overwrite, store=self._zarr_store, path=f'{name}/image') self._associatedImages.append((group, arr)) From 8976f3df1790217e1b45ca6466907488ff7c3cc0 Mon Sep 17 00:00:00 2001 From: Anne Haley Date: Mon, 3 Jun 2024 13:35:36 +0000 Subject: [PATCH 6/6] Store associated images as dict with image keys --- .../zarr/large_image_source_zarr/__init__.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/sources/zarr/large_image_source_zarr/__init__.py b/sources/zarr/large_image_source_zarr/__init__.py index 89c64b3a5..adb0abffa 100644 --- a/sources/zarr/large_image_source_zarr/__init__.py +++ b/sources/zarr/large_image_source_zarr/__init__.py @@ -127,7 +127,7 @@ def _initNew(self, path, **kwargs): self._channelColors = [] self._imageDescription = None self._levels = [] - self._associatedImages = [] + self._associatedImages = {} def __del__(self): if not hasattr(self, '_derivedSource'): @@ -327,8 +327,10 @@ def _validateZarr(self): msg = 'Conflicting xy axis data.' raise TileSourceError(msg) self._channels = found['channels'] - self._associatedImages = [ - (g, a) for g, a in found['associated'] if not any(g is gb for gb, _ in self._series)] + self._associatedImages = { + g.name.replace('/', ''): (g, a) + for g, a in found['associated'] if not any(g is gb for gb, _ in self._series) + } self.sizeX = baseArray.shape[self._axes['x']] self.sizeY = baseArray.shape[self._axes['y']] self.tileWidth = ( @@ -450,7 +452,7 @@ def getAssociatedImagesList(self): :return: the list of image keys. """ - return [f'image_{idx}' for idx in range(len(self._associatedImages))] + return list(self._associatedImages.keys()) def _getAssociatedImage(self, imageKey): """ @@ -459,15 +461,9 @@ def _getAssociatedImage(self, imageKey): :param imageKey: the key of the associated image. :return: the image in PIL format or None. """ - if not imageKey.startswith('image_'): - return - try: - idx = int(imageKey[6:]) - except Exception: - return - if idx < 0 or idx >= len(self._associatedImages): + if imageKey not in self._associatedImages: return - group, arr = self._associatedImages[idx] + group, arr = self._associatedImages[imageKey] axes = self._getGeneralAxes(arr) trans = [idx for idx in range(len(arr.shape)) if idx not in axes.values()] + [axes['y'], axes['x']] @@ -648,7 +644,7 @@ def addTile(self, tile, x=0, y=0, mask=None, axes=None, **kwargs): self.levels = int(max(1, math.ceil(math.log(max( self.sizeX / self.tileWidth, self.sizeY / self.tileHeight)) / math.log(2)) + 1)) - def addAssociatedImage(self, image): + def addAssociatedImage(self, image, imageKey=None): """ Add an associated image to this source. @@ -657,15 +653,17 @@ def addAssociatedImage(self, image): """ data, _ = _imageToNumpy(image) with self._addLock: - # Each associated image should be in its own group - num_existing = len(self.getAssociatedImagesList()) - name = f'associated_{num_existing + 1}' - group = self._zarr.require_group(name) + if imageKey is None: + # Each associated image should be in its own group + num_existing = len(self.getAssociatedImagesList()) + imageKey = f'image_{num_existing + 1}' + group = self._zarr.require_group(imageKey) arr = zarr.array( data, store=self._zarr_store, - path=f'{name}/image') - self._associatedImages.append((group, arr)) + path=f'{imageKey}/image', + ) + self._associatedImages[imageKey] = (group, arr) def _writeInternalMetadata(self): self._checkEditable()