Merge pull request #1524 from girder/plottable-data

Get all plottable data
girder · Jul 10, 2024 · 87034d0 · 87034d0
2 parents d8d094c + b20bd99
commit 87034d0
Show file tree

Hide file tree

Showing 3 changed files with 327 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 - Show a loading spinner on the image display in geojs in girder ([#1559](../../pull/1559))
 - Better handle images that are composed of a folder and an item ([#1561](../../pull/1561))
 - Allow specifying which sources are checked with canReadList ([#1562](../../pull/1562))
+- Added endpoints to get plottable data related to annotations ([#1524](../../pull/1524))
 
 ### Bug Fixes
 - Fix a compositing error in transformed multi source images ([#1560](../../pull/1560))

diff --git a/girder_annotation/girder_large_image_annotation/rest/annotation.py b/girder_annotation/girder_large_image_annotation/rest/annotation.py
@@ -35,7 +35,7 @@
 from girder.utility import JsonEncoder
 from girder.utility.progress import setResponseTimeLimit
 
-from .. import constants
+from .. import constants, utils
 from ..models.annotation import Annotation, AnnotationSchema
 from ..models.annotationelement import Annotationelement
 
@@ -65,6 +65,8 @@ def __init__(self):
         self.route('GET', ('item', ':id'), self.getItemAnnotations)
         self.route('POST', ('item', ':id'), self.createItemAnnotations)
         self.route('DELETE', ('item', ':id'), self.deleteItemAnnotations)
+        self.route('POST', ('item', ':id', 'plot', 'list'), self.getItemPlottableElements)
+        self.route('POST', ('item', ':id', 'plot', 'data'), self.getItemPlottableData)
         self.route('GET', ('folder', ':id'), self.returnFolderAnnotations)
         self.route('GET', ('folder', ':id', 'present'), self.existFolderAnnotations)
         self.route('GET', ('folder', ':id', 'create'), self.canCreateFolderAnnotations)
@@ -617,6 +619,45 @@ def deleteItemAnnotations(self, item):
                 count += 1
         return count
 
+    @autoDescribeRoute(
+        Description('Get a list of plottable data related to an item and its annotations.')
+        .modelParam('id', model=Item, level=AccessType.READ)
+        .jsonParam('annotations', 'A JSON list of annotation IDs that should '
+                   'be included.  An entry of __all__ will include all '
+                   'annotations.', paramType='formData', requireArray=True,
+                   required=False)
+        .errorResponse('ID was invalid.')
+        .errorResponse('Read access was denied for the item.', 403),
+    )
+    @access.public(cookie=True, scope=TokenScope.DATA_READ)
+    def getItemPlottableElements(self, item, annotations):
+        user = self.getCurrentUser()
+        data = utils.PlottableItemData(user, item, annotations=annotations)
+        return data.columns
+
+    @autoDescribeRoute(
+        Description('Get plottable data related to an item and its annotations.')
+        .modelParam('id', model=Item, level=AccessType.READ)
+        .param('adjacentItems', 'Whether to include adjacent item data.',
+               required=False, default=True, dataType='boolean')
+        .param('keys', 'A comma separated list of data keys to retrieve (not json).',
+               required=True)
+        .param('requiredKeys', 'A comma separated list of data keys that must '
+               'be non null in all response rows (not json).', required=False)
+        .jsonParam('annotations', 'A JSON list of annotation IDs that should '
+                   'be included.  An entry of \\__all__ will include all '
+                   'annotations.', paramType='formData', requireArray=True,
+                   required=False)
+        .errorResponse('ID was invalid.')
+        .errorResponse('Read access was denied for the item.', 403),
+    )
+    @access.public(cookie=True, scope=TokenScope.DATA_READ)
+    def getItemPlottableData(self, item, keys, adjacentItems, annotations, requiredKeys):
+        user = self.getCurrentUser()
+        data = utils.PlottableItemData(
+            user, item, annotations=annotations, adjacentItems=adjacentItems)
+        return data.data(keys, requiredKeys)
+
     def getFolderAnnotations(self, id, recurse, user, limit=False, offset=False, sort=False,
                              sortDir=False, count=False):
 

diff --git a/girder_annotation/girder_large_image_annotation/utils/__init__.py b/girder_annotation/girder_large_image_annotation/utils/__init__.py
@@ -1,5 +1,12 @@
 import json
 import math
+import re
+
+from bson.objectid import ObjectId
+
+from girder import logger
+from girder.constants import AccessType
+from girder.models.folder import Folder
 
 
 class AnnotationGeoJSON:
@@ -334,3 +341,280 @@ def isGeoJSON(annotation):
         'Feature', 'FeatureCollection', 'GeometryCollection', 'Point',
         'LineString', 'Polygon', 'MultiPoint', 'MultiLineString',
         'MultiPolygon'}
+
+
+class PlottableItemData:
+    maxItems = 1000
+    maxAnnotationElements = 10000
+    maxDistinct = 20
+    allowedTypes = (str, bool, int, float)
+
+    def __init__(self, user, item, annotations=None, adjacentItems=False):
+        """
+        Get plottable data associated with an item.
+
+        :param user: authenticating user.
+        :param item: the item record.
+        :param annotations: None, a list of annotation ids, or __all__.  If
+            adjacent items are included, the most recent annotation with the
+            same name will also be included.
+        :param adjacentItems: if True, include data other items in the same
+            folder.
+        """
+        self.user = user
+        self._columns = None
+        self._datacolumns = None
+        self._data = None
+        self._findItems(item, adjacentItems)
+        self._findAnnotations(annotations)
+
+    def _findItems(self, item, adjacentItems=False):
+        self._columns = None
+        self.item = item
+        self.folder = Folder().load(id=item['folderId'], user=self.user, level=AccessType.READ)
+        self.items = [item]
+        if adjacentItems:
+            for entry in Folder().childItems(self.folder):
+                if len(self.items) >= self.maxItems:
+                    break
+                if entry['_id'] != item['_id']:
+                    # skip if item doesn't have appropriate metadata or
+                    # annotations.  If skipping, add to list to check if
+                    # dataframe
+                    # TODO:
+                    self.items.append(entry)
+        # TODO: find csv/xlsx/dataframe items in the folder, exclude them from
+        # the item list but include them in general
+
+    def _findAnnotations(self, annotations):
+        from ..models.annotation import Annotation
+
+        self._columns = None
+        if isinstance(annotations, str):
+            annotations = annotations.split(',')
+        self.annotations = None
+        if annotations and len(annotations):
+            self.annotations = []
+            query = {'_active': {'$ne': False}, 'itemId': self.item['_id']}
+            if annotations[0] != '__all__':
+                query['_id'] = {'$in': [ObjectId(annotId) for annotId in annotations]}
+            self.annotations.append(list(Annotation().find(
+                query, limit=0, sort=[('_version', -1)])))
+            if not len(self.annotations[0]):
+                self.annotations = None
+        # Find adjacent annotations
+        if annotations and len(self.items) > 1:
+            names = {}
+            for idx, annot in enumerate(self.annotations[0]):
+                if annot['annotation']['name'] not in names:
+                    names[annot['annotation']['name']] = idx
+            for adjitem in self.items[1:]:
+                query = {'_active': {'$ne': False}, 'itemId': adjitem['_id']}
+                annotList = [None] * len(self.annotations[0])
+                for annot in Annotation().find(query, limit=0, sort=[('_version', -1)]):
+                    if annot['annotation']['name'] in names and annotList[
+                            names[annot['annotation']['name']]] is None:
+                        annotList[names[annot['annotation']['name']]] = annot
+                self.annotations.append(annotList)
+
+    def _addColumn(self, columns, fullkey, title, root, key, source):
+        if fullkey not in columns:
+            columns[fullkey] = {
+                'key': fullkey,
+                'type': 'number',
+                'where': [[root, key, source]], 'title': title,
+                'count': 0, 'distinct': set(), 'min': None,
+                'max': None}
+            return (root, source, 0)
+        elif [root, key, source] not in columns[fullkey]['where']:
+            columns[fullkey]['where'].append([root, key, source])
+        where = -1
+        for colwhere in columns[fullkey]['where']:
+            if colwhere[0] == root and colwhere[2] == source:
+                where += 1
+            if tuple(colwhere) == (root, key, source):
+                return (root, source, where)
+        return (root, source, where)
+
+    def _columnKey(self, source, root, key):
+        if not hasattr(self, '_columnKeyCache'):
+            self._columnKeyCache = {}
+        hashkey = (source, root, key)
+        if hashkey in self._columnKeyCache:
+            return self._columnKeyCache[hashkey]
+        fullkey = f'{root}.{key}.{source}'.lower()
+        title = f'{root} {key}'
+        keymap = {
+            r'(?i)(item|image)_(id|name)$': {'key': '_0_item.name', 'title': 'Item Name'},
+            r'(?i)(low|min)(_|)x': {'key': '_bbox.x0', 'title': 'Bounding Box Low X'},
+            r'(?i)(low|min)(_|)y': {'key': '_bbox.y0', 'title': 'Bounding Box Low Y'},
+            r'(?i)(high|max)(_|)x': {'key': '_bbox.x1', 'title': 'Bounding Box High X'},
+            r'(?i)(high|max)(_|)y': {'key': '_bbox.y1', 'title': 'Bounding Box High Y'},
+        }
+        for k, v in keymap.items():
+            if re.match(k, key):
+                fullkey = v['key']
+                title = v['title']
+                break
+        self._columnKeyCache[hashkey] = fullkey, title
+        return fullkey, title
+
+    def _scanColumnByKey(self, result, key, entry, where=0):
+        if result['type'] == 'number':
+            try:
+                [float(record[key]) for record in entry
+                 if isinstance(record.get(key), self.allowedTypes)]
+            except Exception:
+                result['type'] = 'string'
+                result['distinct'] = {str(v) for v in result['distinct']}
+        for ridx, record in enumerate(entry):
+            v = record.get(key)
+            if not isinstance(v, self.allowedTypes):
+                continue
+            result['count'] += 1
+            v = float(v) if result['type'] == 'number' else str(v)
+            if len(result['distinct']) <= self.maxDistinct:
+                result['distinct'].add(v)
+            if result['type'] == 'number':
+                if result['min'] is None:
+                    result['min'] = result['max'] = v
+                result['min'] = min(result['min'], v)
+                result['max'] = max(result['max'], v)
+            if self._datacolumns and result['key'] in self._datacolumns:
+                self._datacolumns[result['key']][(where, ridx)] = v
+
+    def _scanColumn(self, meta, source, columns, auxmeta=None):
+        for root, entry in meta.items():
+            if not isinstance(entry, list) or not len(entry) or not isinstance(entry[0], dict):
+                continue
+            for key in entry[0]:
+                if not isinstance(entry[0][key], self.allowedTypes):
+                    continue
+                fullkey, title = self._columnKey(source, root, key)
+                where = self._addColumn(columns, fullkey, title, root, key, source)
+                result = columns[fullkey]
+                self._scanColumnByKey(result, key, entry, where)
+                if auxmeta:
+                    for aux in auxmeta:
+                        if (isinstance(aux.get(root), list) and
+                                len(aux[root]) and
+                                isinstance(aux[root][0], dict) and
+                                key in aux[root][0]):
+                            self._scanColumnByKey(result, key, aux[root], where)
+
+    @property
+    def columns(self):
+        """
+        Get a sorted list of plottable columns with some metadata for each.
+
+        Each data entry contains
+
+            :fullkey: a unique string.  This is a good first-order sort
+            :root: the root data array
+            :key: the specific data tag
+            :source: the source of the data (folder, item, annotation,
+                annotationelement, file)
+            :type: string or number
+            :title: a human readable title
+            :[distinct]: a list of distinct values if there are less than some
+                maximum number of distinct values.  This might not include i
+                values from adjacent items
+            :[min]: for number data types, the lowest value present
+            :[max]: for number data types, the highest value present
+
+        :returns: a sorted list of data entries.
+        """
+        if self._columns is not None:
+            return self._columns
+        columns = {}
+        self._addColumn(
+            columns, '_0_item.name', 'Item Name', 'Item', 'name', 'base')
+        self._addColumn(
+            columns, '_2_item.id', 'Item ID', 'Item', '_id', 'base')
+        self._scanColumn(self.folder.get('meta', {}), 'folder', columns)
+        self._scanColumn(self.item.get('meta', {}), 'item', columns,
+                         [item.get('meta', {}) for item in self.items[1:]])
+        for anidx, annot in enumerate(self.annotations[0] if self.annotations is not None else []):
+            self._scanColumn(
+                annot.get('attributes', {}), 'annotation', columns,
+                [itemannot[anidx].get('attributes', {})
+                 for itemannot in self.annotations[1:]
+                 if itemannot[anidx] is not None])
+            if not anidx:
+                self._addColumn(
+                    columns, '_1_annotation.name', 'Annotation Name',
+                    'Annotation', 'name', 'base')
+                self._addColumn(
+                    columns, '_3_annotation.id', 'Annotation ID',
+                    'Annotation', '_id', 'base')
+                self._addColumn(
+                    columns, '_bbox.x0', 'Bounding Box Low X', 'bbox', 'lowx',
+                    'annotationelement')
+                self._addColumn(
+                    columns, '_bbox.y0', 'Bounding Box Low Y', 'bbox', 'lowy',
+                    'annotationelement')
+                self._addColumn(
+                    columns, '_bbox.x1', 'Bounding Box High X', 'bbox',
+                    'highx', 'annotationelement')
+                self._addColumn(
+                    columns, '_bbox.y1', 'Bounding Box High Y', 'bbox',
+                    'highy', 'annotationelement')
+        # TODO: add annotation elements
+        # TODO: bbox could be from min/max query
+        # TODO: Add csv
+        for result in columns.values():
+            if len(result['distinct']) <= self.maxDistinct:
+                result['distinct'] = sorted(result['distinct'])
+                result['distinctcount'] = len(result['distinct'])
+            else:
+                result.pop('distinct', None)
+            if result['type'] != 'number' or result['min'] is None:
+                result.pop('min', None)
+                result.pop('max', None)
+        self._columns = sorted(columns.values(), key=lambda x: x['key'])
+        return self._columns
+
+    def data(self, columns, requiredColumns=None):
+        """
+        Get plottable data.
+
+        :param columns: the columns to return.  Either a list of column names
+            or a comma-delimited string.
+        :param requiredColumns: only return data rows where all of these
+            columns are non-None.  Either a list of column names of a
+            comma-delimited string.
+        """
+        if not isinstance(columns, list):
+            columns = columns.split(',')
+        if not isinstance(requiredColumns, list):
+            requiredColumns = requiredColumns.split(',') if requiredColumns is not None else []
+        # TODO: Always augment columns with item id, annotation id?
+        self._datacolumns = {c: {} for c in columns}
+        rows = set()
+        # collects data as a side effect
+        collist = self.columns
+        for coldata in self._datacolumns.values():
+            rows |= set(coldata.keys())
+        rows = sorted(rows)
+        colsout = [col.copy() for col in collist if col['key'] in columns]
+        for cidx, col in enumerate(colsout):
+            col['index'] = cidx
+        logger.info(f'Gathering {len(self._datacolumns)} x {len(rows)} data')
+        data = [[None] * len(self._datacolumns) for _ in range(len(rows))]
+        for cidx, col in enumerate(colsout):
+            colkey = col['key']
+            if colkey in self._datacolumns:
+                datacol = self._datacolumns[colkey]
+                for ridx, rowid in enumerate(rows):
+                    data[ridx][cidx] = datacol.get(rowid, None)
+        for cidx, col in enumerate(colsout):
+            colkey = col['key']
+            numrows = len(data)
+            if colkey in requiredColumns:
+                data = [row for row in data if row[cidx] is not None]
+            if len(data) < numrows:
+                logger.info(f'Reduced row count from {numrows} to {len(data)} '
+                            f'because of None values in column {colkey}')
+        return {
+            'columns': colsout,
+            'data': data}