diff --git a/.bumpversion.cfg b/.bumpversion.cfg index ee1374ac..16ad673c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.10.1.dev0 +current_version = 0.11.2.dev0 commit = True tag = True sign_tags = True diff --git a/CHANGES.rst b/CHANGELOG.md similarity index 78% rename from CHANGES.rst rename to CHANGELOG.md index df2fc3bd..b1bca662 100644 --- a/CHANGES.rst +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ -CHANGES -======= +0.11.1 +------ + +* Reduce logging level of post_process statement ([#78](https://github.com/ome/omero-metadata/pull/78)) + +0.11.0 +------ + +* Add support for column type auto-detection using pandas ([#67](https://github.com/ome/omero-metadata/pull/67), [#71](https://github.com/ome/omero-metadata/pull/67), [#72](https://github.com/ome/omero-metadata/pull/72), [#75](https://github.com/ome/omero-metadata/pull/75), [#77](https://github.com/ome/omero-metadata/pull/77)) +* Skip empty rows when reading CSV files ([#70](https://github.com/ome/omero-metadata/pull/70)) 0.10.0 ------ diff --git a/README.rst b/README.rst index 1e568745..89b1741c 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,7 @@ conflicts when importing the Python module. Usage ===== -The plugin is called from the command-line using the `omero` command:: +The plugin is called from the command-line using the ``omero metadata`` command:: $ omero metadata @@ -64,15 +64,58 @@ populate -------- This command creates an ``OMERO.table`` (bulk annotation) from a ``CSV`` file and links -the table as a ``File Annotation`` to a parent container such as Screen, Plate, Project +the table as a ``File Annotation`` to a parent container such as Screen, Plate, Project, Dataset or Image. It also attempts to convert Image, Well or ROI names from the ``CSV`` into object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. -If you wish to ensure that ``number`` columns are created for numerical data, this will -allow you to make numerical queries on the table. -Column Types are: +OMERO.tables have defined column types to specify the data-type such as ``double`` or ``long`` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn``. + +The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as follows: + +* Columns named with a supported object-type (e.g. ``plate``, ``well``, ``image``, ``dataset``, or ``roi``), with `` id`` or `` name`` will generate the corresponding column type in the OMERO.table. See table below for full list of supported column names. + +============ ================= ==================== ==================================================================== +Column Name Column type Detected Header Type Notes +============ ================= ==================== ==================================================================== +Image ``ImageColumn`` ``image`` Accepts image IDs. Appends new 'Image Name' column with image names. +Image Name ``StringColumn`` ``s`` Accepts image names. Appends new 'Image' column with image IDs. +Image ID ``ImageColumn`` ``image`` Accepts image IDs. Appends new 'Image Name' column with image names. +Dataset ``DatasetColumn`` ``dataset`` Accepts dataset IDs. +Dataset Name ``StringColumn`` ``s`` Accepts dataset names. +Dataset ID ``DatasetColumn`` ``dataset`` Accepts dataset IDs. +Plate ``PlateColumn`` ``plate`` Accepts plate names. Adds new 'Plate' column with plate IDs. +Plate Name ``PlateColumn`` ``plate`` Accepts plate names. Adds new 'Plate' column with plate IDs. +Plate ID ``LongColumn`` ``l`` Accepts plate IDs. +Well ``WellColumn`` ``well`` Accepts well names. Adds new 'Well' column with well IDs. +Well Name ``WellColumn`` ``well`` Accepts well names. Adds new 'Well' column with well IDs. +Well ID ``LongColumn`` ``l`` Accepts well IDs. +ROI ``RoiColumn`` ``roi`` Accepts ROI IDs. Appends new 'ROI Name' column with ROI names. +ROI Name ``StringColumn`` ``s`` Accepts ROI names. Appends new 'ROI' column with ROI IDs. +ROI ID ``RoiColumn`` ``roi`` Accepts ROI IDs. Appends new 'ROI Name' column with ROI names. +============ ================= ==================== ==================================================================== + +Note: Column names are case insensitive. Space, no space, and underscore are all accepted as separators for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) + +NB: Column names should not contain spaces if you want to be able to query by these columns. + +* All other column types will be detected based on the column's data using the pandas library. See table below. + +=============== ================= ==================== +Column Name Column type Detected Header Type +=============== ================= ==================== +Example String ``StringColumn`` ``s`` +Example Long ``LongColumn`` ``l`` +Example Float ``DoubleColumn`` ``d`` +Example boolean ``BoolColumn`` ``b`` +=============== ================= ==================== + +In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the +``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the +missing values will be stored as NaN. + +However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): - ``d``: ``DoubleColumn``, for floating point numbers - ``l``: ``LongColumn``, for integer numbers @@ -80,29 +123,34 @@ Column Types are: - ``b``: ``BoolColumn``, for true/false - ``plate``, ``well``, ``image``, ``dataset``, ``roi`` to specify objects -These can be specified in the first row of a ``CSV`` with a ``# header`` tag (see examples below). -The ``# header`` row is optional. Default column type is ``String``. +Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` (unless the column names correspond to OMERO objects such as ``image`` or ``plate``). -NB: Column names should not contain spaces if you want to be able to query -by these columns. + +Examples +^^^^^^^^^ + +The examples below will use the default automatic column types detection behaviour. It is possible to achieve the same results (or a different desired result) by manually adding a custom ``# header`` row at the top of the CSV. **Project / Dataset** +^^^^^^^^^^^^^^^^^^^^^^ -To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` +To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` or ``Dataset ID`` and ``Image Name`` or ``Image ID``:: $ omero metadata populate Project:1 --file path/to/project.csv + +Using ``Image Name`` and ``Dataset Name``: project.csv:: - # header s,s,d,l,s Image Name,Dataset Name,ROI_Area,Channel_Index,Channel_Name img-01.png,dataset01,0.0469,1,DAPI img-02.png,dataset01,0.142,2,GFP img-03.png,dataset01,0.093,3,TRITC img-04.png,dataset01,0.429,4,Cy5 + -This will create an OMERO.table linked to the Project like this with +The previous example will create an OMERO.table linked to the Project as follows with a new ``Image`` column with IDs: ========== ============ ======== ============= ============ ===== @@ -114,23 +162,52 @@ img-03.png dataset01 0.093 3 TRITC 36640 img-04.png dataset01 0.429 4 Cy5 36641 ========== ============ ======== ============= ============ ===== -If the target is a Dataset instead of a Project, the ``Dataset Name`` column is not needed. +Note: equivalent to adding ``# header s,s,d,l,s`` row to the top of the ``project.csv`` for manual definition. + +Using ``Image ID`` and ``Dataset ID``: + +project.csv:: + + image id,Dataset ID,ROI_Area,Channel_Index,Channel_Name + 36638,101,0.0469,1,DAPI + 36639,101,0.142,2,GFP + 36640,101,0.093,3,TRITC + 36641,101,0.429,4,Cy5 +The previous example will create an OMERO.table linked to the Project as follows with +a new ``Image Name`` column with Names: + +===== ======= ======== ============= ============ ========== +Image Dataset ROI_Area Channel_Index Channel_Name Image Name +===== ======= ======== ============= ============ ========== +36638 101 0.0469 1 DAPI img-01.png +36639 101 0.142 2 GFP img-02.png +36640 101 0.093 3 TRITC img-03.png +36641 101 0.429 4 Cy5 img-04.png +===== ======= ======== ============= ============ ========== + +Note: equivalent to adding ``# header image,dataset,d,l,s`` row to the top of the ``project.csv`` for manual definition. + +For both examples above, alternatively, if the target is a Dataset instead of a Project, the ``Dataset`` or ``Dataset Name`` column is not needed. + **Screen / Plate** +^^^^^^^^^^^^^^^^^^^ To add a table to a Screen, the ``CSV`` file needs to specify ``Plate`` name and ``Well``. -If a ``# header`` is specified, column types must be ``well`` and ``plate``. +If a ``# header`` is specified, column types must be ``well`` and ``plate``:: + + $ omero metadata populate Screen:1 --file path/to/screen.csv screen.csv:: - # header well,plate,s,d,l,d Well,Plate,Drug,Concentration,Cell_Count,Percent_Mitotic A1,plate01,DMSO,10.1,10,25.4 A2,plate01,DMSO,0.1,1000,2.54 A3,plate01,DMSO,5.5,550,4 B1,plate01,DrugX,12.3,50,44.43 + This will create an OMERO.table linked to the Screen, with the ``Well Name`` and ``Plate Name`` columns added and the ``Well`` and ``Plate`` columns used for IDs: @@ -146,7 +223,10 @@ Well Plate Drug Concentration Cell_Count Percent_Mitotic Well Name Plat If the target is a Plate instead of a Screen, the ``Plate`` column is not needed. +Note: equivalent to adding ``# header well,plate,s,d,l,d`` row to the top of the ``screen.csv`` for manual definition. + **ROIs** +^^^^^^^^^ If the target is an Image or a Dataset, a ``CSV`` with ROI-level or Shape-level data can be used to create an ``OMERO.table`` (bulk annotation) as a ``File Annotation`` linked to the target object. @@ -154,21 +234,19 @@ If there is an ``roi`` column (header type ``roi``) containing ROI IDs, an ``Roi column will be appended automatically (see example below). If a column of Shape IDs named ``shape`` of type ``l`` is included, the Shape IDs will be validated (and set to -1 if invalid). Also if an ``image`` column of Image IDs is included, an ``Image Name`` column will be added. -NB: Columns of type ``shape`` aren't yet supported on the OMERO.server. +NB: Columns of type ``shape`` aren't yet supported on the OMERO.server:: -Alternatively, if the target is an Image, the ROI input column can be -``Roi Name`` (with type ``s``), and an ``roi`` type column will be appended containing ROI IDs. -In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set. + $ omero metadata populate Image:1 --file path/to/image.csv image.csv:: - # header roi,l,l,d,l Roi,shape,object,probability,area 501,1066,1,0.8,250 502,1067,2,0.9,500 503,1068,3,0.2,25 503,1069,4,0.8,400 503,1070,5,0.5,200 + This will create an OMERO.table linked to the Image like this: @@ -182,6 +260,12 @@ Roi shape object probability area Roi Name 503 1070 5 0.5 200 Sample3 === ===== ====== =========== ==== ======== +Note: equivalent to adding ``# header roi,l,l,d,l`` row to the top of the ``image.csv`` for manual definition. + +Alternatively, if the target is an Image, the ROI input column can be +``Roi Name`` (with type ``s``), and an ``roi`` type column will be appended containing ROI IDs. +In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set. + Note that the ROI-level data from an ``OMERO.table`` is not visible in the OMERO.web UI right-hand panel under the ``Tables`` tab, but the table can be visualized by clicking the "eye" on the bulk annotation attachment on the Image. @@ -204,4 +288,4 @@ licensed under the terms of the GNU General Public License (GPL) v2 or later. Copyright --------- -2018-2021, The Open Microscopy Environment +2018-2022, The Open Microscopy Environment and Glencoe Software, Inc diff --git a/setup.py b/setup.py index 745aee7d..239e8d6c 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.10.1.dev0' +version = '0.11.2.dev0' url = "https://github.com/ome/omero-metadata/" setup( @@ -127,7 +127,8 @@ def read(fname): 'future', 'omero-py>=5.6.0', 'PyYAML', - 'jinja2' + 'jinja2', + 'pandas' ], python_requires='>=3', tests_require=[ diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index bc788371..3836051b 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -32,6 +32,8 @@ from omero.grid import LongColumn from omero.model.enums import UnitsLength +import pandas as pd + HELP = """Metadata utilities Provides access to and editing of the metadata which @@ -239,8 +241,13 @@ def _configure(self, parser): populate.add_argument("--localcfg", help=( "Local configuration file or a JSON object string")) - populate.add_argument("--allow_nan", action="store_true", help=( - "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument( + "--allow-nan", "--allow_nan", action="store_true", help=( + "Allow empty values to become Nan in Long or Double columns")) + + populate.add_argument( + "--manual-header", "--manual_header", action="store_true", help=( + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -483,6 +490,49 @@ def testtables(self, args): if not initialized: self.ctx.die(100, "Failed to initialize Table") + @staticmethod + def detect_headers(csv_path, keep_default_na=True): + ''' + Function to automatically detect headers from a CSV file. This function + loads the table to pandas to detects the column type and match headers + ''' + + conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] + headers = [] + table = pd.read_csv(csv_path, keep_default_na=keep_default_na) + col_types = table.dtypes.values.tolist() + cols = list(table.columns) + + for index, col_type in enumerate(col_types): + col = cols[index] + if col.lower() in conserved_headers: + headers.append(col.lower()) + elif col.lower() == 'image id' or col.lower() == 'imageid' or \ + col.lower() == 'image_id': + headers.append('image') + elif col.lower() == 'roi id' or col.lower() == 'roiid' or \ + col.lower() == 'roi_id': + headers.append('roi') + elif col.lower() == 'dataset id' or \ + col.lower() == 'datasetid' or \ + col.lower() == 'dataset_id': + headers.append('dataset') + elif col.lower() == 'plate name' or col.lower() == 'platename' or \ + col.lower() == 'plate_name': + headers.append('plate') + elif col.lower() == 'well name' or col.lower() == 'wellname' or \ + col.lower() == 'well_name': + headers.append('well') + elif col_type.name == 'object': + headers.append('s') + elif col_type.name == 'float64': + headers.append('d') + elif col_type.name == 'int64': + headers.append('l') + elif col_type.name == 'bool': + headers.append('b') + return headers + # WRITE def populate(self, args): @@ -521,6 +571,20 @@ def populate(self, args): cfgid = cfgann.getFile().getId() md.linkAnnotation(cfgann) + header_type = None + # To use auto detect header by default unless instructed not to + # AND + # Check if first row contains `# header` + first_row = pd.read_csv(args.file, nrows=1, header=None) + if not args.manual_header and \ + not first_row[0].str.contains('# header').bool(): + omero_metadata.populate.log.info("Detecting header types") + header_type = MetadataControl.detect_headers( + args.file, keep_default_na=args.allow_nan) + if args.dry_run: + omero_metadata.populate.log.info(f"Header Types:{header_type}") + else: + omero_metadata.populate.log.info("Using user defined header types") loops = 0 ms = 0 wait = args.wait @@ -533,7 +597,7 @@ def populate(self, args): cfg=args.cfg, cfgid=cfgid, attach=args.attach, options=localcfg, batch_size=args.batch, loops=loops, ms=ms, dry_run=args.dry_run, - allow_nan=args.allow_nan) + allow_nan=args.allow_nan, column_types=header_type) ctx.parse() def rois(self, args): diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index a4f9013b..94e4e0f5 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -309,6 +309,13 @@ def _create_columns(self, klass): self.DEFAULT_COLUMN_SIZE, list())) # Ensure RoiColumn is named 'Roi' column.name = "Roi" + if column.__class__ is DatasetColumn: + # This breaks the code, as currently there is no implementation + # of a method to populate the 'Dataset Name' column + # append.append(StringColumn(DATASET_NAME_COLUMN, '', + # self.DEFAULT_COLUMN_SIZE, list())) + # Ensure DatasetColumn is named 'Dataset' + column.name = "Dataset" # If image/roi name, then add ID column" if column.name == IMAGE_NAME_COLUMN: append.append(ImageColumn("Image", '', list())) @@ -412,8 +419,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset name": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_name[column_value].id.val ] @@ -423,8 +428,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_id[ int(column_value)].id.val @@ -904,7 +907,10 @@ def get_image_name_by_id(self, iid, did=None): def resolve_dataset(self, column, row, value): try: - return self.datasets_by_name[value].id.val + if column.name.lower() == 'dataset': + return self.datasets_by_id[int(value)].id.val + else: + return self.datasets_by_name[value].id.val except KeyError: log.warn('Project is missing dataset: %s' % value) return Skip() @@ -1250,6 +1256,8 @@ def preprocess_data(self, reader): column.values.append(value) elif column.name.lower() == "plate": column.values.append(value) + elif column.name.lower() == "dataset": + column.values.append(value) except TypeError: log.error('Original value "%s" now "%s" of bad type!' % ( original_value, value)) @@ -1303,14 +1311,17 @@ def populate_from_reader(self, for (r, row) in enumerate(reader): log.debug('Row %d', r) if filter_function(row): - self.populate_row(row) - row_count = row_count + 1 - if row_count >= batch_size: - self.post_process() - table.addData(self.columns) - for column in self.columns: - column.values = [] - row_count = 0 + if row: + self.populate_row(row) + row_count = row_count + 1 + if row_count >= batch_size: + self.post_process() + table.addData(self.columns) + for column in self.columns: + column.values = [] + row_count = 0 + else: + log.warning('Skip empty row %d', r + 1) if row_count != 0: log.debug("DATA TO ADD") log.debug(self.columns) @@ -1340,7 +1351,10 @@ def populate(self, rows): nrows = len(rows) for (r, row) in enumerate(rows): log.debug('Row %d/%d', r + 1, nrows) - self.populate_row(row) + if row: + self.populate_row(row) + else: + log.warning('Skip empty row %d', r + 1) def post_process(self): target_class = self.target_object.__class__ @@ -1394,7 +1408,7 @@ def post_process(self): if well_name_column is None and plate_name_column is None \ and image_name_column is None and roi_name_column is None \ and roi_column is None: - log.info('Nothing to do during post processing.') + log.debug('Nothing to do during post processing.') return sz = max([len(x.values) for x in self.columns]) @@ -1666,7 +1680,7 @@ class BulkToMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=10, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object :param target_object: The object to be annotated @@ -1999,7 +2013,7 @@ class DeleteMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=500, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py old mode 100644 new mode 100755 index efcb998d..bcd9b3d6 --- a/test/integration/metadata/test_populate.py +++ b/test/integration/metadata/test_populate.py @@ -190,7 +190,7 @@ def assert_columns(self, columns): col_names = "Well,Well Type,Concentration,Well Name" assert col_names == ",".join([c.name for c in columns]) - def assert_table_row(self, row_values, row_index): + def assert_values(self, row_values): # Check rows, based on self.create_csv() # Unsure where the lower-casing is happening if "A1" in row_values or "a1" in row_values: @@ -254,8 +254,8 @@ def assert_columns(self, columns): "Concentration,Plate Name,Well Name,Image") assert col_names == ",".join([c.name for c in columns]) - def assert_table_row(self, row_values, row_index): - super(Screen2Plates, self).assert_table_row(row_values, row_index) + def assert_values(self, row_values): + super(Screen2Plates, self).assert_values(row_values) # last column should contain valid Image ID image_id = row_values[-1] image_name = row_values[2] @@ -807,6 +807,14 @@ def assert_columns(self, columns): def assert_row_count(self, rows): assert rows == len(self.roi_names) + def assert_values(self, row_values): + if "roi1" in row_values: + assert 0.5 in row_values + assert 100 in row_values + elif "roi2" in row_values: + assert 'nan' in [str(value) for value in row_values] + assert 200 in row_values + def get_target(self): if not self.image: image = self.test.make_image() @@ -1258,7 +1266,7 @@ def _assert_parsing_context_values(self, t, fixture): row_values = [col.values[0] for col in t.read( list(range(len(cols))), hit, hit+1).columns] assert len(row_values) == fixture.count - fixture.assert_table_row(row_values, hit) + fixture.assert_values(row_values) def _test_bulk_to_map_annotation_context(self, fixture, batch_size): # self._testPopulateMetadataPlate() diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py new file mode 100755 index 00000000..2fd0bcc7 --- /dev/null +++ b/test/unit/test_automatic_header.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2022 Glencoe Software, Inc. All rights reserved. +# +# This software is distributed under the terms described by the LICENSE.txt +# file you can find at the root of the distribution bundle. If the file is +# missing please request a copy by contacting info@glencoesoftware.com + + +from omero.model import ScreenI, ProjectI +from omero_metadata.populate import HeaderResolver +from omero_metadata.cli import MetadataControl +import pandas as pd +import tempfile +from omero.grid import ImageColumn, LongColumn, PlateColumn, RoiColumn, \ + StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn + + +class TestDetectHeaders: + """Test the MetadataControl.detect_headers API""" + def assert_detect_headers(self, **kwargs): + df = pd.DataFrame(data=self.d) + tmp = tempfile.NamedTemporaryFile() + df.to_csv(tmp.name, index=False) + header = MetadataControl.detect_headers(tmp.name, **kwargs) + assert header == self.expected_header + + def create_objects_dictionary(self): + # Create a dictionary with every combination of headers + # eg plate_name/platename/plate name/plate_id/plateid/plate id + self.d = {} + prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] + for prefix in prefix_list: + self.d[f'{prefix}_name'] = ['a', 'b', 'c'] + self.d[f'{prefix} name'] = ['a', 'b', 'c'] + self.d[f'{prefix}name'] = ['a', 'b', 'c'] + self.d[f'{prefix}_id'] = [1, 2, 3] + self.d[f'{prefix} id'] = [1, 2, 3] + self.d[f'{prefix}id'] = [1, 2, 3] + self.d[f'{prefix}'] = [1, 2, 3] + self.expected_header = [ + 's', 's', 's', 'l', 'l', 'l', 'l', + 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', + 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', + 'well', 'well', 'well', 'l', 'l', 'l', 'well', + 's', 's', 's', 'image', 'image', 'image', 'image', + 's', 's', 's', 'roi', 'roi', 'roi', 'roi' + ] + + def test_objects_columns(self): + self.create_objects_dictionary() + self.assert_detect_headers() + + def test_dense_columns(self): + ''' + Test of the default automatic column type detection behaviour + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, 22, 33], + 'measurement 2': [0.1, 0.2, 0.3], + 'measurement 3': ['a', 'b', 'c'], + 'measurement 4': [True, True, False], + 'measurement 5': [11, 0.1, True] + }) + self.expected_header.extend(['l', 'd', 's', 'b', 's']) + self.assert_detect_headers() + + def test_sparse_default_na(self): + ''' + Test default handling of missing values + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['d', 'd', 's', 's']) + self.assert_detect_headers(keep_default_na=True) + + def test_sparse_no_default_na(self): + ''' + Test handling of missing values as string columns + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['s', 's', 's', 's']) + self.assert_detect_headers(keep_default_na=False) + + +class TestColumnTypes: + ''' + To test resolved column types and column names. + ''' + def assert_expected( + self, target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names + ): + header_resolver = HeaderResolver( + target_object, column_name, column_types=header_type) + resolved_column_types = header_resolver.create_columns() + for index, col in enumerate(resolved_column_types): + assert col.__class__ == expected_resolved_column_type[index] + assert col.name == expected_resolved_column_names[index] + + def test_plate_name_well_name(self): + column_name = [ + 'plate_name', 'well_name', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['plate', 'well', 'l', 'd', 's', 'b'] + + # We expect populate to append 'Plate Name' and 'Well Name' at the end + expected_resolved_column_names = [ + 'Plate', 'Well', 'measurement 1', 'measurement 2', 'measurement 3', + 'measurement 4', 'Plate Name', 'Well Name'] + + expected_resolved_column_type = [ + PlateColumn, WellColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ScreenI(0, None) # Target is agnostic + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_plate_id_well_id(self): + column_name = [ + 'plate_id', 'well_id', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + # plate_id = 'l' since 'plate' header type is not supported for plateid + header_type = ['l', 'l', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'plate_id', 'well_id', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4'] + + expected_resolved_column_type = [ + LongColumn, LongColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn] + + target_object = ScreenI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_plate_well(self): + column_name = [ + 'plate', 'well', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['plate', 'well', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Plate', 'Well', 'measurement 1', 'measurement 2', 'measurement 3', + 'measurement 4', 'Plate Name', 'Well Name'] + + expected_resolved_column_type = [ + PlateColumn, WellColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ScreenI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_name_image_name(self): + ''' + In the case column name is 'Image Name' (case sensitive), + specific behaviour is executed. + ''' + column_name = [ + 'dataset_name', 'Image Name', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['s', 's', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'dataset_name', 'Image Name', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image'] + + expected_resolved_column_type = [ + StringColumn, StringColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, ImageColumn] + + target_object = ProjectI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_id_image_id(self): + column_name = [ + 'dataset_id', 'image_id', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['dataset', 'image', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Dataset', 'Image', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name'] + + expected_resolved_column_type = [ + DatasetColumn, ImageColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] + + target_object = ProjectI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_image(self): + column_name = [ + 'dataset', 'image', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['dataset', 'image', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Dataset', 'Image', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name', ] + + expected_resolved_column_type = [ + DatasetColumn, ImageColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] + + target_object = ProjectI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_roi(self): + column_name = [ + 'image', 'roi', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['image', 'roi', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Image', 'Roi', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name', 'Roi Name'] + + expected_resolved_column_type = [ + ImageColumn, RoiColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ProjectI(0, None) # Target is agnostic + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names)