From 03ebc18b4aa19696241f9d5af233d6fa7079fcb8 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:20:14 +0100 Subject: [PATCH 1/8] Convert detect_headers unit tests into class --- test/unit/test_automatic_header.py | 86 +++++++++++++++++------------- 1 file changed, 49 insertions(+), 37 deletions(-) mode change 100644 => 100755 test/unit/test_automatic_header.py diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py old mode 100644 new mode 100755 index 3e553525..67d14975 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -17,43 +17,55 @@ StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn -def test_detect_headers(): - ''' - Test of the default automatic column type detection behaviour - ''' - d = { - 'measurement 1': [11, 22, 33], - 'measurement 2': [0.1, 0.2, 0.3], - 'measurement 3': ['a', 'b', 'c'], - 'measurement 4': [True, True, False], - 'measurement 5': [11, 0.1, True] - } - prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] - # Create a dictionary with every combination of headers - # eg plate_name/platename/plate name/plate_id/plateid/plate id - for prefix in prefix_list: - d[f'{prefix}_name'] = ['a', 'b', 'c'] - d[f'{prefix} name'] = ['a', 'b', 'c'] - d[f'{prefix}name'] = ['a', 'b', 'c'] - d[f'{prefix}_id'] = [1, 2, 3] - d[f'{prefix} id'] = [1, 2, 3] - d[f'{prefix}id'] = [1, 2, 3] - d[f'{prefix}'] = [1, 2, 3] - - df = pd.DataFrame(data=d) - tmp = tempfile.NamedTemporaryFile() - df.to_csv(tmp.name, index=False) - header = MetadataControl.detect_headers(tmp.name) - expected_header = [ - 'l', 'd', 's', 'b', 's', - 's', 's', 's', 'l', 'l', 'l', 'l', - 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', - 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', - 'well', 'well', 'well', 'l', 'l', 'l', 'well', - 's', 's', 's', 'image', 'image', 'image', 'image', - 's', 's', 's', 'roi', 'roi', 'roi', 'roi' - ] - assert header == expected_header +class TestDetectHeaders: + """Test the MetadataControl.detect_headers API""" + def assert_detect_headers(self): + df = pd.DataFrame(data=self.d) + tmp = tempfile.NamedTemporaryFile() + df.to_csv(tmp.name, index=False) + header = MetadataControl.detect_headers(tmp.name) + assert header == self.expected_header + + def create_objects_dictionary(self): + # Create a dictionary with every combination of headers + # eg plate_name/platename/plate name/plate_id/plateid/plate id + self.d = {} + prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] + for prefix in prefix_list: + self.d[f'{prefix}_name'] = ['a', 'b', 'c'] + self.d[f'{prefix} name'] = ['a', 'b', 'c'] + self.d[f'{prefix}name'] = ['a', 'b', 'c'] + self.d[f'{prefix}_id'] = [1, 2, 3] + self.d[f'{prefix} id'] = [1, 2, 3] + self.d[f'{prefix}id'] = [1, 2, 3] + self.d[f'{prefix}'] = [1, 2, 3] + self.expected_header = [ + 's', 's', 's', 'l', 'l', 'l', 'l', + 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', + 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', + 'well', 'well', 'well', 'l', 'l', 'l', 'well', + 's', 's', 's', 'image', 'image', 'image', 'image', + 's', 's', 's', 'roi', 'roi', 'roi', 'roi' + ] + + def test_objects_columns(self): + self.create_objects_dictionary() + self.assert_detect_headers() + + def test_dense_extra_columns(self): + ''' + Test of the default automatic column type detection behaviour + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, 22, 33], + 'measurement 2': [0.1, 0.2, 0.3], + 'measurement 3': ['a', 'b', 'c'], + 'measurement 4': [True, True, False], + 'measurement 5': [11, 0.1, True] + }) + self.expected_header.extend(['l', 'd', 's', 'b', 's']) + self.assert_detect_headers() class TestColumnTypes: From 7ff94a2a6ee3b0bea8629af9beb8cc3ff8a92d79 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:32:36 +0100 Subject: [PATCH 2/8] Allow MetadataControl.detect_header to control default NA --- src/omero_metadata/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index edce60b6..54c57422 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -489,7 +489,7 @@ def testtables(self, args): self.ctx.die(100, "Failed to initialize Table") @staticmethod - def detect_headers(csv_path): + def detect_headers(csv_path, keep_default_na=True): ''' Function to automatically detect headers from a CSV file. This function loads the table to pandas to detects the column type and match headers @@ -497,7 +497,7 @@ def detect_headers(csv_path): conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] headers = [] - table = pd.read_csv(csv_path) + table = pd.read_csv(csv_path, keep_default_na=keep_default_na) col_types = table.dtypes.values.tolist() cols = list(table.columns) From 31d562bd0f4e7d09ee3a5329438be6f061580a7e Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:49:18 +0100 Subject: [PATCH 3/8] Add unit tests for the detection of sparse columns --- test/unit/test_automatic_header.py | 34 +++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index 67d14975..2fd0bcc7 100755 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -19,11 +19,11 @@ class TestDetectHeaders: """Test the MetadataControl.detect_headers API""" - def assert_detect_headers(self): + def assert_detect_headers(self, **kwargs): df = pd.DataFrame(data=self.d) tmp = tempfile.NamedTemporaryFile() df.to_csv(tmp.name, index=False) - header = MetadataControl.detect_headers(tmp.name) + header = MetadataControl.detect_headers(tmp.name, **kwargs) assert header == self.expected_header def create_objects_dictionary(self): @@ -52,7 +52,7 @@ def test_objects_columns(self): self.create_objects_dictionary() self.assert_detect_headers() - def test_dense_extra_columns(self): + def test_dense_columns(self): ''' Test of the default automatic column type detection behaviour ''' @@ -67,6 +67,34 @@ def test_dense_extra_columns(self): self.expected_header.extend(['l', 'd', 's', 'b', 's']) self.assert_detect_headers() + def test_sparse_default_na(self): + ''' + Test default handling of missing values + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['d', 'd', 's', 's']) + self.assert_detect_headers(keep_default_na=True) + + def test_sparse_no_default_na(self): + ''' + Test handling of missing values as string columns + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['s', 's', 's', 's']) + self.assert_detect_headers(keep_default_na=False) + class TestColumnTypes: ''' From 1f637bd0f73a2023112d4e87786b7c4432b1c298 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 15:05:47 +0100 Subject: [PATCH 4/8] Use --allow_nan flag in MetadataControl.detect_headers From 1ffac2ae5d4d9e8b12f3761ad486587b317ffa79 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 15:21:25 +0100 Subject: [PATCH 5/8] Allow fixtures to override the value assertion with their expectations --- test/integration/metadata/test_populate.py | 27 +++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) mode change 100644 => 100755 test/integration/metadata/test_populate.py diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py old mode 100644 new mode 100755 index 619bc968..42af38e7 --- a/test/integration/metadata/test_populate.py +++ b/test/integration/metadata/test_populate.py @@ -175,6 +175,13 @@ def assert_columns(self, columns): col_names = "Well,Well Type,Concentration,Well Name" assert col_names == ",".join([c.name for c in columns]) + def assert_values(self, row_values): + # Unsure where the lower-casing is happening + if "A1" in row_values or "a1" in row_values: + assert "Control" in row_values + elif "A2" in row_values or "a2" in row_values: + assert "Treatment" in row_values + def assert_child_annotations(self, oas): for ma, wid, wr, wc in oas: assert isinstance(ma, MapAnnotationI) @@ -767,6 +774,14 @@ def assert_columns(self, columns): def assert_row_count(self, rows): assert rows == len(self.roi_names) + def assert_values(self, row_values): + if "roi1" in row_values: + assert 0.5 in row_values + assert 100 in row_values + elif "roi2" in row_values: + assert 'nan' in [str(value) for value in row_values] + assert 200 in row_values + def get_target(self): if not self.image: image = self.test.make_image() @@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture): row_values = [col.values[0] for col in t.read( list(range(len(cols))), hit, hit+1).columns] assert len(row_values) == fixture.count - # Unsure where the lower-casing is happening - if "A1" in row_values or "a1" in row_values: - assert "Control" in row_values - elif "A2" in row_values or "a2" in row_values: - assert "Treatment" in row_values - elif "roi1" in row_values: - assert 0.5 in row_values - assert 100 in row_values - elif "roi2" in row_values: - assert 'nan' in [str(value) for value in row_values] - assert 200 in row_values + fixture.assert_values(row_values) def _test_bulk_to_map_annotation_context(self, fixture, batch_size): # self._testPopulateMetadataPlate() From fe73a17d2a71fd7d220c48891a2364110b59f4f1 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 16:18:30 +0100 Subject: [PATCH 6/8] Add GNU-style hyphen-separated version of CLI arguments --- src/omero_metadata/cli.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 54c57422..874d085e 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -241,11 +241,13 @@ def _configure(self, parser): populate.add_argument("--localcfg", help=( "Local configuration file or a JSON object string")) - populate.add_argument("--allow_nan", action="store_true", help=( - "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument( + "--allow-nan", "--allow_nan", action="store_true", help=( + "Allow empty values to become Nan in Long or Double columns")) - populate.add_argument("--manual_header", action="store_true", help=( - "Disable automatic header detection during population")) + populate.add_argument( + "--manual-header", "--manual_header", action="store_true", help=( + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, From 89b8084f7c142bf70d2ef9d0ce745067234ede99 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Sun, 5 Jun 2022 14:47:05 +0100 Subject: [PATCH 7/8] Use --allow-nan flag to determine NA behavior in detect_headers --- src/omero_metadata/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 874d085e..3836051b 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -579,7 +579,8 @@ def populate(self, args): if not args.manual_header and \ not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") - header_type = MetadataControl.detect_headers(args.file) + header_type = MetadataControl.detect_headers( + args.file, keep_default_na=args.allow_nan) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") else: From 27050f2c84350bffcdbdec7610f7d56bf5dd913c Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 09:23:39 +0100 Subject: [PATCH 8/8] Add note to README about the missing value handling --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index a4bd3241..89b1741c 100644 --- a/README.rst +++ b/README.rst @@ -111,6 +111,9 @@ Example Float ``DoubleColumn`` ``d`` Example boolean ``BoolColumn`` ``b`` =============== ================= ==================== +In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the +``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the +missing values will be stored as NaN. However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below):