From 03ebc18b4aa19696241f9d5af233d6fa7079fcb8 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 14:20:14 +0100
Subject: [PATCH 1/8] Convert detect_headers unit tests into class

---
 test/unit/test_automatic_header.py | 86 +++++++++++++++++-------------
 1 file changed, 49 insertions(+), 37 deletions(-)
 mode change 100644 => 100755 test/unit/test_automatic_header.py

diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py
old mode 100644
new mode 100755
index 3e553525..67d14975
--- a/test/unit/test_automatic_header.py
+++ b/test/unit/test_automatic_header.py
@@ -17,43 +17,55 @@
     StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn
 
 
-def test_detect_headers():
-    '''
-    Test of the default automatic column type detection behaviour
-    '''
-    d = {
-        'measurement 1': [11, 22, 33],
-        'measurement 2': [0.1, 0.2, 0.3],
-        'measurement 3': ['a', 'b', 'c'],
-        'measurement 4': [True, True, False],
-        'measurement 5': [11, 0.1, True]
-    }
-    prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
-    # Create a dictionary with every combination of headers
-    # eg plate_name/platename/plate name/plate_id/plateid/plate id
-    for prefix in prefix_list:
-        d[f'{prefix}_name'] = ['a', 'b', 'c']
-        d[f'{prefix} name'] = ['a', 'b', 'c']
-        d[f'{prefix}name'] = ['a', 'b', 'c']
-        d[f'{prefix}_id'] = [1, 2, 3]
-        d[f'{prefix} id'] = [1, 2, 3]
-        d[f'{prefix}id'] = [1, 2, 3]
-        d[f'{prefix}'] = [1, 2, 3]
-
-    df = pd.DataFrame(data=d)
-    tmp = tempfile.NamedTemporaryFile()
-    df.to_csv(tmp.name, index=False)
-    header = MetadataControl.detect_headers(tmp.name)
-    expected_header = [
-        'l', 'd', 's', 'b', 's',
-        's', 's', 's', 'l', 'l', 'l', 'l',
-        's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
-        'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
-        'well', 'well', 'well', 'l', 'l', 'l', 'well',
-        's', 's', 's', 'image', 'image', 'image', 'image',
-        's', 's', 's', 'roi', 'roi', 'roi', 'roi'
-    ]
-    assert header == expected_header
+class TestDetectHeaders:
+    """Test the MetadataControl.detect_headers API"""
+    def assert_detect_headers(self):
+        df = pd.DataFrame(data=self.d)
+        tmp = tempfile.NamedTemporaryFile()
+        df.to_csv(tmp.name, index=False)
+        header = MetadataControl.detect_headers(tmp.name)
+        assert header == self.expected_header
+
+    def create_objects_dictionary(self):
+        # Create a dictionary with every combination of headers
+        # eg plate_name/platename/plate name/plate_id/plateid/plate id
+        self.d = {}
+        prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
+        for prefix in prefix_list:
+            self.d[f'{prefix}_name'] = ['a', 'b', 'c']
+            self.d[f'{prefix} name'] = ['a', 'b', 'c']
+            self.d[f'{prefix}name'] = ['a', 'b', 'c']
+            self.d[f'{prefix}_id'] = [1, 2, 3]
+            self.d[f'{prefix} id'] = [1, 2, 3]
+            self.d[f'{prefix}id'] = [1, 2, 3]
+            self.d[f'{prefix}'] = [1, 2, 3]
+        self.expected_header = [
+            's', 's', 's', 'l', 'l', 'l', 'l',
+            's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
+            'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
+            'well', 'well', 'well', 'l', 'l', 'l', 'well',
+            's', 's', 's', 'image', 'image', 'image', 'image',
+            's', 's', 's', 'roi', 'roi', 'roi', 'roi'
+        ]
+
+    def test_objects_columns(self):
+        self.create_objects_dictionary()
+        self.assert_detect_headers()
+
+    def test_dense_extra_columns(self):
+        '''
+        Test of the default automatic column type detection behaviour
+        '''
+        self.create_objects_dictionary()
+        self.d.update({
+            'measurement 1': [11, 22, 33],
+            'measurement 2': [0.1, 0.2, 0.3],
+            'measurement 3': ['a', 'b', 'c'],
+            'measurement 4': [True, True, False],
+            'measurement 5': [11, 0.1, True]
+        })
+        self.expected_header.extend(['l', 'd', 's', 'b', 's'])
+        self.assert_detect_headers()
 
 
 class TestColumnTypes:

From 7ff94a2a6ee3b0bea8629af9beb8cc3ff8a92d79 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 14:32:36 +0100
Subject: [PATCH 2/8] Allow MetadataControl.detect_header to control default NA

---
 src/omero_metadata/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py
index edce60b6..54c57422 100755
--- a/src/omero_metadata/cli.py
+++ b/src/omero_metadata/cli.py
@@ -489,7 +489,7 @@ def testtables(self, args):
             self.ctx.die(100, "Failed to initialize Table")
 
     @staticmethod
-    def detect_headers(csv_path):
+    def detect_headers(csv_path, keep_default_na=True):
         '''
         Function to automatically detect headers from a CSV file. This function
         loads the table to pandas to detects the column type and match headers
@@ -497,7 +497,7 @@ def detect_headers(csv_path):
 
         conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
         headers = []
-        table = pd.read_csv(csv_path)
+        table = pd.read_csv(csv_path, keep_default_na=keep_default_na)
         col_types = table.dtypes.values.tolist()
         cols = list(table.columns)
 

From 31d562bd0f4e7d09ee3a5329438be6f061580a7e Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 14:49:18 +0100
Subject: [PATCH 3/8] Add unit tests for the detection of sparse columns

---
 test/unit/test_automatic_header.py | 34 +++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py
index 67d14975..2fd0bcc7 100755
--- a/test/unit/test_automatic_header.py
+++ b/test/unit/test_automatic_header.py
@@ -19,11 +19,11 @@
 
 class TestDetectHeaders:
     """Test the MetadataControl.detect_headers API"""
-    def assert_detect_headers(self):
+    def assert_detect_headers(self, **kwargs):
         df = pd.DataFrame(data=self.d)
         tmp = tempfile.NamedTemporaryFile()
         df.to_csv(tmp.name, index=False)
-        header = MetadataControl.detect_headers(tmp.name)
+        header = MetadataControl.detect_headers(tmp.name, **kwargs)
         assert header == self.expected_header
 
     def create_objects_dictionary(self):
@@ -52,7 +52,7 @@ def test_objects_columns(self):
         self.create_objects_dictionary()
         self.assert_detect_headers()
 
-    def test_dense_extra_columns(self):
+    def test_dense_columns(self):
         '''
         Test of the default automatic column type detection behaviour
         '''
@@ -67,6 +67,34 @@ def test_dense_extra_columns(self):
         self.expected_header.extend(['l', 'd', 's', 'b', 's'])
         self.assert_detect_headers()
 
+    def test_sparse_default_na(self):
+        '''
+        Test default handling of missing values
+        '''
+        self.create_objects_dictionary()
+        self.d.update({
+            'measurement 1': [11, None, 33],
+            'measurement 2': [0.1, 0.2, None],
+            'measurement 3': ['a', 'b', None],
+            'measurement 4': [True, None, False],
+        })
+        self.expected_header.extend(['d', 'd', 's', 's'])
+        self.assert_detect_headers(keep_default_na=True)
+
+    def test_sparse_no_default_na(self):
+        '''
+        Test handling of missing values as string columns
+        '''
+        self.create_objects_dictionary()
+        self.d.update({
+            'measurement 1': [11, None, 33],
+            'measurement 2': [0.1, 0.2, None],
+            'measurement 3': ['a', 'b', None],
+            'measurement 4': [True, None, False],
+        })
+        self.expected_header.extend(['s', 's', 's', 's'])
+        self.assert_detect_headers(keep_default_na=False)
+
 
 class TestColumnTypes:
     '''

From 1f637bd0f73a2023112d4e87786b7c4432b1c298 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 15:05:47 +0100
Subject: [PATCH 4/8] Use --allow_nan flag in MetadataControl.detect_headers


From 1ffac2ae5d4d9e8b12f3761ad486587b317ffa79 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 15:21:25 +0100
Subject: [PATCH 5/8] Allow fixtures to override the value assertion with their
 expectations

---
 test/integration/metadata/test_populate.py | 27 +++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 test/integration/metadata/test_populate.py

diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py
old mode 100644
new mode 100755
index 619bc968..42af38e7
--- a/test/integration/metadata/test_populate.py
+++ b/test/integration/metadata/test_populate.py
@@ -175,6 +175,13 @@ def assert_columns(self, columns):
         col_names = "Well,Well Type,Concentration,Well Name"
         assert col_names == ",".join([c.name for c in columns])
 
+    def assert_values(self, row_values):
+        # Unsure where the lower-casing is happening
+        if "A1" in row_values or "a1" in row_values:
+            assert "Control" in row_values
+        elif "A2" in row_values or "a2" in row_values:
+            assert "Treatment" in row_values
+
     def assert_child_annotations(self, oas):
         for ma, wid, wr, wc in oas:
             assert isinstance(ma, MapAnnotationI)
@@ -767,6 +774,14 @@ def assert_columns(self, columns):
     def assert_row_count(self, rows):
         assert rows == len(self.roi_names)
 
+    def assert_values(self, row_values):
+        if "roi1" in row_values:
+            assert 0.5 in row_values
+            assert 100 in row_values
+        elif "roi2" in row_values:
+            assert 'nan' in [str(value) for value in row_values]
+            assert 200 in row_values
+
     def get_target(self):
         if not self.image:
             image = self.test.make_image()
@@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture):
             row_values = [col.values[0] for col in t.read(
                 list(range(len(cols))), hit, hit+1).columns]
             assert len(row_values) == fixture.count
-            # Unsure where the lower-casing is happening
-            if "A1" in row_values or "a1" in row_values:
-                assert "Control" in row_values
-            elif "A2" in row_values or "a2" in row_values:
-                assert "Treatment" in row_values
-            elif "roi1" in row_values:
-                assert 0.5 in row_values
-                assert 100 in row_values
-            elif "roi2" in row_values:
-                assert 'nan' in [str(value) for value in row_values]
-                assert 200 in row_values
+            fixture.assert_values(row_values)
 
     def _test_bulk_to_map_annotation_context(self, fixture, batch_size):
         # self._testPopulateMetadataPlate()

From fe73a17d2a71fd7d220c48891a2364110b59f4f1 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Thu, 2 Jun 2022 16:18:30 +0100
Subject: [PATCH 6/8] Add GNU-style hyphen-separated version of CLI arguments

---
 src/omero_metadata/cli.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py
index 54c57422..874d085e 100755
--- a/src/omero_metadata/cli.py
+++ b/src/omero_metadata/cli.py
@@ -241,11 +241,13 @@ def _configure(self, parser):
         populate.add_argument("--localcfg", help=(
             "Local configuration file or a JSON object string"))
 
-        populate.add_argument("--allow_nan", action="store_true", help=(
-            "Allow empty values to become Nan in Long or Double columns"))
+        populate.add_argument(
+           "--allow-nan", "--allow_nan", action="store_true", help=(
+                "Allow empty values to become Nan in Long or Double columns"))
 
-        populate.add_argument("--manual_header", action="store_true", help=(
-            "Disable automatic header detection during population"))
+        populate.add_argument(
+            "--manual-header", "--manual_header", action="store_true", help=(
+                "Disable automatic header detection during population"))
 
         populateroi.add_argument(
             "--measurement", type=int, default=None,

From 89b8084f7c142bf70d2ef9d0ce745067234ede99 Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Sun, 5 Jun 2022 14:47:05 +0100
Subject: [PATCH 7/8] Use --allow-nan flag to determine NA behavior in
 detect_headers

---
 src/omero_metadata/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py
index 874d085e..3836051b 100755
--- a/src/omero_metadata/cli.py
+++ b/src/omero_metadata/cli.py
@@ -579,7 +579,8 @@ def populate(self, args):
         if not args.manual_header and \
                 not first_row[0].str.contains('# header').bool():
             omero_metadata.populate.log.info("Detecting header types")
-            header_type = MetadataControl.detect_headers(args.file)
+            header_type = MetadataControl.detect_headers(
+                args.file, keep_default_na=args.allow_nan)
             if args.dry_run:
                 omero_metadata.populate.log.info(f"Header Types:{header_type}")
         else:

From 27050f2c84350bffcdbdec7610f7d56bf5dd913c Mon Sep 17 00:00:00 2001
From: Sebastien Besson <seb.besson@gmail.com>
Date: Tue, 28 Jun 2022 09:23:39 +0100
Subject: [PATCH 8/8] Add note to README about the missing value handling

---
 README.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.rst b/README.rst
index a4bd3241..89b1741c 100644
--- a/README.rst
+++ b/README.rst
@@ -111,6 +111,9 @@ Example Float   ``DoubleColumn``  ``d``
 Example boolean ``BoolColumn``    ``b``      
 =============== ================= ====================
 
+In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the
+``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the
+missing values will be stored as NaN.
 
 However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below):