Skip to content

Commit

Permalink
Merge pull request #77 from sbesson/detect_headers_sparse
Browse files Browse the repository at this point in the history
Fix header detection for tables with sparse numerical data
  • Loading branch information
sbesson authored Jun 28, 2022
2 parents f7d5c91 + 27050f2 commit dea8834
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 55 deletions.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ Example Float ``DoubleColumn`` ``d``
Example boolean ``BoolColumn`` ``b``
=============== ================= ====================

In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the
``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the
missing values will be stored as NaN.

However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below):

Expand Down
17 changes: 10 additions & 7 deletions src/omero_metadata/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,13 @@ def _configure(self, parser):
populate.add_argument("--localcfg", help=(
"Local configuration file or a JSON object string"))

populate.add_argument("--allow_nan", action="store_true", help=(
"Allow empty values to become Nan in Long or Double columns"))
populate.add_argument(
"--allow-nan", "--allow_nan", action="store_true", help=(
"Allow empty values to become Nan in Long or Double columns"))

populate.add_argument("--manual_header", action="store_true", help=(
"Disable automatic header detection during population"))
populate.add_argument(
"--manual-header", "--manual_header", action="store_true", help=(
"Disable automatic header detection during population"))

populateroi.add_argument(
"--measurement", type=int, default=None,
Expand Down Expand Up @@ -489,15 +491,15 @@ def testtables(self, args):
self.ctx.die(100, "Failed to initialize Table")

@staticmethod
def detect_headers(csv_path):
def detect_headers(csv_path, keep_default_na=True):
'''
Function to automatically detect headers from a CSV file. This function
loads the table to pandas to detects the column type and match headers
'''

conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi']
headers = []
table = pd.read_csv(csv_path)
table = pd.read_csv(csv_path, keep_default_na=keep_default_na)
col_types = table.dtypes.values.tolist()
cols = list(table.columns)

Expand Down Expand Up @@ -577,7 +579,8 @@ def populate(self, args):
if not args.manual_header and \
not first_row[0].str.contains('# header').bool():
omero_metadata.populate.log.info("Detecting header types")
header_type = MetadataControl.detect_headers(args.file)
header_type = MetadataControl.detect_headers(
args.file, keep_default_na=args.allow_nan)
if args.dry_run:
omero_metadata.populate.log.info(f"Header Types:{header_type}")
else:
Expand Down
27 changes: 16 additions & 11 deletions test/integration/metadata/test_populate.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ def assert_columns(self, columns):
col_names = "Well,Well Type,Concentration,Well Name"
assert col_names == ",".join([c.name for c in columns])

def assert_values(self, row_values):
# Unsure where the lower-casing is happening
if "A1" in row_values or "a1" in row_values:
assert "Control" in row_values
elif "A2" in row_values or "a2" in row_values:
assert "Treatment" in row_values

def assert_child_annotations(self, oas):
for ma, wid, wr, wc in oas:
assert isinstance(ma, MapAnnotationI)
Expand Down Expand Up @@ -767,6 +774,14 @@ def assert_columns(self, columns):
def assert_row_count(self, rows):
assert rows == len(self.roi_names)

def assert_values(self, row_values):
if "roi1" in row_values:
assert 0.5 in row_values
assert 100 in row_values
elif "roi2" in row_values:
assert 'nan' in [str(value) for value in row_values]
assert 200 in row_values

def get_target(self):
if not self.image:
image = self.test.make_image()
Expand Down Expand Up @@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture):
row_values = [col.values[0] for col in t.read(
list(range(len(cols))), hit, hit+1).columns]
assert len(row_values) == fixture.count
# Unsure where the lower-casing is happening
if "A1" in row_values or "a1" in row_values:
assert "Control" in row_values
elif "A2" in row_values or "a2" in row_values:
assert "Treatment" in row_values
elif "roi1" in row_values:
assert 0.5 in row_values
assert 100 in row_values
elif "roi2" in row_values:
assert 'nan' in [str(value) for value in row_values]
assert 200 in row_values
fixture.assert_values(row_values)

def _test_bulk_to_map_annotation_context(self, fixture, batch_size):
# self._testPopulateMetadataPlate()
Expand Down
114 changes: 77 additions & 37 deletions test/unit/test_automatic_header.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,83 @@
StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn


def test_detect_headers():
'''
Test of the default automatic column type detection behaviour
'''
d = {
'measurement 1': [11, 22, 33],
'measurement 2': [0.1, 0.2, 0.3],
'measurement 3': ['a', 'b', 'c'],
'measurement 4': [True, True, False],
'measurement 5': [11, 0.1, True]
}
prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
# Create a dictionary with every combination of headers
# eg plate_name/platename/plate name/plate_id/plateid/plate id
for prefix in prefix_list:
d[f'{prefix}_name'] = ['a', 'b', 'c']
d[f'{prefix} name'] = ['a', 'b', 'c']
d[f'{prefix}name'] = ['a', 'b', 'c']
d[f'{prefix}_id'] = [1, 2, 3]
d[f'{prefix} id'] = [1, 2, 3]
d[f'{prefix}id'] = [1, 2, 3]
d[f'{prefix}'] = [1, 2, 3]

df = pd.DataFrame(data=d)
tmp = tempfile.NamedTemporaryFile()
df.to_csv(tmp.name, index=False)
header = MetadataControl.detect_headers(tmp.name)
expected_header = [
'l', 'd', 's', 'b', 's',
's', 's', 's', 'l', 'l', 'l', 'l',
's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
'well', 'well', 'well', 'l', 'l', 'l', 'well',
's', 's', 's', 'image', 'image', 'image', 'image',
's', 's', 's', 'roi', 'roi', 'roi', 'roi'
]
assert header == expected_header
class TestDetectHeaders:
"""Test the MetadataControl.detect_headers API"""
def assert_detect_headers(self, **kwargs):
df = pd.DataFrame(data=self.d)
tmp = tempfile.NamedTemporaryFile()
df.to_csv(tmp.name, index=False)
header = MetadataControl.detect_headers(tmp.name, **kwargs)
assert header == self.expected_header

def create_objects_dictionary(self):
# Create a dictionary with every combination of headers
# eg plate_name/platename/plate name/plate_id/plateid/plate id
self.d = {}
prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ]
for prefix in prefix_list:
self.d[f'{prefix}_name'] = ['a', 'b', 'c']
self.d[f'{prefix} name'] = ['a', 'b', 'c']
self.d[f'{prefix}name'] = ['a', 'b', 'c']
self.d[f'{prefix}_id'] = [1, 2, 3]
self.d[f'{prefix} id'] = [1, 2, 3]
self.d[f'{prefix}id'] = [1, 2, 3]
self.d[f'{prefix}'] = [1, 2, 3]
self.expected_header = [
's', 's', 's', 'l', 'l', 'l', 'l',
's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset',
'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate',
'well', 'well', 'well', 'l', 'l', 'l', 'well',
's', 's', 's', 'image', 'image', 'image', 'image',
's', 's', 's', 'roi', 'roi', 'roi', 'roi'
]

def test_objects_columns(self):
self.create_objects_dictionary()
self.assert_detect_headers()

def test_dense_columns(self):
'''
Test of the default automatic column type detection behaviour
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, 22, 33],
'measurement 2': [0.1, 0.2, 0.3],
'measurement 3': ['a', 'b', 'c'],
'measurement 4': [True, True, False],
'measurement 5': [11, 0.1, True]
})
self.expected_header.extend(['l', 'd', 's', 'b', 's'])
self.assert_detect_headers()

def test_sparse_default_na(self):
'''
Test default handling of missing values
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, None, 33],
'measurement 2': [0.1, 0.2, None],
'measurement 3': ['a', 'b', None],
'measurement 4': [True, None, False],
})
self.expected_header.extend(['d', 'd', 's', 's'])
self.assert_detect_headers(keep_default_na=True)

def test_sparse_no_default_na(self):
'''
Test handling of missing values as string columns
'''
self.create_objects_dictionary()
self.d.update({
'measurement 1': [11, None, 33],
'measurement 2': [0.1, 0.2, None],
'measurement 3': ['a', 'b', None],
'measurement 4': [True, None, False],
})
self.expected_header.extend(['s', 's', 's', 's'])
self.assert_detect_headers(keep_default_na=False)


class TestColumnTypes:
Expand Down

0 comments on commit dea8834

Please sign in to comment.