Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Support for different CSV Encodings in Import_Scripts/Populate_Metadata.py #198

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions omero/import_scripts/Populate_Metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def populate_metadata(client, conn, script_params):
object_ids = script_params["IDs"]
object_id = object_ids[0]
data_type = script_params["Data_Type"]

encoding = script_params["CSV Encoding"]
if data_type == "Image":
try:
from omero_metadata.populate import ImageWrapper # noqa: F401
Expand All @@ -120,7 +120,7 @@ def populate_metadata(client, conn, script_params):
original_file = get_original_file(
conn, data_type, object_id, file_ann_id)
provider = DownloadingOriginalFileProvider(conn)
data_for_preprocessing = provider.get_original_file_data(original_file)
data_for_preprocessing = provider.get_original_file_data(original_file, encoding=encoding)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

presumably this line fails if you use the wrong encoding? Or only if you use utf-8?
A try/except that returns a useful message (and/or prints it to stdout) could be helpful?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately no. I discovered this issue because one of our clients was using german Umlauts in their iso-8859-1 encoded CSV, which indeed raise an UnicodeDecodeError at this position. Unfortunately this does not hold true, if it would be the other way around, i.e. specifying 'iso-8859-1' on a Unicode encoded CSV. This will not return an error, but the string is non-sensical. I presume that this can lead to failures down the line in the script (e.g. when the image name is not correct due to the mis-read string), but it could also just lead to nonsensical annotations.
I'm not sure, how one would go about catching this behaviour in general.

You can see the effect for yourself with this test code:

test_str = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ'

test_str.encode('latin-1').decode('utf-8')
> Traceback (most recent call last):
>  File "<stdin>", line 1, in <module>
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 52: invalid continuation byte

test.encode('utf-8').decode('latin-1')
> 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÃ\x84Ã\x96Ã\x9c'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so it's not guaranteed to fail, but if it does raise a UnicodeDecodeError that could be worth catching?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I will try to find ways to break the other encodings as well and check what kind of errors they will give. Might not be 100% but better catching some of the errors than none.

temp_name = data_for_preprocessing.name
# 5.9.1 returns NamedTempFile where name is a string.
if isinstance(temp_name, int):
Expand Down Expand Up @@ -172,6 +172,13 @@ def run_script():
"File_Annotation", grouping="3",
description="File Annotation ID containing metadata to populate. "
"Note this is not the same as the File ID."),

scripts.String(
"CSV Encoding", grouping="4",
description="""Encoding of the CSV File provided. Can depend on your system locale
as well as the program used to generate the CSV File. E.g. Excel defaults to machine specific
ANSI encoding during export to CSV (i.e. cp1252 on US machines, iso-8859-1 on german machines ...).""",
default="utf-8"),

authors=["Emil Rozbicki", "OME Team"],
institutions=["Glencoe Software Inc."],
Expand Down
104 changes: 104 additions & 0 deletions test/integration/test_import_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,107 @@ def test_populate_metadata_for_screen(self):
assert message is not None
assert message.getValue().startswith('Table data populated')
conn.close()

def test_populate_metadata_for_cp1252(self):
sid = super(TestImportScripts, self).get_script(populate_metadata)
assert sid > 0

client, user = self.new_client_and_user()
update_service = client.getSession().getUpdateService()
plates = self.import_plates(client, plate_cols=3, plate_rows=1)
plate = plates[0]
name = plate.name.val
screen = omero.model.ScreenI()
screen.name = omero.rtypes.rstring("test_for_cp1252")
spl = omero.model.ScreenPlateLinkI()
spl.setParent(screen)
spl.setChild(plate)
spl = update_service.saveAndReturnObject(spl)
screen_id = spl.getParent().id.val
assert screen_id > 0
assert spl.getChild().id.val == plate.id.val

cvs_file = create_path("test_cp1252", ".csv")

# create a file annotation
with open(cvs_file.abspath(), 'wb+') as f:
f.write("Well,Plate, Well Type, Facility-Salt-Batch-ID\n".encode("cp1252"))
f.write(("A01,%s,Treatment,FOOL10041-101-2\n" % name).encode("cp1252"))
f.write(("A02,%s,Control,\n" % name).encode("cp1252"))
f.write(("A03,%s,Treatment,FOOL10041-101-2\n" % name).encode("cp1252"))

conn = BlitzGateway(client_obj=client)
fa = conn.createFileAnnfromLocalFile(cvs_file, mimetype="text/csv")
assert fa is not None
assert fa.id > 0
link = omero.model.ScreenAnnotationLinkI()
link.setParent(omero.model.ScreenI(screen_id, False))
link.setChild(omero.model.FileAnnotationI(fa.id, False))
link = update_service.saveAndReturnObject(link)
assert link.id.val > 0
# run the script
screen_ids = []
screen_ids.append(spl.getParent().id)

args = {
"Data_Type": omero.rtypes.rstring("Screen"),
"IDs": omero.rtypes.rlist(screen_ids),
"File_Annotation": omero.rtypes.rstring(str(fa.id)),
"CSV Encoding": omero.rtypes.rstring(str("cp1252"))
}
message = run_script(client, sid, args, "Message")
assert message is not None
assert message.getValue().startswith('Table data populated')
conn.close()

def test_populate_metadata_for_iso8859(self):
sid = super(TestImportScripts, self).get_script(populate_metadata)
assert sid > 0

client, user = self.new_client_and_user()
update_service = client.getSession().getUpdateService()
plates = self.import_plates(client, plate_cols=3, plate_rows=1)
plate = plates[0]
name = plate.name.val
screen = omero.model.ScreenI()
screen.name = omero.rtypes.rstring("test_for_iso8859")
spl = omero.model.ScreenPlateLinkI()
spl.setParent(screen)
spl.setChild(plate)
spl = update_service.saveAndReturnObject(spl)
screen_id = spl.getParent().id.val
assert screen_id > 0
assert spl.getChild().id.val == plate.id.val

cvs_file = create_path("test_iso8859", ".csv")

# create a file annotation
with open(cvs_file.abspath(), 'wb+') as f:
f.write("Well,Plate, Well Type, Facility-Salt-Batch-ID\n".encode("iso-8859-1"))
f.write(("A01,%s,Treatment,FOOL10041-101-2\n" % name).encode("iso-8859-1"))
f.write(("A02,%s,Control,\n" % name).encode("iso-8859-1"))
f.write(("A03,%s,Treatment,FOOL10041-101-2\n" % name).encode("iso-8859-1"))

conn = BlitzGateway(client_obj=client)
fa = conn.createFileAnnfromLocalFile(cvs_file, mimetype="text/csv")
assert fa is not None
assert fa.id > 0
link = omero.model.ScreenAnnotationLinkI()
link.setParent(omero.model.ScreenI(screen_id, False))
link.setChild(omero.model.FileAnnotationI(fa.id, False))
link = update_service.saveAndReturnObject(link)
assert link.id.val > 0
# run the script
screen_ids = []
screen_ids.append(spl.getParent().id)

args = {
"Data_Type": omero.rtypes.rstring("Screen"),
"IDs": omero.rtypes.rlist(screen_ids),
"File_Annotation": omero.rtypes.rstring(str(fa.id)),
"CSV Encoding": omero.rtypes.rstring(str("iso-8859-1"))
}
message = run_script(client, sid, args, "Message")
assert message is not None
assert message.getValue().startswith('Table data populated')
conn.close()