From 691c35fcdcbd94a7954b5c736183e2cf518c81e5 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 6 Mar 2024 10:12:41 -0500 Subject: [PATCH 1/3] Testing changes to run on Windows. --- tests/convert/test_isatab2w4m.py | 17 ++++++++++---- tests/isatab/test_isatab.py | 26 ++++++++++++--------- tests/validators/test_validate_test_data.py | 19 +++++++-------- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/tests/convert/test_isatab2w4m.py b/tests/convert/test_isatab2w4m.py index 30e05bcb..5096a98a 100644 --- a/tests/convert/test_isatab2w4m.py +++ b/tests/convert/test_isatab2w4m.py @@ -1,6 +1,5 @@ # Test conversion to W4M format -import filecmp import os import shutil import tempfile @@ -9,6 +8,16 @@ from isatools.tests import utils +def universal_filecmp(f1, f2): + with open(f1, 'r') as fp1, open(f2, 'r') as fp2: + while True: + b1 = fp1.readline() + b2 = fp2.readline() + if b1 != b2: + return False + if not b1: + return True + # Test presence of data folder def setUpModule(): if not os.path.exists(utils.DATA_DIR): @@ -46,7 +55,7 @@ def plain_test(self, study, test_dir): output_file = os.path.join(self._tmp_dir, '.'.join( ['-'.join([study, 'w4m', x]), 'tsv'])) self.assertTrue(os.path.exists(output_file)) - self.assertTrue(filecmp.cmp(output_file, ref_file, shallow=False), + self.assertTrue(universal_filecmp(output_file, ref_file), 'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file)) # Test MTBLS30 @@ -89,7 +98,7 @@ def na_filtering_test(self, study, test_dir, samp_na_filtering=None, 'sample-metadata', 'variable-metadata', 'sample-variable-matrix']: self.assertTrue(os.path.exists(output_files[x])) self.assertTrue( - filecmp.cmp(output_files[x], ref_files[x]), + universal_filecmp(output_files[x], ref_files[x]), 'Output file "{0}" differs from reference file "{1}".'.format( output_files[x], ref_files[x])) @@ -140,5 +149,5 @@ def test_assay_selection(self): ['-'.join([study, 'w4m', x, assay]), 'tsv'])) self.assertTrue(os.path.exists(output_file)) self.assertTrue( - filecmp.cmp(output_file, ref_file), + universal_filecmp(output_file, ref_file), 'Output file "{0}" differs from reference file "{1}".'.format(output_file, ref_file)) diff --git a/tests/isatab/test_isatab.py b/tests/isatab/test_isatab.py index 1e249e50..25eee0d4 100644 --- a/tests/isatab/test_isatab.py +++ b/tests/isatab/test_isatab.py @@ -28,6 +28,9 @@ def setUpModule(): "git clone -b tests --single-branch git@github.com:ISA-tools/ISAdatasets {0}" .format(utils.DATA_DIR)) +def replace_windows_newlines(input_string): + return input_string.replace('\r\r\n', '\n').replace('\r\n', '\n').replace('\r', '\n') + class TestIsaMerge(unittest.TestCase): @@ -1069,7 +1072,7 @@ def test_source_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tSample Name source1\tsample collection\tsample1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_x2(self): i = Investigation() @@ -1167,7 +1170,7 @@ def test_source_protocol_ref_sample_with_characteristics(self): i.studies = [s] expected = """Source Name\tCharacteristics[reference descriptor]\tProtocol REF\tSample Name\tCharacteristics[organism part] source1\tnot applicable\tsample collection\tsample1\tliver""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_with_parameter_values(self): i = Investigation() @@ -1188,7 +1191,7 @@ def test_source_protocol_ref_sample_with_parameter_values(self): i.studies = [s] expected = """Source Name\tProtocol REF\tParameter Value[temperature]\tSample Name source1\tsample collection\t10\tsample1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_with_factor_values(self): i = Investigation() @@ -1216,11 +1219,11 @@ def test_source_protocol_ref_sample_with_factor_values(self): s.assays = [a] expected_study_table = """Source Name\tProtocol REF\tSample Name\tFactor Value[study group] source1\tsample collection\tsample1\tStudy group 1""" - self.assertIn(expected_study_table, isatab.dumps(i)) + self.assertIn(expected_study_table, replace_windows_newlines(isatab.dumps(i))) expected_assay_table = """Sample Name\tFactor Value[study group]\tProtocol REF sample1\tStudy group 1\textraction""" self.assertIn(expected_assay_table, - isatab.dumps(i, write_fvs_in_assay_table=True)) + replace_windows_newlines(isatab.dumps(i, write_fvs_in_assay_table=True))) def test_source_protocol_ref_protocol_ref_sample(self): i = Investigation() @@ -1239,7 +1242,7 @@ def test_source_protocol_ref_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tProtocol REF\tSample Name source1\tsample collection\taliquoting\taliquot1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_source_protocol_ref_sample_protocol_ref_sample(self): i = Investigation() @@ -1261,7 +1264,7 @@ def test_source_protocol_ref_sample_protocol_ref_sample(self): i.studies = [s] expected = """Source Name\tProtocol REF\tSample Name\tProtocol REF\tSample Name source1\tsample collection\tsample1\taliquoting\taliquot1""" - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data2(self): i = Investigation() @@ -1295,7 +1298,7 @@ def test_sample_protocol_ref_material_protocol_ref_data2(self): i.studies = [s] expected = (f"""Sample Name\tProtocol REF\tExtract Name\tProtocol REF\tAssay Name\tRaw Data File\tComment[checksum type]\tComment[checksum]\n""" + f"""sample1\textraction\textract1\tnucleic acid sequencing\tassay-1\tdatafile.raw\t{cs_comment1.value}\t{cs_comment2.value}""") - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data3(self): i = Investigation() @@ -1334,7 +1337,7 @@ def test_sample_protocol_ref_material_protocol_ref_data3(self): # self.assertIn(expected_line1, dump_out) # self.assertIn(expected_line2, dump_out) - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data4(self): i = Investigation() @@ -1373,7 +1376,7 @@ def test_sample_protocol_ref_material_protocol_ref_data4(self): # self.assertIn(expected_line1, dump_out) # self.assertIn(expected_line2, dump_out) - self.assertIn(expected, isatab.dumps(i)) + self.assertIn(expected, replace_windows_newlines(isatab.dumps(i))) def test_sample_protocol_ref_material_protocol_ref_data_x2(self): i = Investigation() @@ -1710,7 +1713,7 @@ def test_isatab_preprocess_issue235(self): test_isatab_str = b""""Sample Name" "Protocol REF" "Parameter Value[medium]" "Term Source REF" "Term Accession Number" "Parameter Value[serum]" "Term Source REF" "Term Accession Number" "Parameter Value[serum concentration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[medium volume]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[migration modulator]" "Term Source REF" "Term Accession Number" "Parameter Value[modulator concentration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[modulator distribution]" "Term Source REF" "Term Accession Number" "Protocol REF" "Parameter Value[imaging technique]" "Term Source REF" "Term Accession Number" "Parameter Value[imaging technique temporal feature]" "Term Source REF" "Term Accession Number" "Parameter Value[acquisition duration]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[time interval]" "Unit" "Term Source REF" "Term Accession Number" "Parameter Value[objective type]" "Term Source REF" "Term Accession Number" "Parameter Value[objective magnification]" "Term Source REF" "Term Accession Number" "Parameter Value[objective numerical aperture]" "Term Source REF" "Term Accession Number" "Parameter Value[acquisition channel count]" "Term Source REF" "Term Accession Number" "Parameter Value[reporter]" "Term Source REF" "Term Accession Number" "Parameter Value[voxel size]" "Unit" "Term Source REF" "Term Accession Number" "Assay Name" "Raw Data File" "Protocol REF" "Parameter Value[software]" "Term Source REF" "Term Accession Number" "Data Transformation Name" "Derived Data File" "culture1" "migration assay" "RPMI-1640" "" "" "Heat Inactivated Fetal Bovine Serum " "" "" "10" "%" "UO" "http://purl.obolibrary.org/obo/UO_0000165" "300" "microliter" "UO" "http://purl.obolibrary.org/obo/UO_0000101" "" "" "" "" "" "" "" "gradient" "" "" "imaging" "phase-contrast microscopy" "" "" "dynamic" "" "" "6" "hour" "UO" "http://purl.obolibrary.org/obo/UO_0000032" "15" "minute" "UO" "http://purl.obolibrary.org/obo/UO_0000031" "" "" "" "20" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "culture1" "" "data transformation" "CELLMIA" "" "" "" "" """ - with tempfile.NamedTemporaryFile() as tmp: + with tempfile.NamedTemporaryFile(delete=False) as tmp: tmp.write(test_isatab_str) tmp.seek(0) study_assay_parser = isatab_parser.StudyAssayParser('mock.txt') @@ -1719,6 +1722,7 @@ def test_isatab_preprocess_issue235(self): if """Protocol REF\tData Transformation Name""" in header: self.fail('Incorrectly inserted Protocol REF before ' 'Data Transformation Name') + os.remove(tmp.name) def test_isatab_factor_value_parsing_issue270(self): with open(os.path.join(self._tab_data_dir, 'issue270', 'i_matteo.txt'), diff --git a/tests/validators/test_validate_test_data.py b/tests/validators/test_validate_test_data.py index 33fb9840..4d8c1a4e 100644 --- a/tests/validators/test_validate_test_data.py +++ b/tests/validators/test_validate_test_data.py @@ -2,6 +2,7 @@ import logging import os import unittest +import pathlib from jsonschema import Draft4Validator from jsonschema import RefResolver @@ -304,8 +305,8 @@ class TestIsaJsonCreateTestData(unittest.TestCase): def setUp(self): self._reporting_level = logging.ERROR - self.v2_create_schemas_path = os.path.join( - os.path.dirname(__file__), '../..', 'isatools', 'resources', 'schemas', + self.v2_create_schemas_path = pathlib.PurePosixPath( + pathlib.Path(__file__).parents[0], '..', '..', 'isatools', 'resources', 'schemas', 'isa_model_version_2_0_schemas', 'create') def test_validate_testdata_sampleassayplan_json(self): @@ -314,10 +315,9 @@ def test_validate_testdata_sampleassayplan_json(self): with open(os.path.join(self.v2_create_schemas_path, 'sample_assay_plan_schema.json')) as fp: sample_assay_plan_schema = json.load(fp) - resolver = RefResolver('file://{}'.format( - os.path.join(self.v2_create_schemas_path, - 'sample_assay_plan_schema.json')), - sample_assay_plan_schema) + res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + 'sample_assay_plan_schema.json').as_uri() + resolver = RefResolver(res_path, sample_assay_plan_schema) validator = Draft4Validator(sample_assay_plan_schema, resolver=resolver) validator.validate(json.load(test_case_fp)) @@ -342,10 +342,9 @@ def test_validate_testdata_treatment_sequence_json(self): with open(os.path.join(self.v2_create_schemas_path, 'treatment_sequence_schema.json')) as fp: treatment_sequence_schema = json.load(fp) - resolver = RefResolver('file://{}'.format( - os.path.join(self.v2_create_schemas_path, - 'treatment_sequence_schema.json')), - treatment_sequence_schema) + res_path = pathlib.PurePosixPath("file://", self.v2_create_schemas_path, + 'treatment_sequence_schema.json').as_uri() + resolver = RefResolver(res_path, treatment_sequence_schema) validator = Draft4Validator(treatment_sequence_schema, resolver=resolver) validator.validate(json.load(test_case_fp)) From 00509085ba6baadaa9ca60e10dfcb6fdd58d7110 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Thu, 22 Feb 2024 02:18:55 -0500 Subject: [PATCH 2/3] Update defaults.py The previous regex didn't match any DOI I tried. I found this one in a blog post from Crossref where they say it matched 74.4M out of 74.9M DOIs that they have seen. The post is here: https://www.crossref.org/blog/dois-and-matching-regular-expressions/ --- isatools/isatab/defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isatools/isatab/defaults.py b/isatools/isatab/defaults.py index c38566e1..c2487123 100644 --- a/isatools/isatab/defaults.py +++ b/isatools/isatab/defaults.py @@ -34,7 +34,7 @@ def pbar(x): _RX_I_FILE_NAME = compile(r'i_(.*?)\.txt') _RX_DATA = compile(r'data\[(.*?)\]') _RX_COMMENT = compile(r'Comment\[(.*?)\]') -_RX_DOI = compile(r'(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)') +_RX_DOI = compile(r'10.\d{4,9}/[-._;()/:a-z0-9A-Z]+') _RX_PMID = compile(r'[0-9]{8}') _RX_PMCID = compile(r'PMC[0-9]{8}') _RX_CHARACTERISTICS = compile(r'Characteristics\[(.*?)\]') From 2275ee912949fc20f1d653c282842ccfb2d272a0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Wed, 6 Mar 2024 14:20:34 -0500 Subject: [PATCH 3/3] Update validate/test__core.py Since the DOI regex works now there are 2 fewer warnings. Previously it was warning about valid DOIs. --- tests/isatab/validate/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/isatab/validate/test_core.py b/tests/isatab/validate/test_core.py index e2b2c3cd..6c9aeda9 100644 --- a/tests/isatab/validate/test_core.py +++ b/tests/isatab/validate/test_core.py @@ -17,7 +17,7 @@ def test_b_ii_s_3(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(fp=data_file, config_dir=self.default_conf, origin="") - self.assertEqual(len(r['warnings']), 12) + self.assertEqual(len(r['warnings']), 10) def test_mtbls267(self): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'MTBLS267-partial') @@ -82,7 +82,7 @@ def is_investigation(investigation_df): data_path = path.join(path.dirname(path.abspath(__file__)), '..', '..', 'data', 'tab', 'BII-S-3') with open(path.join(data_path, 'i_gilbert.txt'), 'r') as data_file: r = validate(data_file, rules=rules) - self.assertEqual(len(r['warnings']), 12) + self.assertEqual(len(r['warnings']), 10) rule = '12000' expected_error = {