diff --git a/.gitignore b/.gitignore index 95a4edc..a7b0385 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,7 @@ celerybeat-schedule .env # virtualenv +.venv/ venv/ ENV/ diff --git a/README.md b/README.md index 6fdfd57..0e6b11f 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ meltano elt --catalog=my-catalog.json --job_id=my-job-state tap-spreadsheets-any ### JSON support -JSON files are expected to parse as a root-level array of objects where each object is a set of flat key-value pairs. +JSON files are expected to parse as a root-level array of objects where each object is a set of key-value pairs. ```json [ { "name": "row one", "key": 42}, diff --git a/tap_spreadsheets_anywhere/conversion.py b/tap_spreadsheets_anywhere/conversion.py index 93889e7..9145c76 100644 --- a/tap_spreadsheets_anywhere/conversion.py +++ b/tap_spreadsheets_anywhere/conversion.py @@ -26,8 +26,9 @@ def coerce(datum,declared_types): if datum is None or datum == '': return None - desired_type = declared_types.copy() - if isinstance(desired_type, list): + desired_type = declared_types + if isinstance(declared_types, list): + desired_type = declared_types.copy() if "null" in desired_type: desired_type.remove("null") desired_type = desired_type[0] @@ -43,6 +44,15 @@ def convert(datum, desired_type=None): if datum is None or str(datum).strip() == '': return None, None, + if isinstance(datum, bool) or desired_type == 'boolean': + return datum, 'boolean', + + if isinstance(datum, list) or desired_type == 'array': + return datum, 'array', + + if isinstance(datum, dict) or desired_type == 'object': + return datum, 'object', + if desired_type in (None, 'integer'): try: datum_int = int(datum) # Confirm it can be coerced to int @@ -117,6 +127,12 @@ def pick_datatype(counts,prefer_number_vs_integer=False): to_return = 'number' elif counts.get('date-time', 0) > 0: to_return = 'date-time' + elif counts.get('boolean', 0) > 0: + to_return = 'boolean' + elif counts.get('array', 0) > 0: + to_return = 'array' + elif counts.get('object', 0) > 0: + to_return = 'object' elif counts.get('string', 0) <= 0: LOGGER.warning(f"Unexpected data type encountered in histogram {counts}. Defaulting type to String.") @@ -149,6 +165,21 @@ def generate_schema(samples,prefer_number_vs_integer=False, prefer_schema_as_str 'type': ['null', 'string'], 'format': 'date-time', } + elif datatype == 'array': + # TODO: This currently only uses the first record. + # Extend that by sampling more values. + if samples[0][key]: + _, inner_type = convert(samples[0][key][0]) + else: + inner_type = None + to_return[key] = { + 'type': ['null', 'array'], + 'items': {'type': inner_type}, + } + elif datatype == 'object': + to_return[key] = { + 'type': ['null', 'object'], + } else: to_return[key] = { 'type': ['null', datatype], diff --git a/tap_spreadsheets_anywhere/test/test_conversion.py b/tap_spreadsheets_anywhere/test/test_conversion.py index 4def7ab..02c49f7 100644 --- a/tap_spreadsheets_anywhere/test/test_conversion.py +++ b/tap_spreadsheets_anywhere/test/test_conversion.py @@ -64,7 +64,7 @@ def test_pick_datatype(self): 'number': 1}), 'string') self.assertEqual(pick_datatype({}), 'string') - def test_generate_schema(self): + def test_generate_schema_flat(self): self.assertEqual( generate_schema([{'id': '1', 'first_name': 'Connor'}, {'id': '2', 'first_name': '1'}]), @@ -88,3 +88,35 @@ def test_generate_schema(self): {'id': '2', 'date': '2017-01-02'}]), {'id': {'type': ['null', 'integer'],}, 'date': {'type': ['null', 'string'],}}) + + def test_generate_schema_array_valid(self): + self.assertEqual( + generate_schema([{'value': ['foo', 'bar', 'baz', None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': 'string'}}}) + self.assertEqual( + generate_schema([{'value': [1, 2, 3, None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': 'integer'}}}) + self.assertEqual( + generate_schema([{'value': ['1', '2', '3', None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': 'integer'}}}) + self.assertEqual( + generate_schema([{'value': [42.42, 84.84, None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': 'number'}}}) + self.assertEqual( + generate_schema([{'value': [True, None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': 'boolean'}}}) + + def test_generate_schema_array_empty_na(self): + self.assertEqual( + generate_schema([{'value': [None]}]), + {'value': {'type': ['null', 'array'], 'items': {'type': None}}}) + self.assertEqual( + generate_schema([{'value': []}]), + {'value': {'type': ['null', 'array'], 'items': {'type': None}}}) + + def test_generate_schema_object(self): + self.assertEqual( + generate_schema([{'id': '1', 'value': {'foo': 'bar'}}, + {'id': '2', 'value': {'baz': 'qux'}}]), + {'id': {'type': ['null', 'integer'],}, + 'value': {'type': ['null', 'object'],}}) diff --git a/tap_spreadsheets_anywhere/test/test_jsonl.py b/tap_spreadsheets_anywhere/test/test_jsonl.py index 23dfe09..54595d5 100644 --- a/tap_spreadsheets_anywhere/test/test_jsonl.py +++ b/tap_spreadsheets_anywhere/test/test_jsonl.py @@ -55,6 +55,22 @@ "start_date": "2017-05-01T00:00:00Z", "key_properties": [], "format": "detect" + }, + { + "path": "file://./tap_spreadsheets_anywhere/test", + "name": "jsonl_sample_with_array", + "pattern": "sample-array\\.jsonl", + "start_date": "2017-05-01T00:00:00Z", + "key_properties": [], + "format": "detect" + }, + { + "path": "file://./tap_spreadsheets_anywhere/test", + "name": "jsonl_sample_with_object", + "pattern": "sample-object\\.jsonl", + "start_date": "2017-05-01T00:00:00Z", + "key_properties": [], + "format": "detect" } ] } @@ -121,3 +137,34 @@ def test_one_row_jsonl_file_detect(self): row_count += 1 self.assertEqual(3884, row['id'], f"ID field is {row['id']} - expected it to be 3884.") self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}") + + def test_jsonl_with_array(self): + """ + Verify arrays are propagated without serializing them to strings. + """ + test_filename_uri = './tap_spreadsheets_anywhere/test/type-array.jsonl' + iterator = format_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][6], test_filename_uri) + records = list(iterator) + expected_row_count = 3 + row_count = len(records) + self.assertEqual({"id": 1, "value": [1.1, 2.1, 1.1, 1.3]}, records[0]) + self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}") + + def test_jsonl_with_object(self): + """ + Verify objects are propagated without serializing them to strings. + """ + test_filename_uri = './tap_spreadsheets_anywhere/test/type-object.jsonl' + iterator = format_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][7], test_filename_uri) + records = list(iterator) + expected_row_count = 6 + row_count = len(records) + self.assertEqual([ + {"id": 1, "value": {"string": "foo"}}, + {"id": 2, "value": {"integer": 42}}, + {"id": 3, "value": {"float": 42.42}}, + {"id": 4, "value": {"boolean": True}}, + {"id": 5, "value": {"nested-array": [1, 2, 3]}}, + {"id": 6, "value": {"nested-object": {"foo": "bar"}}}, + ], records) + self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}") diff --git a/tap_spreadsheets_anywhere/test/type-array.jsonl b/tap_spreadsheets_anywhere/test/type-array.jsonl new file mode 100644 index 0000000..479723a --- /dev/null +++ b/tap_spreadsheets_anywhere/test/type-array.jsonl @@ -0,0 +1,3 @@ +{"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]} +{"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]} +{"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]} diff --git a/tap_spreadsheets_anywhere/test/type-object.jsonl b/tap_spreadsheets_anywhere/test/type-object.jsonl new file mode 100644 index 0000000..5c68245 --- /dev/null +++ b/tap_spreadsheets_anywhere/test/type-object.jsonl @@ -0,0 +1,6 @@ +{"id": 1, "value": {"string": "foo"}} +{"id": 2, "value": {"integer": 42}} +{"id": 3, "value": {"float": 42.42}} +{"id": 4, "value": {"boolean": true}} +{"id": 5, "value": {"nested-array": [1, 2, 3]}} +{"id": 6, "value": {"nested-object": {"foo": "bar"}}}