Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for container types (array, object) when reading JSON files #75

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ celerybeat-schedule
.env

# virtualenv
.venv/
venv/
ENV/

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ meltano elt --catalog=my-catalog.json --job_id=my-job-state tap-spreadsheets-any

### JSON support

JSON files are expected to parse as a root-level array of objects where each object is a set of flat key-value pairs.
JSON files are expected to parse as a root-level array of objects where each object is a set of key-value pairs.
```json
[
{ "name": "row one", "key": 42},
Expand Down
35 changes: 33 additions & 2 deletions tap_spreadsheets_anywhere/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ def coerce(datum,declared_types):
if datum is None or datum == '':
return None

desired_type = declared_types.copy()
if isinstance(desired_type, list):
desired_type = declared_types
if isinstance(declared_types, list):
desired_type = declared_types.copy()
if "null" in desired_type:
desired_type.remove("null")
desired_type = desired_type[0]
Expand All @@ -43,6 +44,15 @@ def convert(datum, desired_type=None):
if datum is None or str(datum).strip() == '':
return None, None,

if isinstance(datum, bool) or desired_type == 'boolean':
return datum, 'boolean',

if isinstance(datum, list) or desired_type == 'array':
return datum, 'array',

if isinstance(datum, dict) or desired_type == 'object':
return datum, 'object',

if desired_type in (None, 'integer'):
try:
datum_int = int(datum) # Confirm it can be coerced to int
Expand Down Expand Up @@ -117,6 +127,12 @@ def pick_datatype(counts,prefer_number_vs_integer=False):
to_return = 'number'
elif counts.get('date-time', 0) > 0:
to_return = 'date-time'
elif counts.get('boolean', 0) > 0:
to_return = 'boolean'
elif counts.get('array', 0) > 0:
to_return = 'array'
elif counts.get('object', 0) > 0:
to_return = 'object'
elif counts.get('string', 0) <= 0:
LOGGER.warning(f"Unexpected data type encountered in histogram {counts}. Defaulting type to String.")

Expand Down Expand Up @@ -149,6 +165,21 @@ def generate_schema(samples,prefer_number_vs_integer=False, prefer_schema_as_str
'type': ['null', 'string'],
'format': 'date-time',
}
elif datatype == 'array':
# TODO: This currently only uses the first record.
# Extend that by sampling more values.
if samples[0][key]:
_, inner_type = convert(samples[0][key][0])
else:
inner_type = None
to_return[key] = {
'type': ['null', 'array'],
'items': {'type': inner_type},
}
elif datatype == 'object':
to_return[key] = {
'type': ['null', 'object'],
}
else:
to_return[key] = {
'type': ['null', datatype],
Expand Down
34 changes: 33 additions & 1 deletion tap_spreadsheets_anywhere/test/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_pick_datatype(self):
'number': 1}), 'string')
self.assertEqual(pick_datatype({}), 'string')

def test_generate_schema(self):
def test_generate_schema_flat(self):
self.assertEqual(
generate_schema([{'id': '1', 'first_name': 'Connor'},
{'id': '2', 'first_name': '1'}]),
Expand All @@ -88,3 +88,35 @@ def test_generate_schema(self):
{'id': '2', 'date': '2017-01-02'}]),
{'id': {'type': ['null', 'integer'],},
'date': {'type': ['null', 'string'],}})

def test_generate_schema_array_valid(self):
self.assertEqual(
generate_schema([{'value': ['foo', 'bar', 'baz', None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': 'string'}}})
self.assertEqual(
generate_schema([{'value': [1, 2, 3, None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': 'integer'}}})
self.assertEqual(
generate_schema([{'value': ['1', '2', '3', None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': 'integer'}}})
self.assertEqual(
generate_schema([{'value': [42.42, 84.84, None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': 'number'}}})
self.assertEqual(
generate_schema([{'value': [True, None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': 'boolean'}}})

def test_generate_schema_array_empty_na(self):
self.assertEqual(
generate_schema([{'value': [None]}]),
{'value': {'type': ['null', 'array'], 'items': {'type': None}}})
self.assertEqual(
generate_schema([{'value': []}]),
{'value': {'type': ['null', 'array'], 'items': {'type': None}}})

def test_generate_schema_object(self):
self.assertEqual(
generate_schema([{'id': '1', 'value': {'foo': 'bar'}},
{'id': '2', 'value': {'baz': 'qux'}}]),
{'id': {'type': ['null', 'integer'],},
'value': {'type': ['null', 'object'],}})
47 changes: 47 additions & 0 deletions tap_spreadsheets_anywhere/test/test_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,22 @@
"start_date": "2017-05-01T00:00:00Z",
"key_properties": [],
"format": "detect"
},
{
"path": "file://./tap_spreadsheets_anywhere/test",
"name": "jsonl_sample_with_array",
"pattern": "sample-array\\.jsonl",
"start_date": "2017-05-01T00:00:00Z",
"key_properties": [],
"format": "detect"
},
{
"path": "file://./tap_spreadsheets_anywhere/test",
"name": "jsonl_sample_with_object",
"pattern": "sample-object\\.jsonl",
"start_date": "2017-05-01T00:00:00Z",
"key_properties": [],
"format": "detect"
}
]
}
Expand Down Expand Up @@ -121,3 +137,34 @@ def test_one_row_jsonl_file_detect(self):
row_count += 1
self.assertEqual(3884, row['id'], f"ID field is {row['id']} - expected it to be 3884.")
self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}")

def test_jsonl_with_array(self):
"""
Verify arrays are propagated without serializing them to strings.
"""
test_filename_uri = './tap_spreadsheets_anywhere/test/type-array.jsonl'
iterator = format_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][6], test_filename_uri)
records = list(iterator)
expected_row_count = 3
row_count = len(records)
self.assertEqual({"id": 1, "value": [1.1, 2.1, 1.1, 1.3]}, records[0])
self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}")

def test_jsonl_with_object(self):
"""
Verify objects are propagated without serializing them to strings.
"""
test_filename_uri = './tap_spreadsheets_anywhere/test/type-object.jsonl'
iterator = format_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][7], test_filename_uri)
records = list(iterator)
expected_row_count = 6
row_count = len(records)
self.assertEqual([
{"id": 1, "value": {"string": "foo"}},
{"id": 2, "value": {"integer": 42}},
{"id": 3, "value": {"float": 42.42}},
{"id": 4, "value": {"boolean": True}},
{"id": 5, "value": {"nested-array": [1, 2, 3]}},
{"id": 6, "value": {"nested-object": {"foo": "bar"}}},
], records)
self.assertEqual(expected_row_count, row_count, f"Expected row_count to be {expected_row_count} but was {row_count}")
3 changes: 3 additions & 0 deletions tap_spreadsheets_anywhere/test/type-array.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}
{"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}
{"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}
6 changes: 6 additions & 0 deletions tap_spreadsheets_anywhere/test/type-object.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"id": 1, "value": {"string": "foo"}}
{"id": 2, "value": {"integer": 42}}
{"id": 3, "value": {"float": 42.42}}
{"id": 4, "value": {"boolean": true}}
{"id": 5, "value": {"nested-array": [1, 2, 3]}}
{"id": 6, "value": {"nested-object": {"foo": "bar"}}}