diff --git a/.gitignore b/.gitignore index 95a4edc..fe009cc 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,4 @@ rsa-key tags singer-check-tap-data state.json +.venv/ diff --git a/tap_spreadsheets_anywhere/excel_handler.py b/tap_spreadsheets_anywhere/excel_handler.py index bef07b3..4aad321 100644 --- a/tap_spreadsheets_anywhere/excel_handler.py +++ b/tap_spreadsheets_anywhere/excel_handler.py @@ -6,9 +6,16 @@ LOGGER = logging.getLogger(__name__) -def generator_wrapper(reader): +def generator_wrapper(reader, table_spec: dict={}) -> dict: + skip_initial = table_spec.get("skip_initial", 0) + _skip_count = 0 header_row = None for row in reader: + if _skip_count < skip_initial: + LOGGER.debug("Skipped (%d/%d) row: %r", _skip_count, skip_initial, row) + _skip_count += 1 + continue + to_return = {} if header_row is None: header_row = row @@ -58,11 +65,11 @@ def get_legacy_row_iterator(table_spec, file_handle): except Exception as e: LOGGER.info(e) sheet = workbook.sheet_by_name(sheet_name_list[0]) - return generator_wrapper(sheet.get_rows()) + return generator_wrapper(sheet.get_rows(), table_spec) def get_row_iterator(table_spec, file_handle): - workbook = openpyxl.load_workbook(file_handle, read_only=True) + workbook = openpyxl.load_workbook(file_handle.name, read_only=True) if "worksheet_name" in table_spec: try: @@ -88,4 +95,4 @@ def get_row_iterator(table_spec, file_handle): except Exception as e: LOGGER.info(e) active_sheet = worksheets[0] - return generator_wrapper(active_sheet) + return generator_wrapper(active_sheet, table_spec) diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py index 3bf911f..bb40321 100644 --- a/tap_spreadsheets_anywhere/format_handler.py +++ b/tap_spreadsheets_anywhere/format_handler.py @@ -171,7 +171,9 @@ def get_row_iterator(table_spec, uri): except (ValueError,TypeError) as err: raise InvalidFormatError(uri,message=err) - for _ in range(skip_initial): - next(iterator) + if format != 'excel': + # Reduce the scope of changes to fix Issue #52. + for _ in range(skip_initial): + next(iterator) return iterator diff --git a/tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx b/tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx new file mode 100644 index 0000000..437351f Binary files /dev/null and b/tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx differ diff --git a/tap_spreadsheets_anywhere/test/test_excel_handler.py b/tap_spreadsheets_anywhere/test/test_excel_handler.py new file mode 100644 index 0000000..3e1dfc3 --- /dev/null +++ b/tap_spreadsheets_anywhere/test/test_excel_handler.py @@ -0,0 +1,39 @@ +import logging +import pytest +from openpyxl import Workbook +from tap_spreadsheets_anywhere.excel_handler import generator_wrapper + +LOGGER = logging.getLogger(__name__) + + +def get_worksheet(): + """Create a basic workbook that can be manipulated for tests. + See: https://openpyxl.readthedocs.io/en/stable/usage.html. + """ + wb = Workbook() + ws = wb.active + tree_data = [ + ["Type", "Leaf Color", "Height"], + ["Maple", "Red", 549], + ["Oak", "Green", 783], + ["Pine", "Green", 1204] + ] + exp_tree_data = [ + {'type': 'Maple', 'leaf_color': 'Red', 'height': 549}, + {'type': 'Oak', 'leaf_color': 'Green', 'height': 783}, + {'type': 'Pine', 'leaf_color': 'Green', 'height': 1204}, + ] + [ws.append(row) for row in tree_data] + return ws, wb, tree_data, exp_tree_data + + +class TestExcelHandlerGeneratorWrapper: + """Validate the expected state of the `excel_handler.generator_wrapper`.""" + def test_parse_data(self): + worksheet, _, _, exp = get_worksheet() + _generator = generator_wrapper(worksheet) + assert next(_generator) == exp[0] + assert next(_generator) == exp[1] + assert next(_generator) == exp[2] + with pytest.raises(StopIteration): + next(_generator) diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 40a3331..6bc3311 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -2,14 +2,18 @@ import json import logging import unittest +from datetime import datetime +from pathlib import Path from unittest.mock import patch import dateutil +import pytest import smart_open from six import StringIO from tap_spreadsheets_anywhere import configuration, file_utils, csv_handler, json_handler, generate_schema from tap_spreadsheets_anywhere.format_handler import monkey_patch_streamreader, get_row_iterator +from tap_spreadsheets_anywhere.test.test_excel_handler import get_worksheet LOGGER = logging.getLogger(__name__) @@ -204,3 +208,63 @@ def test_renamed_https_object(self): row = next(iterator) self.assertTrue(len(row)>1,"Not able to read a row.") + + +class TestFormatHandlerExcelXlsxSkipInitial: + """pytests to validate Skip Initial for Excel `.xlsx` files works as expected.""" + bad_file = Path(__file__).cwd() / "sample_with_bad_blank_line_above_headings.xlsx" + uri = f"file://{bad_file.absolute()}" + + def test_validate_iterator(self, tmpdir): + xlsx = tmpdir / "fake_test.xlsx" + uri = f"file://{xlsx}" + _, workbook, _, exp = get_worksheet() + workbook.save(xlsx) + + iterator = get_row_iterator({"format": "excel"}, uri) + assert next(iterator) == exp[0] + assert next(iterator) == exp[1] + assert next(iterator) == exp[2] + with pytest.raises(StopIteration): + next(iterator) + + def test_bad_blank_line_above_headings_raises(self): + """Test to verify a sample file that raises #52. + Iteratting through this bad sample file will currently fail + when parsing the blank line. + """ + table_spec = {"format": "excel"} + iterator = get_row_iterator(table_spec, self.uri) + with pytest.raises(IndexError): + for _ in iterator: + continue + + def test_bad_blank_line_above_headings_skip_initial_over_bad_row(self): + """Test to verify a sample file that raises #52, does not fail when + using: `skip_interval`, to avoid the bad row. + """ + # NOTE: that `get_row_iterator` will compress the header row and each + # subsequent data row together, so count one less row than in the file + # + expect a dict. + exp = { + 'account': 'Sales - Commission Fees', + 'account_code': 9999.0, + 'account_type': 'Revenue', + 'contact': 'Company A Limited', + 'credit_gbp': 123.45, + 'date': datetime(2023, 1, 31, 0, 0), + 'debit_gbp': 0.0, + 'description': 'Description for Company A', + 'gross_gbp': 123.45, + 'invoice_number': 'INV-1234', + 'net_gbp': 1234.45, + 'reference': 'REF-1234', + 'revenue_type': 'Commission Fees', + 'vat_gbp': 0.0, + } + table_spec = {"format": "excel", "skip_initial": 4} + # NOTE: `get_row_iterator` should no longer fail with Issue #52, now + # that: `excel_handler.generator_wrapper` is not parsing skipped rows. + iterator = get_row_iterator(table_spec, self.uri) + # Assert that the expected row, after skipping, is next. + assert next(iterator) == exp