Skip to content

Commit

Permalink
Merge pull request #55 from craigastill/fix_skip_interval_for_spreads…
Browse files Browse the repository at this point in the history
…heets_with_empty_rows_before_data

Fix skip interval for spreadsheets with empty rows before data
  • Loading branch information
ets authored Jan 6, 2024
2 parents 330cfbb + dd6a696 commit f811e40
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,4 @@ rsa-key
tags
singer-check-tap-data
state.json
.venv/
15 changes: 11 additions & 4 deletions tap_spreadsheets_anywhere/excel_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@

LOGGER = logging.getLogger(__name__)

def generator_wrapper(reader):
def generator_wrapper(reader, table_spec: dict={}) -> dict:
skip_initial = table_spec.get("skip_initial", 0)
_skip_count = 0
header_row = None
for row in reader:
if _skip_count < skip_initial:
LOGGER.debug("Skipped (%d/%d) row: %r", _skip_count, skip_initial, row)
_skip_count += 1
continue

to_return = {}
if header_row is None:
header_row = row
Expand Down Expand Up @@ -58,11 +65,11 @@ def get_legacy_row_iterator(table_spec, file_handle):
except Exception as e:
LOGGER.info(e)
sheet = workbook.sheet_by_name(sheet_name_list[0])
return generator_wrapper(sheet.get_rows())
return generator_wrapper(sheet.get_rows(), table_spec)


def get_row_iterator(table_spec, file_handle):
workbook = openpyxl.load_workbook(file_handle, read_only=True)
workbook = openpyxl.load_workbook(file_handle.name, read_only=True)

if "worksheet_name" in table_spec:
try:
Expand All @@ -88,4 +95,4 @@ def get_row_iterator(table_spec, file_handle):
except Exception as e:
LOGGER.info(e)
active_sheet = worksheets[0]
return generator_wrapper(active_sheet)
return generator_wrapper(active_sheet, table_spec)
6 changes: 4 additions & 2 deletions tap_spreadsheets_anywhere/format_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def get_row_iterator(table_spec, uri):
except (ValueError,TypeError) as err:
raise InvalidFormatError(uri,message=err)

for _ in range(skip_initial):
next(iterator)
if format != 'excel':
# Reduce the scope of changes to fix Issue #52.
for _ in range(skip_initial):
next(iterator)

return iterator
Binary file not shown.
39 changes: 39 additions & 0 deletions tap_spreadsheets_anywhere/test/test_excel_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
import pytest
from openpyxl import Workbook
from tap_spreadsheets_anywhere.excel_handler import generator_wrapper

LOGGER = logging.getLogger(__name__)


def get_worksheet():
"""Create a basic workbook that can be manipulated for tests.
See: https://openpyxl.readthedocs.io/en/stable/usage.html.
"""
wb = Workbook()
ws = wb.active
tree_data = [
["Type", "Leaf Color", "Height"],
["Maple", "Red", 549],
["Oak", "Green", 783],
["Pine", "Green", 1204]
]
exp_tree_data = [
{'type': 'Maple', 'leaf_color': 'Red', 'height': 549},
{'type': 'Oak', 'leaf_color': 'Green', 'height': 783},
{'type': 'Pine', 'leaf_color': 'Green', 'height': 1204},
]
[ws.append(row) for row in tree_data]
return ws, wb, tree_data, exp_tree_data


class TestExcelHandlerGeneratorWrapper:
"""Validate the expected state of the `excel_handler.generator_wrapper`."""
def test_parse_data(self):
worksheet, _, _, exp = get_worksheet()
_generator = generator_wrapper(worksheet)
assert next(_generator) == exp[0]
assert next(_generator) == exp[1]
assert next(_generator) == exp[2]
with pytest.raises(StopIteration):
next(_generator)
64 changes: 64 additions & 0 deletions tap_spreadsheets_anywhere/test/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
import json
import logging
import unittest
from datetime import datetime
from pathlib import Path
from unittest.mock import patch

import dateutil
import pytest
import smart_open
from six import StringIO

from tap_spreadsheets_anywhere import configuration, file_utils, csv_handler, json_handler, generate_schema
from tap_spreadsheets_anywhere.format_handler import monkey_patch_streamreader, get_row_iterator
from tap_spreadsheets_anywhere.test.test_excel_handler import get_worksheet


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -204,3 +208,63 @@ def test_renamed_https_object(self):

row = next(iterator)
self.assertTrue(len(row)>1,"Not able to read a row.")


class TestFormatHandlerExcelXlsxSkipInitial:
"""pytests to validate Skip Initial for Excel `.xlsx` files works as expected."""
bad_file = Path(__file__).cwd() / "sample_with_bad_blank_line_above_headings.xlsx"
uri = f"file://{bad_file.absolute()}"

def test_validate_iterator(self, tmpdir):
xlsx = tmpdir / "fake_test.xlsx"
uri = f"file://{xlsx}"
_, workbook, _, exp = get_worksheet()
workbook.save(xlsx)

iterator = get_row_iterator({"format": "excel"}, uri)
assert next(iterator) == exp[0]
assert next(iterator) == exp[1]
assert next(iterator) == exp[2]
with pytest.raises(StopIteration):
next(iterator)

def test_bad_blank_line_above_headings_raises(self):
"""Test to verify a sample file that raises #52.
Iteratting through this bad sample file will currently fail
when parsing the blank line.
"""
table_spec = {"format": "excel"}
iterator = get_row_iterator(table_spec, self.uri)
with pytest.raises(IndexError):
for _ in iterator:
continue

def test_bad_blank_line_above_headings_skip_initial_over_bad_row(self):
"""Test to verify a sample file that raises #52, does not fail when
using: `skip_interval`, to avoid the bad row.
"""
# NOTE: that `get_row_iterator` will compress the header row and each
# subsequent data row together, so count one less row than in the file
# + expect a dict.
exp = {
'account': 'Sales - Commission Fees',
'account_code': 9999.0,
'account_type': 'Revenue',
'contact': 'Company A Limited',
'credit_gbp': 123.45,
'date': datetime(2023, 1, 31, 0, 0),
'debit_gbp': 0.0,
'description': 'Description for Company A',
'gross_gbp': 123.45,
'invoice_number': 'INV-1234',
'net_gbp': 1234.45,
'reference': 'REF-1234',
'revenue_type': 'Commission Fees',
'vat_gbp': 0.0,
}
table_spec = {"format": "excel", "skip_initial": 4}
# NOTE: `get_row_iterator` should no longer fail with Issue #52, now
# that: `excel_handler.generator_wrapper` is not parsing skipped rows.
iterator = get_row_iterator(table_spec, self.uri)
# Assert that the expected row, after skipping, is next.
assert next(iterator) == exp

0 comments on commit f811e40

Please sign in to comment.