From 5677de02b18248ccefbb94a8b4b4edaaefdf78a4 Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:00:37 +0100 Subject: [PATCH 1/9] Ignore `venv` virtualenv folder. Issue: #52. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 95a4edc..fe009cc 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,4 @@ rsa-key tags singer-check-tap-data state.json +.venv/ From d59633f66532aa1fbd6c7492cf4daeb6f42ffc7b Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:05:34 +0100 Subject: [PATCH 2/9] Smoke test `excel_handler.generator_wrapper` with openpyxl. Issue: #52. --- .../test/test_excel_handler.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tap_spreadsheets_anywhere/test/test_excel_handler.py diff --git a/tap_spreadsheets_anywhere/test/test_excel_handler.py b/tap_spreadsheets_anywhere/test/test_excel_handler.py new file mode 100644 index 0000000..3e1dfc3 --- /dev/null +++ b/tap_spreadsheets_anywhere/test/test_excel_handler.py @@ -0,0 +1,39 @@ +import logging +import pytest +from openpyxl import Workbook +from tap_spreadsheets_anywhere.excel_handler import generator_wrapper + +LOGGER = logging.getLogger(__name__) + + +def get_worksheet(): + """Create a basic workbook that can be manipulated for tests. + See: https://openpyxl.readthedocs.io/en/stable/usage.html. + """ + wb = Workbook() + ws = wb.active + tree_data = [ + ["Type", "Leaf Color", "Height"], + ["Maple", "Red", 549], + ["Oak", "Green", 783], + ["Pine", "Green", 1204] + ] + exp_tree_data = [ + {'type': 'Maple', 'leaf_color': 'Red', 'height': 549}, + {'type': 'Oak', 'leaf_color': 'Green', 'height': 783}, + {'type': 'Pine', 'leaf_color': 'Green', 'height': 1204}, + ] + [ws.append(row) for row in tree_data] + return ws, wb, tree_data, exp_tree_data + + +class TestExcelHandlerGeneratorWrapper: + """Validate the expected state of the `excel_handler.generator_wrapper`.""" + def test_parse_data(self): + worksheet, _, _, exp = get_worksheet() + _generator = generator_wrapper(worksheet) + assert next(_generator) == exp[0] + assert next(_generator) == exp[1] + assert next(_generator) == exp[2] + with pytest.raises(StopIteration): + next(_generator) From dbb89c910fc7770af2e065f86ef79c4e4be19f96 Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:07:05 +0100 Subject: [PATCH 3/9] Smoke test `format_handler.get_row_iterator` with openpyxl. Issue: #52. --- tap_spreadsheets_anywhere/test/test_format.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 40a3331..92da224 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -5,11 +5,13 @@ from unittest.mock import patch import dateutil +import pytest import smart_open from six import StringIO from tap_spreadsheets_anywhere import configuration, file_utils, csv_handler, json_handler, generate_schema from tap_spreadsheets_anywhere.format_handler import monkey_patch_streamreader, get_row_iterator +from tap_spreadsheets_anywhere.test.test_excel_handler import get_worksheet LOGGER = logging.getLogger(__name__) @@ -204,3 +206,19 @@ def test_renamed_https_object(self): row = next(iterator) self.assertTrue(len(row)>1,"Not able to read a row.") + + +class TestFormatHandlerExcelXlsxSkipInitial: + """pytests to validate Skip Initial for Excel `.xlsx` files works as expected.""" + def test_validate_iterator(self, tmpdir): + xlsx = tmpdir / "fake_test.xlsx" + uri = f"file://{xlsx}" + _, workbook, _, exp = get_worksheet() + workbook.save(xlsx) + + iterator = get_row_iterator({"format": "excel"}, uri) + assert next(iterator) == exp[0] + assert next(iterator) == exp[1] + assert next(iterator) == exp[2] + with pytest.raises(StopIteration): + next(iterator) From 7b2c76b33f7103bbf95fa51f97bda5bc2d95b7d8 Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:07:53 +0100 Subject: [PATCH 4/9] Dump a sample `.xlsx` that exhibits the #52 failure during parsing. Issue: #52. --- ...ample_with_bad_blank_line_above_headings.xlsx | Bin 0 -> 8760 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx diff --git a/tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx b/tap_spreadsheets_anywhere/test/sample_with_bad_blank_line_above_headings.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..437351f574bd9a66c0efccbc5420f3fd2e48f2be GIT binary patch literal 8760 zcmd^k1z40_x9A|L2t!JU#0SzabV-RwN=Qo$&Cn@GBS;F25>k>PA>BxabW6$L5YnA9 z z@n^OmGiPIWD|<_qe?z!ApV`@_3>nzBa^1wP2(oVL;>Cw@R;IHb&lKYQ2y_G@Sl z{=2Wf!wLdcUTD;Iy%NXU{r0X^N75$5-t=o_M3|nW6}txX_}dQUam+#i6GnlZGWxID z?N2Kx_zMT?bN<-$GR!F>^oeNGRnQU>ndP~;GE#xJK${&cl6QST-c4&Df9>I(Bu%j5*}&|@@5ER%9lDADUe=adL|ms(@b7`O{g(usCQrH%dL1JACIQbxnZ7;_o-t>@Jc^gGXc zyK01v=HaER-2#=&Y__F_(}jKIe^i526|r%!O{@Yj(EtE!bO1p0Uy0J)!P&+IVs7s0 z0{Z^wUIf|2Mwo0);-kr;b-HxIo?tnvXf z-AJ-(eU-IgqsAi}pOF-$b!*2;O1|24>*DyjcP)-z4F?_7VuTqdCfa(}`WNB?->8Lg5m@?rq0u`SUR4eMEr|7^))Ra#kW1v6v9key-9OwMG51N#3N>$C2k0 zem*_yM|^Q%5pAfQ#jQz6m5S@D6WF`?h+d#y;B`Ndu!Moqv7V2w$ephmY5B+7BQtAT z$5!rjK8O%MV1CHtK$>3pRCgN)v9s3ZsLye0)#NFNY>OtQJASc$yXZ}F)yPIvqt3%K zJ$P+>T0hgf>am?Ax5k>l3O(&H#O%5Kxu^4C+T7TgTdC5$$*QRfVUI7jye#=o7Z>cS zPPQ$|I-NGhAwq^1emi!Eix30{O8Jw;>@Hm`MdlRUTFSSscjbNDoJXF};e%A<$a<)l zg!Sze^~{(2d#y#(^l>?l-e0JR?~#A_vQ=bAi6GC5Iojm<8K8JCKXy-Rj~-#+k$UxFjtBL8#y_@v4D#QwC^;kxs9d(+FX zs`6D&fw0BmyzTz%{7F|!0b%jv*ORvV$(w!IotBEpldd0g@w3u)4AHHpekXki;Ll_5bYu5`Gbe)n)&}hY?!e2fO+Sd5w+YSmLMWMp0Lr6bQfjr!^ zq>+l#G%UlYXfV)^%IfU|P*#DG2|52dx+4+gn#Di^7D!|dsZZi8HdXckrF0x0pVnq> zY?ZtoU8lbrnLACzVr7QdHx3hsS~M{rTEFMKFX#7U1~!9XmNF{j?f_X z767L4doiY_e>TE?RKlqvz9fNq6*-37)loA2=p&+K+8KYJt~b44IR+uJGrdPVwOrT%J(nuhM{T-GB42V7a%hi6d#PNqIYk=r2z)tOfM>e71q`oPJ8b8KH<{817DtzI7j@ErXSms;#PdtKqQ`HRjS^4TM}!e-Dg4dLwGIeVhc@X3=X) z-$@6?)IJHaG)u!F9>1d@2S&7?1HfHw>w4IzG@iWV~19IoAN5b{VW z**n9aaYz6RNdr|3lfhPT?GQo86RBjM40z*^8yM0As+c2#?c&-U2O-a-l6^A_nue;t zkQq?LE*b0?*A5kgJeNxL%YZixA;FLf5DDPL4S&^Z1l9lrJjpyXW`4kXtn^VpETz91 zy6V3mwii#$lWiTd9r96}!Nh!bshd2Z;Kt74nR&9E|3y~+^NaqczbS*)oQp=z4|s=V zGzy5I^nZ-bWexau5^pxWq-lzzX@aITy-s5coauqi^g?I)pfml@nE{pmz=i%d_xYF8 ziTM4c@n|2T%Z&p5)6pYd)X@Xfkvi(g6m{l;x>Trc-?L%gvq|5xIbUl)7^P$;)~j4J zhX2#(>?q(ZrN275qcuPouQ?A*h#!!H^gZjg03Lq-M|l6Yq7lW1P|y*y_y`I*h87<~K_`R7Cn_;56R1wI^Bk;K zd1z+VfJ92k?Ef${h*B~GOEnjb#2WDbAlfo>9@2CU^t+%rzb1Kx=YPZo*mVc&fzYzL zXN%FY{=Eky7zG4SN~)scSOf0k(dMFI@cXA@{r^M!_7Ht#z&_vl+=7edZ5sa8a?Hy) zc>M>xr%!#xeGwyU!XAj#I1XG{O&|4!qOC29!rpGwGj8!p_o^b$9de_cUFMwBhaPEQF zK`TB0Ad&XZ@?iQOC52z|V6d~f?XTG|=hbuMpyG{IE&{As;%6aTUNPfAp?sS%FYMn_ z8_8t@ZK)Y{ho_PwGiO^D`v$$+!2Ceo^4mc|!fY7}m|SH&tKIlS18}DX^jpNbk>%wK z#u+$D4Ih;GOnU^K-g=u+ls;}vh@QbE9FxPMcTcrd_9VNREgF3PnFbk^oNc%XTWby) zBEDOfgRY_$W1J)PywqSuB^+Pb0B`bOK4=JS`=bsc_sv)a$8%UiTeRgZX@juha0Ky{ z3<`7CmC7$q$a+~1`1@J_@edieLd@;VLEp#kWq^KdJ%=e;l5@;cVAnQV<0mI7{ybRL zFtvb^uE1s@?5NDd7R_qHPa-0Pe5$5!T*G>gK*)P}Ck2(|r@gKc4`JcaH=-S50cXa=U4P6$xA2;T-uJOq zLQiFu877f)j~Y4e0dxN?jM^%(5X_#uv9GJm3i)%2M zSs6Ngf=g$|PvrLA%<2YC1ca9OWFf?rP!{awTMmBKU8>yhFlE{NPyap|v*Ckf!;Emqwta7Ms8F%q~*54@D3O+?tha8HW z^)>s99Ophy%8+P16`bp@a+ulP*(Ve)DXuP057ADKSUB*da`M}|2ryanKmKis$EP$?r|{Z3zb{6LhW5K+ex8Xg%m`3{=oF?g)0` zZ9o+{Da4{4x>w4j2b#?Cl3?Bv2!e%xWtRiK;O zz&2*Ple5YEYNH(Yc+;c_O$tV;rZQ1 z-sKaRqh={QqQvuaa)464#BcCN-g-HKHfRI%mOA9wiIA?a+eVJ9w2g25p{?86wMKI{*+(=7#`u=o&PRlbMpQ$k4c;q*z&I=$P1T@-HiFb|V@$&wi z{==j5__YP+-6kuJlh2;+i1S)dq4-&>)(2!P!n3ZgAhI(PeW~a9`_DSSI9&JY$oER* z1e(G-7O#Tr(O|t)9!^Y++LiK{-l~r}z4Xxg82?Op z@cyAZt{%4L-?gS+2kZdjBDlaf#eZ{JCeL=Aq_#>i7iN`+{45qL?={=bfUko{>*Bt1 zFtTk+d|pt2%l8Q5`jq9&wr}IU!GiPzZ_6BPay&uJf-KppvQ+~rSh%9d#4t1V{7u@dHrTC(^=xl-t3F%-V~Y_ZIuZF2NaI$8irbdf?GR^Vwwf+ken~BIuuM zY0-+C&hF{%jEN~B4NeQ$&fnSz)Wo}W^W<^SLU`z+1tu!jTMGIDJu8xeIV+qk^5(`Z zqp1NA!qMW7!x4^b&eJ+Lm@x|X)TxIqpaG<=OkdGF(aq(H>Oid`(3jPppYb;5z|r(g z+d742gk-gbptD^RAIz_zSF8Y+Fq;+Rc!BfdC{upR`LN^cZf~V_ej|@8SFA9DN%6Kg z=3J2~ewphFWx^d6*CuuB)S9kdFpE9obRvtdU|;LOIhbJ>R!W7PvE}nYni6 z1-(vQuj-Drt9>@PPKV(d5xL4=Pa9d>$J1dtCwwdQ(=Bf&ywiaL2*;>t!2<-2Ym!1t zeDtxSm3pA3x+Y|BW!%|5<=te2i@QSp{DN$TyKZnwVV$xk1dLT>6u4tZ*&ODyknUUmY1k@@iW--7Y7IPKbnK(GueD4xP4=S`t5`?5RyN4>asMm#Q4G&3U5Y~sLr}nzK zo4nadE2%5o-Aw6V)LEx{)`LHJywtl;U1JxiT(uJDTxB%LZhl>NE~{)C)62^z!LXb` zx`e2rT&G0k=0=t16K|m=L*Z)W+?n8jF>V3j<@=)$+CKfjH`5vew?j2f+^H#{v=1W5 zBbSz~JlpY@dmFERq-k^1e2+1Guap}yFZk(QupBZ&OKHU_TdbAhZar>MIPY9vsx(%B zDO(9~tvCUXNyKZkrfEKPDFL%mHQn~X0h?x9+^SuH8Q$!`jv}U<}QX047>DuRsXb8Mfu|^{s_x|{g&SY^TO!+)@@t_4^@Wlg-%YC#i%KSy;Wj1 zo65!}`22;Z^e&we>;#!2Fe^eJMP?Xw(SCL@1Sv;RdhR}Fac^PE4JX3zB(`1F!`u#u zKV>3Ftf;O9QjI=m#EY%LkP zsM?1gS$ky@mnc-hX5kH*uaH&OoXF|VneOo^Svr#SSxKsBvoh?mJEtpiLTfIJ#e1Z5-ni*w{8Mv2k{1Deaa+mw~0bB%_LzlUW1yaq?I3lBjo6#1J zI=$)von($xe(UC@S|o0K%<;Z&c0s&{3Gl#}mMy)@*$6_q+Cqsofvgt;;6B#nU@Bua zY6m>6WC00vC)i6#)HD;B1p151xG#w2Qd?5+I73FV=_Ps-WE^T@tYfi`IR_8FHr-bB zNhlb;=KT)WXmf3cCb`yL_1Pa!(}p`}NWozYiVj>ibQsSWB&I_<*fW~*qy0p68ZM89 zjg#e&L0fit5axfGy68P{uy-}LcLi&BIGVc{T$y>Ldav@OR-9csr`sdtw-=^;k%VlX z$B~N#>TD>Pj)6Sj6Eo%Vx(pv^pN|^WS9RXwU9Zg9PbqK5B2JuT<7Vm1lq?Pq@O=^n zZQox}CW&paDQPjii?N;MpX<4DoGxUxe`q5*g6f$Pg3u&8pMLq1>QXV!^InoSRpK zoP`*+b(5d*Ga?W0Zj`ltub<)FE|@5zC{pUqD9lMgo|_J0O;BFGD3GNoxXnL9yGZ8v zh}B)oH;|!@iv<{$I)lysO0{YEQ)D(BNCN-uNQiQY`2bx%x^!pv-St7Qmu*4rUBYp^ zez;W8YyQ6VkbL1c_z@SI{XW4ZB5S={mV=GorEV8mp~`+e&_O+3H|R3e+}TT z^5LJLeR;=RZtM5#_;&`s260z${!eJWe7t1vKSce%bNKaAy9)Mx0_xTe4*xy$`+e%K zp7iS0`w5^+_xTT@{@We+`{ZAp%9XSI35eU@C;yG>{hiOR7JKD_ege~-KRKk|C;qB* ff0jKFH30Az=B=uTiFL*O#^o>R(iMi%Ts{3at%03U literal 0 HcmV?d00001 From be66756c0c49693c5d7b2418413367433efe7afb Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:09:17 +0100 Subject: [PATCH 5/9] Passing test to catch the thrown excpetion seen in #52. Issue: #52. --- tap_spreadsheets_anywhere/test/test_format.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 92da224..056e8bf 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -2,6 +2,7 @@ import json import logging import unittest +from pathlib import Path from unittest.mock import patch import dateutil @@ -210,6 +211,9 @@ def test_renamed_https_object(self): class TestFormatHandlerExcelXlsxSkipInitial: """pytests to validate Skip Initial for Excel `.xlsx` files works as expected.""" + bad_file = Path(__file__).cwd() / "sample_with_bad_blank_line_above_headings.xlsx" + uri = f"file://{bad_file.absolute()}" + def test_validate_iterator(self, tmpdir): xlsx = tmpdir / "fake_test.xlsx" uri = f"file://{xlsx}" @@ -222,3 +226,14 @@ def test_validate_iterator(self, tmpdir): assert next(iterator) == exp[2] with pytest.raises(StopIteration): next(iterator) + + def test_bad_blank_line_above_headings_raises(self): + """Test to verify a sample file that raises #52. + Iteratting through this bad sample file will currently fail + when parsing the blank line. + """ + table_spec = {"format": "excel"} + iterator = get_row_iterator(table_spec, self.uri) + with pytest.raises(IndexError): + for _ in iterator: + continue From 14294b20c1ce8e5231954b63b9924b9754fab08f Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:10:12 +0100 Subject: [PATCH 6/9] Failing test to highlight `skip_initial` not fixing #52 currently. Issue: #52. --- tap_spreadsheets_anywhere/test/test_format.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 056e8bf..ed16241 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -237,3 +237,37 @@ def test_bad_blank_line_above_headings_raises(self): with pytest.raises(IndexError): for _ in iterator: continue + + def test_bad_blank_line_above_headings_skip_initial_over_bad_row(self): + """Test to verify a sample file that raises #52. + Iteratting through this bad sample file will currently fail + when parsing the blank line during sampling time, even when + supplied with the: `skip_interval` argument with a row + beyond the bad row. + """ + exp = [ + "Date", + "Contact", + "Description", + "Invoice Number", + "Reference", + "Debit (GBP)", + "Credit (GBP)", + "Gross (GBP)", + "Net (GBP)", + "VAT (GBP)", + "Account Code", + "Account", + "Account Type", + "Revenue Type", + ] + table_spec = {"format": "excel", "skip_initial": 5} + # NOTE: `get_row_iterator` fails with Issue #52. This is because the + # current code parses each line against the header generated from the + # first row of the file. With the above bad file, this throws an + # `IndexError` during the sampling discovery phase. Skipping rows + # should be done in: `excel_handler.generator_wrapper`, before the rows + # are parsed. + iterator = get_row_iterator(table_spec, self.uri) + # Assert that the expected row, after skipping, is next. + assert next(iterator) == exp From a4b96cb4e02fcdd88f39a4ad2d2e4973d75c051c Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 15:30:33 +0100 Subject: [PATCH 7/9] Move `skip_initial` into `generator_wrapper` for Excel files only. Aim is to constrain this change to Excel files, but keep current `skip_initial` behaviour for the other supported file types. Moving into `excel_handler.generator_wrapper` before the `header_row` so that skipped lines are not parsed against the headings row. This will avoid failures previously exposed by the bad sample file. Issue: #52. --- tap_spreadsheets_anywhere/excel_handler.py | 13 ++++++++++--- tap_spreadsheets_anywhere/format_handler.py | 6 ++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tap_spreadsheets_anywhere/excel_handler.py b/tap_spreadsheets_anywhere/excel_handler.py index bef07b3..ffd2b84 100644 --- a/tap_spreadsheets_anywhere/excel_handler.py +++ b/tap_spreadsheets_anywhere/excel_handler.py @@ -6,9 +6,16 @@ LOGGER = logging.getLogger(__name__) -def generator_wrapper(reader): +def generator_wrapper(reader, table_spec: dict={}) -> dict: + skip_initial = table_spec.get("skip_initial", 0) + _skip_count = 0 header_row = None for row in reader: + if _skip_count < skip_initial: + LOGGER.debug("Skipped (%d/%d) row: %r", _skip_count, skip_initial, row) + _skip_count += 1 + continue + to_return = {} if header_row is None: header_row = row @@ -58,7 +65,7 @@ def get_legacy_row_iterator(table_spec, file_handle): except Exception as e: LOGGER.info(e) sheet = workbook.sheet_by_name(sheet_name_list[0]) - return generator_wrapper(sheet.get_rows()) + return generator_wrapper(sheet.get_rows(), table_spec) def get_row_iterator(table_spec, file_handle): @@ -88,4 +95,4 @@ def get_row_iterator(table_spec, file_handle): except Exception as e: LOGGER.info(e) active_sheet = worksheets[0] - return generator_wrapper(active_sheet) + return generator_wrapper(active_sheet, table_spec) diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py index 3bf911f..bb40321 100644 --- a/tap_spreadsheets_anywhere/format_handler.py +++ b/tap_spreadsheets_anywhere/format_handler.py @@ -171,7 +171,9 @@ def get_row_iterator(table_spec, uri): except (ValueError,TypeError) as err: raise InvalidFormatError(uri,message=err) - for _ in range(skip_initial): - next(iterator) + if format != 'excel': + # Reduce the scope of changes to fix Issue #52. + for _ in range(skip_initial): + next(iterator) return iterator From 2700771d13385dbd27807c691c318c9b25f03410 Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 16:01:12 +0100 Subject: [PATCH 8/9] Fixed up the `skip_initial` bad `.xlsx` failing test after changes. Fixed up the expectation in the previously failing `skip_intiial` testcase, now that Production code has been changed to skip over the rows without parsing. Issue: #52. --- tap_spreadsheets_anywhere/test/test_format.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index ed16241..6bc3311 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -2,6 +2,7 @@ import json import logging import unittest +from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -239,35 +240,31 @@ def test_bad_blank_line_above_headings_raises(self): continue def test_bad_blank_line_above_headings_skip_initial_over_bad_row(self): - """Test to verify a sample file that raises #52. - Iteratting through this bad sample file will currently fail - when parsing the blank line during sampling time, even when - supplied with the: `skip_interval` argument with a row - beyond the bad row. + """Test to verify a sample file that raises #52, does not fail when + using: `skip_interval`, to avoid the bad row. """ - exp = [ - "Date", - "Contact", - "Description", - "Invoice Number", - "Reference", - "Debit (GBP)", - "Credit (GBP)", - "Gross (GBP)", - "Net (GBP)", - "VAT (GBP)", - "Account Code", - "Account", - "Account Type", - "Revenue Type", - ] - table_spec = {"format": "excel", "skip_initial": 5} - # NOTE: `get_row_iterator` fails with Issue #52. This is because the - # current code parses each line against the header generated from the - # first row of the file. With the above bad file, this throws an - # `IndexError` during the sampling discovery phase. Skipping rows - # should be done in: `excel_handler.generator_wrapper`, before the rows - # are parsed. + # NOTE: that `get_row_iterator` will compress the header row and each + # subsequent data row together, so count one less row than in the file + # + expect a dict. + exp = { + 'account': 'Sales - Commission Fees', + 'account_code': 9999.0, + 'account_type': 'Revenue', + 'contact': 'Company A Limited', + 'credit_gbp': 123.45, + 'date': datetime(2023, 1, 31, 0, 0), + 'debit_gbp': 0.0, + 'description': 'Description for Company A', + 'gross_gbp': 123.45, + 'invoice_number': 'INV-1234', + 'net_gbp': 1234.45, + 'reference': 'REF-1234', + 'revenue_type': 'Commission Fees', + 'vat_gbp': 0.0, + } + table_spec = {"format": "excel", "skip_initial": 4} + # NOTE: `get_row_iterator` should no longer fail with Issue #52, now + # that: `excel_handler.generator_wrapper` is not parsing skipped rows. iterator = get_row_iterator(table_spec, self.uri) # Assert that the expected row, after skipping, is next. assert next(iterator) == exp From dd6a696a512cad25808f646bd5aaa8fccdbfa74d Mon Sep 17 00:00:00 2001 From: Craig Astill Date: Tue, 16 May 2023 16:09:25 +0100 Subject: [PATCH 9/9] Fix `zipfile.BadZipFile` error when reading `.xlsx` files. Use the name of the `.xlsx`, when opening the workbook, since the following error is thrown when attempting to read the `io.TextIOWrapper` instance: `zipfile.BadZipFile: File is not a zip file when loading an .xlsx file` Issue: #54. --- tap_spreadsheets_anywhere/excel_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_spreadsheets_anywhere/excel_handler.py b/tap_spreadsheets_anywhere/excel_handler.py index ffd2b84..4aad321 100644 --- a/tap_spreadsheets_anywhere/excel_handler.py +++ b/tap_spreadsheets_anywhere/excel_handler.py @@ -69,7 +69,7 @@ def get_legacy_row_iterator(table_spec, file_handle): def get_row_iterator(table_spec, file_handle): - workbook = openpyxl.load_workbook(file_handle, read_only=True) + workbook = openpyxl.load_workbook(file_handle.name, read_only=True) if "worksheet_name" in table_spec: try: