From 37ed78fdb5f710c586efa6a2299cf711b24acebd Mon Sep 17 00:00:00 2001 From: chfw Date: Wed, 1 Feb 2017 20:55:29 +0000 Subject: [PATCH 01/18] detect ods types: boolean, currency, time and percentage. support repeated column and multi-line text in a cell. --- CHANGELOG.md | 3 ++ horror/ods_formats.ods | Bin 0 -> 9960 bytes messytables/ods.py | 111 +++++++++++++++++++++++++++++++++-------- messytables/types.py | 45 ++++++++++++++++- test/test_read.py | 108 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 238 insertions(+), 29 deletions(-) create mode 100644 horror/ods_formats.ods diff --git a/CHANGELOG.md b/CHANGELOG.md index a3b1d91..62bead6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +0.15.2 (unreleased) +* #: detect ods types: boolean, currency, time and percentage. support repeated columns + 0.15.1 (29 September 2016) * #158: Add CDFV2-unknown to MIMELOOKUP * #157: Fix for Python Magic API change diff --git a/horror/ods_formats.ods b/horror/ods_formats.ods new file mode 100644 index 0000000000000000000000000000000000000000..fa1f7ff9fceef1586e0b071d503a87f919be99cb GIT binary patch literal 9960 zcmdU#bzEFY)2K;s34|aC76=~PT@u`bJ3$Bc8Qhsb2(H21J-AzNw+!y?1b6sg-!Gf& zzMH-G{&{PD$L35`PgkFQ>U3AjNiTRjw`qM!{K|O5uUqM-zSO7py z)&N~=YjYEQU66^DCB40+0iBgD&;&?lWeu=2u+q1)09bAZzdttKZ+8gL z_j|{p1jGA-zO0m@FbWC^78VvB9^R{0uc)c1nVFe+cz6T^1SBLR6ciLRG&J<}_07%A zfk2>(iwhVG4hRT{h=@o?NJvjlFDNJ|D=Vw3t7~g(>*?tk9UYyXo?c#F-rCwaK0dy^ zy?rS0?yk5SoCF00+b1scR?%^4cVg0Zh4Lxv(93DPVQ*!&M;CX=oS*82K;?OoXzpJO zErlnPaWc5lh?s}zYfrYH(;ChMx86wH8Jt&WHoCGIN+erpI*;)lwzrN$MzI=?iwMn( z+}agf`7f+vlOweu>+!?9+mIS90dz7|F@TC=wk2 z&m*E@8lJj?0u<{oE!@MFj=`04oZ)f^X2Q%jqgtzV6*d-@=0@}f{e~aYI&3fo&-+<6 z0OW>77Ml83-ThW-@-AFTg}60V`vHNK`${z(Xc8j3AZ+)bagq6&lEa(pz(MGowNDvy z4!pG;R+4p10O{G+9{J-D;6YQ>UExMdQx$phpm=)em!hV}mQ-A37v|^`N#t4OdcpFS z!^Hv@BCIZwHy0Z!d}bdsaA)JFBi7fvFs_rk=$i;5=WowM2a9FMJP2+81U3rp`^PrU z0~ed6SG&Nukb(2e>6~)bPR!L$U{&el@NspXpoogs0dd5mcn)$m=nUP+oO~ZBk(wUo zGO^Yx;t*$!4Tk$!aIz5OJK^{l9{KT`Nk$Z1%fT|`9R^87PAj-u;0np$J(2#z{b{48 z`C4;D(0@RI-{x9T&~g1JYI%Otogwa8_QG<5lOGL@0`22z2+E*J_H7re#geL9QfBoh zULOw?s~20>xuU0w8%L0nqS+VR+@a6#>oUzhc*ye~>}H-)23v%+rb_Z+>cU5ttBt6F1he`2?iC|5{Hn7*G3(Tr+x(7w)2-{%u;IyddRnv3kb z#;4+9*^blWEw!C}sqp?u?0CV}di|gqUmRUfBmLlc*!o+sHgho%Nz7&Hq$2aTVGaWn+!FI>v<7*_ReFOR+sBtI&KKm&M1yTw0bu}LI=FI zC{bM|fmGbHx7G@q23q2U1rWfKXyGFn;iP^yvdpyW%F#-Ks7tBbh|7tsiutdDd40X4 zGbJ7@V=VR0c#&Jb+jpQ5*d4toJd1zD8xek3j#{?0;9~KZ zphmV_-2*LI0n9EswZvb4m+N}lT0PpW5PvKW6kY{3$N_~3Rwfbwh(&Zgx5V1e^K2Gm zdFi30OsmJduFsO-{VrV4-+%0RS9hoxlNjtC{$3qxCCU5rYvuclbYldn9W-(@ZSELA zU`Zm_U!QR8^oktqxd;yf-AG?eV$&k+YE-2cMz3ugh?d=kIof=Cp&Iu%tf@LSAn&%o zG-tY{wmm^`{>)^F`Mf;0k5C#RVc`z`++K!#jk}=-?+D(*yw$Qd1T6_>4$N<%l4O0S z>Tdes?E=i5M>9QzsKbKM>sqZI6*>Z-LRut!SapiZJ{p+^j|W-?Mkb%y*8J9$uEX>O z@@{QI`${g_%PZB>vybf-R3_U&eHQhw9{Yy}Q2TDwc059>bsOrCra2;uOv8%yAjKud z%U%H|?K>d?VQB&$FGWukiI$_qwCKDwXk! z%+qG&zHiE0FV|$sg3UTQs7KFc3(sBpCB8ZuiNy_c@y(eXZU6{B+N|=BMKkuz_Mba# zVIygkvy9Aivd#vCRHN~oFM$6)+}-VrH$|W9Yw7tvRk|lUl(?{rP?3O+_kSW40s_Jx z(eL5T4?(#<0RliE6H6l?ouh^MuKGMMjSJJg$wT<#IJZ`qWkbMdEx3Sx%3>B9Q)B+I zP&XT?)H9*0@p-7)s%;4~Ec?@e4KT5IBap-O!q#?S>4>$q!(FD zV7i@Ej{y6IhBW+T&3-&uR9;Wz%9JI;R|IWOy-L*zi?*F<&9od&C}Yzmi?R;q=%wW( zJYGn^uM8M;uqfTA_TEubP*Z4s5K&Oy<|v?VvMXTXM+P9Q{? z|7eVDZl`r63UTn^fASXO?}zq$Xr2W&Vwe3j9?53==Q<9Q2x^a3NxB#V&k>0WS&!)K z*Bidcf<(}VpVRr#N}v&h-QXX`F)l~W$_0N|xofS|+26S{>k1)b`tlCPIZFu2T=c~! zjjh)H@9>cX&DLEohj{|?-`xpPn2a_g@2H#EdA!yb1goH8u|9}Sqq*){b>~(1R``>y zzL~LwxCBjQ2@HJ^T0qQPlue85*}(kvh+|HuA0IM!it;F?MNwTF)@o<7gkUJ6b12KS zE>aJ{Z{!(`?{rvIf~#AJqf%;xEvEAzB|g?=&XA}NAJws3Y5>;d&Yu?{NH=wuwK<&m zfjeAfrE%hK<;dxS@e6ofroX83#8PhA3cAcDX-CaaHBIR!a;(*7D9pkdwm@Q_Uznyd zBo7_#^SE{A+xO;ANu%1hAz#$!dYvcE2*4YP$x_HEVjW=Ft(W`iR!BtU)||O*BSsT< z{ObFw{0=d4Qh5`qH%&|PdS2*Wr%a&NYiu)`Hj<3ta*g!kz=Wbrc>-=ytN{T&0*2nA zYUwvD8Q2StElcxIpgF(PXI&!-0I+Gg$ZGt1SMzGKXDq&2^`D{nROye40^1~uPVXK! z&R)3XMzrD~x9yT0JQhK8rZY6z%8L50c}g0iIiPMg~ixd4Fu4%cC;6t;*y{*Cp~0TG=xqi7f!%nnm* zQVYY^!)hd@TO=Gtj>ju>L(9oI3%(rPx{y4Kt^`P)Dy1=YMSvS^eN zdX%CUlw+g7oS;K?iAHn`h4;ct@&NDmZd6f-tOn2***ej4U>K&A!~*Br784NUsuNEr z>Ch*;I<$8Ev|%LSdzE=Bl{8O~sw{>MeUV|Nwwy*d(_DdTt!HaTCmXRRZPb`U`kjOH zNeLd8Btv6j(B*iTB+X8LApaq={LS9eDvR~qBK_8C)^m^XVaC8_84tG$z(=l9XRVWA zHf(Z}UNe3b3N)44OwNOqMJ{K}%Ke6?o$iBbajA6E1Wq$bEvg~=38yn8V}~|@O3e6u z4KF+9%4!)}ofmxESA3PB{El|cwkF8VE(>Ex*|kyJ&4lK)EEvxOaON9@_kC6J%5Qc~ z@hzBSK$A&Q&Jo$&t)$N1L@s-8-Tj*0`I_o2oi;#l`bJ$~xRn>qxU@wZ=wdm*2)^kZ zH{a*JT*?qL^1HimCD$!>F-)m+G~}U)B<^jztw@kF8YXZ!X_2j{T`QQiNjNQBg&9rd z+T874qx{;7{&5HSX^JabyzMd+Pq$HoFFKefF>s$w58LvKX;XcK8nv@V#RT74@^(8D0&9<-|(7Qp37^xHY@R9%Xj5n0zTgUN#3KC(?n zB4-tQ$!K&41ERRGm^ZZN=M$I9cZa>DP~r#~V*bHGvn&mZVxl_QNm?}6bwjhe>{jAD zPVIphadT>O-zvyCo|p)YoFtwsABkQ~U9R0aRDmjpU!fA?t|7|!5R51`pTSfsyu>VX zP#zfZB$jauO@ldl^WjmywYm)~xm^kfi$rs0sb$B1Vg>tL8U^sVob~-NlV%w)TMdbC zrC=->EeFIvdpNbvilvpYWF45vwCAWWm}`tdGbHO8KK3DUtsB`8tObVQQiTNSqLS(qq4eo`OC_nh zHWa~&j&Ikt_AkPfwAf_%ZKpB?_ zl~!AS+K+TkhQPLxSgVZElnj=G8GZ!8T<tdmT=4&i7M1r zGsDo-Ip{lBNF@Y|3~Ogq!olOM5ldQ2jb(fc-b7X3*+wi)>P$99+`0JQwIaoGTy{IPi-UsNJ+hf)v80n@IuhJ8W+E(T_9!1FPvuw~U z#Tu2T9-4D`W>CmL_COYQw)m@$F%h3lNdyldZ5_Ls^*NhJ z)e{{Cy)>$3&tW$jcR>?iRkF8(6S#5@7RMQsWa5wMKqVfK%6j|co?Y2Tx!V(+RzEeWV0 zcrFP}cpULaDqw3z96mYCKSVS}WBk_kR9aX(BJ`VOEY0LfK>_oaj)l;eP;&uAOVm-R zU-jMA7^C)y21fpxcU(T!Z6Cz{49BNbtTNH3wMR{Dhe#I~$Lu7E-ijCs#_b*YV&<8^ z_C^^_Kx7()WLCE=p~23Drp^Gz&5v@`}!=X{JC};K(rf zM%K*vWu@8%*g*3|#DsEIKp|d0xW*qqT`j=a+4Kk=Q`>^B+V*ViAoNWi1_$;ox6!A~fP*b`CYl;_63Mwlvg~#2NUOT+wT;snaEe6RWbOG%A6y5=ygLC`Kr#0d5C6T~yq~*!B?3&p!6LNrwXcbg|lr zWj#lOG*&jsP7to-=gW}TRoXjv^G3e1`grEvqnWHX#-17(-AVPSFFtGr=wPl-VAr8g z)_6)+SCT`5oO|&R!kH&;&ScN#xmJ4%M=?FzeFkMywG|hugn1q15()MF+k{!&qnSCV zzs?FrxsS2KK3BRLKQ3)_8);YsfOB@*&H-l>t|wFlPu}KnS?z_7GhdF=+Nq=)rdHTD z)a*KaFP~0*QGu1j9yNGvFy4&}8zeeQ7wO2y?=lBo=XbtU;3jG}fkNUTOp16EwOiPs zSW!n-u+&)iTqT?Q1mCv6V1#ad_iHrQrt#;JxR!v55ED9BjWEy;Hq zwEK=y2#B0L_CidROt9pZ}+}f(*vitPorU^4C`hUT;;0S2X7(lf#cY#b1+cg+@^2@ z=DTqt>ubJTEr>B%F5dfOG(6u9S2iyqb9>!V?ZS4WlB~^ ze?oPNYca|O8%`t^@z^bcOrqEK8Ep7e#nFWJrdd;3@(BHA=(C#IA}sB2B57|@;z5*; zM!u~$-A2zSm}|V~=Cz|!)=n%TeT9yyj~#_GkV(D5 z+O7xpwFV)-8<@S~)D=oZ+$ZNNOqm{v#lbG1i$SoJnTK9zo>vXWo=MgiuHb`JNftnt zU0l#2y&03ftElS(D-*wj7*FL&mJ`pwuBYZjx~Xa-EPlH;qb8~LA*pKCa(%j$b2;}p zO|;~@L3J=m0JVe+D^)O$REfHjV++rB6t>o9(iQJ?x&uid85aQ(%_S0dwSR2Pi^&&LQcPu2$A%1^!kcH zYc4$c+Kz{H2-gqE5@jWH!G9hoX(q3+y3hc6{7FFJ|&`18?$5@NmJWIx`3A|okhgZ+E_6dtUfu|$QrnD z=z^29ienC9f`Zf~&!qZI$~15lTVvu8qgS396d>S@6h_durQ+MR$B!q)0U?Etoyy*4 z+r-i(MAdL&z*QM~6Yq3zr0Lly19-T3@{^*%)(0FCi2Pv0MBqZvog471{PS?w+c|fT zP(MyQd!_E@-GwmH3FlRC*ihxMxemirwdQt8-P7)g9wOV)7J*uX>#91Mw8TGTwd!7m z8}pi63;9zS_pp)(sfcsK+{lsq8Mdl#kaSnjEkg4VyA-`GK0OI`=v*w>+NEP zHZnxRNF16X4W;;CSxS8ns>vGww_kJg+DGOTK%Rf|tt^%ODuCoApN83CT?9|uH4qL~ z;`i7G!D>}w%JJE=u#9am51pAYD^Rmx@YnC;tE}R*t7uWECEB}7*Q)P+qo=O`!l`N} zDz%$wwN`4%4Ihp;aHXi-M22b3lfCY0Cnb}I+ICKTl&CDKXp2faeljHQ<#E}=UN^yj zgs5^t9UeV_BWswS6fSHG%VP{NqY@1CsA$K+y0gtaRgJ5mYvUUQ%kkjDU)tD3T|(X z_78HZ@#4>3Ufj_i?k}rugf?!MsJaet&d;yg-9@Zi>nysieY*A%tBdy2lJZwYwLj*HR+1>1A%kffwUhL&!1UP2fEPE#zlM$(d-eqOs9o>WgM2Ugt<9KNkt8GRg{ zXWueWvf*NnK4-_=%#zt-S8M3V6qzfUWE#6&4FY*UjS)9--rT??ew2~ z40Ks%#lA{pF58X1!Ca~&s9PrOaImhJK-5$qN>F>6BW&!T%hYvr`*8$nE`HZn?p)#c6$nOejo%)Gdj@y^qsEn z5=_UCh@KITAcDhF>D5O)}1q5S(45OXoNY6vF5 zGcI=(8J2Mt^r0NhGrC%Tyt+&3is;L?v06&4`OO96FScWK6f4B$&Ai8#56Dfm#|f>^ z80t0{%`C4KwX}Lt&`4cQtG|j9n_$TEdd!lfRmhu(5vB}m$-Y?}R!5>@LAJGHhKBD& zlhNe2hQt%kCvWpNYHOr==35zdEbJIUY7z65;6E-Nm~15XMP~;rN3?G)%G086@!O9I zE{@O5Qv`I!j>BxNc~oe!c|j(i+l$x}XzLdV>(p=HA3|v$Ns2JWeJHIY`?pZ~NA3;) z(tQ|E2SrGRj51*QoL&2O^^F~nB(ju*2f{qlV|*rt*-|8tI33thLfsh~>G3Wp(}zy= z^1SXW!6nh)-MR>T@q#))I#^Xq4oYN7B9=>3+kw#9JIS9tmiJ{8jGWpes}7H4b99TW zsXC+fY7S6jqExAn0RUb2U48;1rb0+_RF5exs!Vy!DENInKcc{8Bia>Zc@a5{v;xy? zkue$lOfJwljY|AY>&DPCY^Ksu{jYaVSWCyl3M!tbqbCc#vria#5<5t(RhKt%2KQpZ z4+T+Ddr1ZMMgx;(S(`Q?U2k^pMS52jLl0%KL{KJpL_?+C;}LMj+bT1c*)}?6J8N_Z z2Q5q1x`!<$D<+$J0QQ}5q%Y5V+?g~rvUYV0{T_yNZtPE;Le?)WBPb0ct*UU;N+wbB zUVX^5fz;%x95^vtc7@Xibn>570S=?T4Pnz{Dd%5@PY zMQsjT?Tgoku(x1+;pe=9&Y*~7J6_bv2Kij*9c3@?SYPC6@MoAegtuf|9i}{zoW@B` zN9a$3DO3rf2rx!VAE#0f@(-Nhp{hB9#%lWGt*J&Ihoj;V6qNgY0`&LOCl3?V7P^)u zh5#Ul&ep)NKce3f%<%NXK93JzVB!ruEUYbMPmO7EGpnyEEd62Jv$@9L2xYm9yshMK z%S9s0^TFluySn&Y0i`)DiY)rC$$G?$D>kXQ>gZ}Rr@5vkDya$M2o6lanaNbO4wh7$ z+Mj)B*-^{~%hD(viGZyS@D69GxZGaI{LMhR z(Pij*`5~c_Xz@c&DL*qFfxUMh_)qu!$a3F%>`!^fNDEq78k!i{+5VB=1~Sp#=dhZ(wUVi^)2=DLyy$U84x<&vXy?_bGLf0DjFEzpa2W4w3Ydh-)C;UI^0YSPT zJK(>24*#FZv$Zm^1pxmKn*Zc;kd>ABznAw<^?T5=*0ltf|8M>NsjaZ?|EILD;#*pA z84>zF``0~A3l)1UTZx|5uT#?$MWy!V3fS1@e4`iR_gty~!yxNLQ{G-p$sOh7DVD_H z)p2t(e`z)aA~a43l0Pp&r$1Cv^kl0yc(P~^VCdjkAbJ*fES<=0;%nik`k$oLZrBY7@}90Qdhhw4*)i3deW(?-6^A!Y z%ykkQLWl@=82gT(;9|Zy{H6Z%^=kkF?JksR4ySTu-7~&nj(7g=BxAoc>9)g;g5$Qz z6cJ)NCqwZEkKmYFy&#H7!saB0<#(b-&(L1Jv|@*f?~HI?H?XYZwjAC-yCXpM=Y<-Fqs-oV6VdX~)|2iZ60M4Isd%rs6*L>gK zHGdt#KFnDE6ytkCzsz6%UGdj|{V>A)DK_^$`8nSGyWX$#{{sYnD%RuO6aRmM``@*G zh2CE)m`(84IPpXCcToJZIH?oSCN`fp1=0sMO_fAvqtZ>;*7y9;BYwF09}%!5r2qf` literal 0 HcmV?d00001 diff --git a/messytables/ods.py b/messytables/ods.py index 7b03d74..e9d56b4 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -6,17 +6,42 @@ from messytables.core import RowSet, TableSet, Cell from messytables.types import (StringType, DecimalType, - DateType) + DateType, BoolType, CurrencyType, + TimeType, PercentageType) -ODS_NAMESPACES_TAG_MATCH = re.compile(b"(]*>)", re.MULTILINE) -ODS_TABLE_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) +ODS_TABLE_MATCH = re.compile( + b".*?().*?", re.MULTILINE) ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') -ODS_ROW_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_ROW_MATCH = re.compile( + b".*?().*?", re.MULTILINE) + +NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" +NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" +NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0" +NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0" + +TABLE_CELL = 'table-cell' +VALUE_TYPE = 'value-type' +COLUMN_REPEAT = 'number-columns-repeated' + +ODS_VALUE_TOKEN = { + "float": "value", + "date": "date-value", + "time": "time-value", + "boolean": "boolean-value", + "percentage": "value", + "currency": "value" +} ODS_TYPES = { 'float': DecimalType(), - 'date': DateType(None), + 'date': DateType('%Y-%m-%d'), + 'boolean': BoolType(), + 'percentage': PercentageType(), + 'time': TimeType() } @@ -102,13 +127,13 @@ def __init__(self, sheet, window=None, namespace_tags=None): else: namespaces = { "dc": u"http://purl.org/dc/elements/1.1/", - "draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0", - "number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0", - "office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0", - "svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0", - "table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0", - "text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0", - "calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0", + "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0", + "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0", + "office": NS_OPENDOCUMENT_PTTN % u"office:1.0", + "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0", + "table": NS_OPENDOCUMENT_PTTN % u"table:1.0", + "text": NS_OPENDOCUMENT_PTTN % u"text:1.0", + "calcext": NS_CAL_PTTN % u"calcext:1.0", } ods_header = u""\ @@ -129,19 +154,61 @@ def raw(self, sample=False): block = self.namespace_tags[0] + row + self.namespace_tags[1] partial = io.BytesIO(block) - for action, elem in etree.iterparse(partial, ('end',)): - if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell': - cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type') - children = elem.getchildren() - if children: - c = Cell(children[0].text, - type=ODS_TYPES.get(cell_type, StringType())) - row_data.append(c) - - if not row_data: + for action, element in etree.iterparse(partial, ('end',)): + if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): + continue + + cell_type = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + repeat = element.attrib.get( + _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, + type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, + type=ODS_TYPES.get(cell_type, StringType())) + else: + cell = Cell('', type=StringType()) + if repeat: + number_of_repeat = int(repeat) + row_data += [cell] * number_of_repeat + else: + row_data.append(cell) + + empty_cells = [c for c in row_data if c.value == ''] + if len(empty_cells) == len(row_data): # ignore blank lines continue del partial yield row_data del rows + + +def _read_text_cell(element): + children = element.getchildren() + text_content = [] + for child in children: + if child.text: + text_content.append(child.text) + else: + text_content.append('') + if len(text_content) > 0: + cell_value = '\n'.join(text_content) + else: + cell_value = '' + return Cell(cell_value, type=StringType()) + + +def _tag(namespace, tag): + return '{%s}%s' % (namespace, tag) diff --git a/messytables/types.py b/messytables/types.py index ba017f3..04dd234 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -103,6 +103,27 @@ def cast(self, value): return decimal.Decimal(value) +class PercentageType(DecimalType): + """ Decimal number, ``decimal.Decimal`` or float numbers. """ + guessing_weight = 0 + + def cast(self, value): + result = DecimalType.cast(self, value) + if result: + result = result/decimal.Decimal('100') + return result + + +class CurrencyType(DecimalType): + guessing_weight = 0 + result_type = decimal.Decimal + + def cast(self, value): + value_without_currency = value.split(' ')[0] + return DecimalType.cast(self, + value_without_currency) + + class FloatType(DecimalType): """ FloatType is deprecated """ pass @@ -134,6 +155,25 @@ def cast(self, value): raise ValueError +class TimeType(CellType): + result_type = datetime.time + + def cast(self, value): + if isinstance(value, self.result_type): + return value + if value in ('', None): + return None + hour = int(value[2:4]) + minute = int(value[5:7]) + second = int(value[8:10]) + if hour > 24: + return datetime.timedelta(hours=hour, + minutes=minute, + seconds=second) + else: + return datetime.time(hour, minute, second) + + class DateType(CellType): """ The date type is special in that it also includes a specific date format that is used to parse the date, additionally to the @@ -186,7 +226,7 @@ class DateUtilType(CellType): def test(self, value): if len(value) == 1: - return False + return False return CellType.test(self, value) def cast(self, value): @@ -195,7 +235,8 @@ def cast(self, value): return parser.parse(value) -TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType] +TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType, + TimeType, CurrencyType, PercentageType] def type_guess(rows, types=TYPES, strict=False): diff --git a/test/test_read.py b/test/test_read.py index f4b73d1..c09a727 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import unittest +from decimal import Decimal from . import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal @@ -22,6 +23,7 @@ import datetime stringy = type(u'') + class ReadCsvTest(unittest.TestCase): def test_utf8bom_lost(self): fh = horror_fobj('utf8bom.csv') @@ -188,7 +190,8 @@ def rows(skip_policy): row_set = table_set.tables[0] return row_set - second = lambda r: r[1].value + def second(row): + return row[1].value assert "goodbye" in list(map(second, rows(True))) assert " goodbye" in list(map(second, rows(False))) @@ -308,9 +311,9 @@ def test_read_large_ods(self): assert_equal(6, len(table_set.tables)) row_set = table_set.tables[0] row = next(row_set.raw()) - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) for row in row_set.sample: - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) def test_ods_version_4412(self): fh = horror_fobj('loffice-4.4.1.2.ods') @@ -334,6 +337,94 @@ def test_ods_read_past_blank_lines(self): assert_equal(rows[2][0], 'Jane') assert_equal(rows[3][0], 'Ian') + def test_ods_read_all_supported_formats(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = row_set_to_rows(row_set) + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0], "2014-11-11") + assert_equal(rows[2][0], "2001-01-01") + assert_equal(rows[3][0], '') + # time formats + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1], "PT11H12M12S") + assert_equal(rows[2][1], "PT00H00M12S") + assert_equal(rows[4][1], 'PT27H17M54S') + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], 'true') + assert_equal(rows[2][2], 'false') + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], '11.11') + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], '1 GBP') + assert_equal(rows[2][4], '-10000 GBP') + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], '2') + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], '3') + assert_equal(rows[4][6], '11') + # Scientifed not supported + assert_equal(rows[1][7], '100000') + # Fraction + assert_equal(rows[1][8], '1.25') + # Text + assert_equal(rows[1][9], "abc") + + def test_ods_read_all_supported_formats_casted(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = cast_row_set_to_rows(row_set) + date_format = "%d/%m/%Y" + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0].strftime(date_format), "11/11/2014") + assert_equal(rows[2][0].strftime(date_format), "01/01/2001") + assert_equal(rows[3][0], '') + # time formats + time_format = "%S:%M:%H" + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1].strftime(time_format), "12:12:11") + assert_equal(rows[2][1].strftime(time_format), "12:00:00") + assert_equal(rows[3][1], 0) + assert_equal(rows[4][1], datetime.timedelta(hours=27, + minutes=17, + seconds=54)) + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], True) + assert_equal(rows[2][2], False) + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], Decimal('11.11')) + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], Decimal('1')) + assert_equal(rows[2][4], Decimal('-10000')) + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], Decimal('0.02')) + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], 3) + assert_equal(rows[4][6], 11) + # Scientifed not supported + assert_equal(rows[1][7], 100000) + # Fraction + assert_equal(rows[1][8], 1.25) + # Text + assert_equal(rows[1][9], "abc") + + def row_set_to_rows(row_set): rows = [] for row in row_set: @@ -341,6 +432,13 @@ def row_set_to_rows(row_set): return rows +def cast_row_set_to_rows(row_set): + rows = [] + for row in row_set: + rows.append([cell.type.cast(cell.value) for cell in row]) + return rows + + class XlsxBackwardsCompatibilityTest(unittest.TestCase): def test_that_xlsx_is_handled_by_xls_table_set(self): """ @@ -573,8 +671,8 @@ def setUp(self): PDFTableSet(fh) except ImportError: # Optional library isn't installed. Skip the tests. - raise SkipTest("pdftables is not installed, skipping PDF tests") - + raise SkipTest( + "pdftables is not installed, skipping PDF tests") def test_read_simple_pdf(self): with horror_fobj('simple.pdf') as fh: From 56d91521d0b4f0bde30ba4bfb5e60ec8ea20b31c Mon Sep 17 00:00:00 2001 From: chfw Date: Wed, 1 Feb 2017 21:20:48 +0000 Subject: [PATCH 02/18] fix python 2.6 unit test failure on testing ods casted decimal values --- test/test_read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_read.py b/test/test_read.py index c09a727..dcb23ee 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -420,7 +420,7 @@ def test_ods_read_all_supported_formats_casted(self): # Scientifed not supported assert_equal(rows[1][7], 100000) # Fraction - assert_equal(rows[1][8], 1.25) + assert_equal(rows[1][8], Decimal('1.25')) # Text assert_equal(rows[1][9], "abc") From 9ae2ed6863d36663641d84dcc5b1919df31a0fa4 Mon Sep 17 00:00:00 2001 From: chfw Date: Wed, 1 Feb 2017 21:24:51 +0000 Subject: [PATCH 03/18] fix TimeType to return a datetime.time if hour is less than 24 --- messytables/types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/messytables/types.py b/messytables/types.py index 04dd234..3c70ebe 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -166,12 +166,12 @@ def cast(self, value): hour = int(value[2:4]) minute = int(value[5:7]) second = int(value[8:10]) - if hour > 24: + if hour < 24: + return datetime.time(hour, minute, second) + else: return datetime.timedelta(hours=hour, minutes=minute, seconds=second) - else: - return datetime.time(hour, minute, second) class DateType(CellType): From 92b95dc4d9b7d5b82d3cb20f43cc7d7ca0b3e945 Mon Sep 17 00:00:00 2001 From: chfw Date: Thu, 2 Feb 2017 09:23:37 +0000 Subject: [PATCH 04/18] performance improvement on test/test_read.py:ReadODSTest.test_read_large_ods: 45s down to 9s --- messytables/ods.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/messytables/ods.py b/messytables/ods.py index e9d56b4..f766661 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -153,6 +153,7 @@ def raw(self, sample=False): block = self.namespace_tags[0] + row + self.namespace_tags[1] partial = io.BytesIO(block) + empty_row = True for action, element in etree.iterparse(partial, ('end',)): if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): @@ -165,6 +166,8 @@ def raw(self, sample=False): _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) if cell_type == 'string': cell = _read_text_cell(element) + if cell.value != '': + empty_row = False elif cell_type == 'currency': value = element.attrib.get( _tag(NS_OPENDOCUMENT_OFFICE, value_token)) @@ -172,11 +175,13 @@ def raw(self, sample=False): _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) cell = Cell(value + ' ' + currency, type=CurrencyType()) + empty_row = False elif cell_type is not None: value = element.attrib.get( _tag(NS_OPENDOCUMENT_OFFICE, value_token)) cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) + empty_row = False else: cell = Cell('', type=StringType()) if repeat: @@ -185,8 +190,7 @@ def raw(self, sample=False): else: row_data.append(cell) - empty_cells = [c for c in row_data if c.value == ''] - if len(empty_cells) == len(row_data): + if empty_row: # ignore blank lines continue From 0fc889f2a7ff8729dc20b5bc611d4fa61d309ecf Mon Sep 17 00:00:00 2001 From: chfw Date: Thu, 2 Feb 2017 09:32:00 +0000 Subject: [PATCH 05/18] code refactoring: reduce the length of ODSRowSet.raw() --- messytables/ods.py | 51 +++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/messytables/ods.py b/messytables/ods.py index f766661..6f6708b 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -159,31 +159,12 @@ def raw(self, sample=False): if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): continue - cell_type = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) - value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + cell = _read_cell(element) + if cell.value != '': + empty_row = False + repeat = element.attrib.get( _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) - if cell_type == 'string': - cell = _read_text_cell(element) - if cell.value != '': - empty_row = False - elif cell_type == 'currency': - value = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, value_token)) - currency = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) - cell = Cell(value + ' ' + currency, - type=CurrencyType()) - empty_row = False - elif cell_type is not None: - value = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, value_token)) - cell = Cell(value, - type=ODS_TYPES.get(cell_type, StringType())) - empty_row = False - else: - cell = Cell('', type=StringType()) if repeat: number_of_repeat = int(repeat) row_data += [cell] * number_of_repeat @@ -199,6 +180,30 @@ def raw(self, sample=False): del rows +def _read_cell(element): + cell_type = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, + type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, + type=ODS_TYPES.get(cell_type, StringType())) + else: + cell = Cell('', type=StringType()) + + return cell + + def _read_text_cell(element): children = element.getchildren() text_content = [] From 927ed7157f136bd00d9acfa2c4a1bfe5dab1a31e Mon Sep 17 00:00:00 2001 From: chfw Date: Thu, 2 Feb 2017 09:39:18 +0000 Subject: [PATCH 06/18] add a unit test on reading multi line text cell in ods format --- horror/multilineods.ods | Bin 0 -> 8030 bytes test/test_read.py | 7 +++++++ 2 files changed, 7 insertions(+) create mode 100644 horror/multilineods.ods diff --git a/horror/multilineods.ods b/horror/multilineods.ods new file mode 100644 index 0000000000000000000000000000000000000000..c03f6456a3144882c3462a9ebbb1ba0700abcc85 GIT binary patch literal 8030 zcmeHMc|27A*B{x}WGj(`B#f~$vL#u=SW3zmV~jDDnPJ9Sl6}jb$dWACDiS3-5mH$~ z$d;v$Eo(`5W|S}W)A#i}&%e*}nb)1U_uO;d=X}one9k>{4n&unf(-zm0RRMw6}18! zz@cIQ0DyE5NdQN*Ba+~OL&9-52Q&gsKw~i=SB$k77LG^b#jrRe#u|%oaYSMWVtAZ0 z5^jyRMIs51Ele63nk`Hs%s=A<@$3v1V}nMyIB%ihB|&fm0_lJxUSgd=2p4B(5}3QA z133VqOHHx?uFJAxtBho2VMSM>hNCrus9bSN%UXr+)EBmfD>H)7giFmScm_) zl_V(~9D{WD!%Dl=fPzMH0DyFmd}3nj;%J3|qaE-d!uF{c4ucAT=xZ|49i}6)FluY5 z8xen>0RUtG8q&ImtQJuO0LUY>)lZwaC(rarICntUR1J}V_YMhPV7_1}EPU&_ZgyHm zrmmlE5ryX$y61F~((@J+lC;uKOH*FI28X{l?T_BrjJXp&DqmBrm~;iOvELwVTCM;C z>vxPVVygS7;Q#nC)T^TU+A4e&{>(fGa;wj&^zy?`cLL~b!qxcLVpP&ZOZvd$AD)+H zM9~DO3kd{4R+3Cxu~qx)%CQ;zauJDYz258dEAWcT)3QnHkIY{yvBcZbAqxldy`ipk zVNZ1AXPv@{N-BA(!tb0)77vg-zp&c9yxe6O{oKh|=+%t8g5#)YAA%(p86%#-(wJj&d7_a()u z&!rnnIJvKi<~c)}-`p}#OsHO-*e!d#)#mT>jw=Q}9_qiwNmWhf+&r2cRZ_RMAA}v} z34FD@X50iy#bUBRMuBpJE4a=f{LRU6L>%t<5Hu1-39eg7AWmBC*a5l_h?YOmBSbs5 zaQ~n5f6@~RtX09N;m{|A^~4Pd&_1iLp08%<`}dauJ&_B6#Snf4{(*~n*il*L%CDw} zF5Q(&EKDZI<;|S+NuM|dMjq_pLE8zi>W1voHk+@WgTS&JUf$>HqkJF6+FH@*A(hA} z+zpe$bi>fW>MHdKMS2vcZSDCN&uNNC?^U13hjv%Q@kDGtw@D__o zNy?eW_!*5yTWKT+p*Rzo7PQ!-+#WP(YPvjVu$tCx$?CRDbsX+$Ieu#T`m2>ie&Xd3 z`wMT5pAzNh%06Kyw%Vl?BlN)Yg1Cs7Nrg_d_W%{OeOU0@yu4dop1A}a)5yrk;*}L& zk4O<6puNfKlUlf11}M<9xX#A1*W8O~O1Jry!+uX!bOO@U;7OY$v+rbCL$X^`E9X+T zM9KiPk&yoSN6pf-^8MPn3ucXrmyt)-`fv2s)i18ArnZ}TE2^Zd>7eik@bq01^Dryd zM8@~yBKJIVBib0Q0Ocym%d_9O?S~Hdn_jxu#*;kcj~DcSImLyuGdhWc((KdhURY-? zRZE~%)8ca5FFQZh4!e_T@r_KPS)6yU3zvd zfe@}L>36p76XfhzVUmTz?ZHY8s`6PD`^0x`gHa!l8Y~Sy^)>f;9TSnANeAD!Wh1zL zWKq_of4DONxE+;UG?1;ZU;CKma8F4VO9KOQz$5MxYfVv8&8pa-Yr>_$y(0l1ZRWuT z-F54Nqf*V%UKjB9G&o%zd3mvEFn#{^$4Of+&&KM*P~5{CI@2Sdl@q})KbwEJriD~4 z^mD#EtYb9iZck9?YJHdz$3bpZtL>#fG_gFcDq?zH`QrV@JSG)~k+(mdZd?k_tTCi; zq)a%Qo?c0{5;skD@C46`!+xr0?r+Mr=nGY99^Eh!C6)ABS@+1 zBYk7&82n!GL2VCO8PMB8@c`@W3kFvb%fHOa+_rx9=&4U<{cOR#w=V-RN3O7^wyg2M z=*=9M0InryweHl1H#8Q*@QC#AhsqJ|t8$;zM#GC3Sp*p={EFUl9!l%Z=oIYl48TRj zd`rl;B)AJI7e$jpH(~V+i{C{eU$O)%u?I_YEN3xIZ+t%KdY{qJdrEXT4(Wv~Rk=KY z?4)O<6rB1wlMSR7_G6RSi z9Vyw|D=}ms#sEB$KtN+qcvAK^V`hkdFVB2!uE6q#K7&BK>>bshHuRY!d#QxX$|~9` zWO~*QHDjD63RHR^n+_`(PbUiGLUhXI`o2n_E;F6PTwNiCb{Rdj; zGDJR>i)+afg_$3koKEhie^bL5=OhI4CX`A+eh_kfg~IeuU*++R6SlE~fok8S?DPl9 zvh8PcGwG=iWY!Ud#!PvR52#O?w&b`?ZqO}=e&D(&e5whC&Uxq3=2r{DMrc%1m0jxg z9Q5{b6~8LS=M=xd%yaBPqHnRX=IUW6)apubU6g)ZXuJ_E6wo3h60}jI_(oOaT1yC1 zQ!2MtSv`zAXM=26PTODIDJ(`9x-rCX>!tCSw_QgM)9PW$bszHspBiuERZIBWed`vA z5W+EsQax=G8H^{BEIs{oz2v$I6Wf4lgfjotF6KL@Pkng{tIIt9yd_cSqn3Hm7&P4? z&FBy(#YV)TH!`8^p)IO&w7PjYr5OjafoD`NL>W$DYg@S#^F5v!OzPEvzw%t#1dGVE zT{~T^xp;;*s*v))o5ty>&8ZJ=E?-XIyu;BpA9U`A2VExXb%NHycqMuJIf1mXZG*4f zm=o4ONen{K+&-I}alncFz899m$;KF5d>PJCCc>ZkefL{52J*Esj&-S-2U%Pa@=?DAVh25Zj zGW1eW3Pa&6r9)FstnUh6t<+a$88qNHHUR0Js5*v=MzbT3wRHFiPu-#do|rZ+f#Gv_UgxJz+83f3 z`wtGWRehm7L)?Ce9a=#L;*sn>Vk8G2wP1z>zPnbZvd=H0Pe)hsU(`L#cmse+^J1G*~& z0%R_gwA&Y%C|M=Hk&*lG^a-DvU4CJMt3&(Dk2uC>jwv*R1p5btchP-{AMxRH0`x{a zmUrT{+IhlGX@t3Pd1srHGS$@$<2;^~jq%ZscG0QV zEd(3dim8M<8Vl$BxE0*{mN$UP5J(vf@R|8skvyK`*zvN!OZHUxO0WNG_n`V|MX3*% zJoDp=#vg^zeV?>u1V_plU^SVm^CPP#3)Gd}fZ;P*0hB-#wn(F720R{-Cz}&C&<0g+ z&vfB!sa7# z7mj=rH(^xAy*w09npInpj#9z&*i9gt-lOa>Ts!zx%TzTTW7F3}sb?;;pdSyF#*CS$ zuZD`#viRQax9EN`8Cdz<40~mClUHcpTZkM-l(DN~c!}Qfxu|Gg8ak(&)WS$=L!*FW z9bai2WZr!F@wQdu9P8M61l?pMP{{m}wSN=2iSsIf;<7< z&=jlM;|6hGsXa9&G8L5itJ7a|6_0`sr;Q0JyDZ~`Qj%OBBTr0rGNx8xw8FfT;9H$7)R+ zbK5LWusiFeOwQ%Y=ntNa#0yrifJ_>Xx3IEk_?+|4M<%k)k`1A&hFXZ@CNa zT1UNnR2txYZOT=awL(}=4x3QIHi@~GFSudCl+{k|$pI{brfLD#p=9>)(CWFuik z{!jLSg`qJAX&+whQnF(A_%`ErPHEuT(rM(&!>9f9JyU2k9l;FGTU}m$`!m|eZ80EO#;dJOW76Pe<#X+&MC z-9+shy`h0o10JIg6xsJuBn|ug=AhYln=xzV3;*XLs#R3s^ z7c5-N)P|-mf78zWk^6eJ*itsM;pp*P#zt;!%7P&*xzJfXZ+1rQz)GrYkIt8jb1`hx z11;H|a&l)hEpjnmFMcXi*>^bRZ2|g`4{wk%VpQrtPM`5P*GJ_YFZwuj!9ib^_hq$? zKw14K7Shbd^&*B<6dD?)7?I`@k=!yn+sfD7^sTGOJUR zffNK9PR`6`%L3SGv2|1Sa+RXGm#p`1H$@Y~#&VBH`MvKb4a|ko^HW3hystPa3yUQU zIcsW#4-WSCn+L7e+GI|Lw1rhhWK=%R^%q-`f3{@j*#Eg;$s&sQi$SJ$CaP43XGI9Xg}*z$sMtU0`SH5R?DJgHBFKna;~c z5>HC@nw~ECr0O%dGz{bmO{Fm87QMCMTF8(g-^tHN&)9!$-0XSHwZe-q=);VA{7Sc! zq1Rxd*%gl%r9rU1t~|yEjmpNe0i#_!5t~wWYjUJl!*wbY*C-_b@S0eY`u9xG5spFI zAn^n-XKR}`v2QSb;>`YYiUG)$9#s$}rL$0Tsa{b9!j=`a|C|$@saZ%Brg0Hqq-bst0p*VtumM2^dy*zgHFt}xe*2dJ3Fm*S|<14RH z*nmRr-N$tBr$&`QPkEz#RKsb{HTx6%xXbi)Ur60rGz8bC&5Sf!R(ANM6SIAhh}NFL zV1}%Z79T1f&6@iSU5&H#oJDxLbi9}!;=gH`EIDiNx%`yJ=Jn7(&L9wg4gKA3# z>U-39@xsybngt~H6@RsTOGyR*_z~Z(zgIKAl-VOupUaXKQPRfB1JzfXFKaLV>K|1US$YjdbJZg}wU*w;>|13sQu^Nphpb@-Vo*}@k9FRZ+ z)&)aQ;*;Y0NhCH9h~E53llVo$TG=5Hgl(!gALy6!QAo_sW`nW;@!=b6Eq#!*n7EjP zfGHXyCCR%(Cann=v8Yc10!fQYILZj9xu6}afnYfa>0ie0z$yM(ISN0Y^!C+16WJpF zJ?^m+Q)0cdCZ~bg`Wm2r0w8f0J-k_qspDk`obw6|o?`3qOoVjs2+q)!QCrq)N7wL* zpI=+yk9%u#`4sRxw&(HYT*>JNvec;iO7+%?veN-7MRbt0ca}qsB4@6wH_J}FXg(=@ zL?C7~P1q};1zzW@K6zeJJKZ@-06j@=IpyMdU7))A73(Co9)oDFryi&q6Gv^6^DBz8u*Zzy!xa%Lv; z?vE30si7G;bS5&|zzwzV-Z$sowH6vToDM+MEwg=7F_0Q=$mLP5*(m$&8b5GC7!iAS zzA5Ol2b(W>2wcka7*eMU>*75n$SaIhQL;#g;Fyh0v2bxH&rb4}Xw>aG^Y9d}%wbpv z3|9M{o)jUqwM4T7i4pQF1t~(3?PCM{y|YF1_O7wr=h~LqU82}-KaqM@JHv!{`e)1O zFP*Gy(7$!Ac0lj%0s0?(tl#IgBmA?AvU5GL2iQ(u>+dL}&eqQ0A^P*rdH&w#`a2S- ztFtpM?t!$^_xd{use`pMBKAPR?dpU5cX1l_z}ejw`#a9>v)b4LXQxm0ca+~}B~4sp ze|mX$-|TmsKRahT-f3+y$rY4_U^(QsRX|>&{V$_46-WR8 literal 0 HcmV?d00001 diff --git a/test/test_read.py b/test/test_read.py index dcb23ee..bfaee5b 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -424,6 +424,13 @@ def test_ods_read_all_supported_formats_casted(self): # Text assert_equal(rows[1][9], "abc") + def test_ods_read_multi_line_cell(self): + fh = horror_fobj('multilineods.ods') + table_set = ODSTableSet(fh) + row_set = table_set.tables[0] + rows = row_set_to_rows(row_set) + assert_equal(rows[0][0], '1\n2\n3\n4') + def row_set_to_rows(row_set): rows = [] From 8eb8691b5802896c06b531b07ca2f4073aad71d1 Mon Sep 17 00:00:00 2001 From: chfw Date: Thu, 2 Feb 2017 09:48:06 +0000 Subject: [PATCH 07/18] code refactoring: restore one liner where possible --- messytables/ods.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/messytables/ods.py b/messytables/ods.py index 6f6708b..3a45109 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -181,23 +181,17 @@ def raw(self, sample=False): def _read_cell(element): - cell_type = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') if cell_type == 'string': cell = _read_text_cell(element) elif cell_type == 'currency': - value = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, value_token)) - currency = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) - cell = Cell(value + ' ' + currency, - type=CurrencyType()) + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, type=CurrencyType()) elif cell_type is not None: - value = element.attrib.get( - _tag(NS_OPENDOCUMENT_OFFICE, value_token)) - cell = Cell(value, - type=ODS_TYPES.get(cell_type, StringType())) + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) else: cell = Cell('', type=StringType()) From 15a5bfbe11bda4c4c2e64354bd6905ba1811218c Mon Sep 17 00:00:00 2001 From: Rufus Pollock Date: Tue, 7 Feb 2017 09:38:33 +0800 Subject: [PATCH 08/18] [types,bug][s]: fixes #163 type_guess error "has no len" on DateUtilType. This is happening when type guessing using the `DataUtilType` (in my case on an xls). It is caused by this code https://github.com/okfn/messytables/blob/master/messytables/types.py#L188 ``` def test(self, value): if len(value) == 1: return False return CellType.test(self, value) ``` The value passed is the value of any cell and may already have been cast e.g. it could be float or a datetime -- neither of which have a len attribute. I suspect in CSV this issue does not show up because everything is a string. However, with xls parsing this can happen as xls already casts cell data. The solution was based on adapting the test from the DateType above. I have **not** created a new test case. However, these two tests in dataconverters were exercising the problem and failing. They are now passing: https://github.com/okfn/dataconverters/blob/85cf1a844a4e6b0b7de349a9c61c7c1c322d34e1/tests/test_xls_json.py#L24 https://github.com/okfn/dataconverters/blob/85cf1a844a4e6b0b7de349a9c61c7c1c322d34e1/tests/test_xls_json.py#L33 --- messytables/types.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/messytables/types.py b/messytables/types.py index ba017f3..356811c 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -185,7 +185,10 @@ class DateUtilType(CellType): result_type = datetime.datetime def test(self, value): - if len(value) == 1: + if not( + isinstance(value, datetime.datetime) or + (isinstance(value, string_types) and is_date(value)) + ): return False return CellType.test(self, value) From b50e6ef201fcd8fe87aa6fc3ecfa0c039826eca3 Mon Sep 17 00:00:00 2001 From: Rufus Pollock Date: Tue, 7 Feb 2017 09:59:28 +0800 Subject: [PATCH 09/18] [README][s]: fix broken badges at top by replacing with shields.io or removing. --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 75667cd..52fc19c 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) -[![Latest Version](https://pypip.in/version/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![Downloads](https://pypip.in/download/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![Supported Python versions](https://pypip.in/py_versions/messytables/badge.svg)](https://pypi.python.org/pypi/ckanserviceprovider/) -[![Development Status](https://pypip.in/status/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![License](https://pypip.in/license/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) +[![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/) + +<-- download counts not working on pypi atm https://github.com/badges/shields/issues/716 +[![Downloads](https://img.shields.io/pypi/dm/messytables.svg)](https://pypi.python.org/pypi/messytables/) +--> A library for dealing with messy tabular data in several formats, guessing types and detecting headers. From b7faca40d2eb888cfe88a0505768a379c1b51152 Mon Sep 17 00:00:00 2001 From: Rufus Pollock Date: Tue, 7 Feb 2017 10:00:35 +0800 Subject: [PATCH 10/18] [README][xs]: remove download counts badge in html comment as html comment not working in markdown on github. --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index 52fc19c..5d83641 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,6 @@ [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/) -<-- download counts not working on pypi atm https://github.com/badges/shields/issues/716 -[![Downloads](https://img.shields.io/pypi/dm/messytables.svg)](https://pypi.python.org/pypi/messytables/) ---> - A library for dealing with messy tabular data in several formats, guessing types and detecting headers. See the documentation at: https://messytables.readthedocs.io From 9cb960a03ba045ded4d6469d065a20530d20484e Mon Sep 17 00:00:00 2001 From: chfw Date: Tue, 7 Feb 2017 09:26:21 +0000 Subject: [PATCH 11/18] rebase master branch --- CHANGELOG.md | 3 ++ horror/ods_formats.ods | Bin 0 -> 9960 bytes messytables/ods.py | 111 +++++++++++++++++++++++++++++++++-------- messytables/types.py | 43 +++++++++++++++- test/test_read.py | 108 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 237 insertions(+), 28 deletions(-) create mode 100644 horror/ods_formats.ods diff --git a/CHANGELOG.md b/CHANGELOG.md index a3b1d91..62bead6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +0.15.2 (unreleased) +* #: detect ods types: boolean, currency, time and percentage. support repeated columns + 0.15.1 (29 September 2016) * #158: Add CDFV2-unknown to MIMELOOKUP * #157: Fix for Python Magic API change diff --git a/horror/ods_formats.ods b/horror/ods_formats.ods new file mode 100644 index 0000000000000000000000000000000000000000..fa1f7ff9fceef1586e0b071d503a87f919be99cb GIT binary patch literal 9960 zcmdU#bzEFY)2K;s34|aC76=~PT@u`bJ3$Bc8Qhsb2(H21J-AzNw+!y?1b6sg-!Gf& zzMH-G{&{PD$L35`PgkFQ>U3AjNiTRjw`qM!{K|O5uUqM-zSO7py z)&N~=YjYEQU66^DCB40+0iBgD&;&?lWeu=2u+q1)09bAZzdttKZ+8gL z_j|{p1jGA-zO0m@FbWC^78VvB9^R{0uc)c1nVFe+cz6T^1SBLR6ciLRG&J<}_07%A zfk2>(iwhVG4hRT{h=@o?NJvjlFDNJ|D=Vw3t7~g(>*?tk9UYyXo?c#F-rCwaK0dy^ zy?rS0?yk5SoCF00+b1scR?%^4cVg0Zh4Lxv(93DPVQ*!&M;CX=oS*82K;?OoXzpJO zErlnPaWc5lh?s}zYfrYH(;ChMx86wH8Jt&WHoCGIN+erpI*;)lwzrN$MzI=?iwMn( z+}agf`7f+vlOweu>+!?9+mIS90dz7|F@TC=wk2 z&m*E@8lJj?0u<{oE!@MFj=`04oZ)f^X2Q%jqgtzV6*d-@=0@}f{e~aYI&3fo&-+<6 z0OW>77Ml83-ThW-@-AFTg}60V`vHNK`${z(Xc8j3AZ+)bagq6&lEa(pz(MGowNDvy z4!pG;R+4p10O{G+9{J-D;6YQ>UExMdQx$phpm=)em!hV}mQ-A37v|^`N#t4OdcpFS z!^Hv@BCIZwHy0Z!d}bdsaA)JFBi7fvFs_rk=$i;5=WowM2a9FMJP2+81U3rp`^PrU z0~ed6SG&Nukb(2e>6~)bPR!L$U{&el@NspXpoogs0dd5mcn)$m=nUP+oO~ZBk(wUo zGO^Yx;t*$!4Tk$!aIz5OJK^{l9{KT`Nk$Z1%fT|`9R^87PAj-u;0np$J(2#z{b{48 z`C4;D(0@RI-{x9T&~g1JYI%Otogwa8_QG<5lOGL@0`22z2+E*J_H7re#geL9QfBoh zULOw?s~20>xuU0w8%L0nqS+VR+@a6#>oUzhc*ye~>}H-)23v%+rb_Z+>cU5ttBt6F1he`2?iC|5{Hn7*G3(Tr+x(7w)2-{%u;IyddRnv3kb z#;4+9*^blWEw!C}sqp?u?0CV}di|gqUmRUfBmLlc*!o+sHgho%Nz7&Hq$2aTVGaWn+!FI>v<7*_ReFOR+sBtI&KKm&M1yTw0bu}LI=FI zC{bM|fmGbHx7G@q23q2U1rWfKXyGFn;iP^yvdpyW%F#-Ks7tBbh|7tsiutdDd40X4 zGbJ7@V=VR0c#&Jb+jpQ5*d4toJd1zD8xek3j#{?0;9~KZ zphmV_-2*LI0n9EswZvb4m+N}lT0PpW5PvKW6kY{3$N_~3Rwfbwh(&Zgx5V1e^K2Gm zdFi30OsmJduFsO-{VrV4-+%0RS9hoxlNjtC{$3qxCCU5rYvuclbYldn9W-(@ZSELA zU`Zm_U!QR8^oktqxd;yf-AG?eV$&k+YE-2cMz3ugh?d=kIof=Cp&Iu%tf@LSAn&%o zG-tY{wmm^`{>)^F`Mf;0k5C#RVc`z`++K!#jk}=-?+D(*yw$Qd1T6_>4$N<%l4O0S z>Tdes?E=i5M>9QzsKbKM>sqZI6*>Z-LRut!SapiZJ{p+^j|W-?Mkb%y*8J9$uEX>O z@@{QI`${g_%PZB>vybf-R3_U&eHQhw9{Yy}Q2TDwc059>bsOrCra2;uOv8%yAjKud z%U%H|?K>d?VQB&$FGWukiI$_qwCKDwXk! z%+qG&zHiE0FV|$sg3UTQs7KFc3(sBpCB8ZuiNy_c@y(eXZU6{B+N|=BMKkuz_Mba# zVIygkvy9Aivd#vCRHN~oFM$6)+}-VrH$|W9Yw7tvRk|lUl(?{rP?3O+_kSW40s_Jx z(eL5T4?(#<0RliE6H6l?ouh^MuKGMMjSJJg$wT<#IJZ`qWkbMdEx3Sx%3>B9Q)B+I zP&XT?)H9*0@p-7)s%;4~Ec?@e4KT5IBap-O!q#?S>4>$q!(FD zV7i@Ej{y6IhBW+T&3-&uR9;Wz%9JI;R|IWOy-L*zi?*F<&9od&C}Yzmi?R;q=%wW( zJYGn^uM8M;uqfTA_TEubP*Z4s5K&Oy<|v?VvMXTXM+P9Q{? z|7eVDZl`r63UTn^fASXO?}zq$Xr2W&Vwe3j9?53==Q<9Q2x^a3NxB#V&k>0WS&!)K z*Bidcf<(}VpVRr#N}v&h-QXX`F)l~W$_0N|xofS|+26S{>k1)b`tlCPIZFu2T=c~! zjjh)H@9>cX&DLEohj{|?-`xpPn2a_g@2H#EdA!yb1goH8u|9}Sqq*){b>~(1R``>y zzL~LwxCBjQ2@HJ^T0qQPlue85*}(kvh+|HuA0IM!it;F?MNwTF)@o<7gkUJ6b12KS zE>aJ{Z{!(`?{rvIf~#AJqf%;xEvEAzB|g?=&XA}NAJws3Y5>;d&Yu?{NH=wuwK<&m zfjeAfrE%hK<;dxS@e6ofroX83#8PhA3cAcDX-CaaHBIR!a;(*7D9pkdwm@Q_Uznyd zBo7_#^SE{A+xO;ANu%1hAz#$!dYvcE2*4YP$x_HEVjW=Ft(W`iR!BtU)||O*BSsT< z{ObFw{0=d4Qh5`qH%&|PdS2*Wr%a&NYiu)`Hj<3ta*g!kz=Wbrc>-=ytN{T&0*2nA zYUwvD8Q2StElcxIpgF(PXI&!-0I+Gg$ZGt1SMzGKXDq&2^`D{nROye40^1~uPVXK! z&R)3XMzrD~x9yT0JQhK8rZY6z%8L50c}g0iIiPMg~ixd4Fu4%cC;6t;*y{*Cp~0TG=xqi7f!%nnm* zQVYY^!)hd@TO=Gtj>ju>L(9oI3%(rPx{y4Kt^`P)Dy1=YMSvS^eN zdX%CUlw+g7oS;K?iAHn`h4;ct@&NDmZd6f-tOn2***ej4U>K&A!~*Br784NUsuNEr z>Ch*;I<$8Ev|%LSdzE=Bl{8O~sw{>MeUV|Nwwy*d(_DdTt!HaTCmXRRZPb`U`kjOH zNeLd8Btv6j(B*iTB+X8LApaq={LS9eDvR~qBK_8C)^m^XVaC8_84tG$z(=l9XRVWA zHf(Z}UNe3b3N)44OwNOqMJ{K}%Ke6?o$iBbajA6E1Wq$bEvg~=38yn8V}~|@O3e6u z4KF+9%4!)}ofmxESA3PB{El|cwkF8VE(>Ex*|kyJ&4lK)EEvxOaON9@_kC6J%5Qc~ z@hzBSK$A&Q&Jo$&t)$N1L@s-8-Tj*0`I_o2oi;#l`bJ$~xRn>qxU@wZ=wdm*2)^kZ zH{a*JT*?qL^1HimCD$!>F-)m+G~}U)B<^jztw@kF8YXZ!X_2j{T`QQiNjNQBg&9rd z+T874qx{;7{&5HSX^JabyzMd+Pq$HoFFKefF>s$w58LvKX;XcK8nv@V#RT74@^(8D0&9<-|(7Qp37^xHY@R9%Xj5n0zTgUN#3KC(?n zB4-tQ$!K&41ERRGm^ZZN=M$I9cZa>DP~r#~V*bHGvn&mZVxl_QNm?}6bwjhe>{jAD zPVIphadT>O-zvyCo|p)YoFtwsABkQ~U9R0aRDmjpU!fA?t|7|!5R51`pTSfsyu>VX zP#zfZB$jauO@ldl^WjmywYm)~xm^kfi$rs0sb$B1Vg>tL8U^sVob~-NlV%w)TMdbC zrC=->EeFIvdpNbvilvpYWF45vwCAWWm}`tdGbHO8KK3DUtsB`8tObVQQiTNSqLS(qq4eo`OC_nh zHWa~&j&Ikt_AkPfwAf_%ZKpB?_ zl~!AS+K+TkhQPLxSgVZElnj=G8GZ!8T<tdmT=4&i7M1r zGsDo-Ip{lBNF@Y|3~Ogq!olOM5ldQ2jb(fc-b7X3*+wi)>P$99+`0JQwIaoGTy{IPi-UsNJ+hf)v80n@IuhJ8W+E(T_9!1FPvuw~U z#Tu2T9-4D`W>CmL_COYQw)m@$F%h3lNdyldZ5_Ls^*NhJ z)e{{Cy)>$3&tW$jcR>?iRkF8(6S#5@7RMQsWa5wMKqVfK%6j|co?Y2Tx!V(+RzEeWV0 zcrFP}cpULaDqw3z96mYCKSVS}WBk_kR9aX(BJ`VOEY0LfK>_oaj)l;eP;&uAOVm-R zU-jMA7^C)y21fpxcU(T!Z6Cz{49BNbtTNH3wMR{Dhe#I~$Lu7E-ijCs#_b*YV&<8^ z_C^^_Kx7()WLCE=p~23Drp^Gz&5v@`}!=X{JC};K(rf zM%K*vWu@8%*g*3|#DsEIKp|d0xW*qqT`j=a+4Kk=Q`>^B+V*ViAoNWi1_$;ox6!A~fP*b`CYl;_63Mwlvg~#2NUOT+wT;snaEe6RWbOG%A6y5=ygLC`Kr#0d5C6T~yq~*!B?3&p!6LNrwXcbg|lr zWj#lOG*&jsP7to-=gW}TRoXjv^G3e1`grEvqnWHX#-17(-AVPSFFtGr=wPl-VAr8g z)_6)+SCT`5oO|&R!kH&;&ScN#xmJ4%M=?FzeFkMywG|hugn1q15()MF+k{!&qnSCV zzs?FrxsS2KK3BRLKQ3)_8);YsfOB@*&H-l>t|wFlPu}KnS?z_7GhdF=+Nq=)rdHTD z)a*KaFP~0*QGu1j9yNGvFy4&}8zeeQ7wO2y?=lBo=XbtU;3jG}fkNUTOp16EwOiPs zSW!n-u+&)iTqT?Q1mCv6V1#ad_iHrQrt#;JxR!v55ED9BjWEy;Hq zwEK=y2#B0L_CidROt9pZ}+}f(*vitPorU^4C`hUT;;0S2X7(lf#cY#b1+cg+@^2@ z=DTqt>ubJTEr>B%F5dfOG(6u9S2iyqb9>!V?ZS4WlB~^ ze?oPNYca|O8%`t^@z^bcOrqEK8Ep7e#nFWJrdd;3@(BHA=(C#IA}sB2B57|@;z5*; zM!u~$-A2zSm}|V~=Cz|!)=n%TeT9yyj~#_GkV(D5 z+O7xpwFV)-8<@S~)D=oZ+$ZNNOqm{v#lbG1i$SoJnTK9zo>vXWo=MgiuHb`JNftnt zU0l#2y&03ftElS(D-*wj7*FL&mJ`pwuBYZjx~Xa-EPlH;qb8~LA*pKCa(%j$b2;}p zO|;~@L3J=m0JVe+D^)O$REfHjV++rB6t>o9(iQJ?x&uid85aQ(%_S0dwSR2Pi^&&LQcPu2$A%1^!kcH zYc4$c+Kz{H2-gqE5@jWH!G9hoX(q3+y3hc6{7FFJ|&`18?$5@NmJWIx`3A|okhgZ+E_6dtUfu|$QrnD z=z^29ienC9f`Zf~&!qZI$~15lTVvu8qgS396d>S@6h_durQ+MR$B!q)0U?Etoyy*4 z+r-i(MAdL&z*QM~6Yq3zr0Lly19-T3@{^*%)(0FCi2Pv0MBqZvog471{PS?w+c|fT zP(MyQd!_E@-GwmH3FlRC*ihxMxemirwdQt8-P7)g9wOV)7J*uX>#91Mw8TGTwd!7m z8}pi63;9zS_pp)(sfcsK+{lsq8Mdl#kaSnjEkg4VyA-`GK0OI`=v*w>+NEP zHZnxRNF16X4W;;CSxS8ns>vGww_kJg+DGOTK%Rf|tt^%ODuCoApN83CT?9|uH4qL~ z;`i7G!D>}w%JJE=u#9am51pAYD^Rmx@YnC;tE}R*t7uWECEB}7*Q)P+qo=O`!l`N} zDz%$wwN`4%4Ihp;aHXi-M22b3lfCY0Cnb}I+ICKTl&CDKXp2faeljHQ<#E}=UN^yj zgs5^t9UeV_BWswS6fSHG%VP{NqY@1CsA$K+y0gtaRgJ5mYvUUQ%kkjDU)tD3T|(X z_78HZ@#4>3Ufj_i?k}rugf?!MsJaet&d;yg-9@Zi>nysieY*A%tBdy2lJZwYwLj*HR+1>1A%kffwUhL&!1UP2fEPE#zlM$(d-eqOs9o>WgM2Ugt<9KNkt8GRg{ zXWueWvf*NnK4-_=%#zt-S8M3V6qzfUWE#6&4FY*UjS)9--rT??ew2~ z40Ks%#lA{pF58X1!Ca~&s9PrOaImhJK-5$qN>F>6BW&!T%hYvr`*8$nE`HZn?p)#c6$nOejo%)Gdj@y^qsEn z5=_UCh@KITAcDhF>D5O)}1q5S(45OXoNY6vF5 zGcI=(8J2Mt^r0NhGrC%Tyt+&3is;L?v06&4`OO96FScWK6f4B$&Ai8#56Dfm#|f>^ z80t0{%`C4KwX}Lt&`4cQtG|j9n_$TEdd!lfRmhu(5vB}m$-Y?}R!5>@LAJGHhKBD& zlhNe2hQt%kCvWpNYHOr==35zdEbJIUY7z65;6E-Nm~15XMP~;rN3?G)%G086@!O9I zE{@O5Qv`I!j>BxNc~oe!c|j(i+l$x}XzLdV>(p=HA3|v$Ns2JWeJHIY`?pZ~NA3;) z(tQ|E2SrGRj51*QoL&2O^^F~nB(ju*2f{qlV|*rt*-|8tI33thLfsh~>G3Wp(}zy= z^1SXW!6nh)-MR>T@q#))I#^Xq4oYN7B9=>3+kw#9JIS9tmiJ{8jGWpes}7H4b99TW zsXC+fY7S6jqExAn0RUb2U48;1rb0+_RF5exs!Vy!DENInKcc{8Bia>Zc@a5{v;xy? zkue$lOfJwljY|AY>&DPCY^Ksu{jYaVSWCyl3M!tbqbCc#vria#5<5t(RhKt%2KQpZ z4+T+Ddr1ZMMgx;(S(`Q?U2k^pMS52jLl0%KL{KJpL_?+C;}LMj+bT1c*)}?6J8N_Z z2Q5q1x`!<$D<+$J0QQ}5q%Y5V+?g~rvUYV0{T_yNZtPE;Le?)WBPb0ct*UU;N+wbB zUVX^5fz;%x95^vtc7@Xibn>570S=?T4Pnz{Dd%5@PY zMQsjT?Tgoku(x1+;pe=9&Y*~7J6_bv2Kij*9c3@?SYPC6@MoAegtuf|9i}{zoW@B` zN9a$3DO3rf2rx!VAE#0f@(-Nhp{hB9#%lWGt*J&Ihoj;V6qNgY0`&LOCl3?V7P^)u zh5#Ul&ep)NKce3f%<%NXK93JzVB!ruEUYbMPmO7EGpnyEEd62Jv$@9L2xYm9yshMK z%S9s0^TFluySn&Y0i`)DiY)rC$$G?$D>kXQ>gZ}Rr@5vkDya$M2o6lanaNbO4wh7$ z+Mj)B*-^{~%hD(viGZyS@D69GxZGaI{LMhR z(Pij*`5~c_Xz@c&DL*qFfxUMh_)qu!$a3F%>`!^fNDEq78k!i{+5VB=1~Sp#=dhZ(wUVi^)2=DLyy$U84x<&vXy?_bGLf0DjFEzpa2W4w3Ydh-)C;UI^0YSPT zJK(>24*#FZv$Zm^1pxmKn*Zc;kd>ABznAw<^?T5=*0ltf|8M>NsjaZ?|EILD;#*pA z84>zF``0~A3l)1UTZx|5uT#?$MWy!V3fS1@e4`iR_gty~!yxNLQ{G-p$sOh7DVD_H z)p2t(e`z)aA~a43l0Pp&r$1Cv^kl0yc(P~^VCdjkAbJ*fES<=0;%nik`k$oLZrBY7@}90Qdhhw4*)i3deW(?-6^A!Y z%ykkQLWl@=82gT(;9|Zy{H6Z%^=kkF?JksR4ySTu-7~&nj(7g=BxAoc>9)g;g5$Qz z6cJ)NCqwZEkKmYFy&#H7!saB0<#(b-&(L1Jv|@*f?~HI?H?XYZwjAC-yCXpM=Y<-Fqs-oV6VdX~)|2iZ60M4Isd%rs6*L>gK zHGdt#KFnDE6ytkCzsz6%UGdj|{V>A)DK_^$`8nSGyWX$#{{sYnD%RuO6aRmM``@*G zh2CE)m`(84IPpXCcToJZIH?oSCN`fp1=0sMO_fAvqtZ>;*7y9;BYwF09}%!5r2qf` literal 0 HcmV?d00001 diff --git a/messytables/ods.py b/messytables/ods.py index 7b03d74..e9d56b4 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -6,17 +6,42 @@ from messytables.core import RowSet, TableSet, Cell from messytables.types import (StringType, DecimalType, - DateType) + DateType, BoolType, CurrencyType, + TimeType, PercentageType) -ODS_NAMESPACES_TAG_MATCH = re.compile(b"(]*>)", re.MULTILINE) -ODS_TABLE_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) +ODS_TABLE_MATCH = re.compile( + b".*?().*?", re.MULTILINE) ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') -ODS_ROW_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_ROW_MATCH = re.compile( + b".*?().*?", re.MULTILINE) + +NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" +NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" +NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0" +NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0" + +TABLE_CELL = 'table-cell' +VALUE_TYPE = 'value-type' +COLUMN_REPEAT = 'number-columns-repeated' + +ODS_VALUE_TOKEN = { + "float": "value", + "date": "date-value", + "time": "time-value", + "boolean": "boolean-value", + "percentage": "value", + "currency": "value" +} ODS_TYPES = { 'float': DecimalType(), - 'date': DateType(None), + 'date': DateType('%Y-%m-%d'), + 'boolean': BoolType(), + 'percentage': PercentageType(), + 'time': TimeType() } @@ -102,13 +127,13 @@ def __init__(self, sheet, window=None, namespace_tags=None): else: namespaces = { "dc": u"http://purl.org/dc/elements/1.1/", - "draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0", - "number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0", - "office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0", - "svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0", - "table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0", - "text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0", - "calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0", + "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0", + "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0", + "office": NS_OPENDOCUMENT_PTTN % u"office:1.0", + "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0", + "table": NS_OPENDOCUMENT_PTTN % u"table:1.0", + "text": NS_OPENDOCUMENT_PTTN % u"text:1.0", + "calcext": NS_CAL_PTTN % u"calcext:1.0", } ods_header = u""\ @@ -129,19 +154,61 @@ def raw(self, sample=False): block = self.namespace_tags[0] + row + self.namespace_tags[1] partial = io.BytesIO(block) - for action, elem in etree.iterparse(partial, ('end',)): - if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell': - cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type') - children = elem.getchildren() - if children: - c = Cell(children[0].text, - type=ODS_TYPES.get(cell_type, StringType())) - row_data.append(c) - - if not row_data: + for action, element in etree.iterparse(partial, ('end',)): + if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): + continue + + cell_type = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + repeat = element.attrib.get( + _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, + type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get( + _tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, + type=ODS_TYPES.get(cell_type, StringType())) + else: + cell = Cell('', type=StringType()) + if repeat: + number_of_repeat = int(repeat) + row_data += [cell] * number_of_repeat + else: + row_data.append(cell) + + empty_cells = [c for c in row_data if c.value == ''] + if len(empty_cells) == len(row_data): # ignore blank lines continue del partial yield row_data del rows + + +def _read_text_cell(element): + children = element.getchildren() + text_content = [] + for child in children: + if child.text: + text_content.append(child.text) + else: + text_content.append('') + if len(text_content) > 0: + cell_value = '\n'.join(text_content) + else: + cell_value = '' + return Cell(cell_value, type=StringType()) + + +def _tag(namespace, tag): + return '{%s}%s' % (namespace, tag) diff --git a/messytables/types.py b/messytables/types.py index 356811c..39ff098 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -103,6 +103,27 @@ def cast(self, value): return decimal.Decimal(value) +class PercentageType(DecimalType): + """ Decimal number, ``decimal.Decimal`` or float numbers. """ + guessing_weight = 0 + + def cast(self, value): + result = DecimalType.cast(self, value) + if result: + result = result/decimal.Decimal('100') + return result + + +class CurrencyType(DecimalType): + guessing_weight = 0 + result_type = decimal.Decimal + + def cast(self, value): + value_without_currency = value.split(' ')[0] + return DecimalType.cast(self, + value_without_currency) + + class FloatType(DecimalType): """ FloatType is deprecated """ pass @@ -134,6 +155,25 @@ def cast(self, value): raise ValueError +class TimeType(CellType): + result_type = datetime.time + + def cast(self, value): + if isinstance(value, self.result_type): + return value + if value in ('', None): + return None + hour = int(value[2:4]) + minute = int(value[5:7]) + second = int(value[8:10]) + if hour > 24: + return datetime.timedelta(hours=hour, + minutes=minute, + seconds=second) + else: + return datetime.time(hour, minute, second) + + class DateType(CellType): """ The date type is special in that it also includes a specific date format that is used to parse the date, additionally to the @@ -198,7 +238,8 @@ def cast(self, value): return parser.parse(value) -TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType] +TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType, + TimeType, CurrencyType, PercentageType] def type_guess(rows, types=TYPES, strict=False): diff --git a/test/test_read.py b/test/test_read.py index f4b73d1..c09a727 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import unittest +from decimal import Decimal from . import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal @@ -22,6 +23,7 @@ import datetime stringy = type(u'') + class ReadCsvTest(unittest.TestCase): def test_utf8bom_lost(self): fh = horror_fobj('utf8bom.csv') @@ -188,7 +190,8 @@ def rows(skip_policy): row_set = table_set.tables[0] return row_set - second = lambda r: r[1].value + def second(row): + return row[1].value assert "goodbye" in list(map(second, rows(True))) assert " goodbye" in list(map(second, rows(False))) @@ -308,9 +311,9 @@ def test_read_large_ods(self): assert_equal(6, len(table_set.tables)) row_set = table_set.tables[0] row = next(row_set.raw()) - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) for row in row_set.sample: - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) def test_ods_version_4412(self): fh = horror_fobj('loffice-4.4.1.2.ods') @@ -334,6 +337,94 @@ def test_ods_read_past_blank_lines(self): assert_equal(rows[2][0], 'Jane') assert_equal(rows[3][0], 'Ian') + def test_ods_read_all_supported_formats(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = row_set_to_rows(row_set) + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0], "2014-11-11") + assert_equal(rows[2][0], "2001-01-01") + assert_equal(rows[3][0], '') + # time formats + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1], "PT11H12M12S") + assert_equal(rows[2][1], "PT00H00M12S") + assert_equal(rows[4][1], 'PT27H17M54S') + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], 'true') + assert_equal(rows[2][2], 'false') + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], '11.11') + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], '1 GBP') + assert_equal(rows[2][4], '-10000 GBP') + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], '2') + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], '3') + assert_equal(rows[4][6], '11') + # Scientifed not supported + assert_equal(rows[1][7], '100000') + # Fraction + assert_equal(rows[1][8], '1.25') + # Text + assert_equal(rows[1][9], "abc") + + def test_ods_read_all_supported_formats_casted(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = cast_row_set_to_rows(row_set) + date_format = "%d/%m/%Y" + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0].strftime(date_format), "11/11/2014") + assert_equal(rows[2][0].strftime(date_format), "01/01/2001") + assert_equal(rows[3][0], '') + # time formats + time_format = "%S:%M:%H" + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1].strftime(time_format), "12:12:11") + assert_equal(rows[2][1].strftime(time_format), "12:00:00") + assert_equal(rows[3][1], 0) + assert_equal(rows[4][1], datetime.timedelta(hours=27, + minutes=17, + seconds=54)) + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], True) + assert_equal(rows[2][2], False) + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], Decimal('11.11')) + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], Decimal('1')) + assert_equal(rows[2][4], Decimal('-10000')) + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], Decimal('0.02')) + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], 3) + assert_equal(rows[4][6], 11) + # Scientifed not supported + assert_equal(rows[1][7], 100000) + # Fraction + assert_equal(rows[1][8], 1.25) + # Text + assert_equal(rows[1][9], "abc") + + def row_set_to_rows(row_set): rows = [] for row in row_set: @@ -341,6 +432,13 @@ def row_set_to_rows(row_set): return rows +def cast_row_set_to_rows(row_set): + rows = [] + for row in row_set: + rows.append([cell.type.cast(cell.value) for cell in row]) + return rows + + class XlsxBackwardsCompatibilityTest(unittest.TestCase): def test_that_xlsx_is_handled_by_xls_table_set(self): """ @@ -573,8 +671,8 @@ def setUp(self): PDFTableSet(fh) except ImportError: # Optional library isn't installed. Skip the tests. - raise SkipTest("pdftables is not installed, skipping PDF tests") - + raise SkipTest( + "pdftables is not installed, skipping PDF tests") def test_read_simple_pdf(self): with horror_fobj('simple.pdf') as fh: From a01e9895e8fb546538992237e7e7cfe15c7be6c3 Mon Sep 17 00:00:00 2001 From: chfw Date: Tue, 7 Feb 2017 09:44:08 +0000 Subject: [PATCH 12/18] remove magic value and optimize empty row detection --- messytables/ods.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/messytables/ods.py b/messytables/ods.py index 3a45109..ea7c86e 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -26,6 +26,7 @@ TABLE_CELL = 'table-cell' VALUE_TYPE = 'value-type' COLUMN_REPEAT = 'number-columns-repeated' +EMPTY_CELL_VALUE = '' ODS_VALUE_TOKEN = { "float": "value", @@ -160,7 +161,7 @@ def raw(self, sample=False): continue cell = _read_cell(element) - if cell.value != '': + if empty_row is True and cell.value != EMPTY_CELL_VALUE: empty_row = False repeat = element.attrib.get( @@ -193,7 +194,7 @@ def _read_cell(element): value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) else: - cell = Cell('', type=StringType()) + cell = Cell(EMPTY_CELL_VALUE, type=StringType()) return cell @@ -205,11 +206,11 @@ def _read_text_cell(element): if child.text: text_content.append(child.text) else: - text_content.append('') + text_content.append(EMPTY_CELL_VALUE) if len(text_content) > 0: cell_value = '\n'.join(text_content) else: - cell_value = '' + cell_value = EMPTY_CELL_VALUE return Cell(cell_value, type=StringType()) From 385d229dc6d2f81be91f7d926ae33ba4a280d90a Mon Sep 17 00:00:00 2001 From: chfw Date: Tue, 7 Feb 2017 10:10:06 +0000 Subject: [PATCH 13/18] better comment for scientific notation --- test/test_read.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_read.py b/test/test_read.py index bfaee5b..ec4dbdc 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -371,7 +371,7 @@ def test_ods_read_all_supported_formats(self): assert_equal(rows[0][6], "Int") assert_equal(rows[1][6], '3') assert_equal(rows[4][6], '11') - # Scientifed not supported + # Scientific value is used but its notation is not assert_equal(rows[1][7], '100000') # Fraction assert_equal(rows[1][8], '1.25') @@ -417,7 +417,7 @@ def test_ods_read_all_supported_formats_casted(self): assert_equal(rows[0][6], "Int") assert_equal(rows[1][6], 3) assert_equal(rows[4][6], 11) - # Scientifed not supported + # Scientific value is used but its notation is not assert_equal(rows[1][7], 100000) # Fraction assert_equal(rows[1][8], Decimal('1.25')) From edf6fef643990a07dcd986fe0c4bf554c75944c1 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Wed, 8 Feb 2017 13:40:37 +0000 Subject: [PATCH 14/18] Bump setup.py to 0.15.2 version for release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16fdb73..40fe7e0 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='messytables', - version='0.15.1', + version='0.15.2', description="Parse messy tabular data in various formats", long_description=long_desc, classifiers=[ From 0eb9555d888af2145d36ee15cbaceaf7a3559411 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Wed, 8 Feb 2017 13:42:27 +0000 Subject: [PATCH 15/18] Update CHANGELOG.md for 0.15.2 Add change not listed, and add date. --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62bead6..5e8f8cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -0.15.2 (unreleased) -* #: detect ods types: boolean, currency, time and percentage. support repeated columns +0.15.2 (8 February 2017) +* #165: detect ods types: boolean, currency, time and percentage. support repeated columns +* #160: Correct spelling of separator in source 0.15.1 (29 September 2016) * #158: Add CDFV2-unknown to MIMELOOKUP From a4cf5e57855955da1cff4eb26d0448250ef3897c Mon Sep 17 00:00:00 2001 From: Krzysiek Madejski Date: Fri, 24 Mar 2017 21:56:52 +0100 Subject: [PATCH 16/18] fixes #168 --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 40fe7e0..4f8f8ed 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,8 @@ 'python-dateutil>=1.5.0', 'lxml>=3.2', 'requests', - 'html5lib', + 'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301 + 'html5lib', 'json-table-schema>=0.2, <=0.2.1' ], extras_require={'pdf': ['pdftables>=0.0.4']}, From ba346e098abd30ea3baabc7791fe885f09f2aa95 Mon Sep 17 00:00:00 2001 From: Simeon Walker Date: Mon, 24 Apr 2017 15:40:59 +0100 Subject: [PATCH 17/18] Add another Excel mime type --- messytables/any.py | 1 + 1 file changed, 1 insertion(+) diff --git a/messytables/any.py b/messytables/any.py index c497391..fd9dfc5 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -25,6 +25,7 @@ 'text/plain': 'CSV', # could be TAB. 'application/CDFV2-corrupt': 'XLS', 'application/CDFV2-unknown': 'XLS', + 'application/CDFV2': 'XLS', 'application/vnd.oasis.opendocument.spreadsheet': 'ODS', 'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS', } From 93f49b1367793e7459e3dcb07793828f4e0218d4 Mon Sep 17 00:00:00 2001 From: Sam Hatchett Date: Tue, 2 May 2017 14:08:14 -0400 Subject: [PATCH 18/18] fixes documentation for headers_guess implementation uses column count compared to the column count mode (not row count) --- messytables/headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/messytables/headers.py b/messytables/headers.py index 4434618..a50ebc7 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -20,7 +20,7 @@ def column_count_modal(rows): def headers_guess(rows, tolerance=1): """ Guess the offset and names of the headers of the row set. This will attempt to locate the first row within ``tolerance`` - of the mode of the number of rows in the row set sample. + of the mode of the number of columns in the row set sample. The return value is a tuple of the offset of the header row and the names of the columns.