From 6678b04aa8408ec60c42826573520266c71a9033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Wed, 19 Aug 2020 14:31:03 +0200 Subject: [PATCH 1/2] Import documents: list documents that cannot be read --- .../text/widgets/owimportdocuments.py | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 1df27390d..a6e8fee8c 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -13,6 +13,7 @@ from types import SimpleNamespace as namespace from concurrent.futures._base import TimeoutError +from typing import List, Optional from AnyQt.QtCore import Qt, QEvent, QFileInfo, QThread from AnyQt.QtCore import pyqtSlot as Slot @@ -24,6 +25,7 @@ QVBoxLayout, QLabel ) +from Orange.data import Table, Domain, StringVariable from Orange.widgets import widget, gui, settings from Orange.widgets.utils.filedialogs import RecentPath from Orange.widgets.utils.concurrent import ( @@ -40,6 +42,13 @@ from Orange.canvas.preview.previewbrowser import TextLabel +# domain for skipped images output +SKIPPED_DOMAIN = Domain([], metas=[ + StringVariable("name"), + StringVariable("path") +]) + + def prettifypath(path): home = os.path.expanduser("~/") if path.startswith(home): # case sensitivity! @@ -79,23 +88,21 @@ class OWImportDocuments(widget.OWWidget): class Outputs: data = Output("Corpus", Corpus) + skipped_documents = Output("Skipped documents", Table) #: list of recent paths - recent_paths = settings.Setting([]) # type: List[RecentPath] - currentPath = settings.Setting(None) + recent_paths: List[RecentPath] = settings.Setting([]) + currentPath: Optional[str] = settings.Setting(None) want_main_area = False resizing_enabled = False Modality = Qt.ApplicationModal - MaxRecentItems = 20 - class Warning(widget.OWWidget.Warning): read_error = widget.Msg("{} couldn't be read.") - def __init__(self): super().__init__() #: widget's runtime state @@ -103,7 +110,7 @@ def __init__(self): self.corpus = None self.n_text_categories = 0 self.n_text_data = 0 - self.n_skipped = 0 + self.skipped_documents = [] self.__invalidated = False self.__pendingTask = None @@ -169,7 +176,8 @@ def __init__(self): minimum=0, maximum=100 ) self.cancel_button = QPushButton( - "Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), + "Cancel", + icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), ) self.cancel_button.clicked.connect(self.cancel) @@ -286,7 +294,7 @@ def __updateInfo(self): elif self.__state == State.Done: nvalid = self.n_text_data ncategories = self.n_text_categories - n_skipped = self.n_skipped + n_skipped = len(self.skipped_documents) if ncategories < 2: text = "{} document{}".format(nvalid, "s" if nvalid != 1 else "") else: @@ -536,10 +544,13 @@ def __onRunFinished(self): if corpus.domain.class_var else 0 self.corpus = corpus - self.n_skipped = len(errors) + self.corpus.name = "Documents" + self.skipped_documents = errors if len(errors): - self.Warning.read_error("Some files" if len(errors) > 1 else "One file") + self.Warning.read_error( + "Some files" if len(errors) > 1 else "One file" + ) self.__setRuntimeState(state) self.commit() @@ -561,14 +572,23 @@ def __onReportProgress(self, arg): assert QThread.currentThread() is self.thread() if self.__state == State.Processing: self.pathlabel.setText(prettifypath(arg.lastpath)) - self.progress_widget.setValue(arg.progress) - self.progress_widget.setValue(100 * arg.progress) + self.progress_widget.setValue(int(100 * arg.progress)) def commit(self): """ Create and commit a Corpus from the collected text meta data. """ self.Outputs.data.send(self.corpus) + skipped_table = ( + Table.from_list( + SKIPPED_DOMAIN, + [[x, os.path.join(self.currentPath, x)] + for x in self.skipped_documents] + ) + if self.skipped_documents else None + ) + skipped_table.name = "Skipped documents" + self.Outputs.skipped_documents.send(skipped_table) def onDeleteWidget(self): self.cancel() @@ -615,8 +635,8 @@ def send_report(self): ('Number of documents', self.n_text_data)] if self.n_text_categories: items += [('Categories', self.n_text_categories)] - if self.n_skipped: - items += [('Number of skipped', self.n_skipped)] + if self.skipped_documents: + items += [('Number of skipped', len(self.skipped_documents))] self.report_items(items, ) @@ -646,5 +666,6 @@ def main(argv=sys.argv): w.onDeleteWidget() return 0 + if __name__ == "__main__": sys.exit(main()) From 49d24f12bbfc178d8c0d4f545fcbe92c789a5c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Thu, 20 Aug 2020 12:58:06 +0200 Subject: [PATCH 2/2] OWImportDocuments: add unittests --- .../text/widgets/tests/data/sample_docx.docx | Bin 0 -> 4199 bytes .../text/widgets/tests/data/sample_odt.odt | Bin 0 -> 8289 bytes .../text/widgets/tests/data/sample_pdf.pdf | Bin 0 -> 8079 bytes .../tests/data/sample_pdf_corrupted.pdf | Bin 0 -> 7455 bytes .../text/widgets/tests/data/sample_txt.txt | 1 + .../widgets/tests/test_owimportdocuments.py | 75 ++++++++++++++++++ 6 files changed, 76 insertions(+) create mode 100644 orangecontrib/text/widgets/tests/data/sample_docx.docx create mode 100644 orangecontrib/text/widgets/tests/data/sample_odt.odt create mode 100644 orangecontrib/text/widgets/tests/data/sample_pdf.pdf create mode 100644 orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf create mode 100644 orangecontrib/text/widgets/tests/data/sample_txt.txt create mode 100644 orangecontrib/text/widgets/tests/test_owimportdocuments.py diff --git a/orangecontrib/text/widgets/tests/data/sample_docx.docx b/orangecontrib/text/widgets/tests/data/sample_docx.docx new file mode 100644 index 0000000000000000000000000000000000000000..a5b6a3aefc20d3400a75a7a7cc99ddb72e3c3fd1 GIT binary patch literal 4199 zcmaJ^2UJsA76k(YkY-5eMXErgiUy<^iqdA|MDT zQUwGN1O=jmhyj_zKl9}~GoH8d*2`LNzq{@}`<%Ovp$>ooOh!#jO%`DUHX=I`$BEDO z{;r+@k`knQ#a%re0Guvv>j0v*`PLJ9>-oqozp4*E4j7a5X>)W0cm`2%I1i_7HoSxG zZw>vr%{A^d@&m1(8y^pUR)&E4NsTp2Awy=BBzYZPNj6(3r@YqAvJoqrk12EnX(iw4 z)%{c|EA3a3!*|8NxL?tUzpgmd?`17Y3rrLDt8|Qv%<%s*!bN-|*vH>R@-JFke4GQl zT)j~eAzq$;QIx7j+bzWui_hnOF^@50O2v4q`$BPK(s_+t=V%5*u7&~bScxokuk1%` zNlV(>6OwF4fnJH%PisXL7&VS#PWXyEc_s8XUaP%d-`IwxA}FetlbNDk-Mcr`P<&TL z@J+$(w3+XA#-}5m<(ZzEKKY>N;|+oV6&N5qYj$kEa`1Wv>}E%`m#s%EXZeT{2S@5+ zls9*rW_mjuv|W<+oLpH#g-{&sMW69y>fLNtIc~UTze?poVdlyzYkppFtkwwd9ZN8Y zjy*}K&R*hXng2)!=s$E^@$p8PIy!l}l9;GS8S*BQpn*T29U?4s7MAfJE3*oms1?}( z#JQ-Rp3ZmgWBNwOCNt)>tcNIs@!<2HzS^5tXi#I(DHvTg!a(BgigBKcjkH%dPPStZKvCjM03# zmu6};l!b9X5d+8n@>XXe?9Rz4S$iA;`FvO30{{?IzQ4CsREUnGqBJURDR2t3nU0r4&y|)StiKBt!xJEbpIvH{e<6txciea~Dt% z8xQG#+fyC87*+Qw>tYieGu2CxF+>HBZ|;Z~S_(azJ)%^EZO8 zeZt0#wlzH%AhT#=E@5z~IbZLQ(8gB3_O@ZaTyHA5n zDXs~Mqf3k8)DjORqBpSK7ZyPEcb(o^u+`a<5=)}-*) zU+$$YlMD1ez%FY%tK{QJ2uBqQHK#<;jV!pl zs?cN%6JHt=t(;e1DPl|sRBJ|vZQDVHdf9tcocNZI6>#4D*bV8q@zM6!V3;i9lRYq= zhn*MVEPN(pU8@G(p?gbGX0xyJgU|KY)8PQyna==1gpiuhj-rRc?j|{^;t>bW1-1@&r zQQKP<>XA1E-%cPoy{1@SmoeJ%!Bslt(t65#I20(p0wW*y)a?b?x@u(>B zG-xwix3e;-4bQ5fNT|+`)fUfe9*>)me}z)}*PE)D}TOg5<)0R)1Zts5Di4V-aGV@StkLJx9X zgGFyFMXlp1P?5$8`_MypxoBmFu!m2LzdL1!LESWL$i_wxDS@c-ffU89yTS=)POkPNR6RRQXw66GB{LKw37-WYYxKwLpoW7h|_K zi|)Qdol|DbScrUFHmO8VX$!elFVYU{<6nSsP=K_Z0zEbAs-H}VR&MS$bbYO*KSil! z&($P0ZGx(}%yA-t{9|hpMR+u>H?)~{s|lUX&;{VRUJ+EWLlY4K>bxlk>b^iIAHDMd z7oxXFU{A9W3xOM4YT;35D?JsA^;l#(*BaGT`VS*LS={8r}%nN2T1a zyZs~}`kbyZvU1;@Kc%Qay;jHPWejxgvq0`hqGmB6WhD9iIwMIuJcKlfAH>#=p!}0~ zn22qWs2f9nAKw5;M_*r3=g&(u^65Saj$Ob*IchD6%pXq}zNmFIu$HaKX=m5=vA)Mb zE%op)(o@3r0ju5SySeby?xGx>1r4`qePqVNY1xY>L<*p#*<5XqmuYkwBLQYuedzNs z2-i%tLUY~BbsWeBOg*?#f}4LV9aI>b--4(%^F?}vZARDAzX=qlR77B>qrb@AUIstw zjV?oHXOiFY?!iqTvRWaqN)N&$D}}e&#L+Nre5c3{=<&~?7SDMugDQb=-h_{j7M1s@ z!%~Z4KO0ck>1xRh96(uT53r~g<0jgLM*X6#?*Qs-PN^vq{5toC5>9XU+ue(C(V`Q~ zcavrnqA;o&&s_3jd5fju$}tLuuEA#qYtomj;k?$~Pq!T{JsS%{1TVdTe-H9$qP`xJ zkaO{(W^HTDTz!XMG_BJVfN+dNy9Jz{Ql3aV_aBW8sb>GC-PyB0!QF2 zQrgvy$DikPxSg%^w8!NP#{x2}zsxYZk#n}=NvDGeesNX_pPNsZKs8nkIsp3ytz1Qt zsA(@?tu)PC)?Cjr7K>CTiFOTI!Q&Vl#!h!=KgG13+xA4#_i0r}!HnEeQPg3yF0zugTp;j|T7g0-CbDhQm(E=m_U-G=QV_zs-~w)T1ul8StZ0_`ID!#des#gKV35m!2UBh?3zr zgrU@8tQ2ufr7D0K%os>`wyM8_k2d#z zu5AwdH~6R<_#J+uk2aq`Q0R&V)y8BSlN_QjO-Q6G{Es~P*!u@iw$U_wTpPZ+^4q%v{%=S+i%Yy|1M#fA2mv000C4%v7*d^gG#6XaE4f-FEvIz}mvv z6yj=UYGh{zwg4GHENpF9oNY{)ZH*i)9GPwHOl?eTK~C1DHV|fr=?jSRFJz|@^PqPy z0KnabcT1*j?qqFjV`Kq#WPyBdGTYfa4^&o^Mnxh(y1fKdRz^bgc3%kqzyN^vZdseIe}>83neT>t+W^_xK<>uC3u|zLrgiu%&x3D^1{m*0UwtSr zvUpFWm-AVYMz8X%;zHqg@nSp7&Q(-7G3-qv$RE$2%8+ zJ|;_*adc~e@<{-#iEM{kKZ?BVrHe`4ExjFViUvHeLPT5D=ssusW>Gi76#BO&$fWWz=k+uKl57lyrbtEbTC=_1elmN(m4 z`l@0o_QynNnsgr=MZdALK1d9@=lkj?qZRUov6B=;8wrZmv=a8Gp8*14&8En`Of<11 z3xOQjP;^G0r$cKVtgi8TUcRE@**+)4ZT^tSkQn?aWf41NlVpV1w)33@!k z{ycN2|IQ%b{HYNSZ;u>O0`+j6J7qSwNyekKay-?q#Aa{9r_Dppy9ZnE^e_pGO+|km zT#?BPt56+5eJ>g}sW8y*3@@i?fkuptX$2W`)u>o zM#T4v%jPc;`b^I^^&dYzulA~@OHHW`4%DH~Ey7=XUdCppLxb<_j@ECTw$rXR?7=>1 zJ~olK{vx_G?lpsLIjn~**P;7YH%N%BE!fV;#uWS)++7y=caOKp$H~~}Z@v8vWd^o2f|&jd z^}Bm?gcw1b9RJ?i@4RB?VEf#`)bam<{M~)tdFAi@k^aUf7S=}3O&wXpEFjiKc8-5> z=?o&GpC;x9Lr%M=3--5w1&#NyzSKS)gmd5rLc;mJtLy-;If<+^~WGOs$LoM$}I-I zI_BEL0aC@Yu_sbXJ4` z5lKSDEKb&567UXwL^RV>f$XG3xPB!Q3|`>NO)En(s#0z6uOf@|5(!0A==Fhk6MayW z@8EzQsqnU>?|tktY#8&-5HS405H>Wu7=V(suD{$+@>$n=>1C|B+h>rQ(^$vB5=D&R zTlUXN+jab|HyhuWhi>pc7FRe+fJFu~pI=05R2-5%l`g*faw@)>kHX!~cf4=EoC7zP!hLX+F&^Y@TbbvfJoy|48l-lAgoE>vcp%&UF}VN+-TIu<@kyV- zFzC4Na;X99nG{x;oTTa}AFFrAff$@HWB`L6$QTMxYj`#42wY z99%zS?wK?M@akUl$(5Eb-xTd};gCflcPfrGRCp1&hu5}{xI~) z;2ST;V-sieobU;OmaN)<#YiPo0f{o;V_U0J{!ntLzY+}LBKeR ziN@s%8I+Wxv=v3l!^(MqFBo;4C`L-Ri#E9tH?`1mt%-Aq0$a|O_-q+C@vhsx*vy?V zcVLpC_Zrc#K$=2WNT8BW`Mo~7Q;lx_1Z_FM#^;ZHTO2K8t8?fAQ_nI6b7g+rPC%9f53N}T+9hHE385@&#riU1;-&_Q zN($zfQ@o>kPT5pD1FSobPatg!Vwur`WsS;bg>_X7Ee39qifGN!bW>SvIaC~x0AhK~ zWB%cdV6izizw@eg9lS^LXp|-4R)`}9Cyv(e8AeT#OKFehCQb$e(}>&Zxx)$566Wu1 z92^(rpl+Xh8Q zXvs|M{(RhSZ2!SE z)=}Aywm#T0!Av9kj^W0KUuN4^e3(B7i)*~s${I0bIv50Li3YObPpnPV);D3kg-;Nq zMT6gF5F0N>;_e}YZES9~4L++5l~HcaLSJvtVWFM@C zzONlKfh9-3fyO(ztjwZTsV!HjHSc;VZEK?v(6Uz9pPsA>FLArb&se9YYKG?JGHh}r zSquq{tGc^oep43Q)$6Jm)yP%%POXD#H!oXSwtsO;#I({Fr~DfG?z24hv# zw8U`=sI9(1oVIeKGOP&p@C7X$;1DYm)(4}#rqZmJ^SKenJaY3&S5thFm;l)+HYRt# zY}IcuKOOwWnKu)!{z9XCxk$2@3%MA=F#DwZOe}x6QK503>Z<#F*#^<*#27;;f|C5h zI`5<44Xc5caDuuOQeBx)JS{mj_m7R^WwH!%dp2Q%?7RD6m`A1_o&EJHPiW@h&Z6Co;N zpvSt=P7A4jB%e^(A1XLX7R>Ekf$DH%7EvW?vxG9r)xD4;ItyRx!67q8hHeXQLAZ7> zKxXaDcze#G|KklsaD{AoLd~#*sC+-i2c~?fdg`=ofjGqUfqkLifqYPryUQX-Bhe+x za+o7;R&|0wt$TH_^Amn=_t6ql!b#Xv#21K+=xlM?LtDmC^I`6R<(|cc>AA}<>#x4# zO+HVNlLNs^vFnY{D!scF83&)VAfRh#A0>Ocz5^uK!5`lDXkL6%cYruQ$CoRBb1twA z<21Mebx6xSPdT*zP~&1i@MM-phxHWG0$nqt<-fP#57X?C^7@gn{7;560ht)eS%0NBO|W1+~2Cmz0u<)*i7_WCjl zN;e~sRcj*$^|VPW38d>Duo{?B;!ZBR>Q%MlZOR=8kxe?eFfM&j{^An;sjjWAp5Da5 z!ayc@FI_nL(^%naZJbAqv}G)7!S=m_LxarQ=QownCyxvgKGi$dmA08(R!>>Moe$A* zM`+ExEnt~@zc4OidX&)ivBR?R)&BLxpo^Q3a&u?=B$rD+{d;~`Q{6hjx}yCc9$hec zR;T|w4N1a`eoH9&KA~+N;ti}o%ASGq>ecn-(dxT!vl=`x2{!mW{AjjKCR8j57==lj z(B5SpV!avFnqj+e3~bp{_H=YykHnJWm_8u%!5)E?hi9z|kigO@zNcqFb@a=PfeS!J_4QyUl5S9*aS@O6Mn*`8v9c4re#7ft`6xr%Q zMh(KOBnES+XJM0`Oe1lONkvjQ5RO@GNzZ{O8hm+*QO4_>w@QZ7=(R38BCEExF@Qi+ z31Z2I0*rzYZD***De6(uK@cwt=SZ%#CJc8FN++^EUvhk8Nr$ugTT=mC3(w z^S%^;$R0@VNUV_O@*Ld56%Y13(lVm1My#W!@`^E^KmFS7#+$}>o{A&?T;SN0LVkHt zaq(rEa`t^%bQy<3;_SRV6X(4;X#$gMd}J zrI?3uYNZ6M9ma&QUlj)Ub1=<1&myR>?6?lr%}e&+v0H0=uGo^asI&Ikb=@Lv0u`{F z0(Ou<6+T~7vWOaYf_=n{G|l+;TIel zeO7a}DYIM>Po&(_3<$(^>mDxceG4xoSxXJ;2d!eIZr*~HB^79S(ywl4P19MlrNv{t zo~|=kc+Mu-CHVCPK~P?;clLWyp#_0n|*v= zDTw4Laj8SHe>iJH8^UAkET|fj>sOkat;ULu=y0tCs|vvZ#@ek-q*{MQ0@z^XAkb{K zHwxE%ODo1e-Pdmn0>=u_7!n)vKNej=F;$EFqSr#&}~GY7oczP zQvJJx+p&Z!JaI1vDptl~-_u;TqvB++gECCj3K5b}Im7(|=04lm&+**YP1e-#Bm%}X zvH24*y!)N}8LW5Lx35Q~WZhvZb8#-CXgyn-m1-ZAU~H@SJqV-zNsIc zHx=kBc1gH9+-l%(lc_s8D}DxpB+O(X&3HJyc9r|Ch%DRoW$MLDC+^N#{b7!3c~{Xlsc!Vg&mAIqhD~-|PV~<^v{|P2QMz%?F_c@#w?ewg)!-7z zzz@~MXgV}`zVaMCU~TW<29R>h)r(cBt*}w%rZwe1$Kl2cBAZ=XRb4SbUg-@sWG^Fu z-AfT}!*e8Zr}MaYr0u^YI1CL|QkN@_T2Lj+e0!O*U3P!M>`}iD%azkQ2h}oeBnz#_ zy(A2GzadzfT`K=+sUd%24_-ckF!x6F8qrx>rSM)7sPZx$vXBA!9tWHgI0~8Fn^(;h zESG~yC2b3jNrJWgP!UXPjlNb~+_mw0f?P`icU0RUC{za^f3N)e_I zqr25|Xmp=L5-Yaf*H=h>&d47;Ma@}foF)|buw1D5!yk0TB{Hg4&9>(S1W+|P-f>b%G{Af(aR(eP;dl-JlgVg{da(O#af5w(RpQSyA z;}xWI_a>_Y^t-{Zm#he$x)o1$|092fZ^FzP)7WeTLxZa`BM8?+ZI-zZ2%g;I7%nxy0gu z(xq&*@pdJx(JO8}B}y+7532IGfnRf~7tlk-p9QcCF~O?b$m%QYdX7ttid|jCWgFai zGhOwQ3i7QtNB{9A3@kR_&vm|Am;PkmtA5JzzoPzK(E9;-a@+hw>2YxS3{t;JU^7`|5uit-?02puK&#P>oeW_hUI4^`_DXgTJ{g)M)==e{#DujGt;l$vHuOz z52gFhEO%P>56k`y%l}fo|4jC)lh%Gi_OlZHAD-`H_-C2-ZdLTdh;DP$e}mGkGX8xK ze@Fhj(z=s@f0z#9&!X_ZaX%-@Uwg#F{B>3KZ{*KW^)8YAu;AOba$Dd3C$0V)^>g&N hi?}~5{?>87jlatBaJM530KmBYncfEgJ`vs>{SR34uyOzZ literal 0 HcmV?d00001 diff --git a/orangecontrib/text/widgets/tests/data/sample_pdf.pdf b/orangecontrib/text/widgets/tests/data/sample_pdf.pdf new file mode 100644 index 0000000000000000000000000000000000000000..47d89e492a99c4b27b3ea0bcdac1213a8664dee8 GIT binary patch literal 8079 zcmai32UJtbx)u?H2r69!AtFs$dO}B9=nA0-(hLx)1PDPoB2uJFlU|h~Qba^RdJ_cc zC`FMXy(7JdFY%mv&pG$Lx87cBXJ-HN&;0G1Su6ALX)B>cA!2Xy9#|2#k=LC%_G%Bq45CguftUFDIX}ZjPOh0!`dN4#st2cC2z7vtdj_s=<+KPnZb zUWpNQ6?MDpVmN`H)ofR|7Fm1h=$MxsfWuq;=?|$#4go;+oZNxHVC2!?GQ@wyhx|X{ z`!jwB5CR6n!GDi-R?^#(=LserX$XA6gAn1yeO=rJQ#R(77YvWIYS2Xws*t2d>v?>DRlEZk9xVU_iHv~fc;%!yAX9an|x0)o( zo^=l{Zk?kG+W58E$3=aP=uqj;5WC{_p-GOOW#eLFvp3-WrsPDW^T{tLz;|sIC{5AZ zgx#n5ngy+$d~`{Z33sr`Q!B4GyU~RA09ebmtPDEG?n_hyId!1b(^HSmPM$g5;xqFB zQ&nO_CW6f;CHEO3)lbXbknv$K)M<1m0z$T& z`S-7Kw#ZmXD0*_4b$V>&NBB|U^Ou22Snj5kKK#K<;**9AfzGA-Bj?Qn9KqF^->H1! zL0KpCP0#cO+1R_wJBHl0P=tx8s&jh`KM+Y-u)ugJ_oRtkc9uhK?qe`BmfoD|;Py13 z`)-D2V)F&glw$v_9m4np}^c6eZjGKPziQ0YZh*|zDnp!$K^m=oNet8(-u{7@H zPcMq zK?qFmV6L)uOz{t^?dc*Q}?dxTaQfy?h`vJ z!ajX7)$sEIy;CwW#xVvZ%TeZD%kW*tsH^7A=qI1h0R&xFp=&>C;#<~o@AQ83P;Qc9 zN~29T>4*voP)kb|6-iUOkEw2&7hn=#t9P$7-TlV&M5{><4=8xy?Y`5QOF4XiE9n$g zQdG)rFzB!7UxR`^?8}DJ>#+)**B_b1bUN>;(V}T%$|+Ac#TpBK8a4G_l*;Ar@VHttc=Y0Xe0+s!jcMK^SI z0xbiIOtqAGie~|ER9I(3=+!pvS-j2a%-xD=C0m9M5f=HnD|Ceqj&f$LZe%IA56@9! zJ3ty&&Zu6kuuLzf=Nh1)dVwuF|5UX1=1;IX?&wb!!1o zL$)C{6~kqKtGHA&XI~mD*7TGmyz`8h1&Bu-f^#bN^>mpP3kGG6Czg4$jy6Y_tz1dd zi3l&0&+72D(ncGO*}g5WyJ6JFvsxDr`V?;52P=<#E;6^k-}ZQfDTCqc&(vi2szBuz zCmo-J8=&3u71hhO4i2d-n|+N%fg#lHN13R8@2TvozsflYt4(hkSA?byGT9cgn+nnt z^dAFw1Rq=2v0e4zcY0uUx&vAg%y3EeKI6mw?QnQtS)56fb*IFgZcepE29>z)s^J5d za~56Sh&(H^WQ~tiqq?I8n$-w0VX(aKxEH~ZE79XecH3~ty zik$v1n<9_SFqI;YzOZWsyg7{v-n~Yf2w^Up+UG1s#S2yuoThK5pG73xwU1@tNIGS> zdF6dK)Qj?&NqOV(%uo~=TKyH>c0y3DQ# z`WLgxl)OVRdrIf%r>}Zz?4z!Van)xWMd$L|=u38Pr?w@anY+cx4NapfY~LDfq*SJe zeZA{6%(?vP_DNUw^0zxq!_wNj)Ml_>KR6^?+Pzno6WD|Tp>S9}GW8~2`5Y8x2n z=$WLM$Cxh?O=i!DXGL5z4gzW0md6v)3>?0(^rb{eZALK7r`ZyC#LI%7JiDnW30Y9e zWa1M*it9M%#Fd36n9eXRyX9S%+T4d8{+ND_No2m!o&Fl{i1Ft)2xh#O{na(ojovWt zOPE%q!8yJXy=O)yFLaxhcEq-q-Zgyj>T-N>P**8eHs5u2zU64W34QX+*0+1E)Ni)= zCkx)4s&d}0JqSZJK$ykf5wFNK;H`igo-8B64CLD z5wC5*#L}O6Zk<@NQpj^TG^IjW?3?w=ZK@R{lhXzju@+Io8s`A1 zV)b>K-bg=O_W@60+-6G64@6Riz2_s2~0o*R_c7ph-lW;L+& z@LDxDdPM+g;~}ZHwc9`5gx&5)?al(dT7t(vqY|klZ!d2tEh=x3J=>dQXIL;xa0$9&VDyq zrkz{*YlVz-72&%jjd{JpjB2~#wO`ZbzKhN2ENe2?MJ8OAk!G&u3tqk3FQ4KtDLd2l*0D!5Ejv>Zk zhe};DXzbyQY_}4|S#@c%P!(Z_ap(tiI)j{2#%#vQ0jnuzO9`22 z%_GN@{b#;;%bes`&0RGQ#Xc)Pks0#aZaUrAF2IE$xOMMsLTBwM*}h(<*Sn11 z<9qw#v~iPqzX1jK@)-u>%SzJMra4<4@u+v* zP1y|pGMQo0eq(&E^OL4)q%o&dXIZ>KjcDc11efVf`D2n}8QrUKRK2l!Hs75Ze;mkN>TF(DxOV)YR_?*KF;kfN+GhNg)O`K=yBorf zevIF&`za&Ct-eSblW^Dtr!FV}+i52BVB$d9>POsemxyn&hw?y{!-dzT(Y<3v2?Bt*ImR&w11EWPVxa!)0>$y!2XQ5oO%@cuqm_zL^`z+5G(OA2i;pF1^lQA*>~c zv$^l(4nO%i*M5i*Drq>3y)yo=6WRQJ0~IZ^+wHVsYdmi@+BfENaxA=+QsepS-O}6P z-uQa2mwX2sW!bl63Dx6bON#V6O-kiYFGQ=U6=wtGqIY4ZL`!q0jlaPsFVOF&QE;Os zhv_{+C}7ld9L@<8U33MIzVbvhPDJ1LVSjnbLpnir=N|vp6w$CDs~g}+iQw3;gffk= zELpEaM|c-kbEoEjh?&n2f4sno%eGhIH|{5^;gg=b$tMio=j2>_zOJXs)%W)MZVne6 z$a)?5(mnN~!T$k4`K`FeW704w9mwo%|L z%WPQ#V7tqCR&V5#aFwn`!_u&817?cHuy_8YGsL9LNk3;;umLUGeWZU$zRy-K>5)aZ z5X&6)QJzi^f|p4jD_YdYiJ6iv7?Lh&mfY9giUllVBX^~Q(k=+UX)u^Nz2>;;$%1S| z|7<99zG{+bUev7YY#0-y(TLH0IUC^EicOLmp2cOX_3mK!hbi!LsEY zb+&L!Qc?uXYN#GG27t2&_3)KHnze{*7K8R*=^Ub!*(^J(AF<^X5T-+8$~H}7LKi=1 z+Z%g{FkVo>1Q=JDOcRdBRE3IxkvsJCJIi7O5L;Ak(32->LCrBQwCziuTxx({yvmaN zI2RVf%wXw)t@3+iwh;u4@j%nGwKVs|TKwAgxF^#5*fK_sU#U4%GhU(2MUq7$#L_aX zlK2xt-PH+63S9}OAssoIyy=fCg9c{8c{sw_m=-0m58y~vmfes61@YJ@t=3QpuNzf* zDnR^WLTEeFQO8WZ>XkN`jIig=%Ju`F7+?%4f;8drIkMbPeEA}pb;AB+S!FGp&g$A?x*t19YcP{gRpYP4{1s`XF(*h?#m zOowrL+#o7}g41rz)3zlq&GX-7LEpzF6MZF-MwzNw9*1?leH0MVUvtHI(TfkkG{j(< z3k+~-TGeI>RmclL9jZe#99@%hDzfj?@&a>K_;beAOkQ7#G1{9kPYLvp8rsK-GR)wf zd1D7&+r-PtsKJtJ1tHt*9SYowk&N!urWDldeCVU190`g+W|hZ+Uw~S77-DH*v=m>7 zCwv?aLLZ1e8qi2XhrN6#yTkl5f%q$S_R=V$VKFS?S*v8*^n}KnGmE}v%4cH3U!CNJ zGGPsqjP&nah-(GL{eWoYva-*K_B; z*IDJ07DeZt^7m%0H6`R*K2ffWKKRD!lO^5`6+**PtC z=wObx&gsHA!S3{pYQHo3sOP(UJw7YC&-92(LtfG44k-tbaTgx8B`*KSt*0``zxdLt zWKg%Y8$1fhxiv8F|E?#mKuE&;*^>HL?`ND|w@270%Uyx-z48~Tuj<=Og^i*?27D;g z_oe3mO_Z8*0ItkX1+^R*s}`tSc;o8ZT@^LU*n^%nR@@+iMO|b;UC%2z)@2q0n10Sp zXc&j&sEMGkY|iKHb8mFFP5UiM0=jH;7eDZZqv}51PgCYF>xA5PRJORL&L~r+l%jZraQliVptn^B0S#^?RtE1+vh2`H37I9ba#HY@E%ok#_*wHHRQhN)+ z4H@Q*96o+ctJ%!&Rk^KA?99*Tp8BB=%PUFtLRk0yx^pX3_=8ZDkMHxR7Rfo6j)p#Q zo8NI8?E1WJAwX?+Vaf7H8I#UkdPiakdakpu+Gp6LeYZ=ayhF<}VuLPRpxfT4myiVX zlk;47^tGYFrw5X{{skGva3f;+jIT_7>Ch`CmAjWJn^@x)#288=*H#0C_q@KIHj4eC zuw5MJ#D9o*<0-i~rfwsbuCg%HmmFscq`fdMBecd}8wEFeCj44A`z@n>wUaYC}& zxW@L)ckYEA7p90Iz@>Q5_=*<*F+eNrN?~5(-%KUF4>^>5zj9VZZt}#VlgiOp8-yBQ>4}%lax_atl-L2QYrpX>XdPXubMHM}`AFxH$Fb-H#cOJil98r8Vz-7JOxKBZ6A7q+py+|j zJ@)nMAdshnwAlya#UCXLZ!Z|9inLh6&!?tDv~XgcrQRK}GyxOP-yd&g#h?^~osER; zH+i|zuUsxFUX^)aJn~6+*?0JZhib`Xy^?`Oyu|twx|W*KtzI5Kg!BAVOW)i=0>{~( zgg%CagqAa^fG2#7Gf5Q+#=!KRq1TiS4zwA0!b3NQ8C`Bkss!=L4&_0#y!BG9V~cs7 zw8WHD0HSHaxMFuR;DT&HMya}(H-m)|d1*D^ET7&7ly6jC3jQyb>g4Nk@=ZEI94Y?q zTXoc5x9WPH&NvX-2~SkQxmptJoQX~()`0YR6YGcrDaex_VNE*=905zTbHeN52zJ&0 zGOb|a3WSq7BuxbcCl6y$1RMqwg@Iu}2oxm_ln{rSg4BrEdv=!cc$<4TAQ%9WceTXf zi9m4)2mnMjM*a|mLXZHEBGy?IXJ=zeB+VlY>Jo8|20(Gr6!P*UCIbfrl6e3OfOM<> zFJeIauMqwuKA_^@|A+Q4jp@R-052-<`A~Hc8Q7&<^%j%)H{jn=NpZvjQ z`|IZv$J9+nBw?l} ze|ZIKzjL=R%xWEr^sXG}R!-sKQr%tR70*?9SB)?&sA(GmCEf7uR5jJ5RTQ63{=z0J zA6()n++uj(#v$?`Ao|m3(N{f#Ld~j4qHL2@To3gkKD~Ko=5YGXNotUARxWp|fV+Ik z+3S|G_2*51+&5#~LwjTPI@(E>64iHOE~v$+$nw)?xT^nz{*c(wd7$PyB+W0Ygl9xm zZTOZ~d41d!U=(7VSK!<~m87V7!!D~K+A1;a*-6-Tbn{|ys6f-6zc7E2(PUlaRh9|8q{ZrfQAZC#B~C=?J00Rs_WFc3=W z!^9W6!WAJaYekVj5Yl8!6V+mwr z!^sPPFgRByH-aV36$l|KV?bIsD?2QC!;lRiBoJay6apauMj#*%aR^Wx4iOVaLf{Y- z3P%kT(i5bpOWH#B?JRK^6$OwQkVqrvm`G<&4A2Tar%_<3H8{85-RI;@$4u`{M^pT3Zu~ zw-LnQMfG*B|0CJ|q6bMqD3W+g(m^GSRz{=2U<3*bMnS<~qy(vjlUfp7P*7O_xq4P^ zmbicE!4?0|L6UTT>yz~0r1?nXpD$@X5=MGSa*_%dMsAf!Eu6I0Z<8dyl}LoDppqn% ztRW5sL!fXt94rim3V^`^*Z-Ce4+72_00u(f0Mbf-K0qV_0Yd<-fq%+SUsgApgrhQvUx-kAz@S&Hp(U1&97kk4V6hj&=eWl)83aI8v%W7$+wZs(zyv08+zS zI{_i&s*^Uini9|$C=G|A5l{&wC{i2>0h64AD~Lm3XmNxB2?Iz)ag;3J|5qWySINmz z(H3Xv;OgcG1Sukw6y@c?;z%$Ar3gVQDZyY$Xk}%%1RNp`LnD>RAao^S3B=zeMIj`R M06sqDTWG-l0NWNc8UO$Q literal 0 HcmV?d00001 diff --git a/orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf b/orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1dfbf06e6ac2fa19b1faba00d0c365183e5f3315 GIT binary patch literal 7455 zcmai31yq#V_7@R^5l~tbVMI!5I%uh(6^0O`9AIb$7zXK5LRv~X6+|UOL_oR(Q5q!_ z3F$_p1o1!Pz3*M`yYH?4x7Ii3eEaOZ&yL?YYt7yQI?5Pvhy)xU(E6~ouJvVW4gd-S z16?c~05UQlEj+=NWCw(iDOeE3-id@Kf-p`v5?&c^GsE!b(iT#Ew{OMS*I?^42J8foX|{o^;CI;?jDSo9SAeEPd4M*E)>ab7qcO+7u~;kfUzoVj_I+)+uL;J7uD@ zEAgW4;vVPRj3x-PS{;g)V(N|^9P)Dk@C55W?IF)7E&wQ&Qzj4?jI#QRhWLkn$p7Kr zpZ-ID5HJ`H{@vSIDPPy~+i4S3qQ=khCJD~!$jD*?WbBub=iDFZ-PERIc%NU8F6aKJ z4>+Uw(pt0aeh9x%NQa7ZunmJf*3Nq_mRpId%dIuN5Bb8>s+x$B)d=P&9loFRP2&;l zI`m-@5fwbLyL!#BgFd)t{8M1n*R_E4-SsNb7}(=QIyOZPuY0cx72hPYe_%Pb2RU{> zdi5=x$#?PLGk&x|2a8@r%cZGx^Pd-k<|L1eyR#;qe7#@tIPj!!^l`Sj3o#)r-SM>_ zs79nZUxx`Z)6Sn>plgiTFzA2$uKW7dwGU?kGB@Vlv>B-hXNd6~YPVe13ME{91$XA2 zny2HO&U>te7aeBQAsh#ivW@pH3}hxQc7u_}l;Va4Z7vpggF#+jxTU%;(RsfjArqn}WK|Jn{vhN`z2 zw@33S7g{%Y=d2bhet(m%PCq2X$r|z7bJaM=sVD1N| zsltj)gjq~V?J>t_9GANy>&I*^_>}jIb4Rk4TRVx%c5)B8R5r=HMi}X-*X&dRgm1YD z?p@?=m9>^s^5!w`^4fY79Y8~Pvqib&0cg(OLPrcUf|v47{~Jp{Ai7%gb_uTK-Z zZ)VE(A|(7e%WfY(YmTl`Gg&a(W93bcbHuj-t8T}j#(Xhv0w zd~>*TTM#e@zrjT7Xb5|F_e@F@+(=*Jh2o=?an7to25dCiQP__2<0YCksZ??B89Shw z`6WSHw{Ao0V^xLjgJLJY^Rj+p!^dX<)yqCJmgRTKbFWm1b$y zO!_R-&e+J?>X~WcVwvi?!0BO7kj)A{v=N&bh)`~E;+dPq4d_{LV z#OijjnYIdF$t(blitLJxxY*7+OR!y?xmj7GY{&d2+VYX!3d57YgZx?RE4hlE!*jH_ zPLSq>6KWSLt+FZ@c?Rfc9^=ZHK4^_z5}bvWJ3p?~Fv==dkk`|FKlA(+*KMmO_g5CJ zmo*kL>2J*_>NrJEAI<04PrY-hxGCR%qWjE(PDsfr9RF0>_HfE=lvpivYxv3$NBeHhz#wSg-^Diu>6T^m=d+{^3!j( zb1Jr|Sm4YG8!W(=&V^5lpAHP#=ii1sfmS(!qWM01RM!a!OAwz11n*Mf@9j1#5eCW|Pbv517;T9*U%8N} z8y)pTA-B`lS_fk^X7{q9{)%xQ-)jBsi2HDpK3GMb%$%v zxIm9;Oy5SU(Os|o$%1cd)HkLZVHpL30-3>sTO5uT7a%#q%Uj)ka=Z*PTLRp!m#Bp8 zEA#s!ZHv9SB2|mM`XVnG^5-`#`1TraB1Cy?>mITnlq^_BbDO=Keh{5}(;lWXNF?I82K+*RwVC!)vF3OrRG!fB7V#E~B-7bb=-lWtCZQl5yNV=YrlI3!_P+F3>CI@?`Aj<^AG$pB-h-=JQji7Z z9997#BwE)sKe0R_*=&Yo*`x5X^yVJ)``77**c7%a-C55G&e$M9!!Q=7yie{q9*jnX zA0xG63{MG^>OU|xeXQ5Kv@P*%=~d%LpSRAB_v@?V%je&ooNql?Z^j%wu=DSpD+`!y z`N2-`q^X*>?+C@x3=n4tH_(;e0&e85Qd9eze0Q|x2AtY@XOofK%gb;qPB6Pl?LLb| zlcgB^e0M`oKh{PmhS5as#gy)OqWXDS=Y@(q>sHD0T}YW8o#rsX^S2p$A%F;plsXb7 zi6#1(h*%|WsR-&vNO~{eCIOq+%X+=ke4;KG&}5kh%xq@ubVVXzWBpWx6M^ z`rD?Uz%}zCoez_JExw<6_fl|-Cm!0Ql&(DCI~S2&sUq>&=IJ+@6(p<61`VkWQOl9& z2&v}qchohC>Md@MuN-OQ;QT`4aQYcf^095fVIaWua?8@THque=%Xgd8AKb-EGgOMr z#Y^8+BpEv=%j{N+Htq&IZLTlfolD^XD3`h%e&~--m2n(?L>uP0Qs?6N^zL`xSSgKw zpR=(-1p8anQm^Z(F1R&F&2D)s}{!3h(H|zB;&eH$GfYr>zx_hxu9oD@T3PcVhzVqpT9?AW3UJoS87CX~U##dX8XRZTI*cy;oVM^^Y{_C@lf5i9;l7MCcehyZ>HYR}+OGgp zDpy9Qx#LX@Qbo~FJ16*jvn*%81P+Hj?z=jKC%GY?(Rc0V!cZFwcP?5sux zHp*&f=pkkTJ1{U(^zPn$+D@@jmv%wTL=||7Zcgb-wo7Xav7dZ=A8yafHx}XsbG=lJ z9d6h5n*I1-qQuEWaD+LFdOlA-D4H`*zj&%RsC8fsp!~(aC#rwCsQ8!&DYW7?z)V-y z2y3}bqoEZ#7JMboqm*St8Uwz(G#_-Wn za!zK;$RTzAiO;^WNBP!s7cC-i4=RpihXTGeAMQ!W?96--;=vL3+Jjt+Dyy{1?+jN)1Q|eKPpssI=?}oZg?&fEk z_*T!?qy!#yX@BvK>(xA{*;qaK$di_E7{XiTYEl1W?J%%TKJfFH8O&mBGighDzG3~< z7129i$FJ7^kd@`tSfr0n{{9wDTT}$L*GlfeCW3S{4tPDzl3wJJ3IW(jCu*u5jgJAt zUxlciY0JMAmIQV#IJ|Vw1Fe0F5?!0d^o|)P3jyZlSjHp`T@*A-j0v4#@9S=`Rp*R< zYdUPUXpl_nJUGA7`R?a%9n7~wyO#R#{jq4@jkSn(JK2bi`oz2q$=|zq+yJ!^YIbxO`g_Zeeua%$2k}K|#+qny=NCoL7RZTy~aVLZ4C+46b=6(_sX?XqHLGH*WGH|BSAD7uwi>;3c9 z(#zrAqz0d-0{a`~dDrBKHRBRXN{rjh$`$ud$EmBAbZHHUnfnb1CJC*$ee+5A%zJM&YSMcr?TGoyoV>zvm-5%d0O;Eqv6!?HRn}t zc4QOgN8=OMi>5gi#VsnXR!1=+rVQQtF1OkQEz?HCj!Yj5I@p%K;Gw#}iAgmCI zR;{mS^F-rQQ={otBlMy10K8>{m%qZntYu7#1hoG`*ATtzX8HGq5j%b%Q3edQeA6sG zV)2cRgNctA%V|~YZIf!#Y2x9Snn(#SW}A_5ds%`A;*2c_y?0MNv?czrjzihKvyJdG z7unP97Qo`!n62D!)dA1UH$tKDUKqOe)|S2m%b$B*PGT*0t>X0sm0KdTk`(LRq}VmX zt*j!eNI$T&Z@VC=5i3!2 zsI~YxjnLOcmijM8jM5E402ld{+&t$=BJt5W4=om!YK7IP=ANWo)mppM8%dI3p%>-3 zyy73Pj^*n}jA%0#$k|kmJkDsFzlM*G6Po72YSZPprm{S85O`hqC`nE?4HnfL`Wj0= zEg+=C^&Fa98fOE2qs`NM^y6Sz$-(h;5p3Ifbj!Q$U9h$Cs;Wjg6fvs08YfzkVe`Wv z;q1yH>-R)`UJ#8?(Q%KKX}i*=7LQ)#LSHAOk^H5Q#yM)*Uf=8e`=}t~zvfEw;%9=v zbfhr4)68&MdbJiRHOOOOU7GJ0IHorLSWMrszbWyM0|bN8!kkx~<*M4!E3lg~V1gGr-{ zVS^2Wqk~`VSM6uQC+1474rYRf!6}(Niz+pG}BcUnRDq z4NEp@Rr%-X=*_BXn`>&HzBV4yhZYxvAo%TOviw9RdWH5b9bPtnG%@>>#MI~g+Owxa zz2Q)?tbAg0eR2MpL}ol+y~OjEvNf``S?il0(6zNlsTi{!iEG1-X6vN-iDcA3Xxu=~ zF4y{H5Xjq6#{7-R;@8rJm#0lK#9D3OOd09Xt=!lL88=6)OuNBZnS(V?ruEN<7NR6_;ih6*9tzWV8xaV32B zTH{MA0daJZJPAA5aAD3+;|x9Qi@_(7g_*VBT)*Bx>KCd{h5t*DlTyBLY~ zPSjsjPJM4zJP6}LASvVBt%&xnBo}fOm;8AZ=Zps_Do`F#EqhBm5l6CjA?V?W_BH@Y znb+1G2q#a-oQjGrUMAuQI1DHb1H*t2C<+afL_^I$>Li?#y_EvN)(H;;13(JyR(Jvl zh?ax^K$OiWAL3955&%-dxvJsqZS6>8Ir5?&3GZwOM3Y4*>dBR5JQPT&%VGiKD(k-r zwdlWG__GiVMT7sJGPGIzTS6=FjLNPb&07*Pm$VyC0{*fG`vxi&y53$8Q{Ego@@C@U{tgDx{yh zY&F}!WD4ZH8t)m=8^7DxLH?6ib2I+5dZMbFAY-<>#t-OM$!*<0b^jq5K{;gt3#xj< zzoOdb-Hs582*{3IJWVwPdQU{XADUp){DUnSl@9hqx zbZM}^9Tti)luLlD=--CA1O8zh;4kyY0>A8|m;@w;T+`m#-30i{2g;|Ciw7l2zik2e ztuyp%4<#J=_`BGW<6%p7|5rEwplhypf`S$0CNcR#7s$x?29V(4-mdsyNB z#RpgVg9l0G{l!n_gOlZv$Uk4QJQ7BJ$#gOc7)BXY$RnJr>(`cKyOqfeE@5RUD1`$J z1w){4I2~^a7qUMgN8zqv-BS{ zN#sB5Bgg+Ad}IWZbN+W(6dd{+ABl(~U+qK+DD~`p@Z?Z|ur4lSRQ*CP0HjW^aREXo zStlQEb!DIlPzDagAfS@UP$U`(0h5h`E25z=3>u+G#sE?YjgkZW|11>vD!W)I+2O4m z-94OvASI--l7a#ljRZqbN)U{)G7P4SQBi?Q!XaoF2B|^;p*smjB>hS$3L%LE2neWL H!vOvdT9)am literal 0 HcmV?d00001 diff --git a/orangecontrib/text/widgets/tests/data/sample_txt.txt b/orangecontrib/text/widgets/tests/data/sample_txt.txt new file mode 100644 index 000000000..2e500dafc --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/sample_txt.txt @@ -0,0 +1 @@ +This is a test txt file \ No newline at end of file diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py new file mode 100644 index 000000000..3966772d3 --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -0,0 +1,75 @@ +import os +import unittest + +from Orange.widgets.tests.base import WidgetTest +from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments + + +class TestOWImportDocuments(WidgetTest): + def setUp(self) -> None: + self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) + path = os.path.join(os.path.dirname(__file__), "data") + self.widget.setCurrentPath(path) + self.widget.reload() + self.wait_until_finished() + + def test_current_path(self): + path = os.path.join(os.path.dirname(__file__), "data") + self.assertEqual(path, self.widget.currentPath) + + def test_output(self): + output = self.get_output(self.widget.Outputs.data) + self.assertEqual(4, len(output)) + self.assertEqual(3, len(output.domain.metas)) + names = output.get_column_view("name")[0] + self.assertListEqual( + ["sample_docx", "sample_odt", "sample_pdf", "sample_txt"], + sorted(names.tolist()), + ) + texts = output.get_column_view("content")[0] + self.assertListEqual( + [ + f"This is a test {x} file" + for x in ["docx", "odt", "pdf", "txt"] + ], + sorted([x.strip() for x in texts.tolist()]), + ) + self.assertEqual("content", output.text_features[0].name) + + skipped_output = self.get_output(self.widget.Outputs.skipped_documents) + self.assertEqual(1, len(skipped_output)) + self.assertEqual(2, len(skipped_output.domain.metas)) + names = skipped_output.get_column_view("name")[0] + self.assertListEqual( + ["sample_pdf_corrupted.pdf"], + sorted(names.tolist()), + ) + + def test_could_not_be_read_warning(self): + """ + sample_pdf_corrupted.pdf is corrupted file and cannot be loaded + correctly - widget must show the warning + """ + self.assertTrue(self.widget.Warning.read_error.is_shown()) + self.assertEqual( + "One file couldn't be read.", + str(self.widget.Warning.read_error), + ) + + def test_send_report(self): + self.widget.send_report() + + def test_info_box(self): + self.assertEqual( + "4 documents, 1 skipped", self.widget.info_area.text() + ) + + # empty widget + self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) + self.assertEqual( + "No document set selected", self.widget.info_area.text() + ) + + +if __name__ == "__main__": + unittest.main()