From 38322372eb670d17668756f8be882cd3d9ea834b Mon Sep 17 00:00:00 2001 From: Hendricks27 Date: Mon, 11 Nov 2024 18:10:34 -0600 Subject: [PATCH] 0.1.2 --- Docker/build.sh | 2 +- docs/build/doctrees/environment.pickle | Bin 31300 -> 31300 bytes docs/build/doctrees/filetype.doctree | Bin 7988 -> 7447 bytes docs/build/doctrees/process.doctree | Bin 25264 -> 24220 bytes docs/build/html/_sources/filetype.rst.txt | 6 +- docs/build/html/_sources/process.rst.txt | 3 - docs/build/html/process.html | 4 - docs/build/html/searchindex.js | 2 +- docs/source/process.rst | 3 - src/config.ini | 1 + src/gfa.py | 26 + src/longread.py | 642 ++++++++++++++++++++++ src/main.py | 25 +- src/mainL.py | 167 ++++++ src/mainLOLD.py | 578 +++++++++++++++++++ src/mgmp.py | 361 ++++++++++++ src/utility.py | 6 +- 17 files changed, 1798 insertions(+), 28 deletions(-) create mode 100644 src/longread.py create mode 100644 src/mainL.py create mode 100644 src/mainLOLD.py create mode 100644 src/mgmp.py diff --git a/Docker/build.sh b/Docker/build.sh index 6da6c76..c0015e3 100644 --- a/Docker/build.sh +++ b/Docker/build.sh @@ -1,7 +1,7 @@ #!/bin/bash -tag="V0.1.1" +tag="V0.1.2" docker build --platform linux/amd64 -t wenjin27/methylgrapher:latest -t wenjin27/methylgrapher:$tag ./ # docker push wenjin27/methylgrapher:latest diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index 8f2325c885ee3fb98ce14b3225205d0ba5197e2a..b5dcb5e19c88ae7ea7e8682a0d7e0003c83f8a82 100644 GIT binary patch delta 558 zcmX@|h4IK2#tnta(i~}-IjJR;1*ubdSTpldQY)tPuqWr|l_VyYOzu&3W}H9yg0hA5 zTqXvFtR+CAhZC$wuc)|WO6!!MDOvL-E2{*{&j-mb2NAPC#3~R0*0UDGnzwnXiXw9m zNKqI2kG|fgtJT=R3cJ|3XF_-&#a--+Rzi65z`W4S5Z-Js&-oyPHwVm<{sZDo=Fqso zIDhj)4G+f2OSF_H`)EZl&YSG0t+e^LRy`xcRsVtNA+C~OhPg@=#F{rbRYwD;yH7`! zaq?+X!O8VL!kf?Qs)IN}n>qAzAdEHxMdrz0t@D9~OH9@?@tFM7MrHFX6GO(yrFKG_ z@0xZ&7?tLY5C)HB9)!_jmB|Qn&3i-B$@NZ_laFgzPM&MyF?qGF%4SYmpy8`^1vj_Z zSpd~+-fut6SPT?A-ppX9dNYDU&zlwK$n4?(7EpLhR&naRE`fNHIW%rC z&fEM@!-J7={$xjOrO7^85tB8wls6yOs%M0_>OW9D#8nc^FjuL9SaT<*>SzFU_vy$o zPJU{mviZEOI^*P0JE6@S`Z*9rn}H%T(BSumrjzTPEP;kgOx82;n7mq7W%Dc(Lq;G& zaPuA0ZV02oya~eKw#{M??aOion0v(xM9KZq!kI9M-vXk!=D_DX!6S8Dj85sJf zXk=dvyKnfJ@Kx*#fC1p{_*n(GLv7*c>@4F0|tr! delta 410 zcmbPkwZ)FLfpw~;{6^N}GW8jpJ$%KL#U-h^@wutR#fj;uQzlQ*Y@AX%MI(bXgDrzS z7Ra!N3G_&XWR~QlDwO7>q!#65=A|l>}q(4AgH*kk1C*Gt0)CQQ-u^$^yd0N12jdYinfFm zw866)j2dGVZE0&oOid|*8k_W?R{Ka}Vv1B-HLXUIKEAZ3_uSbTXJ&&oY52fp_nbYy zbMBe{eay+v$bqxOyw$v?wY<~3(hQ@6-zBSIAS5k>(im!@W9S4LJ3qARpWp~0ZK{Tf zm}r|lMfM9sn+6d9)x$TE_h7@+N#;l%)CTb( zulPqOLA}l_iCz#QVM}c#m@`uPj2wkdy=S`2GGDV{sAkVI4WOH&`xKloL1kQIAa^1v0AYAS&xTmTMR?y@! zPc2QM&vA!=>{P_EY4k*?Prz+GdiV_9+l}yWbU(QaSEK9+Zn}duyJ<^{xtdn4S>vI* z>IGeq?@XSh$D2Od;@JDN^fQ$7kB4QM*d2LtxApmRA#Cb$=?{5sb2$7qflgOA?9447 zCTJLw8IbqHICD%c2@}S7vXOToW3s1$jw|!fc>b5j|IR+Gq+F$>UZ;yBb5ts%r}L*Z4>kzgMe%VqjBArBA zGq{pIkfxe&&ojTl+ZYe_1Vd^|1ze(Zq zg$NO;Vn&t@J--#grk6{f3X%gjN%0RZtuOQh{hAmWr_~p$N=qr&yC<e|omn@N8ST zk{+bz5?LvQ8Og*7S2Hs#q5e7{y`u<;j1z9V-rWv)lNw0|oXg5IALp&IgCo-h(UVid z4V-qR@hGKBlVmC7>tP*2N)IOAmzMhfCn>$pGv@P*bBx(Klx$o^5~~)Kuh%E*L=}++ zQTZv(Sq~>&HnIP|Eo`TS&5=0@mOC?y{7!|BG75IeGHTZ+=mu9-Bl%dsdN{0IB*0zz zqOkRPZL7&Q;`=%EH*8uZzdB({FQ_)$r5~WpaS?6aRD&z>Z+$U(MGlSA>WftssTAy0 zhM`Y~~l2a4^kI3pc=SC=U{4d3u&C%Z+;CfRN=6v%i;`%_z%fURHkMDL2X~19H-h zDF>94LYv3L9FFKS-WA$Btd2Ocpin;f~II2^wUhuogQwQyBW*K1*DoK|0~ss+Pj zAHlbCbF7Bva+hl%F|UMN@2bx`A(44q6JEGwvP1{?`LR(r#Pf5KjfJ>^!Y$kI;}#sk z_%6&w;4qx4p9SMEi_e|Sz#$ytEPSYK30_yRwz8&fePuOX?ipg;0_kSW;&Y1_cZzYB z7%_`mWN-_*Taeu>d<(j3sRKb+8XeBY^54=_T0x(uyD6og&^8u@rark$e7%s%#CtP4 zg9F~ZSw9YVQDzQ)RpD1sJ_O=)@TwqB2RCt+hXWqytjgrK#gE~M%#PxKCnX~|Z#)9z zdE+W$**M_d%@_{2@v=iW;I7D~V1IClivBQs7OVybTwClQ4!9;*5T73xBDOGOWmQ$p zx~iH=_-)}GQVQQKIxn3HfQFKE^3m=hJ2?Z6q6m0g7-e_zw*>Yfb_U-d_`fHwtIpaL z>;%|L_efXd(62DGpNB@0&%wSV(mq2I?E4BKf@5Gu(RS&c4EO~B4yY(r0J$1~@g2aM zJqErk-cI~sYe|Mg0)Z68@$v9tg;1a&818HZD#2b@OX36}MkeHG2%9wo<2_920GTGD zopN-QCb~fr?R)on(gIQ<2t_jCsD{v_AsFvc>I|?4>jh!0Od#-RnbLz+4Z(1^XjKW= zgJwZ!k_p)wLc4}wyhErxI3}Wx$k9v_-Jyy0y)T;rKfTKsY2^Dwc#V(w~~hJE=itXQ$|UlhiTg#Z8m literal 25264 zcmeHPeT>}3bw7RFo$jPNN#|H~Vo9Y{xRz|~outTL>Q#;^`3|n?(^-}hxv9hOc4wF5 zahEHSd%824T1MkS>xNZvSfl-9+aIUyo?*F@ z;rL;fl|935h~fI6KBNyn+1{^@_=BF|SuA$TAD(yYGIQNFbwQ8K=h!kmTHbiB9Z)gU z)bLENt?%?lR6(S^$sY=7oAWnYMuWBIS__&#s(GGcEVMlEL_BOVDlXAGT(&gVMI-(m zY-si1uPv2UQ~p@c%8L%GQlFMT>}fT+)z-J`x9i*V-TEedk3Tzg-erzEb%`}hqcK&6 z?4~ZYT+dP+<}%GG>r-{+=_}T(qcwHrOo3o`Dj;;K8L&L*xZq~*>CLSLeQO|UDXSe( zgTH;2HS9W59%)qAveBpolG@eT9tsLn9txmqk{i7p5r9$&j$A=l@pbU zL1NM{{Xxa3qL*tpta4CUv0F-ATLB4Wt5so&rnpUGk(G=_^l!Hf>@BsT+aaA zjB^%H>tc4rp*Hl#B;ea(itDi^m~x1+$q+2>OO5c5UuqJ?kL5@48GR2|p>5DWj%JSH z&A>?9;$3(K0Di=~%TN>V>7{z~oDOvgFyNa}gjohB=BB06Bn=;CD`O zS&d9J(C9FwX}cV7z^PF|t5GKCUAx?_Gp!No#ERnT)R)BJ!m25OQRUNdvo5AQ$lI7! z))ifIp$1SA(uLRn`pF0388l&v9*5QKC00pmg4S43V1n(k0pwN`nC!(so0^)Z2b@4l zpebd#O-jm_&VgZV+Kmb^ax$WFQVC4~RLOvjDw^fmiq>pe22>PUL1Sl0k|z?94E16@ zGO~I(uw@rM%@^m|Gn6tJUCXkSnyyPqj*sJeyx6B#LKZ zZ`KKJlg2E=V~%F23zl78jJ;j`iwUnqnRgiJd+33bYE?~n0*)YFqceuZ=Ap1k$!aP; zq16{E+Pn_pXEfK-$E|%x!Aq{%!$7aQv3==O7#jG_CquEUi&%(ZczIuFfnAUa@Y}bPNAIRdZ_Ofs(KF zxe#)%`UK;&@X3TZQ?Au*djj!?~EFE^h4i!m%r96 ziMm&dq3+8u)ZGTu30vzQ|k~K3WQ)GHr9LsKL+U5j=_wl2re3eyXGQ z_Xn{&ochG_OgN$3N>XmgH4;5ylGP$VJ8d6s%U-B6+{@Oi*mR*~m{{YMWcl0qStj%F zMr@(>Bo^vNNw=phuH0&qv|9~a>S)P4W!eigS#3d;k1P8ttg5vv@0fDJKPI~Yf5 z!nfejZ?UP&A^0KxZ>Nz)Z!Nk0SF~oKwcM9fLp#)p?rIqIi0MsNHQdA1P!i;C4-k%6 zER42=jz_oPR*GA(K$;Xh%nsxUq{mJ%;C z`|w%vO~|>X_x1UPaZ!mfDx4k)l9jrRK%v4sjj}tDHuo`7a&cDBlqWS2De*lqZotL& zl8UvzsHT~p6jX4@4D_vRH|tDNUApLR8hoXM3XfG1(3c9cp@RzGf8HCMNFKRlrM=9oRnj6y}sb!=D8qTW{TZ} zk5ATx(pE{ozHHZTP8iSbIjgl=C!pFvF@QEAbde*Q!u9u>U8&-zL&mjlQ=TYEo!=0d*pAV&E zheA%H3(|NVj~;+HfA+-8L8S%5?9^<$oIuQNho9@Q^B}W zx@0?xDza|~(b>oFH}vQfeg*ze7?dT`0`b^BvbgWUlY&(T%l`zHd&Teiuj{{|UtQH- zilTG2)mhE^# zWIKBFkW_u5FRC8zM%9s0-Dnu~R$UFg`*lhwklW8t$%oY!ALqOAaX|1< z*P0h45A-Weza+oF2ubx*eNp|fZdBg^Ot@vusi<{wRY7bMKB>%vNo?fkDXF~D7nM(R zqjG;~LG#MG>KacmWm^6nm+R3(58fw`k$-@s`Nh6y{&+W<_u@-Ijs#;?H%Z^szUX^0 zN#DBj4_^^faGSS|{KHK;lYIGg%Ri)EEBF|={nc9D>k-AQJQ24&WJ+6g(4yg zzNLhF^XNd|f}_-lf1PHNfq-nf7n58X4fmzc(BlJr8V}EA z;-T*}!Dq!f=RrLo<1=CeN5=CB9nVMrgfU@8FR7EiCZ3gtx=1Je7?la?dS?Ja;(B+I zw7t?7ZJ$ZdCSPzAl;591NY?g%G)$h)hRGVic&<488-mKFGgON7#|2e+_=sL9zZK&r4Y>y; zI5OP4)Z)CHX|D^qdF4X9(HebHjNohZ3f4%vd3w^#lb}`JA!qO1dtiVE2;d?Fur8PH zcVbdZAY(JW1s+O4qnFQ@0>>+v;ON`u^Tnd)Q2~OwwR=X<#?+I~Mln|#Ol1d-JDVn5UAOX1|zY@GD#^nI-mw}g{F6eBoJ zUQ4V*KVJ^Q_33A*H2Hras49Y}*JJ!7_$crRAtb-R2ubxn^hNd8x>23uQ$k2>j-Har zxBH^<>)oi#{|zA|Hva%g^Y{Cr`A@phoE4>nlxB64^bHRd@%kbQ_{}7J>keja6;yCt zUPmxW84X-Ia=4{y;wM{s#5Eb=#P~^m zCYDZ_p2Y^bD7HyAOQe57U@VVirZ>#T#R!hdzlUYMKJkq$CeBNWB}Q;k-b}nC?t8e` z>05rZnDz{&j`4~Zr{CdHw4V~x<>4nXBHw|CBfdJ&nA$YbBcGjGr8hHul!AfIi-njY z+`J%0@WpxyizNe{pGpQgNfckN9d7UJLxP0|qgTP`x&oSij7h5l0;%t5ey@Ng1%kIS zLC|+V^SW5vJXj|{C{+4cKTywW*nMQ@3+&g#<(n=K%4$!?ndXG9xeyo@RB=! zIIQ6jn=RIqnr2h%tQ|_GbT4Zn3c~0I{SWY>AQn$dVh5>R!-0aJAk3g=dzyu_dj4qC z2Zxby=8jJ4Zt%y;Er-g@sG_*cwx}g2GUIP?nMcPAP%~$zD1rl5py9DbrR@)WRI^&B zE)$(8^oN3jt?fB|`7DH}PYuW7}T1*|qI&n8JtVfCQB)C7{D- z__O6$_%-I)Wt9rn@Hz@Ofef0Rm9{@o#WYk@B6jnNKSsrQ8rUefOs6N>SNw4oRdSjt z&V%qe5}5GzpoLkCx2Ts4PgkR&Ie1<;AgrP@=WoLiah$hMEw;oU&W--4u36&epuZDE zYam4&!DzWrr$K+P!WLRJJZ}7w=1_48^uUquI$B2PJO(3{h(4o%gX1br(c4t{q3w^f zylUxzl561a-N~COa~#`Ib(F@in1dA{Qp;Z3-yR+*ma`1A$;@7l)pK~$Lma_|_`u7Dhd zpm9~O35gh$2oU;^Kd!Yr8#_ItCV!&=xQ$g>^`Y6{KnWOK#S?Z%|-)SywfQcq3*da52n z?FF)FINi^hr8^HDg*?ZdYB@DMwaDpzIGP^Hs}W=uQEtazUX?!s24PC~>J}b0;%N)e zqKc=tPjB(}0F!l9D5qdR4GIjb`a9f}hNms7qJlshWvy(2{x-L+LD4R@pw%t}bqoI; z{8UlMLBp{qDr>c;j1n(6(!`uV->{Tigc!ZpYNB8P)$HPhxI#0q3iO~HOQQ-5gmz(q z3!-gwMMRhTbk z6N%QKk{6mWTUGoRq8&#l0*3ZsY7G9~Zytq?n}_Lg9NWpvAHgLV3jw}iX@;=+!rYo^ zU9_$d&(E2kr&)i2E_dUDvUwL>?nKVY+((xSaF)#TbU8>Q9KfaJj}U#{N)rM@=`K#; zDyMGMyowI1=1ctgoBaAkex;|Z@+YkF*;o0@t9;hgmVO7#Lrm)1IEScJ@)U{`{Wgk& z{H6Y7^TQaezY$PL2ZsYH>C~h7ExOS0DD!)Cp#w(dF>Hn8M>kA`xS`#a0XMYs$yDh= zd&kTxbfLXR=4*7Jy(Z>|h->=p8*oi24)Z7JLLV5-Pt%1yW0`+I7y43RK1$M|n15w$I>WI*Tsq)^qyMnWw^{836C8E@Utj{rd zDm<8ux?-@526#{c4O9r1$CQ1s;yC+6lH*9$uxpopIX3Xx%LK=t5jJ^ky?kNS z?dG3iL@zgxBUkeWL&s!;znL@&*7T9NnKR@a1QiYOwhGDyo<0W8&-@0_A!L`Ug@nh~*@CWo?^-;$-@&*wr1nru zYMrMKdrR%!Eavw`MOS1k2PwJ-vqnR7o#RvT+c;blcmJCuKeCSbl~kw-=4zq`w&L-)LVL|@BeyX6IfnEVZv@NsVHv{8D=*CbiD}guSKq9b!A zBfrGnj7co|+8;&`*u~IWVmHjr-BVa<`?I7*nV|eqdn+cj&g}%frS=}Mb2EkIcQQ+U zlt0Tazqe!ZOMQ;S_jl$`hz|`>&WY#1wu)w+Genome Indexing

Description

This process involves transforming the genome graph from GFA format into two fully converted genome graphs: one depleted of C bases and another depleted of G bases. Additionally, if desired, you may include a spike-in genome in FASTA format to estimate the conversion rate in a single step further.

-
-

Note

-

It’s important to note that in the C-to-T genome graph, if both C and T segments are positioned identically—meaning they share the same parent and child segments, and each has only one parent and one child—the T segments are removed. Additionally, any associated links and paths are redirected to the corresponding C segment. This principle also applies in the G-to-A genome graph.

-

Example Usage

diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js index 6dbf2a8..4e9172d 100644 --- a/docs/build/html/searchindex.js +++ b/docs/build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Align": [[5, "align"]], "Citation": [[0, "citation"]], "Contact us": [[0, null]], "ConversionRate": [[5, "conversionrate"]], "Deduplication": [[5, "deduplication"]], "Description": [[5, "description"], [5, "id1"], [5, "id5"], [5, "id9"], [5, "id13"]], "Example Usage": [[5, "example-usage"], [5, "id2"], [5, "id6"], [5, "id10"], [5, "id14"]], "File Types": [[2, null]], "GAF: Graphical mApping Format": [[2, "gaf-graphical-mapping-format"]], "GFA: Graphical Fragment Assembly format": [[2, "gfa-graphical-fragment-assembly-format"]], "Genome Indexing": [[5, "genome-indexing"]], "Install via CONDA": [[4, "install-via-conda"]], "Install via PIP": [[4, "install-via-pip"]], "Installation": [[4, null]], "Main": [[5, "main"]], "MethylCall": [[5, "methylcall"]], "Optional arguments": [[5, "optional-arguments"], [5, "id4"], [5, "id8"], [5, "id12"]], "Prerequisites": [[4, "prerequisites"]], "Process": [[5, null]], "Reference:": [[2, "reference"]], "Required arguments": [[5, "required-arguments"], [5, "id3"], [5, "id7"], [5, "id11"], [5, "id15"]], "Source code": [[4, "source-code"]], "Source code and issue tracker": [[0, "source-code-and-issue-tracker"]], "Toy Example": [[1, null]], "Use Docker": [[4, "use-docker"]], "Welcome to methylGrapher\u2019s documentation!": [[3, null]], "methyl: Graph methylation file": [[2, "methyl-graph-methylation-file"]], "methylGrapher: Genome-Graph-Based DNA Methylation Analysis Using Whole Genome Bisulfite Sequencing": [[3, "methylgrapher-genome-graph-based-dna-methylation-analysis-using-whole-genome-bisulfite-sequencing"]]}, "docnames": ["contact", "example", "filetype", "index", "install", "process"], "envversion": {"sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["contact.rst", "example.rst", "filetype.rst", "index.rst", "install.rst", "process.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": 5, "0": [2, 5], "1": 5, "10": 4, "11": 4, "12": 4, "20": 5, "3": 4, "4096": 5, "9": 4, "A": 5, "For": 2, "If": 0, "It": 5, "The": [1, 2, 4, 5], "activ": 4, "ad": 4, "adapt": 4, "addition": 5, "against": 5, "align": [3, 4], "all": 1, "along": 1, "also": [1, 5], "an": 0, "analyz": 1, "ani": [0, 5], "anoth": 5, "appli": 5, "ar": [1, 4, 5], "assembli": 3, "associ": 5, "base": [2, 5], "bash": 1, "batch_siz": 5, "below": 1, "best": 4, "bioconda": 4, "blob": [1, 2], "both": [4, 5], "c": 5, "calcul": 2, "call": 5, "can": [0, 1], "cd": 1, "child": 5, "citat": 3, "clone": 1, "code": 3, "com": [0, 1, 2], "compris": 5, "conda": 3, "construct": 2, "contact": 3, "context": 2, "convers": 5, "conversionr": 3, "convert": 5, "coordin": 2, "core": 5, "correspond": 5, "coverag": 2, "creat": 4, "cytosin": 2, "data": 1, "dedupl": [3, 4], "default": 5, "deplet": 5, "design": 2, "desir": 5, "detail": [2, 5], "direct": 5, "directori": 5, "discard_multimap": 5, "doc": 2, "docker": 3, "document": 5, "each": [2, 5], "ensur": 4, "essenti": 5, "estim": 5, "exampl": 3, "example_b": 1, "execut": 5, "explan": 2, "extract": [2, 5], "facilit": 5, "fasta": 5, "fastq": [1, 5], "fastq1_file_path": 5, "fastq2_file_path": 5, "fastuniq": [4, 5], "file": [1, 3, 5], "final": 1, "find": 0, "firstli": 5, "folder": 1, "format": [1, 3, 5], "four": 5, "fq1": 5, "fq2": 5, "fragment": 3, "from": 5, "fulli": 5, "further": 5, "g": 5, "gaf": [3, 5], "galor": 4, "gener": [1, 5], "genom": [1, 2], "gfa": [1, 3, 5], "gfa1": 2, "gfa_file_path": 5, "gfatool": 2, "giraff": [4, 5], "git": 1, "github": [0, 1, 2], "glossari": 2, "graph": [1, 5], "graphic": 3, "ha": [4, 5], "have": 0, "here": 0, "how": 1, "html": 2, "http": [0, 1, 2], "hub": 4, "i": [1, 2, 4, 5], "id": 2, "ident": 5, "import": 5, "includ": [2, 5], "incorpor": 1, "index": 3, "index_prefix": 5, "instal": [1, 3], "involv": 5, "io": 2, "issu": 3, "its": 2, "lambda_phage_genome_path": 5, "lambdaphagefastafilepath": 5, "lastli": 5, "latest": 4, "lh3": 2, "link": 5, "list": 2, "lp": 5, "mai": 5, "main": [1, 3], "make_exampl": 1, "map": 3, "master": 2, "md": 2, "mean": 5, "methyl": [1, 5], "methylcal": 3, "methylgraph": [0, 1, 2, 4, 5], "minigraph": 2, "minimum_ident": 5, "minimum_mapq": 5, "more": 5, "must": 4, "n": 4, "next": 5, "note": 5, "number": 2, "one": 5, "onli": 5, "oper": 5, "our": 0, "output": [1, 2], "output_prefix": 5, "outputprefix": 5, "packag": 1, "pair": 2, "pangenom": 2, "parent": 5, "path": [4, 5], "percentag": 2, "perform": 4, "pip": 3, "pipelin": 1, "pleas": [0, 5], "posit": [2, 5], "prefix": 5, "preparegenom": 5, "preparegenomeoutputprefix": 5, "prerequisit": 3, "principl": 5, "process": 3, "provid": [2, 5], "py": 1, "python": 4, "python3": 4, "question": 0, "rate": 5, "ratio": 2, "read": 2, "recommend": 4, "redirect": 5, "refer": [3, 5], "rel": 2, "relat": 2, "remov": 5, "repo": 0, "result": 5, "rgfa": 2, "same": 5, "script": 1, "segment": [2, 5], "sh": 1, "share": 5, "show": 1, "simul": 1, "singl": 5, "sort": 5, "sourc": 3, "spec": 2, "specifi": 2, "spike": 5, "step": 5, "strand": 2, "submit": 0, "subsequ": 5, "sum": 2, "support": 2, "t": 5, "test": 4, "thank": 0, "thei": 5, "theworkingdir": 5, "thi": [1, 5], "threads_us": 5, "threadsus": 5, "toi": 3, "toolkit": 4, "total": 2, "tracker": 3, "transform": 5, "trim": 4, "twlab": [0, 1], "two": 5, "type": 3, "u": 3, "undergo": 5, "unmethyl": 2, "us": 1, "verifi": 1, "version": 4, "vg": [2, 4, 5], "via": 3, "whether": 2, "which": [1, 2], "within": 1, "work": 5, "work_dir": 5, "workflow": 1, "working_directori": 5, "would": 4, "y": 5, "you": [0, 1, 4, 5], "your": 4, "yourgfafilepath": 5}, "titles": ["Contact us", "Toy Example", "File Types", "Welcome to methylGrapher\u2019s documentation!", "Installation", "Process"], "titleterms": {"": 3, "align": 5, "analysi": 3, "argument": 5, "assembli": 2, "base": 3, "bisulfit": 3, "citat": 0, "code": [0, 4], "conda": 4, "contact": 0, "conversionr": 5, "dedupl": 5, "descript": 5, "dna": 3, "docker": 4, "document": 3, "exampl": [1, 5], "file": 2, "format": 2, "fragment": 2, "gaf": 2, "genom": [3, 5], "gfa": 2, "graph": [2, 3], "graphic": 2, "index": 5, "instal": 4, "issu": 0, "main": 5, "map": 2, "methyl": [2, 3], "methylcal": 5, "methylgraph": 3, "option": 5, "pip": 4, "prerequisit": 4, "process": 5, "refer": 2, "requir": 5, "sequenc": 3, "sourc": [0, 4], "toi": 1, "tracker": 0, "type": 2, "u": 0, "us": [3, 4], "usag": 5, "via": 4, "welcom": 3, "whole": 3}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Align": [[5, "align"]], "Citation": [[0, "citation"]], "Contact us": [[0, null]], "ConversionRate": [[5, "conversionrate"]], "Deduplication": [[5, "deduplication"]], "Description": [[5, "description"], [5, "id1"], [5, "id5"], [5, "id9"], [5, "id13"]], "Example Usage": [[5, "example-usage"], [5, "id2"], [5, "id6"], [5, "id10"], [5, "id14"]], "File Types": [[2, null]], "GAF: Graphical mApping Format": [[2, "gaf-graphical-mapping-format"]], "GFA: Graphical Fragment Assembly format": [[2, "gfa-graphical-fragment-assembly-format"]], "Genome Indexing": [[5, "genome-indexing"]], "Install via CONDA": [[4, "install-via-conda"]], "Install via PIP": [[4, "install-via-pip"]], "Installation": [[4, null]], "Main": [[5, "main"]], "MethylCall": [[5, "methylcall"]], "Optional arguments": [[5, "optional-arguments"], [5, "id4"], [5, "id8"], [5, "id12"]], "Prerequisites": [[4, "prerequisites"]], "Process": [[5, null]], "Reference:": [[2, "reference"]], "Required arguments": [[5, "required-arguments"], [5, "id3"], [5, "id7"], [5, "id11"], [5, "id15"]], "Source code": [[4, "source-code"]], "Source code and issue tracker": [[0, "source-code-and-issue-tracker"]], "Toy Example": [[1, null]], "Use Docker": [[4, "use-docker"]], "Welcome to methylGrapher\u2019s documentation!": [[3, null]], "methyl: Graph methylation file": [[2, "methyl-graph-methylation-file"]], "methylGrapher: Genome-Graph-Based DNA Methylation Analysis Using Whole Genome Bisulfite Sequencing": [[3, "methylgrapher-genome-graph-based-dna-methylation-analysis-using-whole-genome-bisulfite-sequencing"]]}, "docnames": ["contact", "example", "filetype", "index", "install", "process"], "envversion": {"sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["contact.rst", "example.rst", "filetype.rst", "index.rst", "install.rst", "process.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"0": [2, 5], "1": 5, "10": 4, "11": 4, "12": 4, "20": 5, "3": 4, "4096": 5, "9": 4, "For": 2, "If": 0, "The": [1, 2, 4, 5], "activ": 4, "ad": 4, "adapt": 4, "addition": 5, "against": 5, "align": [3, 4], "all": 1, "along": 1, "also": 1, "an": 0, "analyz": 1, "ani": 0, "anoth": 5, "ar": [1, 4, 5], "assembli": 3, "base": [2, 5], "bash": 1, "batch_siz": 5, "below": 1, "best": 4, "bioconda": 4, "blob": [1, 2], "both": 4, "c": 5, "calcul": 2, "call": 5, "can": [0, 1], "cd": 1, "citat": 3, "clone": 1, "code": 3, "com": [0, 1, 2], "compris": 5, "conda": 3, "construct": 2, "contact": 3, "context": 2, "convers": 5, "conversionr": 3, "convert": 5, "coordin": 2, "core": 5, "coverag": 2, "creat": 4, "cytosin": 2, "data": 1, "dedupl": [3, 4], "default": 5, "deplet": 5, "design": 2, "desir": 5, "detail": [2, 5], "direct": 5, "directori": 5, "discard_multimap": 5, "doc": 2, "docker": 3, "document": 5, "each": 2, "ensur": 4, "essenti": 5, "estim": 5, "exampl": 3, "example_b": 1, "execut": 5, "explan": 2, "extract": [2, 5], "facilit": 5, "fasta": 5, "fastq": [1, 5], "fastq1_file_path": 5, "fastq2_file_path": 5, "fastuniq": [4, 5], "file": [1, 3, 5], "final": 1, "find": 0, "firstli": 5, "folder": 1, "format": [1, 3, 5], "four": 5, "fq1": 5, "fq2": 5, "fragment": 3, "from": 5, "fulli": 5, "further": 5, "g": 5, "gaf": [3, 5], "galor": 4, "gener": [1, 5], "genom": [1, 2], "gfa": [1, 3, 5], "gfa1": 2, "gfa_file_path": 5, "gfatool": 2, "giraff": [4, 5], "git": 1, "github": [0, 1, 2], "glossari": 2, "graph": [1, 5], "graphic": 3, "ha": 4, "have": 0, "here": 0, "how": 1, "html": 2, "http": [0, 1, 2], "hub": 4, "i": [1, 2, 4, 5], "id": 2, "includ": [2, 5], "incorpor": 1, "index": 3, "index_prefix": 5, "instal": [1, 3], "involv": 5, "io": 2, "issu": 3, "its": 2, "lambda_phage_genome_path": 5, "lambdaphagefastafilepath": 5, "lastli": 5, "latest": 4, "lh3": 2, "list": 2, "lp": 5, "mai": 5, "main": [1, 3], "make_exampl": 1, "map": 3, "master": 2, "md": 2, "methyl": [1, 5], "methylcal": 3, "methylgraph": [0, 1, 2, 4, 5], "minigraph": 2, "minimum_ident": 5, "minimum_mapq": 5, "more": 5, "must": 4, "n": 4, "next": 5, "number": 2, "one": 5, "oper": 5, "our": 0, "output": [1, 2], "output_prefix": 5, "outputprefix": 5, "packag": 1, "pair": 2, "pangenom": 2, "path": 4, "percentag": 2, "perform": 4, "pip": 3, "pipelin": 1, "pleas": [0, 5], "posit": 2, "prefix": 5, "preparegenom": 5, "preparegenomeoutputprefix": 5, "prerequisit": 3, "process": 3, "provid": [2, 5], "py": 1, "python": 4, "python3": 4, "question": 0, "rate": 5, "ratio": 2, "read": 2, "recommend": 4, "refer": [3, 5], "rel": 2, "relat": 2, "repo": 0, "result": 5, "rgfa": 2, "script": 1, "segment": 2, "sh": 1, "show": 1, "simul": 1, "singl": 5, "sort": 5, "sourc": 3, "spec": 2, "specifi": 2, "spike": 5, "step": 5, "strand": 2, "submit": 0, "subsequ": 5, "sum": 2, "support": 2, "t": 5, "test": 4, "thank": 0, "theworkingdir": 5, "thi": [1, 5], "threads_us": 5, "threadsus": 5, "toi": 3, "toolkit": 4, "total": 2, "tracker": 3, "transform": 5, "trim": 4, "twlab": [0, 1], "two": 5, "type": 3, "u": 3, "undergo": 5, "unmethyl": 2, "us": 1, "verifi": 1, "version": 4, "vg": [2, 4, 5], "via": 3, "whether": 2, "which": [1, 2], "within": 1, "work": 5, "work_dir": 5, "workflow": 1, "working_directori": 5, "would": 4, "y": 5, "you": [0, 1, 4, 5], "your": 4, "yourgfafilepath": 5}, "titles": ["Contact us", "Toy Example", "File Types", "Welcome to methylGrapher\u2019s documentation!", "Installation", "Process"], "titleterms": {"": 3, "align": 5, "analysi": 3, "argument": 5, "assembli": 2, "base": 3, "bisulfit": 3, "citat": 0, "code": [0, 4], "conda": 4, "contact": 0, "conversionr": 5, "dedupl": 5, "descript": 5, "dna": 3, "docker": 4, "document": 3, "exampl": [1, 5], "file": 2, "format": 2, "fragment": 2, "gaf": 2, "genom": [3, 5], "gfa": 2, "graph": [2, 3], "graphic": 2, "index": 5, "instal": 4, "issu": 0, "main": 5, "map": 2, "methyl": [2, 3], "methylcal": 5, "methylgraph": 3, "option": 5, "pip": 4, "prerequisit": 4, "process": 5, "refer": 2, "requir": 5, "sequenc": 3, "sourc": [0, 4], "toi": 1, "tracker": 0, "type": 2, "u": 0, "us": [3, 4], "usag": 5, "via": 4, "welcom": 3, "whole": 3}}) \ No newline at end of file diff --git a/docs/source/process.rst b/docs/source/process.rst index f281181..51f995a 100644 --- a/docs/source/process.rst +++ b/docs/source/process.rst @@ -11,9 +11,6 @@ Description ~~~~~~~~~~~~~~~~~~~~~~ This process involves transforming the genome graph from GFA format into two fully converted genome graphs: one depleted of C bases and another depleted of G bases. Additionally, if desired, you may include a spike-in genome in FASTA format to estimate the conversion rate in a single step further. -.. note:: - It's important to note that in the C-to-T genome graph, if both C and T segments are positioned identically—meaning they share the same parent and child segments, and each has only one parent and one child—the T segments are removed. Additionally, any associated links and paths are redirected to the corresponding C segment. This principle also applies in the G-to-A genome graph. - Example Usage ~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: shell diff --git a/src/config.ini b/src/config.ini index f7b5e47..b1c56c1 100644 --- a/src/config.ini +++ b/src/config.ini @@ -1,6 +1,7 @@ [default] # Provide the path to the executable vg_path = vg +ga_path = GraphAligner fastuniq_path = fastuniq diff --git a/src/gfa.py b/src/gfa.py index cb44b7b..f5c2c7f 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -345,6 +345,32 @@ def get_sequences_by_segment_ID(self, segment_IDs): return res +# Store segment length in memory +class GraphicalFragmentAssemblySegmentLengthMemory(object): + + def __init__(self): + self.clear() + + def clear(self): + self._segment_length = {} + + def parse(self, gfa_file): + with open(gfa_file) as gfa_fh: + for l in gfa_fh: + if l[0] not in "S": + continue + + l = l.strip().split("\t") + + if l[0] == "S": + rt, segID, seq, *tags = l + # sequence, tag, links, parent_links_count + self._segment_length[segID] = len(seq) + + + def get_sequence_length_by_segment_ID(self, segment_ID): + return self._segment_length[segment_ID] + diff --git a/src/longread.py b/src/longread.py new file mode 100644 index 0000000..639bc7d --- /dev/null +++ b/src/longread.py @@ -0,0 +1,642 @@ + + +import os +import re +import sys +import time +import gzip +import argparse + +import gfa +import utility + + + + + + + + +utl = utility.Utility() + +cytosine_methylation_regex = re.compile(r"\+(\(.*\))\-(\(.*\))") +cigar_regex = re.compile(r"\d+[MID]") + + +def path_to_list(path_str): + # Example: <29983488>29983486<29983485<29983484 + res = [] + direction = True + segment_ID = "" + for c in path_str: + if c in "<>": + if c == "<": + direction = False + elif c == ">": + direction = True + else: + segment_ID += c + continue + + res.append((segment_ID, direction)) + segment_ID = "" + + res.append((segment_ID, direction)) + res.pop(0) + + return res + + +def debug_get_fasta_by_read_name(read_name, fasta_fp): + with open(fasta_fp) as f: + for l in f: + if l.startswith(">"): + if l[1:].strip() == read_name: + return f.readline().strip() + return None + +def sam_bam_line_reader(file_path): + lower_fp = file_path.lower() + if lower_fp.endswith(".sam"): + with open(file_path, 'r') as f: + for line in f: + yield line + elif lower_fp.endswith(".bam"): + samtool_cmd = utility.SystemExecute() + samtool_out, samtool_err = samtool_cmd.execute(f"samtools view -h {file_path}", ) + for line in samtool_out: + yield str(line, 'utf-8') + samtool_cmd.wait() + + + +def bam_sam_to_fasta(bsam_fp, fasta_fp): + # Explore sam file + + output_fasta_fh = None + if utl.isGzip(fasta_fp): + output_fasta_fh = gzip.open(fasta_fp, 'wt') + else: + output_fasta_fh = open(fasta_fp, 'w') + + + no_methylation_tag = 0 + read_line_count = 0 + + + durations = [0] * 10 + timestamps = [0] * 10 + next_report = 10 + + + for l in sam_bam_line_reader(bsam_fp): + # Skip header + if l.startswith('@'): + continue + + + if timestamps[0] == 0: + ts = time.time() + for i in range(10): + timestamps[i] = ts + + durations[0] += time.time() - timestamps[0] + + if sum(durations) > next_report: + print(durations) + next_report += 10 + + line = l.strip().split('\t') + basic_info = line[:11] + tags_list = line[11:] + tags = {} + for tag_str in tags_list: + tag = [tag_str[:4], tag_str[5:]] + if tag_str.startswith("Ml:B:C"): + tag = ["Ml:B:C", tag_str[7:]] + tags[tag[0]] = tag[1] + + read_name = basic_info[0] + read_seq = basic_info[9].upper() + + if 'Mm:Z' not in tags: + no_methylation_tag += 1 + continue + + if "Ml:B:C" not in tags: + no_methylation_tag += 1 + continue + + mm_tag = tags['Mm:Z'] + ml_tag = tags["Ml:B:C"] + + if mm_tag.endswith(";"): + mm_tag = mm_tag[:-1] + if ml_tag.endswith(";"): + ml_tag = ml_tag[:-1] + + mm_tag = mm_tag.split(",") + ml_tag = ml_tag.split(",") + + if mm_tag[0] != 'C+m': + # print(mm_tag[0], "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + continue + else: + mm_tag = mm_tag[1:] + + if len(ml_tag) in [0, 1]: + continue + + mm_tag = [int(x) for x in mm_tag] + ml_tag = [int(x) for x in ml_tag] + assert len(mm_tag) == len(ml_tag) + + cg_count = read_seq.count("C") + + cytosines_positions = [] + cpg_positions = [] + for i, nuc in enumerate(read_seq): + if nuc == "C": + cytosines_positions.append(i) + if i < len(read_seq) - 1 and read_seq[i + 1] == "G": + cpg_positions.append(i) + + methylated_cytosines = [] + unmethylated_cytosines = [] + nth_cytosine = 0 + for i in range(len(mm_tag)): + interval = mm_tag[i] + 1 + if nth_cytosine == 0: + nth_cytosine = interval - 1 + else: + nth_cytosine += interval + # print(nth_cytosine, interval) + + methylated = ml_tag[i] > 127 + if methylated: + methylated_cytosines.append(cytosines_positions[nth_cytosine]) + else: + unmethylated_cytosines.append(cytosines_positions[nth_cytosine]) + + cpg_covered_count = len( + set(cpg_positions).intersection(set(methylated_cytosines).union(set(unmethylated_cytosines)))) + if cpg_covered_count < len(cpg_positions) - 5: + # print(read_line_count, cpg_covered_count, len(cpg_positions)) + # continue + pass + + # print(read_name) + + # print(read_line_count) + # print(read_name) + # print(l) + # print(len(read_seq)) + # print(line) + # print(basic_info[:9]) + # print(tags) + # print(cg_count) + # print(len(mm_tag)) + # print(len(ml_tag)) + + # print(mm_tag[:40]) + # print(cytosines_positions[:80]) + # print() + # print(methylated_cytosines[:20]) + # print(cpg_positions[:20]) + + # print(len(methylated_cytosines), len(cpg_positions)) + # print(methylated_cytosines[-1], cpg_positions[-1]) + + # print('\n') + + methylated_cytosines = [str(x) for x in methylated_cytosines] + unmethylated_cytosines = [str(x) for x in unmethylated_cytosines] + met_str = f"+({','.join(methylated_cytosines)})-({','.join(unmethylated_cytosines)})" + fasta_entry = f">{read_name}{met_str}\n{read_seq}\n\n" + + # print(fasta_entry) + output_fasta_fh.write(fasta_entry) + + read_line_count += 1 + + + tnow = time.time() + durations[1] += tnow - timestamps[0] + timestamps[0] = tnow + + + + + output_fasta_fh.close() + return None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def generate_fp_within_wd(wd, file_name): + return os.path.join(wd, file_name) + +def generate_fasta_fp(wd): + return generate_fp_within_wd(wd, "input.fasta") + + +def extract_methylation_single_thread(gfa_fp, wd, threads=1, debug=False): + """ + Extract methylation information + :param wd: working directory + :param gfa: genome graph file path in GFA format + :return: methylation information + """ + + counter_pass = 0 + counter_issue_no_methylation = 0 + counter_issue = 0 + counter_total = 0 + + methylation_result = {} + + time_start = time.time() + + gfa_instance = gfa.GraphicalFragmentAssemblySegmentLengthMemory() + gfa_instance.parse(gfa_fp) + + if debug: + # Just in case the sequence is needed for debugging + gfa_seq_instance = gfa.GraphicalFragmentAssemblyMemorySegmentOptimized() + gfa_seq_instance.parse(gfa_fp) + + + + with open(generate_fp_within_wd(wd, "align.gaf")) as f: + for l in f: + + l = l.strip().split("\t") + # print(l) + + counter_total += 1 + if counter_total % 1000 == 0: + duration = time.time() - time_start + # TODO better logging + # print(counter_total, counter_pass, counter_issue_no_methylation, counter_issue, duration) + + # Read the graph alignment file (GAF) + read_name = l[0] + + query_len = int(l[1]) + query_start = int(l[2]) + query_end = int(l[3]) + + strand = l[4] + + path_str = l[5] + path_list = path_to_list(path_str) + + path_len = int(l[6]) + path_start = int(l[7]) + path_end = int(l[8]) + + matched_len = int(l[9]) + alignment_block_len = int(l[10]) + mapq = int(l[11]) + + tag_list = l[12:] + tags = {} + for tag_str in tag_list: + tname, tdt, tv = tag_str.split(":") + if tdt == "i": + tv = int(tv) + elif tdt == "f": + tv = float(tv) + tags[tname] = tv + # print(tname, tv) + + # Example: 42M1I5M1D23M1I1M1I1M1I13M1D41M1D35M1D4M1D + cigar = [] + cigar_str = tags["cg"] + for c in cigar_regex.findall(cigar_str): + cigar.append((int(c[:-1]), c[-1])) + + alignment_block_type = [] + block_start = 0 + block_end = 0 + for length, alignment_type in cigar: + block_end += length + + alignment_block_type.append((block_start, block_end, alignment_type)) + block_start = block_end + alignment_block_type.append((block_start, block_end, alignment_type)) + + + + # Sanity checks + + # make sure there is methylation information here, and there is no conflict + mc = cytosine_methylation_regex.findall(read_name) + if len(mc) != 1: + counter_issue_no_methylation += 1 + continue + + methylated_cytosines = mc[0][0][1:-1].split(",") + unmethylated_cytosines = mc[0][1][1:-1].split(",") + + if len(methylated_cytosines) + len(unmethylated_cytosines) == 0: + counter_issue_no_methylation += 1 + continue + + methylated = list(map(int, list(filter(lambda x: x != "", methylated_cytosines)))) + unmethylated = list(map(int, list(filter(lambda x: x != "", unmethylated_cytosines)))) + + + # Just to make sure the cigar interpretation is correct + cigar_freq = {} + for c in cigar: + if c[1] not in cigar_freq: + cigar_freq[c[1]] = 0 + cigar_freq[c[1]] += c[0] + assert path_end - path_start == cigar_freq["M"] + cigar_freq.get("D", 0) + assert query_end - query_start == cigar_freq["M"] + cigar_freq.get("I", 0) + + reconstructed_cigar = "" + for c in cigar: + reconstructed_cigar += str(c[0]) + c[1] + assert reconstructed_cigar == cigar_str + + + # if ">" in path_str: + # continue + + + # Just to make sure the path interpretation is correct + path_len_constructed = 0 + path_segment_length = {} + for segID, direction in path_list: + seq_length = gfa_instance.get_sequence_length_by_segment_ID(segID) + path_len_constructed += seq_length + path_segment_length[segID] = seq_length + assert path_len_constructed == path_len + + + # Debugging + # query_seq = debug_get_fasta_by_read_name(read_name, "./short.fasta") + query_seq = "" + path_seq = "" + + """ + for i, (segID, direction) in enumerate(path_list): + seq = gfa_instance.get_sequence_by_segment_ID(segID) + if not direction: + seq = seq_reverse_complement(seq) + path_seq += seq + """ + + + pos_in_read = -1 + pos_in_path = -1 + for alignment_block_len, alignment_block_type in cigar: + + for ai in range(alignment_block_len): + if alignment_block_type == "M": + pos_in_read += 1 + pos_in_path += 1 + + pos_in_abs_path = path_start + pos_in_path + pos_in_abs_read = query_start + pos_in_read + + met_flag = None + if pos_in_abs_read in methylated: + met_flag = "M" + elif pos_in_abs_read in unmethylated: + met_flag = "U" + + if met_flag is not None: + + corresponding_segment_ID = None + corresponding_segment_pos = pos_in_abs_path + for segID, direction in path_list: + if corresponding_segment_pos < path_segment_length[segID]: + corresponding_segment_ID = segID + if not direction: + # TODO need double check + corresponding_segment_pos = path_segment_length[segID] - corresponding_segment_pos - 1 + break + + corresponding_segment_pos -= path_segment_length[segID] + + if debug: + segment_seq = gfa_seq_instance.get_sequence_by_segment_ID(corresponding_segment_ID) + csp1 = corresponding_segment_pos + csp2 = corresponding_segment_pos + 2 + if csp2 > len(segment_seq): + csp2 = csp1 + 1 + + if not direction: + csp2 = corresponding_segment_pos + csp1 = corresponding_segment_pos - 2 + if csp1 < 0: + csp1 = 0 + + # This is to check whether I am aligning CG within the read to CG in the segment + print(path_seq[pos_in_abs_path: pos_in_abs_path+2], query_seq[pos_in_abs_read:pos_in_abs_read+2], segment_seq[csp1: csp2], met_flag) + + if corresponding_segment_ID not in methylation_result: + methylation_result[corresponding_segment_ID] = {} + if corresponding_segment_pos not in methylation_result[corresponding_segment_ID]: + dir_str = "-" if direction else "+" + methylation_result[corresponding_segment_ID][corresponding_segment_pos] = [dir_str, 0, 0] + methylation_result[corresponding_segment_ID][corresponding_segment_pos][2] += 1 + if met_flag == "M": + methylation_result[corresponding_segment_ID][corresponding_segment_pos][1] += 1 + + + elif alignment_block_type == "I": + pos_in_read += 1 + elif alignment_block_type == "D": + pos_in_path += 1 + else: + raise Exception("Unknown alignment block type") + + # print(path_len_constructed, path_len) + # print(cigar) + # print(alignment_block_type) + + # print(path_start, path_end, path_end - path_start) + # print(query_start, query_end, query_end-query_start) + # print(cigar_freq) + + # print(path_seq[433:533]) + # print(query_seq[:100]) + # print(path_str) + counter_pass += 1 + # break + + + + for segment_ID in sorted(methylation_result): + for pos in sorted(methylation_result[segment_ID]): + strand, met, cov = methylation_result[segment_ID][pos] + unmet = cov - met + l = f"{segment_ID}\t{pos}\t{strand}\t{met}\t{unmet}\t{cov}" + print(l) + return None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def prepare_fasta(basecall, wd, threads=1, debug=False): + """ + Prepare fasta file from long read base call BAM/SAM file + :param basecall: long read base call BAM/SAM file + :param wd: working directory + :return: fasta file + """ + + bam_sam_to_fasta(basecall, generate_fasta_fp(wd)) + return None + + +def align(gfa, wd, threads=1, debug=False): + """ + Align long reads to genome graph + :param wd: working directory + :param gfa: genome graph file path in GFA format + """ + + # TODO use non-default version of GraphAligner + cmd = f"GraphAligner -t {threads} --cigar-match-mismatch -g {gfa} -f {generate_fasta_fp(wd)} -a {generate_fp_within_wd(wd, 'align.gaf')} -x vg" + align_exe = utility.SystemExecute() + + # TODO + # align_exe.execute(cmd, stdout=generate_fp_within_wd(wd, 'align.log'), stderr=generate_fp_within_wd(wd, 'align.err')) + print(cmd) + + return None + +def extract_methylation(gfa_fp, wd, threads=1, debug=False): + """ + Extract methylation information + :param wd: working directory + :param gfa_fp: genome graph file path in GFA format + :return: methylation information + """ + + extract_methylation_single_thread(gfa_fp, wd, debug=debug) + + return None + + +def main(basecall, gfa_fp, wd, threads=1, debug=False): + """ + One step to run all + :param basecall: long read base call BAM/SAM file + :param gfa_fp: genome graph file path in GFA format + :param wd: working directory + :return: methylation information + """ + + prepare_fasta(basecall, wd, threads=threads, debug=debug) + align(gfa_fp, wd, threads=threads, debug=debug) + extract_methylation(gfa_fp, wd, threads=threads, debug=debug) + + return None + + + + + +if __name__ == "__main__": + + pass diff --git a/src/main.py b/src/main.py index 55eb3f9..2b48405 100644 --- a/src/main.py +++ b/src/main.py @@ -4,7 +4,7 @@ __copyright__ = "Copyright 2023-2024, Ting Wang Lab" __credits__ = ["Juan Macias"] __license__ = "MIT" -__version__ = "0.1.1" +__version__ = "0.1.2" __maintainer__ = "Wenjin Zhang" __email__ = "wenjin@wustl.edu" @@ -98,18 +98,23 @@ else: freport.write(f"Insert lambda phage genome into genome graph as segment: {lambda_segment_id}\n") - graph_trim_flag = kvargs.get("trim", "Y").lower() in "yestrue" - g = gfa.GraphicalFragmentAssemblyMemory() - g.parse(original_gfa_with_lambda, keep_link=True) - g.write_converted(gfa_c2t, "C", "T", SNV_trim=graph_trim_flag) + graph_trim_flag = kvargs.get("trim", "N").lower() in "yestrue" - del g - g = gfa.GraphicalFragmentAssemblyMemory() - g.parse(original_gfa_with_lambda, keep_link=True) - g.write_converted(gfa_g2a, "G", "A", SNV_trim=graph_trim_flag) + if graph_trim_flag: + g = gfa.GraphicalFragmentAssemblyMemory() + g.parse(original_gfa_with_lambda, keep_link=True) + g.write_converted(gfa_c2t, "C", "T", SNV_trim=graph_trim_flag) - del g + del g + + g = gfa.GraphicalFragmentAssemblyMemory() + g.parse(original_gfa_with_lambda, keep_link=True) + g.write_converted(gfa_g2a, "G", "A", SNV_trim=graph_trim_flag) + + del g + else: + utility.gfa_converter(original_gfa_with_lambda, prefix+".wl", compress=False) for gfa_file_path in [gfa_c2t, gfa_g2a]: index_prefix = gfa_file_path[:-4] diff --git a/src/mainL.py b/src/mainL.py new file mode 100644 index 0000000..2637bc1 --- /dev/null +++ b/src/mainL.py @@ -0,0 +1,167 @@ + + + +import os +import re +import sys +import argparse + + +import longread + + + + + + + + +if __name__ == "__main__": + + # Arugment parser + parser = argparse.ArgumentParser(description='methylGrapher for long reads') + + subparsers = parser.add_subparsers(dest='command', help='Sub-command help') + + + # main + parser_main = subparsers.add_parser('main', help='One step to run all') + parser_main.add_argument('-basecall', help='Long read base call BAM/SAM file', required=True) + parser_main.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_main.add_argument('-t', help='number of threads', default=1, type=int) + parser_main.add_argument('-debug', help='debug mode') + parser_main.add_argument('-wd', help='working directory', default="./") + + + # 1 prepare fasta + parser_prepare_fasta = subparsers.add_parser('prepare_fasta', help='Prepare fasta file') + parser_prepare_fasta.add_argument('-basecall', help='Long read base call BAM/SAM file', required=True) + parser_prepare_fasta.add_argument('-wd', help='working directory', default="./") + parser_prepare_fasta.add_argument('-t', help='number of threads', default=1, type=int) + parser_prepare_fasta.add_argument('-debug', help='debug mode') + + + # 2 align + parser_align = subparsers.add_parser('align', help='Align long reads to genome graph') + parser_align.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_align.add_argument('-wd', help='Long read fasta file', default="./") + parser_align.add_argument('-t', help='number of threads', default=1, type=int) + parser_align.add_argument('-debug', help='debug mode') + + # 3 methylation extraction + parser_extraction = subparsers.add_parser('extraction', help='Extract methylation information') + parser_extraction.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_extraction.add_argument('-wd', help='working directory', default="./") + parser_extraction.add_argument('-t', help='number of threads', default=1, type=int) + parser_extraction.add_argument('-debug', help='debug mode') + + + # TODO add function execution for subcommands + + + + args = parser.parse_args() + + if args.command == 'main': + print("Running methylGrapherL with main function", file=sys.stderr) + + + wd = "./" + thread = 1 + debug = False + basecall_fp = args.basecall + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + + + # Log parameters + print("Long read base call file path: ", basecall_fp, file=sys.stderr) + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + # Run main function + longread.main(basecall_fp, gfa_fp, wd) + + + elif args.command == 'prepare_fasta': + print("Prepare fasta", file=sys.stderr) + + wd = "./" + thread = 1 + debug = False + basecall_fp = args.basecall + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + # Log parameters + print("Long read base call file path: ", basecall_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + # Run prepare fasta function + longread.prepare_fasta(basecall_fp, wd) + + + + elif args.command == 'align': + print("Align") + + wd = "./" + thread = 1 + debug = False + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + + # Log parameters + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + + + elif args.command == 'extraction': + wd = "./" + thread = 1 + debug = False + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + # Log parameters + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + else: + print('No command provided') + + diff --git a/src/mainLOLD.py b/src/mainLOLD.py new file mode 100644 index 0000000..b8b1b4a --- /dev/null +++ b/src/mainLOLD.py @@ -0,0 +1,578 @@ + + + +import os +import re +import sys +import time + +import gfa +import mcall +import utility + + + + +class tmpHelp: + def __init__(self): + self._help_text = """ + Usage: main.py [options] + + Commands: + prepareGenome Prepare genome for methylation calling + callMethylation Call methylation from nanopore reads + + Options: + -h, --help Show this help message and exit + -t, --thread Number of threads to use + """ + + def help_text(self): + return self._help_text + + +help = tmpHelp() + + +utl = utility.Utility() + + + + + + +# Step1: SAM/BAM file to fasta file +# Step2: Alignment +# Step3: Methylation extraction + + + + + +cytosine_methylation_regex = re.compile(r"\+(\(.*\))\-(\(.*\))") + + +def path_to_list(path_str): + # Example: <29983488>29983486<29983485<29983484 + res = [] + direction = True + segment_ID = "" + for c in path_str: + if c in "<>": + if c == "<": + direction = False + elif c == ">": + direction = True + else: + segment_ID += c + continue + + res.append((segment_ID, direction)) + segment_ID = "" + + res.append((segment_ID, direction)) + res.pop(0) + + return res + + +def debug_get_fasta_by_read_name(read_name, fasta_fp): + with open(fasta_fp) as f: + for l in f: + if l.startswith(">"): + if l[1:].strip() == read_name: + return f.readline().strip() + return None + + +def seq_reverse_complement(seq): + return seq.translate(str.maketrans("ATCG", "TAGC"))[::-1] + + +gfa_fp = f"/scratch/wzhang/ref/graph/wgbs_bspg/bspg.wl.gfa" + + +# Store entire GFA in memory +class GraphicalFragmentAssemblyMemory(object): + + def __init__(self): + self.clear() + + def clear(self): + self._header = "" + self._segment = {} + self._walk = {} + + self._original_gfa_path = "" + + def parse(self, gfa_file): + self._original_gfa_path = gfa_file + # TODO when not keep_tag, keep_link, forbid to write GFA and access those two + for l in open(gfa_file): + if l[0] not in "HSL": + continue + + l = l.strip().split("\t") + + if l[0] == "H": + self._header = "\t".join(l[1:]) + + if l[0] == "S": + rt, segID, seq, *tags = l + # sequence, tag, links, parent_links_count + self._segment[segID] = seq + + else: + continue + + def get_sequence_by_segment_ID(self, segment_ID): + return self._segment[segment_ID] + + def get_sequence_length_by_segment_ID(self, segment_ID): + return len(self._segment[segment_ID]) + + def get_sequences_by_segment_ID(self, segment_IDs): + res = {} + for sID in segment_IDs: + res[sID] = self.get_sequence_by_segment_ID(sID) + return res + + def get_tag_by_segment_ID(self, segment_ID): + raise NotImplementedError + + def get_parent_link_count(self, segment_ID): + return self._segment[segment_ID][3] + + def get_parent_links(self, segment_ID): + raise NotImplementedError + + def get_child_links(self, segment_ID): + return self._segment[segment_ID][2] + + def all_segment_IDs(self): + return self._segment.keys() + + +gfa_instance = GraphicalFragmentAssemblyMemory() +gfa_instance.parse(gfa_fp) + + + + + + + + + + + + + +def bsam_to_fasta(bsam_fp, fasta_fp): + # Explore sam file + + output_fasta = open(fasta_fp, "w") + + no_methylation_tag = 0 + with open(bsam_fp) as f: + read_line_count = 0 + for l in f: + # Skip header + if l.startswith('@'): + continue + + line = l.strip().split('\t') + basic_info = line[:11] + tags_list = line[11:] + tags = {} + for tag_str in tags_list: + tag = [tag_str[:4], tag_str[5:]] + if tag_str.startswith("Ml:B:C"): + tag = ["Ml:B:C", tag_str[7:]] + tags[tag[0]] = tag[1] + + read_name = basic_info[0] + read_seq = basic_info[9].upper() + + if 'Mm:Z' not in tags: + no_methylation_tag += 1 + continue + + if "Ml:B:C" not in tags: + no_methylation_tag += 1 + continue + + mm_tag = tags['Mm:Z'] + ml_tag = tags["Ml:B:C"] + + if mm_tag.endswith(";"): + mm_tag = mm_tag[:-1] + if ml_tag.endswith(";"): + ml_tag = ml_tag[:-1] + + mm_tag = mm_tag.split(",") + ml_tag = ml_tag.split(",") + + if mm_tag[0] != 'C+m': + # print(mm_tag[0], "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + continue + else: + mm_tag = mm_tag[1:] + + if len(ml_tag) in [0, 1]: + continue + + mm_tag = [int(x) for x in mm_tag] + ml_tag = [int(x) for x in ml_tag] + assert len(mm_tag) == len(ml_tag) + + cg_count = read_seq.count("C") + + cytosines_positions = [] + cpg_positions = [] + for i, nuc in enumerate(read_seq): + if nuc == "C": + cytosines_positions.append(i) + if i < len(read_seq) - 1 and read_seq[i + 1] == "G": + cpg_positions.append(i) + + methylated_cytosines = [] + unmethylated_cytosines = [] + nth_cytosine = 0 + for i in range(len(mm_tag)): + interval = mm_tag[i] + 1 + if nth_cytosine == 0: + nth_cytosine = interval - 1 + else: + nth_cytosine += interval + # print(nth_cytosine, interval) + + methylated = ml_tag[i] > 127 + if methylated: + methylated_cytosines.append(cytosines_positions[nth_cytosine]) + else: + unmethylated_cytosines.append(cytosines_positions[nth_cytosine]) + + cpg_covered_count = len( + set(cpg_positions).intersection(set(methylated_cytosines).union(set(unmethylated_cytosines)))) + if cpg_covered_count < len(cpg_positions) - 5: + print(read_line_count, cpg_covered_count, len(cpg_positions)) + continue + # print(read_name) + + # print(read_line_count) + # print(read_name) + # print(l) + # print(len(read_seq)) + # print(line) + # print(basic_info[:9]) + # print(tags) + # print(cg_count) + # print(len(mm_tag)) + # print(len(ml_tag)) + + # print(mm_tag[:40]) + # print(cytosines_positions[:80]) + # print() + # print(methylated_cytosines[:20]) + # print(cpg_positions[:20]) + + # print(len(methylated_cytosines), len(cpg_positions)) + # print(methylated_cytosines[-1], cpg_positions[-1]) + + # print('\n') + + methylated_cytosines = [str(x) for x in methylated_cytosines] + unmethylated_cytosines = [str(x) for x in unmethylated_cytosines] + met_str = f"+({','.join(methylated_cytosines)})-({','.join(unmethylated_cytosines)})" + fasta_entry = f">{read_name}{met_str}\n{read_seq}\n\n" + + # print(fasta_entry) + output_fasta.write(fasta_entry) + + read_line_count += 1 + if read_line_count > 10: + pass + + output_fasta.close() + return None + + + + + +counter_pass = 0 +counter_issue_no_methylation = 0 +counter_issue = 0 +counter_total = 0 + +methylation_result = {} + +time_start = time.time() +with open(output_fp) as f: + for l in f: + + l = l.strip().split("\t") + # print(l) + + counter_total += 1 + if counter_total % 1000 == 0: + duration = time.time() - time_start + print(counter_total, counter_pass, counter_issue_no_methylation, counter_issue, duration) + + read_name = l[0] + + query_len = int(l[1]) + query_start = int(l[2]) + query_end = int(l[3]) + + strand = l[4] + + path_str = l[5] + # if ">" in path_str: + # continue + + path_len = int(l[6]) + path_start = int(l[7]) + path_end = int(l[8]) + + matched_len = int(l[9]) + alignment_block_len = int(l[10]) + mapq = int(l[11]) + + tag_list = l[12:] + tags = {} + + # print(tag_list) + + for tag_str in tag_list: + tname, tdt, tv = tag_str.split(":") + if tdt == "i": + tv = int(tv) + elif tdt == "f": + tv = float(tv) + tags[tname] = tv + # print(tname, tv) + + mc = cytosine_methylation_regex.findall(read_name) + if len(mc) != 1: + counter_issue_no_methylation += 1 + continue + + methylated_cytosines = mc[0][0][1:-1].split(",") + unmethylated_cytosines = mc[0][1][1:-1].split(",") + + if len(methylated_cytosines) + len(unmethylated_cytosines) == 0: + counter_issue_no_methylation += 1 + continue + + methylated = list(map(int, list(filter(lambda x: x != "", methylated_cytosines)))) + unmethylated = list(map(int, list(filter(lambda x: x != "", unmethylated_cytosines)))) + + # Example: 42M1I5M1D23M1I1M1I1M1I13M1D41M1D35M1D4M1D + cigar_str = tags["cg"] + cigar = [] + for c in re.findall(r"\d+[MID]", cigar_str): + cigar.append((int(c[:-1]), c[-1])) + + alignment_block_type = [] + block_start = 0 + block_end = 0 + for length, alignment_type in cigar: + block_end += length + + alignment_block_type.append((block_start, block_end, alignment_type)) + block_start = block_end + alignment_block_type.append((block_start, block_end, alignment_type)) + + """ + TODO uncomment + cigar_freq = {} + for c in cigar: + if c[1] not in cigar_freq: + cigar_freq[c[1]] = 0 + cigar_freq[c[1]] += c[0] + + assert path_end - path_start == cigar_freq["M"] + cigar_freq.get("D", 0) + assert query_end - query_start == cigar_freq["M"] + cigar_freq.get("I", 0) + + + # Just to make sure the cigar interpretation is correct + reconstructed_cigar = "" + for c in cigar: + reconstructed_cigar += str(c[0]) + c[1] + assert reconstructed_cigar == cigar_str + """ + + # Just to make sure the path interpretation is correct + path_len_constructed = 0 + path_list = path_to_list(path_str) + for segID, direction in path_list: + seq = gfa_instance.get_sequence_by_segment_ID(segID) + # print(segID, len(seq)) + path_len_constructed += len(seq) + # print(path_str) + assert path_len_constructed == path_len + + # query_seq = debug_get_fasta_by_read_name(read_name, "./short.fasta") + query_seq = "" + path_seq = "" + + """ + for i, (segID, direction) in enumerate(path_list): + seq = gfa_instance.get_sequence_by_segment_ID(segID) + if not direction: + seq = seq_reverse_complement(seq) + path_seq += seq + """ + + path_segment_length = {} + for i, (segID, direction) in enumerate(path_list): + seql = gfa_instance.get_sequence_length_by_segment_ID(segID) + path_segment_length[segID] = seql + + pos_in_read = -1 + pos_in_path = -1 + for alignment_block_len, alignment_block_type in cigar: + + for ai in range(alignment_block_len): + if alignment_block_type == "M": + pos_in_read += 1 + pos_in_path += 1 + + pos_in_abs_path = path_start + pos_in_path + pos_in_abs_read = query_start + pos_in_read + + met_flag = None + if pos_in_abs_read in methylated: + met_flag = "M" + elif pos_in_abs_read in unmethylated: + met_flag = "U" + + if met_flag is not None: + + corresponding_segment_ID = None + corresponding_segment_pos = pos_in_abs_path + for segID, direction in path_list: + if corresponding_segment_pos < path_segment_length[segID]: + corresponding_segment_ID = segID + if not direction: + # TODO need double check + corresponding_segment_pos = path_segment_length[ + segID] - corresponding_segment_pos - 1 + break + + corresponding_segment_pos -= path_segment_length[segID] + + segment_seq = gfa_instance.get_sequence_by_segment_ID(corresponding_segment_ID) + csp1 = corresponding_segment_pos + csp2 = corresponding_segment_pos + 2 + if csp2 > len(segment_seq): + csp2 = csp1 + 1 + + if not direction: + csp2 = corresponding_segment_pos + csp1 = corresponding_segment_pos - 2 + if csp1 < 0: + csp1 = 0 + # print(path_seq[pos_in_abs_path: pos_in_abs_path+2], query_seq[pos_in_abs_read:pos_in_abs_read+2], segment_seq[csp1: csp2], met_flag) + + if corresponding_segment_ID not in methylation_result: + methylation_result[corresponding_segment_ID] = {} + if corresponding_segment_pos not in methylation_result[corresponding_segment_ID]: + dir_str = "-" if direction else "+" + methylation_result[corresponding_segment_ID][corresponding_segment_pos] = [dir_str, 0, 0] + methylation_result[corresponding_segment_ID][corresponding_segment_pos][2] += 1 + if met_flag == "M": + methylation_result[corresponding_segment_ID][corresponding_segment_pos][1] += 1 + + + + + elif alignment_block_type == "I": + pos_in_read += 1 + elif alignment_block_type == "D": + pos_in_path += 1 + else: + raise Exception("Unknown alignment block type") + + # print(path_len_constructed, path_len) + # print(cigar) + # print(alignment_block_type) + + # print(path_start, path_end, path_end - path_start) + # print(query_start, query_end, query_end-query_start) + # print(cigar_freq) + + # print(path_seq[433:533]) + # print(query_seq[:100]) + # print(path_str) + counter_pass += 1 + # break + + + + +if __name__ == "__main__": + args = sys.argv + args.pop(0) + + ga_path = "GraphAligner" + thread = 1 + + # Find config file + try: + config = utility.ConfigParser("config.ini") + vg_path = config.get("default", "ga_path") + thread = config.get("default", "thread") + except: + pass + + try: + thread = int(thread) + except: + thread = 1 + + if len(args) == 0: + print("No command specified. Use 'help' for more information.") + print(help.help_text()) + sys.exit(0) + + command = args.pop(0) + command = command.lower() + if command not in ["help", "-h", "--help", "main"]: + print(f"Unknown command: {command}") + sys.exit(1) + + kvargs = {} + while len(args) > 0: + arg = args.pop(0) + if arg.startswith("-"): + key = arg[1:] + value = args.pop(0) + kvargs[key] = value + else: + print(f"Unknown argument: {arg}\n\n") + print(help.help_text()) + sys.exit(1) + + if "thread" in kvargs: + thread = int(kvargs["thread"]) + del kvargs["thread"] + if "t" in kvargs: + thread = int(kvargs["t"]) + del kvargs["t"] + + + if command in ["help", "-h", "--help"]: + print(help.help_text()) + sys.exit(0) + + + + if command == "preparegenome": + original_gfa_file_path = kvargs["gfa"] + prefix = kvargs["prefix"] + + fn_report = prefix + ".prepare.genome.report.txt" + freport = open(fn_report, "w") \ No newline at end of file diff --git a/src/mgmp.py b/src/mgmp.py new file mode 100644 index 0000000..650c187 --- /dev/null +++ b/src/mgmp.py @@ -0,0 +1,361 @@ + + +import os +import sys +import time +import multiprocessing + + + + +# write a framework for both single threaded and multi-threaded execution + +# 3 main class +# 1. Functions to execute +# 2. Worker class +# 3. Task class + + + + +# The single threaded execution is just one function after another +# The multi-processed execution is more complex + + + +def test_step1(iter): + + i = 0 + while i < 10000: + i += 1 + yield i + + +def test_step2(iter): + + for i in iter: + time.sleep(0.0001) + yield i**2 + + + + + +def worker_function(pid, name, function, input_queue, output_queue, batch_size=1024): + print(f"{name} worker {pid} started") + while True: + try: + tasks = input_queue.get() + if tasks is None: + break + + # print(pid, "tasks") + + result = [] + for r in function(tasks): + result.append(r) + + if len(result) >= batch_size: + output_queue.put(result) + result = [] + + if len(result) > 0: + output_queue.put(result) + + except Exception as e: + print(e) + break + + print(f"{name} worker {pid} finished") + + +def end_function(pid, last_queue): + print(f"End function {pid} started") + while True: + try: + tasks = last_queue.get() + if tasks is None: + break + + # print(pid, len(tasks)) + except Exception as e: + print(e) + break + print(f"End function {pid} finished") + + + +class TaskOLD: + + def __init__(self): + pass + + def run(self, threads=1): + maxsize = 100 + batch_size = 256 + + q0 = multiprocessing.Queue(maxsize=maxsize) + q1 = multiprocessing.Queue(maxsize=maxsize) + q2 = multiprocessing.Queue(maxsize=maxsize) + + + + pool1 = [] + for i in range(1): + p = multiprocessing.Process( + name=f"MGWorke1-{i}", + target=worker_function, + args=(i, "1", test_step1, q0, q1), + kwargs={"batch_size": batch_size} + ) + p.start() + pool1.append(p) + + pool2 = [] + for i in range(threads): + p = multiprocessing.Process( + name=f"MGWorker2-{i}", + target=worker_function, + args=(i, "2", test_step2, q1, q2), + kwargs={"batch_size": batch_size} + ) + p.start() + pool2.append(p) + + p = multiprocessing.Process( + name=f"MGEnd", + target=end_function, + args=(0, q2) + ) + p.start() + + + + + q0.put(1) + q0.put(None) + + for p in pool1: + p.join() + + for p in pool2: + q1.put(None) + + for p in pool2: + p.join() + + q2.put(None) + + + + print("Finished") + + return None + + + + +# TODO how to pass parameters +class Task: + + def __init__(self, steps, batch_size=1024, queue_max_size=100): + + # Example steps + steps_example = [ + {"name": "step1", "function": test_step1, "args": [], "kwargs": {}, "threads": 1}, + {"name": "step2", "function": test_step2, "args": [], "kwargs": {}, "threads": 2}, + ] + + self._steps = steps + self._batch_size = batch_size + self._queue_max_size = queue_max_size + + return + + def run(self): + threads = 10 + maxsize = 100 + batch_size = 256 + + + # N steps need N queues, but for formatting purposes, it need N+1 queues + q0 = multiprocessing.Queue(maxsize=self._queue_max_size) + q1 = multiprocessing.Queue(maxsize=self._queue_max_size) + q2 = multiprocessing.Queue(maxsize=self._queue_max_size) + + + # N steps need N pools + pool1 = [] + for i in range(1): + p = multiprocessing.Process( + name=f"MGWorke1-{i}", + target=worker_function, + args=(i, "1", test_step1, q0, q1), + kwargs={"batch_size": batch_size} + ) + p.start() + pool1.append(p) + + pool2 = [] + for i in range(3): + p = multiprocessing.Process( + name=f"MGWorker2-{i}", + target=worker_function, + args=(i, "2", test_step2, q1, q2), + kwargs={"batch_size": batch_size} + ) + p.start() + pool2.append(p) + + p = multiprocessing.Process( + name=f"MGEnd", + target=end_function, + args=(0, q2) + ) + p.start() + + + + + + # Let the steps begin + q0.put([1]) + q0.put(None) + + for p in pool1: + p.join() + + for p in pool2: + q1.put(None) + + for p in pool2: + p.join() + + q2.put(None) + + + + print("Finished") + + return None + + + +def myfunc(x, y, z): + print(x) + print(y) + print(z) + print(x,y,z) + return x + y + z + + +def test_args(a, b, c, *args, kw=3, **kwargs): + print(a) + print(b) + print(c) + print(kw) + print(args) + print(kwargs) + myfunc(*args) + return None + + + + +def to_worker_function(function, pid, name, input_queue, output_queue, *args, **kwargs): + + + def wrapper(function, pid, name, input_queue, output_queue, *args, **kwargs): + batch_size = kwargs.get("batch_size", 1024) + print(f"{name} worker {pid} started") + while True: + try: + tasks = input_queue.get() + if tasks is None: + break + + # print(pid, "tasks") + + result = [] + for r in function(tasks): + result.append(r) + + if len(result) >= batch_size: + output_queue.put(result) + result = [] + + if len(result) > 0: + output_queue.put(result) + + except Exception as e: + print(e) + break + + print(f"{name} worker {pid} finished") + + return wrapper + + + + + +test_args(1, 2, 3, 100, 200, 300, verbose=True, silent=False) + + +sys.exit(3) + +if __name__ == "__main__": + + + # Single threaded execution + + ts1 = time.time() + s1_gen = test_step1(lambda x: x) + s2_gen = test_step2(s1_gen) + for i in s2_gen: + pass + duration1 = time.time() - ts1 + + + # Multi-threaded execution + + ts2 = time.time() + task = Task(1) + task.run() + duration2 = time.time() - ts2 + + ts5 = time.time() + task = Task(1) + task.run() + duration5 = time.time() - ts5 + + ts10 = time.time() + task = Task(1) + task.run() + duration10 = time.time() - ts10 + + print(duration1, duration2, duration5, duration10) + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/utility.py b/src/utility.py index 0432a2c..c0d417f 100644 --- a/src/utility.py +++ b/src/utility.py @@ -131,8 +131,8 @@ def gfa_converter(input_gfa, output_file_prefix, compress=True, thread=1): l = l.strip().split('\t') if l[0] == 'S': - l[2] = l[2].replace(conversion[0], conversion[1]) - l[2] = l[2].replace(conversion[0].lower(), conversion[1].lower()) + sc = l[2].upper().replace(conversion[0], conversion[1]) + l[2] = sc newl = '\t'.join(l) + '\n' @@ -631,7 +631,7 @@ def header(self): """.strip() def version(self): - return "0.1.1" + return "0.1.2" def help_text_raw(self): return """