forked from mjhubisz/argweaver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmc2bed_all.py
116 lines (102 loc) · 3.53 KB
/
smc2bed_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import os
import sys
from io import BytesIO
from itertools import count
import pandas as pd
from argweavers.bin import require_executable
from argweavers.scripts.smc2bed import main as smc2bed
class MetavarFormatter(argparse.HelpFormatter):
def _get_default_metavar_for_optional(self, action):
return f"<{action.dest}>"
def _get_default_metavar_for_positional(self, action):
return f"<{action.dest}>"
def main(argv=None):
bgzip = require_executable("bgzip", "It is included in `samtools`")
tabix = require_executable("tabix", "It is included in `samtools`")
parser = argparse.ArgumentParser(
description="Convert all *.smc files from a single arg-sample run into "
"a bed.gz file which can be parsed by arg-summarize",
formatter_class=MetavarFormatter,
)
parser.add_argument(
"-s",
dest="startnum",
type=int,
default=0,
help="start with this MCMC rep of arg-sample (default: 0)",
)
parser.add_argument(
"-e",
dest="endnum",
type=int,
default=-1,
help="end with this MCMC rep (default: stop when no file exists)",
)
parser.add_argument(
"-i",
dest="interval",
type=int,
default=0,
help="sampling interval (will be auto-detected from output if not "
"specified)",
)
parser.add_argument(
"-r",
dest="region",
help="region (in format START-END, 1-based, inclusive) to pass to "
"smc2bed (default: run on all coordinates)",
)
parser.add_argument(
"baseout",
help="base name of arg-sample output (specified with arg-sample -o). "
"Creates a file called <baseout>.bed.gz.",
)
args = parser.parse_args(argv)
baseout = args.baseout
startnum = args.startnum
interval = args.interval
endnum = args.endnum
region = args.region
startfile = f"{baseout}.{startnum}.smc.gz"
if not os.path.exists(startfile):
sys.stderr.write(f"ERROR: {startfile} does not exist\n")
sys.exit(1)
if interval == 0:
for interval in range(1, 1001):
nextnum = startnum + interval
if os.path.exists(f"{baseout}.{nextnum}.smc.gz"):
break
else:
sys.stderr.write(
"ERROR detecting sampling interval; try specifying with -i\n"
)
sys.exit(1)
sys.stderr.write(f"starting at rep startnum={startnum}\n")
sys.stderr.write(f"using sampling interval={interval}\n")
num = startnum
regionarg = ["--region", region] if region else []
out = b""
for num in count(startnum, interval):
file = f"{baseout}.{num}.smc.gz"
if (endnum != -1 and num > endnum) or not os.path.exists(file):
num -= interval
sys.stderr.write(f"ended at sample={num}\n")
break
sys.stderr.write(f"{num} {file}\n")
out += smc2bed(
["--sample", str(num), *regionarg, file],
capture_output=True,
)
num += interval
bed = pd.read_table(
BytesIO(out), header=None, names=["chrom", "start", "end", "iter", "tree"]
)
bed = bed.sort_values(["chrom", "start", "end", "iter"])
out = bed.to_csv(index=False, header=False, sep="\t").encode("utf-8")
with open(f"{baseout}.bed.gz", "wb") as f:
bgzip(["-"], input=out, stdout=f)
tabix(["-p", "bed", f"{baseout}.bed.gz"])
sys.stderr.write(f"wrote and indexed {baseout}.bed.gz\n")
if __name__ == "__main__":
main()