forked from EisenRa/holoflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preparegenomes.py
246 lines (174 loc) · 7.78 KB
/
preparegenomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import argparse
import subprocess
import os
import glob
import sys
###########################
#Argument parsing
###########################
# Gather input files and variables from command line
parser = argparse.ArgumentParser(description='Runs holoflow pipeline.')
parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True)
parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True)
parser.add_argument('-c', help="config file", dest="config_file", required=False)
parser.add_argument('-l', help="pipeline log file", dest="log", required=False)
parser.add_argument('-t', help="threads", dest="threads", required=True)
args = parser.parse_args()
in_f=args.input_txt
path=args.work_dir
cores=args.threads
# retrieve current directory
file = os.path.dirname(sys.argv[0])
curr_dir = os.path.abspath(file)
# If the user does not specify a config file, provide default file in GitHub
# If the user does not specify a config file, provide default file in GitHub
if not (args.config_file):
cpconfigCmd= 'cp '+curr_dir+'/workflows/preparegenomes/config.yaml '+path+'/config.yaml'
subprocess.Popen(cpconfigCmd,shell=True).wait()
config = path+'/config.yaml'
else:
config=args.config_file
# If the user does not specify a log file, provide default path
if not (args.log):
log = os.path.join(path,"Holoflow_preparegenomes.log")
else:
log=args.log
# Load dependencies
loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0'
subprocess.Popen(loaddepCmd,shell=True).wait()
#Append current directory to .yaml config for standalone calling
# see preprocessing.py for verbose description
import ruamel.yaml
yaml = ruamel.yaml.YAML()
yaml.explicit_start = True
with open(str(config), 'r') as config_file:
data = yaml.load(config_file)
if data == None:
data = {}
with open(str(config), 'w') as config_file:
data['threads'] = str(cores)
data['holopath'] = str(curr_dir)
data['logpath'] = str(log)
dump = yaml.dump(data, config_file)
###########################
## Functions
###########################
###########################
###### PREPAREGENOMES FUNCTIONS
def set_up_preparegenomes(path,in_f):
"""Generate output names files from input.txt. Rename and move
input files where snakemake expects to find them if necessary."""
db_dir = os.path.join(path,"PRG")
if not os.path.exists(db_dir):
os.makedirs(db_dir)
with open(in_f,'r') as in_file:
# Paste desired output file names from input.txt
ref_genomes_IDs=list()
ref_genomes_paths=list()
db_ID=''
db_paths=''
output_files=''
all_lines = in_file.readlines() # Read input.txt lines
# remove empty lines
all_lines = map(lambda s: s.strip(), all_lines)
lines = list(filter(None, list(all_lines)))
last_file = lines[-1]
for file in lines:
if not (file.startswith('#')):
refg = file.strip('\n').split(' ') # Create a list of each line
# Save IDs for reformat and paths for merging
ref_genomes_IDs.append(refg[0])
ref_genomes_paths.append(refg[1])
db_ID = refg[2]
# If all previous genomes to same db, only save db name once
# do the merging of the genomes into db
if not (refg[2] == db_ID):
# call merging function
db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' '
output_files+=''+path+'/'+db_ID+'.tar.gz'
db_ID = refg[2]
ref_genomes_IDs=list()
ref_genomes_paths=list()
# If ending of lines, and no new db name, also
# do the merging of the genomes into db
if (file == last_file):
db_ID = refg[2]
# call merging function
db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' '
output_files+=''+path+'/'+db_ID+'.tar.gz'
else:
pass
return[db_paths,output_files]
def merge_genomes(refg_IDs,refg_Paths,db_ID):
db_dir = os.path.join(path,"PRG")
if not (os.path.exists(str(''+db_dir+'/'+db_ID+'.fna'))):
for i in range(len(refg_Paths)):
genome = refg_Paths[i]
ID = refg_IDs[i]
if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))):
if genome.endswith('.gz'): # uncompress genome for editing
# and save it in db_dir
uncompressCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna.gz && gunzip -c '+db_dir+'/'+ID+'.fna.gz > '+db_dir+'/'+ID+'.fna'
subprocess.check_call(uncompressCmd, shell=True)
# edit ">" genome identifiers
# find all lines starting with > and add ID_ before all info
editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna'
subprocess.check_call(editgenomeCmd, shell=True)
else:
# move to project dir and edit ">" genome identifiers
mvgenomeCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna'
subprocess.check_call(mvgenomeCmd, shell=True)
editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna'
subprocess.check_call(editgenomeCmd, shell=True)
# define full db path and merge all reference genomes in it
db_path = ''+db_dir+'/'+db_ID+'.fna'
# obtain full paths of all edited genomes to merge
mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+''
subprocess.check_call(mergeCmd, shell=True)
# remove all individual genomes if more than one
if os.path.exists(db_dir+"/"+ID+".fna"):
rmCmd='cd '+db_dir+' && ls | grep -v "'+db_ID+'*" | xargs rm'
subprocess.check_call(rmCmd, shell=True)
else:
pass
else: # the db file already exists
# define full db path and merge all reference genomes in it
db_path = ''+db_dir+'/'+db_ID+'.fna'
return(db_path)
def run_preparegenomes(in_f, path, config, cores):
"""Run snakemake on shell"""
# retrieve db_path
path_out = set_up_preparegenomes(path,in_f)
# Append db_path to config for Snakefile running
yaml = ruamel.yaml.YAML()
yaml.explicit_start = True
with open(str(config), 'r') as config_file:
data = yaml.load(config_file)
with open(str(config), 'w') as config_file:
data['DB_path'] = str(path_out[0]).strip()
dump = yaml.dump(data, config_file)
# get output files and Snakefile directory
curr_dir = os.path.dirname(sys.argv[0])
holopath = os.path.abspath(curr_dir)
path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile')
# Run snakemake
log_file = open(str(log),'w+')
log_file.write("Have a nice run!\n\t\tHOLOFOW Preparegenomes starting")
log_file.close()
prg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+''
subprocess.check_call(prg_snk_Cmd, shell=True)
log_file = open(str(log),'a+')
log_file.write("\n\t\tHOLOFOW Preparegenomes has finished :)")
log_file.close()
#Check how the run went
for file in path_out.split(" "):
exist.append(os.path.isfile(file))
if not all(exist): # all output files exist
log_file = open(str(log),'a+')
log_file.write("Looks like something went wrong...\n\t\t")
log_file.close()
###########################
#### Workflows running
###########################
# 0 # Preparegenomes workflow
run_preparegenomes(in_f, path, config, cores)