-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTelephage.sh
executable file
·259 lines (202 loc) · 11 KB
/
Telephage.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env bash
#Checking for dependencies
if ! command -v hmmscan > /dev/null; then
printf "\n HMMER3 not found in path. Exiting"
exit
fi
if ! command -v efetch > /dev/null; then
printf "\n Entrez-Direct tools not found in path. Exiting"
exit
fi
if ! command -v faidx > /dev/null; then
printf "\n Pyfaidx not found in path. Exiting"
exit
fi
if ! command -v taxonkit > /dev/null; then
printf "\n Taxonkit not found in path."
printf "\n Activating the included executable"
chmod +x taxonkit
fi
if ! command -v FastTree > /dev/null; then
printf "\n Fasttree not found in path."
printf "\n Activating the included executable"
chmod +x FastTreeDbl
fi
if ! command -v muscle > /dev/null; then
printf "\n Muscle not found in path."
printf "\n Activating the included executable"
chmod +x muscle
fi
#Binomica logo
echo " "
echo " "
echo " "
echo " ,@## %.. %#/ "
echo " &%/ %,. %#/ "
echo " *&%. %.. %#( "
echo " (%% %.. %#( "
echo " /%% %*. %#( "
echo " %% &#.,*,(###. "
echo " #% %,( "
echo " #%/ %,( "
echo " .%%( %,/ "
echo " #%%%%#%%%#%(./(*,**#( "
echo " #*, "
echo " .%,. "
echo " %,# "
echo " %,% "
echo " %#% "
echo " (%%% "
echo " (% "
echo " "
echo " Binomica Labs - TelePhage v. 0.01 "
echo " "
echo " Small Thoughtful Science "
echo " "
echo " "
echo " "
echo "##################################################"
#Checking for output directory and creating one
if [ -d "./TP_output" ]
then
echo "TP_output directory already exists. Exiting for now."
exit
else
mkdir TP_output
fi
#Checking for Taxonkit dependency directry .taxonkit/, creating one if not found
if [ -d ~/.taxonkit/ ]
then
echo " "
echo " "
echo " Taxonkit dump directory found. Processing. "
echo " "
echo " "
else
echo " "
echo " "
echo " Downloading and moving taxonkit dump files. "
echo " "
echo " "
mkdir ~/.taxonkit/
wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xvf taxdump.tar.gz
mv delnodes.dmp merged.dmp names.dmp nodes.dmp ~/.taxonkit/
rm citations.dmp division.dmp gc.prt gencode.dmp readme.txt taxdump.tar.gz
fi
#Checking for Pfam.hmm and other required files for Hmmscan step
#If the files are absent the script will attempt to download and hmmpress the file
FILE=Pfam-A.hmm
if [ -f "$FILE" ]; then
echo " "
echo " "
echo " $FILE found, proceeding with analysis "
echo " "
echo " "
else
echo " "
echo " "
echo " $FILE not found. Downloading matching version and activating hmmpress "
echo " "
echo " "
wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam34.0/Pfam-A.hmm.gz
gunzip Pfam-A.hmm.gz
hmmpress Pfam-A.hmm
fi
#Finding Pfam accession for input
input_protein="$1"
hmmscan --tblout "$1".pf Pfam-A.hmm "$1"
#Isolating the top scoring pfam accession number from HMMscan result
cat *.pf | grep -i "PF" | awk '{print $2}' | head -n 1 > "$1"_input_protein_pfam.txt
cp "$1"_input_protein_pfam.txt *.pf TP_output/
rm *.pf
#Screen preformatted phage pfam output for similar protein acccession numbers
input_pfam=$(<"$1"_input_protein_pfam.txt)
echo " "
echo "##################################################"
echo "## Top pfam hit for the input is $input_pfam ##"
echo "##################################################"
echo " "
zcat allphage-pf34.pfam.gz | grep -i $input_pfam | awk '{print $2}' > "$1"_similar_protein_accessions.acc && echo "Protein accessions list created" || echo "Protein accession list compilation failed"
cp "$1"_similar_protein_accessions.acc TP_output/
#Extracting full protein sequences associated with the accession numbers
#We're first parsing the accession groups into 200
#Multiple simultaneous download from NCBI is not recommended!
JoinIntoGroupsOf() {
xargs -n "$@" echo |
sed 's/ /,/g'
}
cat "$1"_similar_protein_accessions.acc | JoinIntoGroupsOf 200 | xargs -n 1 sh -c \
'efetch -email "[email protected]" -db protein -id $0 -format fasta' \
> "$1"_similar_proteins.seq && echo "Protein FASTA for matching accessions compiled" || \
echo "Protein FASTA compilation failed"
cp "$1"_similar_proteins.seq TP_output/
cat "$1"_similar_protein_accessions.acc | JoinIntoGroupsOf 200 | xargs -n 1 sh -c \
'efetch -email "[email protected]" -db protein -id $0 -format fasta_cds_na' \
> "$1"_similar_proteins_cds.seq && echo "Nucleotide FASTA for matching accessions compiled" || \
echo "Nucleotide FASTA compilation failed"
cp "$1"_similar_proteins_cds.seq TP_output/
epost -input "$1"_similar_protein_accessions.acc -db protein | efetch -format gb -mode xml | xtract -pattern \
GBSeq -element GBSeq_accession-version -block GBQualifier -if GBQualifier_name \
-equals product -element GBQualifier_value > \
"$1"_similar_proteins_annotation.txt && echo "Protein accessions annotation file compiled" || \
echo "Protein accessions annotation compilation failed"
cp "$1"_similar_proteins_annotation.txt TP_output/
#Quick warning message for the user before the more time consuming processes begin
echo " "
echo " "
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo "~~( Stay awhile and listen; following steps will take some time )~~"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo " "
echo " "
echo " "
echo "##################################################"
echo "## Aligning the pfam matching protein sequences ##"
echo "##################################################"
echo " "
#Using muscle to align the pfam sharing protein sequences
#Choosing to use included binary or system call
if ! command -v muscle > /dev/null
then
./muscle -in "$1"_similar_proteins.seq > "$1"_similar_proteins.aln
else
muscle -in "$1"_similar_proteins.seq > "$1"_similar_proteins.aln
fi
cp "$1"_similar_proteins.aln TP_output/
echo " "
echo "##################################################"
echo "## Running FastTree on the protein alignment ##"
echo "##################################################"
echo " "
#Using FastTreeDbl to generate a default phylogenetic tree from the alignment
#Choosing to use included binary or system call
if ! command -v FastTree > /dev/null
then
./FastTreeDbl "$1"_similar_proteins.aln > "$1"_similar_proteins.tre
else
FastTree "$1"_similar_proteins.aln > "$1"_similar_proteins.tre
fi
cp "$1"_similar_proteins.tre TP_output/
#Screen output for the taxon id search delay"
echo " "
echo "##################################################"
echo "## Crunching taxon data ##"
echo "##################################################"
echo " "
#Linking the accession numbers to NCBI uid
zgrep -f "$1"_similar_protein_accessions.acc phage-prot.accession2taxid.gz | awk '{print $2"\t"$3}' > "$1"_similar_proteins_to_uids && echo "Protein accessions to taxonomic ID list completed" || echo "Protein accessions to taxonomic ID list failed"
cp "$1"_similar_proteins_to_uids TP_output/
#Screening input_proteins_to_uids file to isolate uid for taxonomy tagging
cat "$1"_similar_proteins_to_uids | awk '{print $2}' > "$1"_phages_with_similar_proteins.uid && echo "Phage taxonomic ID list completed" || echo "Phage taxonomic ID list compilation failed"
cp "$1"_phages_with_similar_proteins.uid TP_output/
#Using Taxonkit to link NCBI uid with taxonomic names for identified phages
#Also creating a temporary linear uid file to create taxonomic tree list
taxonkit lineage "$1"_phages_with_similar_proteins.uid > "$1"_phages-taxonomic-full.txt && echo "Phage taxonomic name list completed" || echo "Phage taxonomic name compilation failed"
taxonkit lineage -t "$1"_phages_with_similar_proteins.uid > "$1"_phages-taxonomic-lineage.txt && echo "Phage taxonomic lineage list completed" || echo "Phage taxonomic lineage compilation failed"
cp "$1"_phages-taxonomic-full.txt "$1"_phages-taxonomic-lineage.txt TP_output/
#Clean up for the working directory
rm "$1"_input_protein_pfam.txt "$1"_similar_protein_accessions.acc "$1"_similar_proteins.seq "$1"_similar_proteins_cds.seq "$1"_similar_proteins_annotation.txt "$1"_similar_proteins_to_uids "$1"_phages_with_similar_proteins.uid "$1"_phages-taxonomic-full.txt "$1"_phages-taxonomic-lineage.txt "$1"_similar_proteins.aln "$1"_similar_proteins.tre && echo "All TelePhage processes completed. Have a nice day!" || echo "TelePhage process terminated with error"
echo " "
echo " "
echo " "