-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTree_pipeline.sh
executable file
·38 lines (29 loc) · 1.38 KB
/
Tree_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash
unset -f module
. /tgac/software/testing/lmod/6.1/x86_64/lmod/lmod/init/profile
ml biopython
OUTPUT_FILE="PST130_concatinatelan.phy"
#FILES="../Isolates2015/*.fa"
FILES="incomplete_gene_tst/*.fa"
OUTPUT_DIR="sorted_incomplete_test"
# Create a directory on the working dir where the script was started from
mkdir $OUTPUT_DIR
# Find all the Isolates fasta files and order them by gene
shopt -s nullglob
for f in $FILES;
do
python ./sort_fasta.py -f $f > $OUTPUT_DIR/$(basename $f).sorted
done
# With all the files ordered by gene and placed in the sorted folder, filter out which genes are relevant (have enough information)
# codon_from_fasta has multiple filtering parameters, the most relevant are -m (minimum percentage of known bases in a sequence for acceptance) -s (minimum number of accepted samples percentage)
python ./codon_from_fasta.py -d sorted_incomplete_test
# Select all the gene filtered sequences
pattern="./$OUTPUT_DIR/*.filtered"
files=( $pattern )
# Write the PHYLIP header to the final.phy file
echo -n ${#files[@]} > $OUTPUT_FILE
echo -n " " >> $OUTPUT_FILE
wc -m < ${files[0]} >> $OUTPUT_FILE
# Write all the sequences to the final.phy file to generate a sequential PHYLIP file
for filename in $OUTPUT_DIR/*.filtered; do echo -n "$(basename $filename | cut -d '_' -f 1) "; cat $filename; echo ""; done >> $OUTPUT_FILE
# The final.phy file is ready to go into RAxML