-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgoAna
executable file
·169 lines (156 loc) · 6.99 KB
/
goAna
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9";
GENE_ID_TYPE="SYMBOL"
ANNOTATION="GOTERM_BP_ALL"
PVALUE="0.05"
MAXCLASS=20
MINGENE=10
ALLOW_DUPLICATES=0
FIGWIDTH=10
FIGHEIGHT=20
#### usage ####
usage() {
echo Program: "goAna (perform gene ontology analysis on input gene list)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: [email protected]
echo "Usage: goAna -i <file> -o <dir> [OPTIONS]"
echo "Options:"
echo " -i <file> [input file containing gene list(s) (can be stdin)]"
echo " [single gene list format (default)]"
echo " => column 1: <gene>; column 2: <fold change> (optional)"
echo " [multiple gene list format (required with -c parameter)]"
echo " => column 1: <gene>; column 2: <gene> ... column N: <gene>"
echo " [multiple gene list format (required with -f parameter)]"
echo " => column 1: <gene>; column 2: <class>"
echo " -o <dir> [output directory]"
echo "[OPTIONS]"
echo " -g <string> [genome for which to perform the analysis (default: mm9)]"
echo " -d <string> [input gene id type (default: SYMBOL)]"
echo " [options: ENTREZID, PFAM, IPI, PROSITE, ACCNUM, ALIAS, ENZYME,]"
echo " [ MAP, PATH, PMID, REFSEQ, SYMBOL, UNIGENE, ENSEMBL, ]"
echo " [ ENSEMBLPROT, ENSEMBLTRANS, GENENAME, UNIPROT, GO, ]"
echo " [ EVIDENCE, ONTOLOGY, GOALL, EVIDENCEALL, ONTOLOGYALL,]"
echo " [ OMIM, UCSCKG ]"
echo " -a <string> [annotation type (default: GOTERM_BP_ALL)]"
echo " [if multiple separate then by a comma]"
echo " [options: DAVID, GOTERM_BP_ALL, GOTERM_MF_ALL, GOTERM_CC_ALL,]"
echo " KEGG_PATHWAY, DISEASE_ONTOLOGY, REACTOME_PATHWAY]"
echo " [run goAna.R -l for complete list of annotations available]"
echo " -p <float> [p-value (default: 0.05)]"
echo " -m <int> [maximum number of go classes to plot (default: 20)]"
echo " -n <int> [minimum genes in a go class (default: 10)]"
echo " -c [compare gene lists]"
echo " -f [compare gene lists using formula (gene~class)]"
echo " -u <int> [Plot top -m classes even if some are common between classes (0: No, 1: yes (default: 0))]"
echo " -w <int> [Width of output figure (default: 10)]"
echo " -t <int> [Height of output figure (default: 20)]"
echo " -b <file> [input file containing background gene list]"
echo " -r <file> [input file containing manually filtered GO categories to plot"]
echo " -q [plot count data in log]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:o:g:d:a:p:m:n:cfu:w:t:b:r:qh ARG; do
case "$ARG" in
i) GENEFILE=$OPTARG;;
o) OUTDIR=$OPTARG;;
g) GENOME=$OPTARG;;
d) GENE_ID_TYPE=$OPTARG;;
a) ANNOTATION=$OPTARG;;
p) PVALUE=$OPTARG;;
m) MAXCLASS=$OPTARG;;
n) MINGENE=$OPTARG;;
c) COMPARE_CLUSTER=1;;
f) COMPARE_CLUSTER_FORMULA=1;;
u) ALLOW_DUPLICATES=$OPTARG;;
w) FIGWIDTH=$OPTARG;;
t) FIGHEIGHT=$OPTARG;;
b) BKGFILE=$OPTARG;;
r) FTRRESFILE=$OPTARG;;
q) LOGCOUNT=1;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$GENEFILE" -o -z "$OUTDIR" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
<<"COMMENT1"
COMMENT1
echo -n "Create directory structure... "
if [ ! -d "$OUTDIR" ]; then
mkdir -p $OUTDIR
fi
echo "done"
echo -n "Populating files based on input genome, $GENOME (`date`).. "
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.simpleRepeat.gz"
GENOME_NAME="mmu"
GENOME_DB="org.Mm.eg.db"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg19.simpleRepeat.gz"
GENOME_NAME="hsa"
GENOME_DB="org.Hsa.eg.db"
else
echo "Presently the program only support analysis for mm9, hg19 or danRer7"
echo
usage
fi
echo "done"
## determine, if the input genes are from a file or stdin
echo -n "Create gene file depending on if the input is from file or STDIN (`date`).. "
if [ -f "$GENEFILE" ]; then
zless $GENEFILE | perl -ane '$line=(); foreach(@F) { chomp($_); $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/GENES_INTEREST.TXT
elif [ "$GENEFILE" == "stdin" ]; then
while read LINE; do echo ${LINE}; done | perl -ane '$line=(); foreach(@F) { chomp($_); $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/GENES_INTEREST.TXT
else
usage
fi
echo "done"
echo "Initiate GO analysis (`date`).. "
ADD_ARG=""
if [ ! -z "$BKGFILE" ]; then
ADD_ARG="-b $BKGFILE "
fi
if [ ! -z "$LOGCOUNT" ]; then
ADD_ARG="$ADD_ARG -q "
fi
if [ ! -z "$FTRRESFILE" ]; then
if [ ! -z "$COMPARE_CLUSTER" ]; then
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -p $PVALUE -m $MAXCLASS -n $MINGENE -c -s $OUTDIR/go_analysis_compareCluster.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG -r $FTRRESFILE
elif [ ! -z "$COMPARE_CLUSTER_FORMULA" ]; then
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -p $PVALUE -m $MAXCLASS -n $MINGENE -f -s $OUTDIR/go_analysis_compareClusterFormula.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG -r $FTRRESFILE
else
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -a $ANNO -p $PVALUE -m $MAXCLASS -n $MINGENE -s $OUTDIR/go_analysis_list.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG
fi
elif [ ! -z "$COMPARE_CLUSTER" ]; then
for ANNO in $(echo $ANNOTATION | sed 's/\,/ /g'); do
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -a $ANNO -p $PVALUE -n $MINGENE -c -s $OUTDIR/go_analysis_compareCluster.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG
done
elif [ ! -z "$COMPARE_CLUSTER_FORMULA" ]; then
for ANNO in $(echo $ANNOTATION | sed 's/\,/ /g'); do
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -a $ANNO -p $PVALUE -m $MAXCLASS -n $MINGENE -f -s $OUTDIR/go_analysis_compareClusterFormula.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG
done
else
for ANNO in $(echo $ANNOTATION | sed 's/\,/ /g'); do
goAna.R -i $OUTDIR/GENES_INTEREST.TXT -o $OUTDIR -e $GENOME_NAME -d $GENE_ID_TYPE -a $ANNO -p $PVALUE -m $MAXCLASS -n $MINGENE -s $OUTDIR/go_analysis_list.Rsession -u $ALLOW_DUPLICATES -w $FIGWIDTH -t $FIGHEIGHT $ADD_ARG
done
fi
echo "done"