-
Notifications
You must be signed in to change notification settings - Fork 0
/
delete.prepare.sh
executable file
·227 lines (195 loc) · 6.65 KB
/
delete.prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/bin/bash
source ~/.bashrc
conda activate singlecell
##TODO: add parse options
showHelp() {
echo "Usage: ./prepare.sh [OPTIONS]"
echo "Options:"
echo " -h, --help Show this help message"
echo "Mandatory arguments:"
echo " -t, --task TASK Task to run (nanopore, annotation, matrix, or count)"
echo " -o, --output_dir Output directory"
echo "annotation task-specific options:"
echo " --reference Gene annotation file in gtf format"
echo " --bam bam file path, need to be sorted, filtered, and indexed before running"
echo "Nanopore task-specific options:"
echo " --fastq fastq file path"
echo " --kit_name multiome, 3prime, 5prime"
echo " --kit_version kit version"
echo " --reference_nanopore path to reference directory"
echo "matrix task-specific options:"
echo " --bam bam file path, need to be sorted, filtered, and indexed before running"
echo " --match threshold to decide reads map to exons"
echo " --workers number of processes per job"
echo " --jobs number of jobs"
echo " --job_index index of jobs"
echo " --cover_existing whether to overwirte existing compatible matrix"
echo "count task-specific options:"
echo " --novel_read_n read number to support a novel isoform"
}
options=$(getopt -l "help,task:,output_dir:,reference:,fastq:,kit_name:,kit_version:,reference_nanopore:,bam:,match:,workers:,job_index:,jobs:,cover_existing,novel_read_n:," \
-o "ht:o:" -a -- "$@")
[ $? -ne 0 ] && exit 1
eval set -- "$options"
while true
do
case $1 in
-h|--help)
showHelp
exit 0
;;
-o|--output_dir)
output_dir=$2
echo "set output directory as: "$output_dir
shift
;;
-t|--task)
task=$2
echo "running task of: "$task
shift
;;
--reference)
reference=$2
echo "use reference annotation file at: "$reference
shift
;;
--reference_nanopore)
reference_nanopore=$2
echo "use reference annotation file at: "$reference_nanopore
shift
;;
--fastq)
fastq=$2
shift
;;
--kit_name)
kit_name=$2
shift
;;
--kit_version)
kit_version=$2
shift
;;
--bam)
bam=$2
shift
;;
--match)
match=$2
shift
;;
--workers)
workers=$2
shift
;;
--job_index)
job_index=$2
shift
;;
--jobs)
jobs=$2
shift
;;
--cover_existing)
cover_existing=$2
shift
;;
--novel_read_n)
novel_read_n=$2
shift
;;
--)
shift
break;;
esac
shift
done
if [ -z "$task" ]; then
echo "Task is missing." >&2
exit 1
fi
if [ -z "$output_dir" ]; then
echo "output_dir is missing." >&2
exit 1
fi
if [ "$task" = "annotation" ]; then
# Check if task specific options are provided
if [ -z "$reference" ]||[ -z "$bam" ]; then
echo "annotation-specific option is missing" >&2
exit 1
fi
fi
if [ "$task" = "nanopore" ]; then
if [ -z "$reference_nanopore" ]||[ -z "$fastq" ]||[ -z "$kit_name" ]||[ -z "$kit_version" ]; then
echo "nanopore-specific options are missing" >&2
exit 1
fi
fi
if [ "$task" = "matrix" ]; then
if [ -z "$bam" ]||[ -z "$match" ]||[ -z "$job_index" ]||[ -z "$jobs" ]; then
echo "matrix-specific options are missing" >&2
exit 1
fi
fi
if [ "$task" = "count" ]; then
if [ -z "$novel_read_n" ]; then
echo "count-specific options are missing" >&2
exit 1
fi
fi
#----------------task annotation: prepare annotation file-----------------#
if [ "$task" = "annotation" ]; then
#Get the directory path of the shell script
current_path="$(dirname "$(readlink -f "$0")")"
echo "preparing annotation file"
mkdir -p $output_dir/reference
#extract annotation information
python3 $current_path/src/main_preprocessing.py --ref $reference --bam $bam --workers $workers --target $output_dir --task annotation
files=$(ls $output_dir/reference)
echo "task done, $files are in $output_dir/reference"
fi
#----------------task matrix: prepare compatible matrix -----------------#
if [ "$task" = "matrix" ]; then
current_path="$(dirname "$(readlink -f "$0")")"
mkdir -p $output_dir/compatible_matrix
if [ -z "$cover_existing" ] || [ "$cover_existing" -eq 1 ]; then
# If cover_existing is null or equals 1, run with --cover_existing
python3 "$current_path/src/main_preprocessing.py" --target "$output_dir" --task matrix \
--bam "$bam" --match "$match" --geneinfo "$output_dir/reference/geneStructureInformation.pkl" \
--job_index "$job_index" --total_jobs "$jobs" --cover_existing
else
# Otherwise, run with --cover_existing_false
python3 "$current_path/src/main_preprocessing.py" --target "$output_dir" --task matrix \
--bam "$bam" --match "$match" --geneinfo "$output_dir/reference/geneStructureInformation.pkl" \
--job_index "$job_index" --total_jobs "$jobs" --cover_existing_false
fi
fi
#----------------task count: prepare count matrix -----------------#
if [ "$task" = "count" ]; then
current_path="$(dirname "$(readlink -f "$0")")"
mkdir -p $output_dir/count_matrix
if [ -z "$workers" ]; then
workers=8
fi
if [ -d "$output_dir/compatible_matrix" ]; then
file_count=$(ls -A "$output_dir/compatible_matrix" |wc -l)
if [ "$file_count" -gt 0 ]; then
echo "generating count matrix"
python3 $current_path/src/main_preprocessing.py --target $output_dir --task count --novel_read_n $novel_read_n --workers $workers
else
echo "no compatible matrix found, run task matrix first"
fi
else
echo "no compatible matrix found, run task matrix first"
fi
fi
#----------------nanopore workflow for alignment and tagging bam files --------------#
if [ "$task" = "nanopore" ]; then
echo "running nanopore pipeline for tagged bam file"
~/nextflow run epi2me-labs/wf-singlecell \
-w $output_dir -profile singularity \
--matrix_min_genes 1 --fastq $fastq \
--ref_genome_dir $reference_nanopore --out_dir $output_dir \
--kit_name $kit_name --kit_version $kit_version
fi
#multiome, v1; 3prime, v3