-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassembly.nf
261 lines (226 loc) · 8.96 KB
/
assembly.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
/*
Genome assembly pipeline
* Remove adapters from HiFi Fastq
* Convert FASTQ to FASTA (Used by TgsGapcloser in next pipeline)
* KMC K-mer analysis + GenomeScope2.0 for genome size estimation
* Genome assembly (hifiasm)
* Hi-C read processing
* Scaffolding (salsa2/pin_hic)
* Genearte Juicebox visualisation files
* BUSCO (contigs/scaffolds)
NOTE: Use the 'assembly_assessment' pipeline to finalise the genome with gap-closing,
followed by a range of QC assessments
*/
// Import pipeline modules
include { hifiadapterfilt } from '../nf-modules/hifiadapterfilt/2.0.0/hifiadapterfilt'
include { seqkit_fq2fa } from '../nf-modules/seqkit/2.1.0/seqkit_fq2fa'
include { kmc } from '../nf-modules/kmc/3.2.1/kmc'
include { genomescope } from '../nf-modules/genomescope/2.0/genomescope'
include { hifiasm } from '../nf-modules/hifiasm/0.16.1//hifiasm'
include { hifiasm_hic } from '../nf-modules/hifiasm/0.16.1/hifiasm-hic'
include { bwa_mem2_index } from "../nf-modules/bwa-mem2/2.2.1/bwa-mem2-index"
include { arima_map_filter_combine } from "../nf-modules/arima/1.0.0/arima_map_filter_combine"
include { arima_dedup_sort } from '../nf-modules/arima/1.0.0/arima_dedup_sort'
include { pin_hic } from '../nf-modules/pin_hic/3.0.0/pin_hic'
include { salsa2 } from '../nf-modules/salsa2/2.3/salsa2'
include { matlock_bam2 } from '../nf-modules/matlock/20181227/matlock_bam2'
include { assembly_visualiser as assembly_visualiser_pin_hic } from '../nf-modules/3d_dna/201008/assembly_visualiser'
include { assembly_visualiser as assembly_visualiser_salsa2 } from '../nf-modules/3d_dna/201008/assembly_visualiser'
include { busco as busco_contig } from '../nf-modules/busco/5.2.2/busco'
include { busco as busco_salsa2 } from '../nf-modules/busco/5.2.2/busco'
include { busco as busco_pin_hic } from '../nf-modules/busco/5.2.2/busco'
// Sub-workflow
workflow ASSEMBLY {
main:
// Output directory paths
out_filteredreads = params.outdir + '/adapter-removed-reads'
out_hifiasm = params.outdir + '/assembly-contigs/' + params.out_prefix
out_busco = params.outdir + '/qc/busco'
out_scaffold = params.outdir + '/assembly-scaffold'
out_genomescope = params.outdir + '/genome-size'
// HiFi Data
Channel
.fromFilePairs(
[ params.hifi.path, params.hifi.pattern].join('/'),
size: params.hifi.nfiles
)
.ifEmpty { exit 1, "Long-read fastq file channel is empty. Can't find the files..." }
.set { ch_hifi }
// HifiAdapterFilt: Remove adapters from HiFi
hifiadapterfilt(
ch_hifi,
out_filteredreads
)
// // FQ2FA: hifi reads
seqkit_fq2fa(
hifiadapterfilt.out.clean,
out_filteredreads
)
// Hi-C data + run assembly pipeline
if (params.containsKey("hic")) {
Channel
.fromFilePairs(
[params.hic.path, params.hic.pattern].join('/'),
size: params.hic.nfiles
)
.set { ch_hic }
// Hifiasm: assemble reads into contigs
hifiasm_hic(
hifiadapterfilt.out.clean,
ch_hic,
params.out_prefix,
out_hifiasm
)
// String to match on - not pretty but does the trick
switch(params.assembly) {
case 'all':
pattern = ['p_ctg', 'hap1', 'hap2' ]
break;
case 'primary':
pattern = [ 'p_ctg' ]
break;
case 'haplotype1':
pattern = [ 'hap1' ]
break;
case 'haplotype2':
pattern = [ 'hap2' ]
break;
case 'haplotypes':
pattern = [ 'hap1' ,'hap2' ]
break;
}
// Filter Hifiasm output channels for only the assemblies we're after
hifiasm_hic.out.contigs.flatten().filter{
pattern.any { val -> it.baseName.contains(val)}
}
.map { ctg ->
return tuple(ctg.baseName, ctg)
}
.set { ch_contigs }
// BUSCO: contig assemblies
busco_contig(
ch_contigs,
params.busco_db,
'contig',
out_busco
)
// BWA2: Index reference files
bwa_mem2_index(
ch_contigs
)
// Combine idx with hic-read channel
bwa_mem2_index.out.combine(ch_hic).set { ch_hap_idx_hic }
// Arima Hi-C processing: Align + filter hic reads
arima_map_filter_combine(ch_hap_idx_hic)
arima_dedup_sort(arima_map_filter_combine.out.bam)
// Maaaaaatlooooooock: Convert Hi-C to contig BAM file to 'merged_nodups.txt' file used by juicebox
matlock_bam2(arima_dedup_sort.out.hic_to_ctg_bam)
// Combine processed Hi-C reads with genome again - join on genome ID (first field)
ch_contigs.join(arima_dedup_sort.out.bam).set { ch_ref_hic }
// Hi-c scaffolding
switch(params.scaffolder) {
case 'all':
// Scaffold
pin_hic(ch_ref_hic, out_scaffold)
salsa2(ch_ref_hic, out_scaffold)
// BUSCO on scaffolds ---
busco_salsa2(
salsa2.out.scaffolds,
params.busco_db,
'scaffold-salsa2',
out_busco
)
busco_pin_hic(
pin_hic.out.scaffolds,
params.busco_db,
'scaffold-pin_hic',
out_busco
)
// Join scaffold agp with matlock output
pin_hic.out.juicebox.join(matlock_bam2.out).set { ch_pin_hic_juicebox }
salsa2.out.juicebox.join(matlock_bam2.out).set { ch_salsa2_juicebox }
// Create Juicebox files
assembly_visualiser_pin_hic(
ch_pin_hic_juicebox,
'pin_hic',
out_scaffold
)
assembly_visualiser_salsa2(
ch_salsa2_juicebox,
'salsa2',
out_scaffold
)
break;
case 'salsa2':
salsa2(ch_ref_hic, out_scaffold)
// BUSCO on scaffolds ---
busco_salsa2(
salsa2.out.scaffolds,
params.busco_db,
'scaffold-salsa2',
out_busco
)
// Generate Juicebox input files ---
salsa2.out.juicebox.join(matlock_bam2.out).set { ch_salsa2_juicebox }
assembly_visualiser_salsa2(
ch_salsa2_juicebox,
'salsa2',
out_scaffold
)
break;
case 'pin_hic':
pin_hic(ch_ref_hic, out_scaffold)
// BUSCO on scaffolds ---
busco_pin_hic(
pin_hic.out.scaffolds,
params.busco_db,
'scaffold-pin_hic',
out_busco
)
// Generate Juicebox input files ---
pin_hic.out.juicebox.join(matlock_bam2.out).set { ch_pin_hic_juicebox }
assembly_visualiser_pin_hic(
ch_pin_hic_juicebox,
'pin_hic',
out_scaffold
)
break;
}
/*
PIPELINE BREAK: The pipeline should break here for users to edit their genome assemblies.
- Might look at adding functionality to just continue with the pipeline? but feel
like users should always check and edit their assemblies in Juicebox to ensure no
obvious misjoins.
*/
} else {
// Hifiasm: assemble contigs
hifiasm(
ch_hifi,
params.out_prefix,
out_hifiasm
)
// BUSCO: contig primary assembly
busco_contig(
hifiasm.out.contigs,
params.busco_db,
'contig',
out_busco
)
// BUSCO plot: Generate a summary plot of the BUSCO results
// busco_plot(busco_contig.out.summary, params.outdir)
// Merqury: K-mer assessment - Compare genome to reads
// hifiasm.out.contigs.map { id, val -> return file(val)}
// .set { ch_contig }
// ch_contig.combine(ch_hifi).set { ch_contig_hifi }
// merqury(ch_contig_hifi, params.outdir)
// // QUAST: contig primary assembly
// quast(ch_contig, params.outdir)
// // Mosdepth - HIFI alignment and coverage
// hifiasm.out.fa.combine(ch_hifi).set { ch_contig_hifi_fq }
// minimap2_pb_hifi(ch_contig_hifi_fq, params.outdir)
// mosdepth(minimap2_pb_hifi.out, params.outdir)
}
// Genome size estimation
kmc(ch_hifi, params.out_prefix)
genomescope(kmc.out.histo, out_genomescope)
}