-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthofinder.nf
101 lines (85 loc) · 3.51 KB
/
orthofinder.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
Ortholog detection pipeline
This pipeline is designed to find orthologs between species with genome
assemblies and annotation files (in GFF3). The pipeline aims to use best practice
methods to obtain a high-quality ortholog geneset out the other end. The pipeline is
as follows:
1. GFF statistics (All isoforms + longest isoform performed by default) - AGAT agat_sp_statistics.pl
2. Filter for longest isoform per gene - AGAT agat_sp_keep_longest_isoform.pl
3. Extract CDS for longest isoform - AGAT agat_sp_extract_sequences.pl
* Protein
* Nucleotide
4. Ortholog detection - OrthoFinder
5. MSA to codon alignment
6. MSA statistics pre-cleaning - msaSummary
7. Clean MSA - Gblocks/CIalign
8. MSA statistics post-cleaning - msaSummary
Potential addition?
* Leave-one-out analysis to see which sample affects the results the most
*/
// Modules required by sub-workflow
include { agat_statistics } from '../nf-modules/agat/0.9.2/agat_statistics'
include { agat_longest_iso } from '../nf-modules/agat/0.9.2/agat_longest_iso'
include { agat_extract_seq } from '../nf-modules/agat/0.9.2/agat_extract_seq'
include { orthofinder } from '../nf-modules/orthofinder/2.5.2/orthofinder'
include { prot_to_codon_msa } from '../nf-modules/general/prot_to_codon_msa'
include { msaSummary as msaSummary_pre } from '../nf-modules/general/msaSummary'
include { msaSummary as msaSummary_post } from '../nf-modules/general/msaSummary'
include { clipkit } from '../nf-modules/clipkit/1.3.0/clipkit'
// Sub-workflow
workflow ORTHOFINDER {
main:
// Outprefix is used as a parent directory here, so adjust the outpath
def outdir = [params.outdir, params.out_prefix].join('/')
// Get GFF files
Channel
.fromFilePairs(
[params.gffs.path, params.gffs.pattern].join('/'),
size: params.gffs.nfiles,
)
.ifEmpty { exit 1, "Can't find GFF3 files." }
.set { ch_gffs }
// Get Genome files
Channel
.fromFilePairs(
[params.genomes.path, params.genomes.pattern].join('/'),
size: params.genomes.nfiles,
)
.ifEmpty { exit 1, "Can't find genome files." }
.set { ch_asm }
// Optional arguments for OrthoFinder
def tree = (params.containsKey('tree') && params.tree) ? "-s " + params.tree : ''
def trim_msa = (params.containsKey('trim_msa') && params.trim_msa) ? '' : '-z'
def stop_early = (params.containsKey('stop_early') && params.stop_early ) ? '-oa' : ''
// Statistics channel
ch_gffs.join(ch_asm).set { ch_statistics }
agat_statistics(ch_statistics, outdir)
// Keep longest isoform
agat_longest_iso(ch_gffs, outdir)
agat_longest_iso.out.longest.join(ch_asm).set { ch_longest }
// Extract longest seq
agat_extract_seq(ch_longest, outdir)
agat_extract_seq.out.protein.collect().set { ch_proteins }
// Find orthologs with OrthoFinder
orthofinder(
ch_proteins,
params.search_prog,
tree,
stop_early,
trim_msa,
outdir
)
// Generate codon MSA files from protein alignments
agat_extract_seq.out.nucleotide.collect().set { ch_nucleotide }
prot_to_codon_msa(
ch_nucleotide,
orthofinder.out.msa,
outdir
)
// Summarise gaps pre-trimming
msaSummary_pre(prot_to_codon_msa.out.msa_codon, 'pre', outdir)
// Clean alignments
clipkit(prot_to_codon_msa.out.msa_codon, outdir)
// Summarise gaps post-trimming
msaSummary_post(clipkit.out.clean, 'post', outdir)
}