-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMDF.R
118 lines (105 loc) · 4.64 KB
/
MDF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env Rscript
library(argparser)
parser <- argparser::arg_parser( 'Produces a fasta file allowing the percentage of missing data in each site specified in -p',
name = 'MDF.R')
parser <- add_argument(parser, 'input',
type='character',
nargs=1,
help='Path to the snpTable.tsv produced by MultiVCFAnalyzer or file.fasta. Note: if file.fasta is used as input, the flag --fasta must be included for proper parsing')
parser <- add_argument(parser, 'percentage',
type="integer",
nargs=1,
help='Partial deletion percentage')
parser <- add_argument(parser, 'output',
type='character',
nargs=1,
help='Path to output directory')
parser <- add_argument(parser, '--toExclude',
type = 'character',
help = 'Path to file containing genomes to exclude one per line',
default = 'None')
parser <- add_argument(parser, '--fasta',
type = 'logical',
help = 'Use when input is a fasta file',
flag = T, short = '-f')
argv <- parse_args(parser)
library(tidyverse)
library(seqinr)
excludingGenomes <- function(df, genometoExclude) {
print(genometoExclude)
toExclude <- read.delim(genometoExclude, header = F)
df_sampleRows <- df %>%
filter(!Genome %in% toExclude$V1)
print(paste("Provided genomes", toExclude$V1, "have been excluded", sep = " "))
return(df_sampleRows)
}
partialDeletionFasta <- function(input, inputdf, percentage, outputpath, genomeToExclude) {
if( genomeToExclude == "None"){
df <- inputdf
print("No genomes excluded")
} else {
df <- excludingGenomes(inputdf, genomeToExclude)
}
pd <- 100 - percentage
partialDelInfo <- df %>%
group_by(Position) %>%
summarise(missing=sum(Call== "N"), total=n()) %>%
mutate(percent = (missing/total)*100) %>%
filter(percent <= pd)
numPos=nrow(partialDelInfo)
print("Missing information calculated")
print(paste("Positions included in the alignment", numPos, sep = " "))
if( input == "fasta") {
snpTable_pd <- df %>%
filter(Position %in% partialDelInfo$Position) %>%
mutate(Call = as.character(Call)) %>%
mutate(Final_Call = if_else(Call == '?'| Call == '-' , 'N' , Call)) %>%
select(-Call) %>%
pivot_wider(names_from = Position, values_from = Final_Call)
} else {
snpTable_pd <- df %>%
filter(Position %in% partialDelInfo$Position) %>%
mutate(Call = as.character(Call), Ref = as.character(Ref)) %>%
mutate(Final_Call = if_else(Call == '.', Ref, Call)) %>%
select(-Ref, -Call) %>%
pivot_wider(names_from = Position, values_from = Final_Call)
}
outputTable <- paste(outputpath, percentage, "pd", numPos,"positions.tsv", sep = "_")
write_tsv(snpTable_pd, outputTable, col_names = F)
print("snpTable_pd saved")
outputFasta <- paste(outputpath, percentage, "pd", numPos, "positions.fasta", sep = "_")
system(paste("awk '{print \">\"$1; $1=\"\"; print $0}' OFS=", outputTable, ">", outputFasta))
write_tsv(snpTable_pd, outputTable, col_names = T)
print("Fasta_dp saved")
}
fastatoTibble <- function(fasta) {
fastaList <- seqinr::read.fasta(fasta, whole.header = T, forceDNAtolower = F)
fastaM <- as_tibble(matrix(unlist(fastaList),
nrow = length(fastaList),
byrow = T),
.name_repair = "universal")
fastaM$Genome <- names(fastaList)
df_sampleRows <- fastaM %>%
pivot_longer(-contains("Genome"), names_to = "Position",values_to = "Call")
return(df_sampleRows)
print("fastaToTibble Comple")
}
snpTabletoTibble <- function(snpTable) {
df <- read.delim(snpTable, stringsAsFactors = F, check.names = F)
df_sampleRows <- df %>%
pivot_longer(-contains(c("Position","Ref")), names_to = "Genome", values_to = "Call")
return(df_sampleRows)
print("snpTableToTibble complete")
}
inFasta=argv$fasta
print(inFasta)
print(argv$toExclude)
if( argv$fasta == T ){
print("Fasta as input, start run with fasta mode")
fastaT <- fastatoTibble(fasta = argv$input)
partialDeletionFasta(input = "fasta", inputdf = fastaT, percentage = argv$percentage, outputpath = argv$output, genomeToExclude = argv$toExclude)
} else {
print("SnpTable as input, start run with table mode")
snpTableT <- snpTabletoTibble(snpTable = argv$input)
partialDeletionFasta(input = "snpTable", inputdf = snpTableT, percentage = argv$percentage, outputpath = argv$output, genomeToExclude = argv$toExclude)
}