-
Notifications
You must be signed in to change notification settings - Fork 2
/
DupCheckBetweenSillys.GCL.r
146 lines (78 loc) · 6.46 KB
/
DupCheckBetweenSillys.GCL.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
DupCheckBetweenSillys.GCL=function(KeySillys,KeySillyIDs=NULL,BetweenSillys,loci,threshold){
##################################################################################################################################################################################################################################################################################################
#
# "KeySillyIDs" is a named list, same length and names as "KeySillys"
#
# e.g. KeySillys="SAUKE09QC"
#
# KeySillyIDs=list(SAUKE09QC=c("2587","3007"))
#
# ReWritten by JJ 12/30/2015
# Revised by KS on 06/20/2016 in order to accomodate multiple KeySillys for the output
##################################################################################################################################################################################################################################################################################################
while(!require(abind)){install.packages("abind")}
while(!require(outliers)){install.packages("outliers")}
keygcls=sapply(KeySillys,function(silly){assign(paste("new",silly,".gcl",sep=""),get(paste(silly,".gcl",sep=""),pos=1),pos=1)},simplify=FALSE)
ploidy=LocusControl$ploidy[loci]
nloci=length(loci)
if(!is.null(KeySillyIDs)){
names(KeySillyIDs)=KeySillys
KeySillyIDsALL=sapply(KeySillys,function(silly){rownames(keygcls[[silly]]$counts)},simplify=FALSE)
KeySillyIDs2Remove=sapply(KeySillys,function(silly){list(KeySillyIDsALL[[silly]][is.na(match(KeySillyIDsALL[[silly]],KeySillyIDs[[silly]]))])},simplify=FALSE)
EMPTY=sapply(KeySillys,function(silly){RemoveIDs.GCL(paste0("new",silly),KeySillyIDs2Remove[[silly]])})
}
newKeySillys=paste0("new",KeySillys)
newname=paste0(newKeySillys,collapse=".")
EMPTY=PoolCollections.GCL(newKeySillys,loci,IDs=NULL,newname=newname)
newkeygcl=get(paste0(newname,".gcl"))
newscores=newkeygcl$scores[,loci,,drop=FALSE]
newsillyvials=newkeygcl$attributes$SillySource
rownames(newscores)=newsillyvials
betweensillys=paste0("between",BetweenSillys)
betweenname=paste(betweensillys,collapse=".")
EMPTY=PoolCollections.GCL(BetweenSillys,loci,IDs=NULL,newname="betweenname")
betweengcl=get(paste0("betweenname",".gcl"))
betweenscores=betweengcl$scores[,loci,,drop=FALSE]
betweensillyvials=betweengcl$attributes$SillySource
rownames(betweenscores)=betweensillyvials
combs=expand.grid(Keysillyvial=newsillyvials,Betweensillyvial=betweensillyvials,stringsAsFactors=FALSE)
scores=abind(newscores,betweenscores,along=1)
scores[scores%in%c("0","Unk","XXX")]=NA
allsillyvials=rownames(scores)
scores.df=data.frame(t(sapply(allsillyvials,function(id){sapply(loci,function(locus){ifelse(is.na(scores[id,locus,1]),NA,paste(scores[id,locus,1:ploidy[locus]],collapse="."))})})),stringsAsFactors=FALSE)
duplicaterate=rowMeans(scores.df[combs[,1],]==scores.df[combs[,2],],na.rm=TRUE)
duplicatesummary <- sapply(unique(combs$Keysillyvial), function(KeySillyID) {
combs.log <- combs$Keysillyvial == KeySillyID
hist(duplicaterate[combs.log], breaks = seq(0, 1, 0.01), col = 8, xlab = "Duplicate Rate", main = KeySillyID)
abline(v = 0.9, col = "red", lwd = 2)
maxduplicaterate.log <- duplicaterate[combs.log] == max(duplicaterate[combs.log])
maxduplicaterate <- setNames(object = duplicaterate[combs.log][maxduplicaterate.log], nm = combs[combs.log, "Betweensillyvial"][maxduplicaterate.log])
text(x = maxduplicaterate[1], y = 5, labels = paste(names(maxduplicaterate), collapse = "_"), srt = 90, adj = 0)
keymissing <- apply(is.na(scores[combs[combs.log, "Keysillyvial"][maxduplicaterate.log], , 1, drop = FALSE]), 1, sum)
betweenmissing <- apply(is.na(scores[combs[combs.log, "Betweensillyvial"][maxduplicaterate.log], , 1, drop = FALSE]), 1, sum)
maxdups <- cbind(combs[combs.log, ][maxduplicaterate.log, ], Keymissing = keymissing, Betweenmissing = betweenmissing, DuplicateRate = maxduplicaterate, stringsAsFactors=FALSE,row.names=seq(sum(maxduplicaterate.log)))
thresholdduplicaterate.log <- duplicaterate[combs.log] > threshold
thresholdduplicaterate <- setNames(object = duplicaterate[combs.log][thresholdduplicaterate.log], nm = combs[combs.log, "Betweensillyvial"][thresholdduplicaterate.log])
keymissing <- apply(is.na(scores[combs[combs.log, "Keysillyvial"][thresholdduplicaterate.log], , 1, drop = FALSE]), 1, sum)
betweenmissing <- apply(is.na(scores[combs[combs.log, "Betweensillyvial"][thresholdduplicaterate.log], , 1, drop = FALSE]), 1, sum)
if(sum(thresholdduplicaterate.log) > 0) {
thresholddups <- cbind(combs[combs.log, ][thresholdduplicaterate.log, ], Keymissing = keymissing, Betweenmissing = betweenmissing, DuplicateRate = thresholdduplicaterate, stringsAsFactors=FALSE,row.names=seq(sum(thresholdduplicaterate.log)))
} else{
thresholddups <- NULL # paste("No Duplicates at a threshold of", threshold)
}
if(grep(pattern = "QC", x = combs[1, ]) == 1) {
projectduplicaterate.log <- combs$Keysillyvial == KeySillyID & combs$Betweensillyvial == gsub(pattern = "QC", replacement = "", x = KeySillyID)
} else {
projectduplicaterate.log <- combs$Keysillyvial == KeySillyID & combs$Betweensillyvial == gsub(pattern = "_", replacement = "QC_", x = KeySillyID)
}
projectduplicaterate <- setNames(object = duplicaterate[projectduplicaterate.log], nm = combs[projectduplicaterate.log, "Betweensillyvial"])
keymissing <- apply(is.na(scores[combs[projectduplicaterate.log, "Keysillyvial"], , 1, drop = FALSE]), 1, sum)
betweenmissing <- apply(is.na(scores[combs[projectduplicaterate.log, "Betweensillyvial"], , 1, drop = FALSE]), 1, sum)
projectdups <- cbind(combs[projectduplicaterate.log, ], Keymissing = keymissing, Betweenmissing = betweenmissing, DuplicateRate = projectduplicaterate, stringsAsFactors=FALSE,row.names=seq(sum(projectduplicaterate.log)))
# list(Threshold = thresholddups, MostSimilar = maxdups, Project = projectdups)
dplyr::bind_rows(thresholddups, maxdups, projectdups) %>%
dplyr::distinct(Keysillyvial, Betweensillyvial, .keep_all = TRUE)
}, simplify = FALSE)
duplicatesummary <- dplyr::bind_rows(duplicatesummary)
return(duplicatesummary)
}