-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmedianwindows.R
82 lines (63 loc) · 2.47 KB
/
medianwindows.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env Rscript
# E.Stolle Febr 2018
# take from table regions according to bed-windows and for columns take median and some other calc
# Rscript medianwindows.R inputfile.data.csv inputfile.win.bed
# runs with 80% of all CPU cores!! takes relatively long
args <- commandArgs(trailingOnly = TRUE)
print(args[1])
inputfile.data <- (args[1])
inputfile.win <- (args[2])
library(tidyverse)
library(parallel)
#library(foreach)
#library(doParallel)
data <- read_tsv(inputfile.data, col_names = FALSE)
wins <- read_tsv(inputfile.win, col_names = FALSE)
#wins <- read_tsv(inputfile,na = "nan")
colnames(data) <- c("SCF","POS","Ns","HMZs","HTZs","SAMPLES")
colnames(wins) <- c("ID","START","END")
############################
wins.n <-as.integer(nrow(wins))
# 285426 lines/windows from the bed file
#add index to windows/bed file
wins$INDEX <- as.integer(c(1:(nrow(wins))))
#for(i in 1:wins.n)
#writeLines(c(SCFv,STARTv,ENDv,BASES,MED1,MED2,MED3,MED4,RAT1,RAT2,RAT3,RAT4),sep="\t")
#https://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
no_cores <- detectCores() * 0.65
#no_cores <- 105
cl <- makeCluster(no_cores, type="FORK")
system.time(
z <- do.call(rbind.data.frame,
parLapply(cl,
unique(wins$INDEX),
function(i){
SCFv <- as.character(subset(wins, wins$INDEX == i, select=ID));
STARTv <- as.numeric(subset(wins, wins$INDEX == i, select=START));
ENDv <- as.numeric(subset(wins, wins$INDEX == i, select=END));
BASES=ENDv-STARTv;
subset.i <- subset(data, SCF == SCFv & POS >= STARTv & POS <= ENDv );
#calculate Medians
MED1 <- round(median(subset.i$Ns, na.rm = TRUE), digits = 4);
MED2 <- round(median(subset.i$HMZs, na.rm = TRUE), digits = 4);
MED3 <- round(median(subset.i$HTZs, na.rm = TRUE), digits = 4);
MED4 <- round(median(subset.i$SAMPLES, na.rm = TRUE), digits = 4);
# ratio per sample
RAT1 <- round(MED1/MED4, digits = 4);
RAT2 <- round(MED2/MED4, digits = 4);
RAT3 <- round(MED3/MED4, digits = 4);
#ratio HTZ/HMZ
RAT4 <- round(MED3/MED2, digits = 4);
return(c(i,SCFv,STARTv,ENDv,BASES,MED1,MED2,MED3,MED4,RAT1,RAT2,RAT3,RAT4))}
))
)
stopCluster(cl)
# user system elapsed
# 45.127 34.732 4944.883
#1.5hrs
#1000SNPs
#user system elapsed
#0.149 0.120 21.340
colnames(z) <- c("i","SCF","START","END","BASES","MED-N","MED-HMZ","MED-HTZ","MED-SAMPLES","RAT-NpSample","RAT-HMZpSample","RAT-HTZpSample","RAT-HTZpHMZ")
write.table(z, file = paste0(inputfile.win,".medians.csv"), sep='\t', row.names=FALSE, col.names=TRUE)
print("done")