-
Notifications
You must be signed in to change notification settings - Fork 0
/
01ProcessCounts.R
92 lines (73 loc) · 2.87 KB
/
01ProcessCounts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
rm(list = ls())
#Did you change it to your base location?
baseDir="~/LeadNPC/"
setwd(baseDir)
source(file = "bin/00base.R")
library("edgeR")
library(biomaRt)
setwd(paste0(baseDir,"Data/"))
samples<-read.table("allCountsHisat.txt",stringsAsFactors = F,header = T, sep = "\t")
x<-samples[,7:59]
#Cria dicionario para transcriptogramer ----
#acerta o nome das coluna de amostra
dic_transcriptogramer<-data.frame(PROBE=samples$Geneid)
#Ensemble gene ID passa a ser o probe associado ao peptide id
mart = useMart("ensembl", dataset="hsapiens_gene_ensembl")
#lista<-listAttributes(mart)
lista <- getBM(attributes = c("ensembl_gene_id", "ensembl_peptide_id"), mart = mart)
dic_transcriptogramer<-merge(dic_transcriptogramer,lista,
by.x="PROBE",by.y = "ensembl_gene_id")
dic_transcriptogramer<-subset(dic_transcriptogramer, ensembl_peptide_id != "")
dic_transcriptogramer<-subset(dic_transcriptogramer, ensembl_peptide_id != " ")
dic_transcriptogramer<-data.frame(ENSP=as.character(dic_transcriptogramer$ensembl_peptide_id),
PROBE = as.character(dic_transcriptogramer$PROBE),stringsAsFactors = F)
colnames(dic_transcriptogramer)<-c("ENSP","PROBE")
#acrescenta o prefixo 9606.
dic_transcriptogramer$ENSP<-gsub("ENSP","9606.ENSP",
dic_transcriptogramer$ENSP,fixed = T)
load(file = "phenodata_Orig.RData")
# processamento das amostras ----
#Corte de amostras com menos de 1 cpm
#ref: RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR
cpm <- cpm(samples[,7:59])
keep.exprs <-rowSums(cpm>1)>=3
sum(keep.exprs)
samples <- samples[keep.exprs,]
#mantém no nome das colunas de amostras somente o ensbl gene ID
colnames(samples)<-c(colnames(samples[,1:6]),substr(colnames(samples[,7:59]),1,10))
#Converte para logCPM
niente<-as.data.frame(cpm(samples[,c(7:59)],log = T))
rownames(niente)<-samples$Geneid
logCPM<-as.matrix(niente)
niente<-as.data.frame(cpm(samples[,c(7:59)],log = F))
rownames(niente)<-samples$Geneid
CPM<-as.matrix(niente)
save(dic_transcriptogramer,logCPM,pheno_data, file = "counts.RData")
save(CPM,file = "CPM.RData")
# suplemento ----
#plota diferença entre dataset filtrado e o sem filtro
library(RColorBrewer)
lcpm <- cpm(x, log=T)
nsamples <- ncol(x)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.6), las=2,
main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
cpm <- cpm(x, log=TRUE)
keep.exprs <-rowSums(cpm>1)>=3
x1 <- x[keep.exprs,]
lcpm <- cpm(x1, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.6), las=2,
main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2 : nsamples){
den <- density(lcpm[,i])
lines(den$x , den$y, col=col[i], lwd=2)
}