-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathstep0-get-expression-matrix.Rmd
221 lines (181 loc) · 7.3 KB
/
step0-get-expression-matrix.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
---
title: "step0-get-expression-matrix"
author: "[email protected]"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(message = F)
```
## 引言
因为作者在GEO上面公布了自己处理好的表达矩阵
所以自己下载sra文件,转fq格式,走cellranger流程并不是必须
这里简要展示多个10x的cellranger结果如何整合,值得注意的是,因为时间关系,我们这里并没有去用心探索最佳参数。
因为文章使用的是Seurat2.0,所以这里就展示Seurat2.0的用法
## 读取数据
```{r}
library(Seurat)
sce.10x <- Read10X(data.dir = '~/Documents/10x/output/four-PBMC-mtx/SRR7722939/')
sce1 <- CreateSeuratObject(raw.data = sce.10x,
min.cells = 60,
min.genes = 200,
project = "SRR7722939")
sce1
sce.10x <- Read10X(data.dir = '~/Documents/10x/output/four-PBMC-mtx/SRR7722940/')
sce2 <- CreateSeuratObject(raw.data = sce.10x,
min.cells = 60,
min.genes = 200,
project = "SRR7722940")
sce2
sce.10x <- Read10X(data.dir = '~/Documents/10x/output/four-PBMC-mtx/SRR7722941/')
sce3 <- CreateSeuratObject(raw.data = sce.10x,
min.cells = 60,
min.genes = 200,
project = "SRR7722941")
sce3
sce.10x <- Read10X(data.dir = '~/Documents/10x/output/four-PBMC-mtx/SRR7722942/')
sce4 <- CreateSeuratObject(raw.data = sce.10x,
min.cells = 60,
min.genes = 200,
project = "SRR7722942")
sce4
sce1;sce2;sce3;sce4
```
这四个数据分别对应了四个时间点:治疗前(Pre),治疗后早期day +27(Early),治疗后反应期day+37(Resp),治疗后复发+614 (AR)
timePoints
PBMC_ARD614 PBMC_EarlyD27 PBMC_Pre PBMC_RespD376
4516 1592 2082 4684
| GSM3330561 | PBMC Pre |
| GSM3330562 | PBMC Disc Early |
| GSM3330563 | PBMC Disc Resp |
| GSM3330564 | PBMC Disc AR |
```{r}
[email protected]$group <- "PBMC_Pre"
[email protected]$group <- "PBMC_EarlyD27"
[email protected]$group <- "PBMC_RespD376"
[email protected]$group <- "PBMC_ARD614"
head(colnames(sce1@data))
colnames(sce1@data) <- paste0("PBMC_Pre.",colnames(sce1@data))
head(colnames(sce1@data))
colnames(sce2@data) <- paste0("PBMC_EarlyD27.",colnames(sce2@data))
colnames(sce3@data) <- paste0("PBMC_RespD376.",colnames(sce3@data))
colnames(sce4@data) <- paste0("PBMC_ARD614.",colnames(sce4@data))
```
## Set up
```{r}
sce1 <- NormalizeData(sce1)
sce1 <- ScaleData(sce1, display.progress = F)
sce2 <- NormalizeData(sce2)
sce2 <- ScaleData(sce2, display.progress = F)
sce3 <- NormalizeData(sce3)
sce3 <- ScaleData(sce3, display.progress = F)
sce4 <- NormalizeData(sce4)
sce4 <- ScaleData(sce4, display.progress = F)
```
## Gene selection for input to CCA
```{r}
sce1 <- FindVariableGenes(sce1, do.plot = F)
sce2 <- FindVariableGenes(sce2, do.plot = F)
sce3 <- FindVariableGenes(sce3, do.plot = F)
sce4 <- FindVariableGenes(sce4, do.plot = F)
g.1 <- head(rownames([email protected]), 1000)
g.2 <- head(rownames([email protected]), 1000)
g.3 <- head(rownames([email protected]), 1000)
g.4 <- head(rownames([email protected]), 1000)
genes.use <- unique(c(g.1, g.2,g.3, g.4))
genes.use <- intersect(genes.use, rownames([email protected]))
genes.use <- intersect(genes.use, rownames([email protected]))
genes.use <- intersect(genes.use, rownames([email protected]))
genes.use <- intersect(genes.use, rownames([email protected]))
head(genes.use)
length(genes.use)
```
通过上面的操作,找到了4个数据集共同重要的基因,然后进行后续合并分析。
```{r}
head(colnames(sce1@data))
colnames(sce1@data) <- paste0("PBMC_Pre.",colnames(sce1@data))
colnames(sce2@data) <- paste0("PBMC_EarlyD27.",colnames(sce2@data))
colnames(sce3@data) <- paste0("PBMC_RespD376.",colnames(sce3@data))
colnames(sce4@data) <- paste0("PBMC_ARD614.",colnames(sce4@data))
head(colnames([email protected]))
colnames([email protected]) <- paste0("PBMC_Pre.",colnames(sce1@data))
colnames([email protected]) <- paste0("PBMC_EarlyD27.",colnames(sce2@data))
colnames([email protected]) <- paste0("PBMC_RespD376.",colnames(sce3@data))
colnames([email protected]) <- paste0("PBMC_ARD614.",colnames(sce4@data))
```
Duplicate cell names, please provide 'add.cell.id1' and/or 'add.cell.id2' for unique names
这一步耗时很夸张, 保守估计10分钟。
重点就是 RunMultiCCA 函数,做 10X 数据集的合并。
```{r}
start_time <- Sys.time()
sce.comb <- RunMultiCCA(list(sce1,sce2,sce3,sce4),
add.cell.ids=c("PBMC_Pre.","PBMC_EarlyD27.","PBMC_RespD376.","PBMC_ARD614."),
genes.use = genes.use,
num.cc = 30)
end_time <- Sys.time()
end_time - start_time
# save(sce.comb,file = 'patient1-PBMAC.sce.comb.Rdata')
```
visualize results of CCA plot CC1 versus CC2 and look at a violin plot
可以检查一下CCA, 类比PCA分析。
```{r}
p1 <- DimPlot(object = sce.comb,
reduction.use = "cca", group.by = "group",
pt.size = 0.5, do.return = TRUE)
p2 <- VlnPlot(object = sce.comb,
features.plot = "CC1",
group.by = "group",
do.return = TRUE)
plot_grid(p1, p2)
#ggsave('patient1-PBMAC.sce.comb_explore_CC.pdf')
PrintDim(object = sce.comb, reduction.type = "cca",
dims.print = 1:2,
genes.print = 10)
```
类比于PCA分析里面的JackStraw和PCElbowPlot查看挑选多个主成分合适。
耗时也很验证,五分钟左右。
```{r}
start_time <- Sys.time()
p3 <- MetageneBicorPlot(sce.comb, grouping.var = "group", dims.eval = 1:30,
display.progress = FALSE)
p3
end_time <- Sys.time()
end_time - start_time
# ggsave('patient1-PBMAC.sce.comb_MetageneBicorPlot.pdf')
DimHeatmap(object = sce.comb, reduction.type = "cca", cells.use = 500,
dim.use = 1:9, do.balanced = TRUE)
# ggsave('patient1-PBMAC.sce.comb_DimHeatmap.pdf')
```
有进度条报告测序运行情况,也需要五分钟。
```{r }
start_time <- Sys.time()
sce.comb <- AlignSubspace(sce.comb, reduction.type = "cca",
grouping.var = "group", verbose =F,
dims.align = 1:20)
end_time <- Sys.time()
end_time - start_time
p1 <- VlnPlot(object = sce.comb, features.plot = "ACC1", group.by = "group",
do.return = TRUE)
p2 <- VlnPlot(object = sce.comb, features.plot = "ACC2", group.by = "group",
do.return = TRUE)
plot_grid(p1, p2)
# ggsave('patient1-PBMAC.sce.comb_ACC.pdf')
```
## t-SNE and Clustering
```{r}
sce.comb <- RunTSNE(sce.comb, reduction.use = "cca.aligned", dims.use = 1:20,
do.fast = T)
sce.comb <- FindClusters(sce.comb, reduction.type = "cca.aligned",
resolution = 1, dims.use = 1:20,print.output = 0)
```
## Visualization
```{r}
p1 <- TSNEPlot(sce.comb, do.return = T, pt.size = 0.5, group.by = "group")
p2 <- TSNEPlot(sce.comb, do.label = T, do.return = T, pt.size = 0.5)
plot_grid(p1, p2)
# ggsave('patient1-PBMAC.sce.comb_tSNE.pdf')
table([email protected]$res.1,[email protected]$group)
# save(sce.comb,file = 'patient1-PBMAC.sce.comb.second.Rdata')
```