-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_genetic_mapping_rqtl2.R
314 lines (250 loc) · 13.6 KB
/
run_genetic_mapping_rqtl2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env Rscript
suppressMessages(library(qtl2))
suppressMessages(library(qtl2convert))
suppressMessages(library(stringr))
suppressMessages(library(tidyr))
suppressMessages(library(dplyr))
####################################################################
###### set inputs and working directory
setwd("~/DRiDO_microbiome_github/")
geno_probability_file <- c('../data/prob.8state.allele.qtl2_200131.Rdata') #obtain from FigShare XXXX
map_file <- c('../data/gm_uwisc_v1_filter_200129.pmap.csv') #from https://github.com/kbroman/MUGAarrays/tree/main/UWisc
query_variants_file <- c('../data/cc_variants.sqlite') #from: https://figshare.com/articles/dataset/SQLite_database_of_variants_in_Collaborative_Cross_founder_mouse_strains/5280229
query_genes_file <- c('../data/mouse_genes_mgi.sqlite') #from: https://figshare.com/articles/dataset/SQLite_database_with_MGI_mouse_gene_annotations_from_Mouse_Genome_Informatics_MGI_at_The_Jackson_Laboratory/5286019
pheno_data_file <- c('../data/all_genus_features_clr_mice_in_rows_n910x535_230816.edit.txt')
meta_data_file <- c('../data/jax_cr_genwave_191122.csv')
###### user to edit
focal_phenotype <- c('M10_Roseburia') #user to edit
output_base <- paste("../results/rqtl2/",focal_phenotype,sep="")
####################################################################
## initialize varaibles
##
n_reps <- 100 ## number of permutations to run to assess statistical significance of QTL peaks
model_type <- c('add') ##type of genetic model to run - additive
focal_additive_cov <- c('cm_DietGenwave') ##fixed effect covariates - diet + generation_wave
####################################################################
## LOAD.DATA_1 load genotype probability file
load(geno_probability_file)
# Edit name format from "Calico_Life_Sciences_Freund_MURGIGV01_20180621_DO.2D.4170_A1" to "DO-2D-4170".
print("rename mouse_ids")
mouse_IDs <- dimnames(apr$`1`)[[1]] %>% substr(48,57) %>% gsub(pattern = '[.]', replacement = '-')
if(nchar(mouse_IDs[1]) > 1) {
for(i in names(apr)){
dimnames(apr[[i]])[[1]] <- mouse_IDs
}
}
print("edit marker names")
for(i in names(apr)){
dimnames(apr[[i]])[[3]] <- dimnames(apr[[i]])[[3]] %>% gsub(pattern = '-', replacement = '.')
}
print('number of samples with genotype data')
print(length(mouse_IDs))
####################################################################
## LOAD.DATA_2 physical map data frame
physical_map <- read.csv(map_file, row.names = NULL, sep = ",", strip.white = TRUE)
### find markers in apr. edit physical map to include missing and remove extra markers from physical_map
lt <- list()
for(n in names(apr)){
apr_markers <- as.data.frame(dimnames(apr[[n]])[[3]])
colnames(apr_markers) <- c("marker")
##apr contains some markers not found on pmap. join on apr_markers
physical_map_chrm <- left_join(apr_markers, physical_map, by = c("marker"))
##left join leaves NAs to fill
physical_map_chrm$chr[is.na(physical_map_chrm$chr)] <- n ##fill chrm with n
physical_map_chrm$pos[is.na(physical_map_chrm$pos)] <- 0.0001 ##fill chrm with n
##store new dataframe
lt[[n]] <- physical_map_chrm
#print(paste(n, length(tmp)))
}
physical_map_edit <- do.call(rbind, lt)
print('physical_map')
print(dim(physical_map))
print('physical_map_edit')
print(dim(physical_map_edit))
# Convert map data frame to list
pmap <- physical_map_edit %>%
mutate(chr = factor(chr, levels = c(1:19, 'X'))) %>%
arrange(chr, pos) %>%
# Make sure the chromosoms are factored properly,
data.frame() %>% map_df_to_list(marker_column = 'marker' , pos_column = 'pos', chr_column= 'chr')
#convert pmap value from char to numeric. this is imp to use to keep markers as row names
for(n in names(pmap)){
pmap[[n]] <- as.list(as.data.frame(lapply(pmap[[n]], function(x) as.numeric(x))))
#make sure position is in mega_bases - make it compatibile with snp_scan --- cc_variants.sqlite data
#pmap[[n]] <- lapply(pmap[[n]], function(x) x /1000000)
#flatten pmap[[n]] from list to vector of numerals
pmap[[n]] <- unlist(pmap[[n]])
}
print('pmap')
print(class(pmap[[n]] ))
print(pmap[[n]][1:3])
####################################################################
## LOAD.DATA_3 Load Phenotype Data
#pheno_data_file <- user argument 'advia_results_rankZ_20180823_kwEDIT.csv'
#focal_phenotype <- user argument
print('load pheno_data')
pheno_data <- read.csv(pheno_data_file, sep = ",", strip.white = TRUE)
#print(head(pheno_data))
### create pheno_data data frame
pheno_data_df <- pheno_data %>%
select(focal_phenotype, MouseID)
pheno_data_df <- pheno_data_df %>%
na.omit()
pheno_data_df <- pheno_data_df %>% mutate(Diet=gsub("-....$","",MouseID))
pheno_data_df <- pheno_data_df %>% mutate(Diet=gsub("DO-","",Diet))
rownames(pheno_data_df) <- pheno_data_df$MouseID
### subset apr to animals with phenotype
list_sample_filter <- pheno_data_df$MouseID
apr_subset <- apr[list_sample_filter]
### filter samples from pheno data -
apr_mouse_ID <- as.data.frame(dimnames(apr_subset$`1`)[[1]])
colnames(apr_mouse_ID) <- c('MouseID')
pheno_data_df <- merge(pheno_data_df, apr_mouse_ID, by = 'MouseID')
print("focal_phenotype df ")
print(dim(pheno_data_df))
print(head(pheno_data_df))
####################################################################
## LOAD.DATA_4 Load Meta Data for these Mice
meta <- read.csv(meta_data_file, sep = ",", strip.white = TRUE)
print('load diet and generation wave data')
generation_tbl <- meta %>%
select(sample_id, gen) %>% ##edit from 191122
rename(Generation = gen, MouseID = sample_id)
generation_tbl <- generation_tbl[!duplicated(generation_tbl),]
print('merge phenotype with generation table')
print(head(generation_tbl))
print(paste("length generation_tbl$MouseID", length(generation_tbl$MouseID)))
print(paste("length pheno_data_df$MouseID", length(pheno_data_df$MouseID)))
pheno_data_df <- generation_tbl %>%
inner_join(pheno_data_df, by = 'MouseID')
rownames(pheno_data_df) <- pheno_data_df$MouseID
print(paste("length_post_join pheno_data_df$MouseID", length(pheno_data_df$MouseID)))
print(head(pheno_data_df))
pheno_data_df$Diet <- factor(droplevels(factor(pheno_data_df$Diet))) #use factor here to reset lelves in df
pheno_data_df$Generation <- factor(droplevels(factor(pheno_data_df$Generation))) #use factor here to reset lelves in df
#create pheno_data for qtl
pheno_data_qtl <- pheno_data_df %>%
select(focal_phenotype)
rownames(pheno_data_qtl) <- pheno_data_df$MouseID
#### 4B - Create covariate matrix
cm_DietGenwave <- model.matrix(~ Diet + Generation, data = pheno_data_df)
rownames(cm_DietGenwave) <- pheno_data_df$MouseID
cm_DietGenwave_stan <- as.matrix(cm_DietGenwave)
cm_DietGenwave <- as.matrix(cm_DietGenwave[,-1])
print(paste("dim(cm_DietGenwave)", dim(cm_DietGenwave)))
#put all cov into list , this will be accessible to focal_additive_cov
cov_list <- list(cm_DietGenwave)
names(cov_list) <- c('cm_DietGenwave')
print(paste("cov_list", names(cov_list)))
####################################################################
## Analysis #1 Estiamte Kinship - run with apr not apr subset
kinship_loco <- calc_kinship(apr_subset, type = 'loco', omit_x = FALSE, cores = 1)
####################################################################
## Analysis #2 Estiamte h2
for(c in names(pmap)){ #for all chromosomes
herit_kinbase <- est_herit(pheno_data_qtl, kinship_loco[[c]], addcovar = cov_list[[focal_additive_cov]],
weights = NULL, reml = FALSE, cores = 1)
print(paste('h2:', herit_kinbase, 'loco (leave one chrm out)',c))
}
####################################################################
## Analysis #2 Run additive qtl analysis
genome_scan_output_file <- paste(output_base,'.genome_wide.qtl.Rdata',sep='')
print(paste("run additive qtl model. covar", focal_additive_cov))
snp_scan_data <- scan1(genoprobs = apr_subset, pheno = pheno_data_qtl, kinship = kinship_loco,
addcovar = cov_list[[focal_additive_cov]], cores = 1)
# Save qtl scans
save(list = c('snp_scan_data'), file = genome_scan_output_file)
#plot png
png(paste(genome_scan_output_file,'.png',sep=''))
plot_scan1(snp_scan_data, map = pmap, type='p', cex=0.3, col=c("#775A99"),
main = paste0(genome_scan_output_file ))
dev.off()
##print to csv
qtl_csv <- paste(output_base,'.genome_wide.qtl.csv',sep='')
df_ieffects = as.data.frame(snp_scan_data)
df_ieffects$marker_name <- rownames(df_ieffects)
write.table(df_ieffects, file = qtl_csv, quote = FALSE, row.names = FALSE, sep = ',', append = FALSE)
####################################################################
####################################################################
## DATA.ANALYSIS_2B run perumutation analysis, find signif
alpha = 0.05
perm_thres = 1-alpha
permute_scan_file <- paste(output_base, '.permute.',n_reps,'.Rdata',sep='')
print('run additive permutation snp scan: ')
permute_scan <- scan1perm(genoprobs = apr_subset, pheno = pheno_data_qtl, kinship = kinship_loco,
addcovar = cov_list[[focal_additive_cov]], n_perm=n_reps, cores = 0)
lod_permute <- round(quantile(permute_scan,perm_thres,na.rm = TRUE), 3)
lod_permute_se <- round( (sd(permute_scan) / length(permute_scan)), 3)
print(paste("additive model", "permute reps",n_reps, "lod:", lod_permute, "se: ", lod_permute_se))
# Save permute_scan data
save(list = c('permute_scan'), file = permute_scan_file)
####################################################################
####################################################################
### DATA.ANALYSIS_3 print sig loci
sig_loci_additive <- list()
additive_threshold = lod_permute
## print significant loci
sig_loci_additive <-find_peaks(snp_scan_data, map = pmap, threshold = lod_permute, peakdrop = 1.5,drop=1.5)
print(sig_loci_additive)
## print mariginal loci
sig_loci_additive_90 <-find_peaks(snp_scan_data, map = pmap, threshold = 0.9*lod_permute, peakdrop = 1.5,drop=1.5)
print(sig_loci_additive_90)
####################################################################
#### DATA.ANALYSIS_4 - estimate genotype effect for all 8 parental alleles
allele_effects_file <- paste(output_base,'.genome_wide.allele_effects.Rdata',sep='')
allele_effects <- list()
print('calc linear model. parental allele_effects, additive model.')
for(c in names(pmap)){ #for all chromosomes
print(c)
## rqtl2 function for BLUP - scan1blup; for linear model - scan1coef
allele_effects[[c]] <- scan1coef(genoprobs = apr_subset[,c], ##do not do BLUP, need to comp
pheno = pheno_data_qtl,
kinship = kinship_loco[[c]],
addcovar = cov_list[[focal_additive_cov]],
se=TRUE,
cores = 1)
}
##save allele effects
save(list = c('allele_effects'), file = allele_effects_file)
####################################################################
### DATA.ANALYSIS_5 -- fine mapping using imputed snps
snpscan_effects_file <- paste(output_base,'.sig_loci.snpscan_effects.Rdata',sep='')
query_variants <- create_variant_query_func(dbfile = query_variants_file)
query_genes <- create_gene_query_func(dbfile = query_genes_file)
Mb_margin <- 5 #set fine mapping window - determines the number of imputed SNPs to test.
snp_scan_data <- list()
print('calc snp_scan additive model')
i <- 1
for(chrm in sig_loci_additive_90$chr %>% unique() %>% as.character()){
tmp <- sig_loci_additive_90 %>% filter(chr == chrm)
print(chrm)
print(tmp)
for(peak_pos in tmp$pos %>% unique() ) {
#peak_pos = as.numeric(sig_loci_additive_90$pos[i])
print(paste("chrm", chrm, "pos", peak_pos))
snp_scan_data[[chrm]] <- scan1snps(genoprobs = apr_subset,
map = pmap,
pheno = pheno_data_qtl,
kinship = kinship_loco[[chrm]],
addcovar = cov_list[[focal_additive_cov]],
query_func = query_variants,
chr = chrm,
start = peak_pos - Mb_margin,
end = peak_pos + Mb_margin,
keep_all_snps = TRUE)
i <- i + 1
##print to tsv
df_fine_lod <- as.data.frame(snp_scan_data[[chrm]]$lod)
df_fine_lod$snp_id <- rownames(df_fine_lod)
print(head(df_fine_lod))
df_fine_snp <- as.data.frame(snp_scan_data[[chrm]]$snpinfo)
print(head(df_fine_snp))
df_fine_merge <- merge(df_fine_lod, df_fine_snp, by=c('snp_id'))
print('head(df_fine_merge)')
print(head(df_fine_merge))
fine_map_file <- paste(output_base, '_chrm',chrm,'pos',peak_pos,'.fine_map.lod.tsv',sep='')
write.table(df_fine_merge, file = fine_map_file, quote = FALSE, row.names = FALSE, sep = "\t", append = FALSE)
}
}
##save snp_scan_data
save(list = c('snp_scan_data'), file = snpscan_effects_file)