-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_gene_dropping_script.R
225 lines (141 loc) · 7.44 KB
/
example_gene_dropping_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
### Example code for gene-dropping
## Clear workspace
rm(list = ls())
gc()
library(GeneDrop)
## Inspect the included files
# Pedigree file - A pedigree with 10000 individuals
head(GeneDropEx_ped)
dim(GeneDropEx_ped)
# Founder haplotypes file - 650 founder haplotypes for 30000 loci
dim(GeneDropEx_hap)
GeneDropEx_hap[1:10,1:10]
# Map Files - Map distance information for the loci
head(GeneDropEx_map)
# Extract map distances in cM
map_distances <- GeneDropEx_map[['cMdiff']]
# Find number of loci on each chromosome
chrom_loci_num <- unname(table(GeneDropEx_map[, 'Chr']))
# Extract the founder haplotypes
founder_haplotypes<-GeneDropEx_hap
### Add a unique '3' allele that we can track later
founder_haplotypes[255,1024]<-3
## Run fix_pedigree on the file to ensure it is in the correct order
# and that required columns are present
pedigree <- fix_pedigree(GeneDropEx_ped)
# Run est_cohort to estimated cohorts for individuals that don't have one
pedigree[, 'Cohort'] <- est_cohort(pedigree)
# Remove individuals that still don't have a cohort
pedigree <- pedigree[!is.na(pedigree[, 'Cohort']), ]
# Create vector containing founder information, here all
# individuals born before 1990 are considered founders
founders <- pedigree[pedigree[, 'Cohort'] < 1990, 'ID']
length(founders)
# Set a seed so the results are repeatable
set.seed(48298)
# Run complete_ped_links to randomly fill in missing pedigree links
ped_completed <- complete_ped_links(pedigree, founders)
# Perform gene-dropping
gene_drop_01 <- genedrop(
pedigree = ped_completed,
map_dist = map_distances,
chr_loci_num = chrom_loci_num,
found_hap = founder_haplotypes,
to_raw = TRUE
)
### Tracking alleles back to ancestors
# We can find founders that have the '3' allele at locus 1024 with the following code
# We double the number of founders because there are two haplotypes per individual.
# We use as.raw as the genotype matrix is stored as raw vectors
founder_3 <- unlist(sapply(1:(length(founders)*2), function(x)
{
if (get_genotype_matrix(gene_drop_01)[[x]][1024] == as.raw(3)){x}}))
# These row references for the genotype matrix can be converted into
# pedigree row references
id_ref = ifelse (founder_3 %% 2 != 0, (founder_3 + 1) / 2, founder_3 / 2)
## And finally we can extract the IDs
founder_id_3 = get_pedigree(gene_drop_01)[id_ref, 'ID']
# These founder individuals have a copy of the '3 allele, every '3' allele should be able to
# be traced back to these individuals
print(founder_id_3)
# Similarly we can find the individual furthest down the pedigree with the '3' allele
rows_allele_3 = tail(unlist(sapply(1:length(get_genotype_matrix(gene_drop_01)), function(x) {
if (get_genotype_matrix(gene_drop_01)[[x]][1024] == as.raw(3)) {x}})),1)
# And find the individuals that allele belongs to
id_ref = ifelse(rows_allele_3 %% 2 != 0, (rows_allele_3 + 1) / 2, (rows_allele_3 / 2))
id_track = get_pedigree(gene_drop_01)[id_ref, 'ID']
print(id_track)
# We can track the allele back to its original ancestor
# allele_track_back takes an ID and loci number,
# along with the gene-drop output
tr_3 <- allele_track_back(id_track, 1024, gene_drop_01)
# An object is returned which shows tracks the sire and the dam allele
# back to their original ancestor
# If this tracking has worked the allele should have been tracked back
# to one of the founders with the '3' allele
any(c(get_allele_dam(tr_3)[,'ID'],get_allele_sire(tr_3)[,'ID']) %in% founder_id_3)
# And every genotype position should have the same allele for both the dam:
id_ref <- id_ref(gene_drop_01, get_allele_dam(tr_3)[, 'ID'])
row_ref = mapply(function(x, y)
c(x * 2 - 1, x * 2)[y] , id_ref, as.numeric(get_allele_dam(tr_3)[, 'Hap']))
unlist(lapply(get_genotype_matrix(gene_drop_01)[row_ref], function(x)
as.numeric(x[1024])))
# And the sire
id_ref <- id_ref(gene_drop_01, get_allele_sire(tr_3)[, 'ID'])
row_ref = mapply(function(x, y)
c(x * 2 - 1, x * 2)[y] , id_ref , as.numeric(get_allele_sire(tr_3)[, 'Hap']))
unlist(lapply(get_genotype_matrix(gene_drop_01)[row_ref], function(x)
as.numeric(x[1024])))
# allele_track_back can also take vectors of IDs and loci for the first two arguments
# and returns a list of objects for each combination e.g.
t_multi <- allele_track_back(c('1',id_track), c(1024, 256), gene_drop_01)
### Tracking alleles through descendants
# Use the allele_track_forw to find all the descendants from the first founder with a '3'
# each allele at locus 1024
tr_3_for <- allele_track_forw(founder_id_3[3], 1024, gene_drop_01)
# The following output should produce all 3's as output
# It takes the ID info from the descendants with the first allele at the locus 1024
# and finds the relevant row references to extract the alleles of the individuals
# from the gene-drop genotype matrix
id_ref <- id_ref(gene_drop_01, get_allele_dam(tr_3_for)[,'ID'])
row_ref = mapply(function(x, y) c(x * 2 - 1, x * 2)[y],
id_ref, as.numeric(get_allele_dam(tr_3_for)[, 'Hap']))
unlist(lapply(get_genotype_matrix(gene_drop_01)[row_ref], function(x)
as.numeric(x[1024])))
# All the alleles tracked back from the sre should be the same too
id_ref <- id_ref(gene_drop_01, get_allele_sire(tr_3_for)[,'ID'])
row_ref = mapply(function(x, y) c(x * 2 - 1, x * 2)[y],
id_ref, as.numeric(get_allele_sire(tr_3_for)[, 'Hap']))
unlist(lapply(get_genotype_matrix(gene_drop_01)[row_ref], function(x)
as.numeric(x[1024])))
# allele_track_forw can also take vectors of IDs and loci for the first two arguments
# and returns an output list of allele_track_objects which contains all combinations
# e.g.
tr_3_for_2 <- allele_track_forw(c(founder_id_3[1], founder_id_3[1]), c(100, 1024), gene_drop_01)
# ID's and genotypes of interest can be extracted using extract_genotype_mat()
## To investigate the change in the '3' allele over time
extracted_loci <- extract_genotype_mat(gene_drop_01, ids='ALL', loci=c(1024))
extracted_loci_info<-cbind(extracted_loci, Cohort = get_pedigree(gene_drop_01)[,'Cohort'])
allele_count<-rbind(t(table(extracted_loci_info[,'L_1024_01'],extracted_loci_info[,'Cohort'])))
cohort_size <- cbind(table(get_pedigree(gene_drop_01)[,'Cohort']))
matplot(apply(allele_count, 2, function(x) x/cohort_size), type='l',
ylab = "Allele Frequency", xlab="Years Passed")
# You can plot the route of descent of the alleles at a given locus for an individual of interest
# using plot_allele_desc, by default background is set to TRUE, this plots the whole pedigree in light grey
# but can take a while depending on the size of the pedigree.
plot_allele_desc(
gene_drop_object = gene_drop_01,
id = founder_id_3[3],
loci = 1024,
background = FALSE,
)
# There several options for exporting the genotype matrix but they will all take a while
# You can write a text file with either two lines per individual (format_short = TRUE) or alternatively 1 line per individual
# with two columns per loci (format_short = FALSE). Remember that ehe resulting file is likely to be large.
write_file(gene_drop_01, filename = "GeneDrop_out", format_short = FALSE)
# Alternatively you can write binary plink files from the output, by default the map distances for the .bim file will be left blank,
# but they can be specified. Note that this won't work using the gene-drop object generated by this script because the additional 3 allele
# means the 1024 locus is not biallelic
write_bed( gene_drop_01, filename = "GeneDrop_out", cM_positions = GeneDropEx_map[,'cMPosition'],
gen_positions = GeneDropEx_map[,'GenomicPosition'],
)