-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata-prep.Rmd
196 lines (166 loc) · 7.4 KB
/
data-prep.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
---
title: "R Notebook"
output: html_notebook
---
Prepping data to add to SRA
```{r setup, include=FALSE}
# load libraries
# pacman::p_load(tidyverse, clownfish)
library(tidyverse)
# load functions
source("~/Documents/clownfishr/R/gen-data-funs.R")
source("~/Documents/clownfishr/R/field-data-funs.R")
source("~/Documents/clownfishr/R/lab-data-funs.R")
source("~/Documents/clownfish-pkg/R/funs-lab-meta.R")
# because the myconfig db connection is still not working
source("~/db-connections.R")
# connect to data
leyte <- read_db("Leyte")
lab <- read_db("Laboratory")
```
# Import the template
The first 11 rows are instructions
```{r message=FALSE}
template <- read_tsv("Model.organism.animal.1.0.tsv", skip = 11)
```
# Import the data
```{r}
ligation_ids <- read_genepop("~/Documents/genomics_pinskylab/data/seq33_03_baits_only_SNPs.gen") %>%
# choose only the names column
select(sample) %>%
# remove APCL_ from the ligation_id
mutate(ligation_id = str_extract(sample, "L\\d+")) %>%
# remove the names column
select(-sample)
# bring in fish data from database
fish <- fish_anem_dive() %>%
filter(!is.na(sample_id)) %>%
select(sample_id, sex, date)
#what are the sample_ids for these ligation_ids
sample_ids <- samp_from_lig(ligation_ids) %>%
rename(sample_name = sample_id) %>%
# Add the organism
mutate(organism = "Amphiprion clarkii",
isolate = "not applicable",
tissue = "tail fin") %>%
left_join(fish, by = c("sample_name" = "sample_id")) %>%
rename(collection_date = date)
```
# SRA did not like repeat sample_ids. Removing sample_ids and keeping only ligation ids
```{r}
lig_only <- sample_ids %>%
select(-sample_name) %>%
rename(sample_name = ligation_id)
```
# Apparently there are also duplicate ligation_ids
```{r}
dups <- sample_ids %>%
group_by(ligation_id) %>%
count() %>%
filter(n > 1)
```
# According to this code, there are not duplicates. However, the wording looks like maybe they want unique individuals, as in individuals that were not collected on the same day.
"Your table upload failed because multiple BioSamples cannot have identical attributes. You should have one BioSample for each specimen, and each of your BioSamples must have differentiating information (excluding sample name, title, bioproject accession and description). This check was implemented to encourage submitters to include distinguishing information in their samples. If the distinguishing information is in the sample name, title or description, please recode it into an appropriate attribute, either one of the predefined attributes or a custom attribute you define. If it is necessary to represent true biological replicates as separate BioSamples, you might add an 'aliquot' or 'replicate' attribute, e.g., 'replicate = biological replicate 1', as appropriate. Note that multiple assay types, e.g., RNA-seq and ChIP-seq data may reference the same BioSample if appropriate."
# To get past this, I am going to include ligation_id in sample name and also include a column called ligation_id (a custom attribute).
```{r}
samp_lig <- sample_ids %>%
mutate(sample_name = str_c(sample_name, ligation_id, sep = "_"))
```
# SRA requires that we change M to male and F to female and that we define age, development stage
```{r}
samp_lig <- samp_lig %>%
mutate(sex = ifelse(sex == "M", "male", sex),
sex = ifelse(sex == "F", "female", sex),
sex = ifelse(sex == "J", "not applicable", sex),
dev_stage = ifelse(sex == "not applicable", "juvenile", NA),
dev_stage = ifelse(sex != "not applicable", "adult", dev_stage)) %>%
# remove any non_apcl samples
filter(grepl("APCL", sample_name)) %>%
arrange(ligation_id)
```
# SRA requires that these files are 1000 lines or less
```{r}
table1 <- samp_lig %>%
slice(1:999)
table2 <- samp_lig %>%
slice(1000:1998)
table3 <- samp_lig %>%
slice(1998:2881)
```
# Export the data in tab delimited format
```{r}
write_tsv(table1, "amphiprion-clarkii-table1.tsv")
write_tsv(table2, "amphiprion-clarkii-table2.tsv")
write_tsv(table3, "amphiprion-clarkii-table3.tsv")
```
# Log on to https://submit.ncbi.nlm.nih.gov/subs/sra/SUB4607463/attributes and upload the table, then hit continue.
# The next step is to add metadata
View the SRA_metadata_PADE.xlsx file in excel
```{r}
# template2 <- readxl::read_xlsx("SRA_metadata_PADE.xlsx")
```
Some samples have a different name and need to be replaced
```{r}
# open the list of files that have weird names
replacements <- read_lines("list.txt") %>%
str_extract(., "APCL_L\\d+.F-RG.bam") %>%
tibble() %>%
rename(sample = ".") %>%
mutate(replace = str_c(str_extract(.,"APCL_L\\d+"), "-RG.bam"))
```
# create table for clarkii data
```{r}
metadata <- samp_lig %>%
rename(library_ID = ligation_id) %>%
mutate(title = "ddRADseq of Amphiprion clarkii",
library_strategy = "RAD-Seq",
library_source = "GENOMIC",
library_selection = "Reduced Representation",
library_layout = "single",
platform = "Illumina",
instrument_model = "Illumina HiSeq 2500",
design_description = "ddRADseq using PstI and MluCI, and size selected to 375 ± 38 bp",
filetype = "bam",
filename = str_c("APCL_", library_ID, "-RG.bam", sep = ""),
assembly = NA
) %>%
select(-organism:-dev_stage) %>%
# fix names of regeno files
mutate(filename = ifelse(filename == "APCL_L0306-RG.bam", "APCL_L0306.L3598-RG.bam", filename),
filename = ifelse(filename == "APCL_L0308-RG.bam", "APCL_L0308.L3599-RG.bam", filename),
filename = ifelse(filename == "APCL_L0309-RG.bam", "APCL_L0309.L3601-RG.bam", filename),
filename = ifelse(filename == "APCL_L0814-RG.bam", "APCL_L0814.L3644-RG.bam", filename),
filename = ifelse(filename == "APCL_L1188-RG.bam", "APCL_L1188.L3266.L3352-RG.bam", filename),
filename = ifelse(filename == "APCL_L2324-RG.bam", "APCL_L2324.L3400-RG.bam", filename)) %>%
mutate(filename = ifelse(filename %in% replacements$replace, str_c("APCL_", library_ID, "-RG.bam"), filename))
```
# SRA requires that these files are 1000 lines or less
```{r}
metadata1 <- metadata %>%
slice(1:999)
metadata2 <- metadata %>%
slice(1000:1998)
metadata3 <- metadata %>%
slice(1998:2886)
```
# Export the meta data tables
```{r}
write_tsv(metadata1, "amphiprion-clarkii-metadata1.tsv")
write_tsv(metadata2, "amphiprion-clarkii-metadata2.tsv")
write_tsv(metadata3, "amphiprion-clarkii-metadata3.tsv")
```
# Sending the bam files by FTP - COMPRESS Folder INTO TAR!!!
I logged into the FTP using Fetch following the onscreen instructions and started dragging over bam files.
However, the bam.bai files are also intertwined, it was taking hours, and was timing out before completing all of the transfers.
I copied all of the bam files into a bam files folder in the data > apcl > all_samples > 20181127 > bam\ files.
Maybe I can separate them into folders on my end and then just drag and drop the folder for submission. The folder transfer will take hours but hopefully it won't time out.
# The file names don't match the metadata
hand edit the metadata table because it is just a few regenotypes
# The first submission is called SUB4607463
There is an error of undefined origin, waiting for an email response
# The second submission is called SUB6249894
- Mark no for biosample
- upload table_2 for biosample attributes
- transferring files starting 11:20am on Monday, 9-9-2019
```{r}
```