-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_prism_V10.R
153 lines (134 loc) · 5.44 KB
/
parse_prism_V10.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
prism_tables <- function(path) {
require(jsonlite)
getExtension <- function(file){
ex <- strsplit(basename(file), split="\\.")[[1]]
return(ex[-1])
}
stopifnot(getExtension(path) == "prism")
temp_dir <- tempdir()
unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
sheet_ids <- top_level_json$sheets$data
sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
return(as.vector(sheet_names))
}
extract_y_colnames <- function(path, table = 1) {
require(jsonlite)
require(vroom)
getExtension <- function(file){
ex <- strsplit(basename(file), split="\\.")[[1]]
return(ex[-1])
}
stopifnot(getExtension(path) == "prism")
temp_dir <- tempdir()
unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
sheet_ids <- top_level_json$sheets$data
sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
if(is.numeric(table)) table <- unname(sheet_names[table])
stopifnot(table %in% sheet_names)
table_sheet_id <- names(top_level_json$sheetAttributesMap[sheet_ids])[sheet_names %in% table]
sheet_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "sheets", value = TRUE) |>
grep(pattern = table_sheet_id, value = TRUE) |>
grep(pattern = "sheet.json", value = TRUE) |>
fromJSON()
stopifnot(identical(sheet_json$title, table))
yDataSets <- sheet_json$table$dataSets
search_y <- function(x) {
list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "sets", value = TRUE) |>
grep(pattern = x, value = TRUE)
}
extract_title <- function(set_json) {
if("string" %in% names(set_json$title)) {
set_json$title$string
} else set_json$title
}
y_colnames <- lapply(yDataSets, search_y) |>
lapply(FUN = fromJSON) |>
lapply(FUN = extract_title) |>
unlist()
y_has_replicates <- sheet_json$table$dataFormat == "y_replicates"
y_replicates <- sheet_json$table$replicatesCount
if(y_has_replicates) {
y_colnames <- make.unique(rep(y_colnames, each = y_replicates))
}
return(y_colnames)
}
read_prism <- function(path, table = 1, use_colnames = FALSE, ...) {
require(jsonlite)
require(vroom)
getExtension <- function(file){
ex <- strsplit(basename(file), split="\\.")[[1]]
return(ex[-1])
}
stopifnot(getExtension(path) == "prism")
temp_dir <- tempdir()
unzip(zipfile = path, exdir = paste0(temp_dir, "/prism_unzip"))
top_level_json <- fromJSON(paste0(temp_dir, "/prism_unzip/document.json"))
sheet_ids <- top_level_json$sheets$data
sheet_names <- unlist(top_level_json$sheetAttributesMap[sheet_ids])
if(is.numeric(table)) table <- unname(sheet_names[table])
stopifnot(table %in% sheet_names)
table_sheet_id <- names(top_level_json$sheetAttributesMap[sheet_ids])[sheet_names %in% table]
sheet_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "sheets", value = TRUE) |>
grep(pattern = table_sheet_id, value = TRUE) |>
grep(pattern = "sheet.json", value = TRUE) |>
fromJSON()
stopifnot(identical(sheet_json$title, table))
table_id <- sheet_json$table$uid
if(!is.null(sheet_json$table$xDataSet)) {
xDataSet <- sheet_json$table$xDataSet
x_colname <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "sets", value = TRUE) |>
grep(pattern = xDataSet, value = TRUE) |>
fromJSON() |>
_$title
} else x_colname <- NULL
yDataSets <- sheet_json$table$dataSets
search_y <- function(x) {
list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "sets", value = TRUE) |>
grep(pattern = x, value = TRUE)
}
extract_title <- function(set_json) {
if("string" %in% names(set_json$title)) {
set_json$title$string
} else set_json$title
}
y_colnames <- lapply(yDataSets, search_y) |>
lapply(FUN = fromJSON) |>
lapply(FUN = extract_title) |>
unlist()
y_has_replicates <- sheet_json$table$dataFormat == "y_replicates"
y_replicates <- sheet_json$table$replicatesCount
if(y_has_replicates) {
y_colnames <- make.unique(rep(y_colnames, each = y_replicates))
}
table_data <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "tables", value = TRUE) |>
grep(pattern = table_id, value = TRUE) |>
grep(pattern = "data.csv", value = TRUE) |>
vroom(col_names = FALSE, ...)
dims_json <- list.files(temp_dir, recursive = TRUE, full.names = TRUE) |>
grep(pattern = "data", value = TRUE) |>
grep(pattern = "tables", value = TRUE) |>
grep(pattern = table_id, value = TRUE) |>
grep(pattern = "content.json", value = TRUE) |>
fromJSON()
dims_json <- c(dims_json$numberOfRows, dims_json$numberOfColumns)
stopifnot(identical(dim(table_data), dims_json))
if(use_colnames) {
new_colnames <- c(x_colname, y_colnames)
colnames(table_data) <- c(setdiff(colnames(table_data), tail(colnames(table_data), n = length(new_colnames))), new_colnames)
}
return(table_data)
}