-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_graph_input.R
201 lines (130 loc) · 5.22 KB
/
create_graph_input.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
library("rlist")
library("purrr")
library("stringr")
#TODO check if line is not commented
NotDomain <- c("VAL","MASTER","SDTM","ADAM","ARCHIVE")
#NotDomain <- c()
# -------------------- read in the .txt file with paths --------------------------
assign_paths <- function(phrase, fileName){
path_file <- paste(getwd(), fileName, sep="")
con <- file(path_file, encoding = "UTF-8")
for (line in readLines(con, warn = FALSE)) {
if (startsWith(line, phrase)) {
path <- unlist(strsplit(line, "<-"))[-1]
close(con)
return(path)
}
}
}
# ----------------------- make sure paths are correct ------------------------------
paths_txt <- "/paths.txt" #"paths_example.txt"
# Dataset paths
path_raw_data <- assign_paths("RAW_DATA",paths_txt)
path_sdtm_data <- assign_paths("SDTM_DATA", paths_txt)
path_adam_data <- assign_paths("ADAM_DATA", paths_txt)
# Program paths
path_sdtm_program <- assign_paths("SDTM_PROGRAM", paths_txt)
path_adam_program <- assign_paths("ADAM_PROGRAM", paths_txt)
# ------------------- create the graph input dataset --------------------------------
### 1. Read in all datasets that are in RAW folder
datasets_f <- function(path, pattern){
clean <- function(x){
x <- paste(pattern,str_replace(x, ".sas7bdat",""), sep="")
invisible(toupper(x))
}
datasets_list <- list.files(path, pattern="*.sas7bdat")
clean_list <- lapply(datasets_list, function(x) clean(x))
invisible(clean_list)
}
### 2. Create a list of all programs in validation folder in SDTM /ADaM folders
program_list_sdtm <- list.files(path_sdtm_program, pattern="*.sas")
program_list_adam <- list.files(path_adam_program, pattern="*.sas")
### 3. Return datasets present in the validation programs
processFile <- function(path, fileName, domain, data_patterns) {
con = file(paste(path, fileName, sep=""))
lines <- toupper(readLines(con, warn = FALSE))
list_matches <- c()
for (line in lines) {
for (pattern in data_patterns) {
data_ = regmatches(line, gregexpr(pattern, line))
if (!is.na(data_[[1]][1])){
list_matches <- list.append(list_matches, data_)
}
}
}
final <- unique(unlist(list_matches, use.names = FALSE))
domain_col <- rep(c(domain), times = length(final))
df <- as.data.frame(list(list(final), domain_col))
colnames(df) <- c("from", "to")
df <- subset(df, as.character(from) != as.character(to))
close(con)
return(df)
}
### 4. Create a dataframe containing all results
# - different colours for different levels
# - must contain two non-empy columns: "source", "target"
create_dataframe <- function(program_list, program_path, pattern){
lista = c()
df_all <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(df_all) <- c("from", "to")
data_patterns <- c("RAW\\.[A-Z0-9]+", "SDTM\\.[A-Z0-9]+", "ADAM\\.[A-Z0-9]+")
for (file in program_list){
print(strrep("-",25))
print(file)
get_domain <- function(x){
domain <- unlist(strsplit(x, "_"))
domain <- regmatches(domain, gregexpr("[A-Za-z0-9]+\\.sas", domain))[-1]
domain <- gsub(".sas","",domain)
domain <- toupper(domain[-1])
invisible(domain)
}
domain <- get_domain(file)
if (any(domain %in% NotDomain)) { }
else if (length(domain) > 0) {
domain <- paste(pattern, domain, sep="")
df <- processFile(program_path, file, domain, data_patterns)
print(strrep("-",25))
print(df)
df_all <- rbind(df_all, df)
}
}
invisible(df_all)
}
#data_patterns <- c("RAW\\.[A-Z0-9]+", "SDTM\\.[A-Z0-9]+")
sdtm_df <- create_dataframe(program_list_sdtm, path_sdtm_program, "SDTM.")
adam_df <- create_dataframe(program_list_adam, path_adam_program, "ADAM.")
df_all <- rbind(sdtm_df, adam_df)
### 4b. Add groups to have different colours
check_group <- function(x){
x_split <- unlist(strsplit(x[1][1], "[.]"))
type <- x_split[1]
if (type %in% c("RAW","SDTM","ADAM")){ return (type)}
}
### 5. Get unique list of names of all datasets found
get_all_names <- function(){
source_in_list <- as.list(levels(df_all$from))
target_in_list <- as.list(levels(df_all$to))
every_dataset <- unique(c(source_in_list, target_in_list))
s <- sort(unlist(every_dataset))
df <- data.frame(matrix(unlist(s), nrow=length(s), byrow=T))
colnames(df) <- c("id")
invisible(df)
}
unique <- get_all_names()
unique$group <- apply(unique, 1, FUN = function(id) check_group(id))
### 6. Check if all exisitng datasets are used in programs
# list of all datasets present in study folders
raw_list <- datasets_f(path_raw_data, "RAW.")
sdtm_list <- datasets_f(path_sdtm_data, "SDTM.")
adam_list <- datasets_f(path_adam_data, "ADAM.")
# -------------- write the output into a csv file -------------------------
write.csv(df_all, 'dataframe.csv', row.names=FALSE)
# unique names of all datasets to pu in the drop-down list in Shiny
write.csv(unique, 'unique_names.csv', row.names=FALSE)
# not used datasets
#write.csv(not_used,'not_used.csv', row.names=FALSE)
# TODO
# create a second dataframe to be saved for programmer to see (csv)
# SDTM.VS | RAW.VS, RAW.DM, SDTM.DM, ...
# get an input dataset with names of programmers on main/QC side
# to connect that info later