-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNewDataSet.R
137 lines (111 loc) · 3.51 KB
/
NewDataSet.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#requirements
require(tidyverse) #tidy programming
require(lubridate) #easy working with datetimes
require(splitstackshape) #working with checkboxes
# wrapper function to unzip a file and place the result in a directory "data"
Unzip_Data = function(zip_file){
unzip(zipfile = zip_file, overwrite = TRUE, exdir = "./data")
list.files("./data")%>%
.[1]%>%
return()
}
#adds a zero to single digit values for time parsing
Add_Zero = function(string){
string=string%>%
as.character()%>%
as.numeric()
ifelse(string < 10, paste0("0",string), string)%>%
return()
}
#split a column with many checked items into a matrix with each item checked
Split_CheckBoxes = function(column, name){
#first, get all the items
column = as.character(column)
categories = NULL
for(line in 1:length(column)){
items = strsplit(column[line], split=",")%>%unlist()
categories = c(categories,items)
}
categories = unique(categories)
#create an empty dataframe
toReturn = data.frame(matrix(nrow = length(column),
ncol = length(categories)))
#loop by row of original data, and then by column of categories,
#checking if each row has the category
for(line in 1:length(column)){
for(x in 1:length(categories)){
toReturn[line,x] = ifelse(grepl(categories[x],column[line]), 1,0)
}
}
names(toReturn) = paste0(name,".",gsub(" ", "_", categories))
return(toReturn)
}
# Load and clean a file of call logs from Qualtrics
Load_Data = function(f){
tryCatch({
#read in the dataset
#f = list.files("./data")[1]
dat = qualtRics::readSurvey(f)%>%
as.tibble()
#special case
if(nrow(dat) < 2){
return(NULL)
}
#remove incomplete rows from qualtrics
dat = dat%>%
filter(Finished == "True" | Finished == "TRUE")
#convert datetimes for call start and end
dat = dat%>%
mutate(call_start = mdy_hm(paste0(
as.character(.$start_date), " ",
Add_Zero(.$start_time_1), ":",
Add_Zero(.$start_time_2), " ",
toupper(as.character(.$start_time_3)))),
call_end = mdy_hm(paste0(
as.character(.$end_date), " ",
Add_Zero(.$end_time_1), ":",
Add_Zero(.$end_time_2), " ",
toupper(as.character(.$end_time_3)))))%>%
select(-c(start_date:end_time_3))
#record when log was submitted
dat = dat%>%
mutate(Recorded = ymd_hms(.$RecordedDate))%>%
select(-RecordedDate)
#only take columns from relevant call info onwards
dat = dat%>%
select(primary:couns_opinion, call_start:Recorded)
#refactor
dat = dat%>%
mutate(body = as.character(body),
opinion = as.character(couns_opinion)) %>%
select(-couns_opinion)
# expand grid of checkbox factors
checkboxes = c("primary_issues", "secondary_issues", "skills", "referrals", "marketing")
for(x in checkboxes) {
dat = concat.split.expanded(dat, x, type = "character", drop = TRUE) %>%
as.tibble()
}
return(dat)
},
error = function(err){
print("Load data error! Check that this is the correct file type!")
return(NULL)
})
}
#get the earliest call start time
getMin = function(df){
df%>%
select(call_start)%>%
slice(which.min(call_start))%>%
.$call_start%>%
floor_date("month")%>%
return()
}
getMax = function(df){
df%>%
select(call_start)%>%
slice(which.max(call_start))%>%
.$call_start%>%
ceiling_date("month")%>%
return()
}