.Rhistory

# affinity analysis
# subset two relevant groups to compare
# this is the segment of interest
segment1 <- data[PREMIUM_CUSTOMER == "Mainstream" &
LIFESTAGE == "YOUNG SINGLES/COUPLES", ]
# the rest of population serves as the baseline comparison
Other <- data[!(PREMIUM_CUSTOMER == "Mainstream" &
LIFESTAGE == "YOUNG SINGLES/COUPLES"), ]
# how many units did segment1 buy?
quantity_segment1 <- segment1[, sum(PROD_QTY)]
# how many units did the rest buy?
quantity_other <- Other[, sum(PROD_QTY)]
# calculate proportion that bought a specific brand by segment1
quantity_segment1_byBrand <- segment1[, .(TargetSeg = sum(PROD_QTY)/quantity_segment1), by = BRAND]
# calculate proportion that bought a specific brand by the rest of the population
quantity_Other_byBrand <- Other[, .(otherSegs = sum(PROD_QTY)/quantity_other), by = BRAND]
# calculate and compare to see which brands segment1 particularly tends to buy compared to other segments.
brand_proportions <- merge(quantity_segment1_byBrand,quantity_Other_byBrand)[, AffinityToBrand := TargetSeg/otherSegs]
brand_proportions[order(-AffinityToBrand)]
#### Preferred pack size compared to the rest of the population
# Over to you! Do the same for pack size.
# calculate proportion that bought a specific brand by segment1
quantity_segment1_byPackSize <- segment1[, .(TargetSeg = sum(PROD_QTY)/quantity_segment1), by = PACK_SIZE]
# calculate proportion that bought a specific brand by the rest of the population
quantity_Other_byPackSize <- Other[, .(otherSegs = sum(PROD_QTY)/quantity_other), by = PACK_SIZE]
# calculate and compare to see which brands segment1 particularly tends to buy compared to other segments.
packSize_proportions <- merge(quantity_segment1_byPackSize,quantity_Other_byPackSize)[, AffinityToPack := TargetSeg/otherSegs]
packSize_proportions[order(-AffinityToPack)]
data[PACK_SIZE == 270, unique(PROD_NAME)]
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(linewidth=80)
library(knitr)
hook_output = knit_hooks$get("output")
knit_hooks$set(output = function(x, options)
{
# this hook is used only when the linewidth option is not NULL
if (!is.null(n <- options$linewidth))
{
x = knitr:::split_lines(x)
# any lines wider than n should be wrapped
if (any(nchar(x) > n))
x = strwrap(x, width = n)
x = paste(x, collapse = "\n")
}
hook_output(x, options)
})
library(data.table)
library(ggplot2)
library(tidyr)
data <- fread(file = "../data/QVI_data.csv")
#### Set themes for plots
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
#### Calculate these measures over time for each store
#### Add a new month ID column in the data with the format yyyymm.
# library(lubridate)
data[, YEARMONTH := format(DATE, "%Y%m")]
# to have 'YEARMONTH' as numeric,
# data[, YEARMONTH := year(DATE)*100 + month(DATE)]
# View(data)
#### Next, we define the measure calculations to use during the analysis.
#  For each store and month, calculate total sales, number of customers,
# transactions per customer, chips per customer and the average price per unit.
# use uniqueN() to count distinct values in a column
# .N gets number of rows in each group
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID)/uniqueN(LYLTY_CARD_NBR),
nChipsPerCust = sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR),
avgPricePerUnit = sum(TOT_SALES)/sum(PROD_QTY)),
by = .(STORE_NBR,YEARMONTH)][order(STORE_NBR,YEARMONTH)]
# order returns a permutation which rearranges its first argument into
# ascending or descending order, breaking ties by further arguments.
# 'measureOverTime' is a data set of monthly measure for each store
# each row is a monthly measure for a store
#### Select stores with full observation periods, ie. from 201807 to 201906
# These are the stores in 'measureOverTime' that have all these 12 months on record
storesWithFullObs <- unique(measureOverTime[, .N, STORE_NBR][N == 12, STORE_NBR])
# Filter to the pre-trial period
preTrialMeasures <- measureOverTime[YEARMONTH < 201902 & STORE_NBR %in% storesWithFullObs, ]
####  Create a function to calculate correlation for a measure, looping
# through each control store.
# inputTable as a metric table with potential comparison stores,
# metricCol as the store metric used to calculate correlation on,
# and storeComparison as the store number of the trial store.
calculateCorrelation <- function(inputTable, metricCol, storeComparison) {
calcCorrTable = data.table(Store1 = numeric(), Store2 = numeric(), corr_measure = numeric())
storeNumbers <- unique(inputTable[, STORE_NBR])
# loop through each store
for (i in storeNumbers) {
calculatedMeasure = data.table("Store1" = storeComparison,
"Store2" = i,
"corr_measure" = cor(inputTable[STORE_NBR == i, eval(metricCol)],
inputTable[STORE_NBR == storeComparison, eval(metricCol)]))
# fill in the table
calcCorrTable <- rbind(calcCorrTable, calculatedMeasure)
}
return(calcCorrTable)
}
#### Create a function to calculate a standardized magnitude distance for a measure,
#### looping through each control store
calculateMagnitudeDistance <- function(inputTable, metricCol, storeComparison) {
calcDistTable = data.table(Store1 = numeric(), Store2 = numeric(),
YEARMONTH = numeric(), measure = numeric())
storeNumbers <- unique(inputTable[, STORE_NBR])
for (i in storeNumbers) {
calculatedMeasure = data.table("Store1" = storeComparison,
"Store2" = i,
"YEARMONTH" = inputTable[STORE_NBR == storeComparison, YEARMONTH],
"measure" = abs(inputTable[STORE_NBR==storeComparison, eval(metricCol)]
- inputTable[STORE_NBR == i,eval(metricCol)]))
calcDistTable <- rbind(calcDistTable, calculatedMeasure)
}
#### Standardize the magnitude distance so that the measure ranges from 0 to 1
minMaxDist <- calcDistTable[, .(minDist = min(measure), maxDist = max(measure)),
by = c("Store1", "YEARMONTH")]
distTable <- merge(calcDistTable, minMaxDist, by = c("Store1", "YEARMONTH"))
distTable[, magnitudeMeasure := 1 - (measure - minDist)/(maxDist - minDist)]
finalDistTable <- distTable[, .(mag_measure = mean(magnitudeMeasure)),
by = .(Store1, Store2)]
return(finalDistTable)
}
####  Use the function you created to calculate correlations against
# store 77 using total sales and number of customers.
trial_store <- 77
corr_nSales <- calculateCorrelation(preTrialMeasures, quote(totSales), trial_store)
corr_nCustomers <- calculateCorrelation(preTrialMeasures, quote(nCustomers), trial_store)
#### Then, use the functions for calculating magnitude.
# want those with highest magnitudeMeasure
magnitude_nSales <- calculateMagnitudeDistance(preTrialMeasures, quote(totSales), trial_store)
magnitude_nCustomers <- calculateMagnitudeDistance(preTrialMeasures, quote(nCustomers), trial_store)
#### Create a combined score composed of correlation and magnitude, by
#### first merging the correlations table with the magnitude table.
#### A simple average on the scores would be 0.5 * corr_measure + 0.5 * mag_measure
corr_weight <- 0.5
score_nSales <- merge(corr_nSales,magnitude_nSales,
by = c("Store1","Store2"))[, scoreNSales := corr_weight * corr_measure +
(1-corr_weight) * mag_measure]
score_nCustomers <- merge(corr_nCustomers,magnitude_nCustomers,
by = c("Store1","Store2"))[, scoreNCust := corr_weight * corr_measure +
(1-corr_weight) * mag_measure]
####  Combine scores across the drivers by first merging our sales
####  scores and customer scores into a single table
score_Control <- merge(score_nSales,score_nCustomers, by = c("Store1","Store2"))
score_Control[, finalControlScore := scoreNSales * 0.5 + scoreNCust * 0.5]
#### Select control stores based on the highest matching store (closest to 1 but
#### not the store itself, i.e. the second ranked highest store)
control_store <- score_Control[order(-finalControlScore)][2][, Store2]
control_store
#### Visual checks on trends based on the drivers
measureOverTimeSales <- measureOverTime
# Recall 'measureOverTimeSales' is a data set in which each row shows a set of monthly metrics for a store
# categorize each store into either "Trial", "Control", "Other stores"
pastSales <- measureOverTimeSales[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control",
"Other stores"))]
pastSales <- pastSales[, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")]
pastSales <- pastSales[, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100,
as.numeric(YEARMONTH) %% 100, 1, sep = "-"),
"%Y-%m-%d")][YEARMONTH < 201903 , ]
ggplot(pastSales, aes(x=TransactionMonth, y=totSales, color = Store_type)) +
geom_line() +
labs(x = "Month of operation", y = "Total sales", title = "Total sales by month")
#### Conduct visual checks on customer count trends by comparing the
#### trial store to the control store and other stores.
#### Hint: Look at the previous plot.
measureOverTimeCusts <- measureOverTime
pastCustomers <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))][, totCust := mean(nCustomers), by=c("YEARMONTH", "Store_type")][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][YEARMONTH < 201903 , ]
ggplot(pastCustomers, aes(TransactionMonth, totCust , color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month of operation", y = "Total customers", title = "Total customers by month")
#### Scale pre-trial control sales to match pre-trial trial store sales
scalingFactorForControlSales <- preTrialMeasures[STORE_NBR == trial_store &
YEARMONTH < 201902, sum(totSales)] / preTrialMeasures[STORE_NBR == control_store &
YEARMONTH < 201902, sum(totSales)]
#### Apply the scaling factor
measureOverTimeSales <- measureOverTime
scaledControlSales <- measureOverTimeSales[STORE_NBR == control_store, ][ ,
controlSales := totSales * scalingFactorForControlSales]
#### Calculate the percentage difference between scaled control sales and trial sales
percentageDiff <- merge(scaledControlSales[, c("YEARMONTH", "controlSales")], measureOverTime[STORE_NBR == trial_store, c("YEARMONTH", "totSales")], by = "YEARMONTH")[, percentDiff := abs(totSales-controlSales)/controlSales]
#### As our null hypothesis is that the trial period is the same as the pre-trial
#### period, let's take the standard deviation based on the scaled percentage difference
#### in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902 , percentDiff])
#### Note that there are 8 months in the pre-trial period
#### hence 8 - 1 = 7 degrees of freedom
degreesOfFreedom <- 7
#### We will test with a null hypothesis of there being 0 percentage difference
#### between trial and control stores.
#### Calculate the t-value for each trial month.
#### The test statistic here is (x - u)/standard deviation
#### x is the observed difference 'percentDiff'
#### u is the null difference
#### (percentDiff-0) / stdDev] is meant to measure, under our null distribution of true percentage difference,
#### how far away from the mean is our observed difference
percentageDiff[, tValue := (percentDiff-0) / stdDev
][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100,
as.numeric(YEARMONTH) %% 100, 1,
sep = "-"), "%Y-%m-%d")
][YEARMONTH < 201905 & YEARMONTH > 201901, .(TransactionMonth,tValue)]
#### find the 95th percentile of our null distribution (which is the t distribution)
#### to check whether the hypothesis is statistically significant.
qt(.95,degreesOfFreedom)
#### Trial and control store total sales
####  Create new variables Store_type, totSales and TransactionMonth in
#### the data table.
pastSales <- measureOverTimeSales[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][Store_type %in% c("Trial", "Control"),]
#### Control store 95th percentile
pastSales_Controls95 <- pastSales[Store_type == "Control",][, totSales := totSales * (1 + stdDev * 2)][, Store_type := "Control 95th % confidence interval"]
#### Control store 5th percentile
pastSales_Controls5 <- pastSales[Store_type == "Control",][, totSales := totSales * (1 - stdDev * 2)][, Store_type := "Control 5th % confidence interval"]
trialAssessment <- rbind(pastSales, pastSales_Controls95, pastSales_Controls5)
#### Plotting these in one nice graph
ggplot(trialAssessment, aes(TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessment[ YEARMONTH < 201905 & YEARMONTH > 201901 ,],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0 , ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total sales", title = "Total sales by month")
#### This would be a repeat of the steps before for total sales
#### Scale pre-trial control customers to match pre-trial trial store customers
#### Compute a scaling factor to align control store customer counts to our trial store.
scalingFactorForControlCust <- preTrialMeasures[STORE_NBR == trial_store & YEARMONTH < 201902, sum(nCustomers)] / preTrialMeasures[STORE_NBR == control_store & YEARMONTH < 201902, sum(nCustomers)]
#### Then, apply the scaling factor to control store customer counts.
measureOverTimeCusts <- measureOverTime
scaledControlCustomers <- measureOverTimeCusts[STORE_NBR == control_store, ][, controlCustomers := nCustomers*scalingFactorForControlCust][, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))]
#### Finally, calculate the percentage difference between scaled control store
#### customers and trial customers.
percentageDiff <- merge(scaledControlCustomers[, .(controlCustomers,YEARMONTH)],measureOverTimeCusts[STORE_NBR == trial_store, .(nCustomers,YEARMONTH)], by = "YEARMONTH")[, percentDiff := abs(controlCustomers-nCustomers)/controlCustomers]
#### As our null hypothesis is that the trial period is the same as the pre-trial
#### period, let's take the standard deviation based on the scaled percentage difference
#### in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902 , percentDiff])
degreesOfFreedom <- 7
#### Trial and control store number of customers
pastCustomers <- measureOverTimeCusts[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][, nCusts := mean(nCustomers), by = c("YEARMONTH", "Store_type")][Store_type %in% c("Trial", "Control"), ]
#### Control store 95th percentile
pastCustomers_Controls95 <- pastCustomers[Store_type == "Control",][, nCusts := nCusts * (1 + stdDev * 2)][, Store_type := "Control 95th % confidence interval"]
#### Control store 5th percentile
pastCustomers_Controls5 <- pastCustomers[Store_type == "Control",][, nCusts := nCusts * (1 - stdDev * 2)][, Store_type := "Control 5th % confidence interval"]
trialAssessment <- rbind(pastCustomers, pastCustomers_Controls95, pastCustomers_Controls5)
#### Plot everything into one nice graph.
#### geom_rect creates a rectangle in the plot. Use this to highlight the
# trial period in our graph.
ggplot(trialAssessment, aes(x = TransactionMonth, y = nCusts,color = Store_type)) +
geom_rect(data = trialAssessment[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total customers", title = "Total customers by month")
#### Calculate the metrics below as we did for the first trial store.
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID)/uniqueN(LYLTY_CARD_NBR),
nChipsPerCust = sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR),
avgPricePerUnit = sum(TOT_SALES)/sum(PROD_QTY)),
by = .(STORE_NBR,YEARMONTH)][order(STORE_NBR,YEARMONTH)]
#### Use the functions we created earlier to calculate correlations
#### and magnitude for each potential control store
trial_store <- 86
corr_nSales <- calculateCorrelation(preTrialMeasures, quote(totSales),trial_store)
corr_nCustomers <- calculateCorrelation(preTrialMeasures, quote(nCustomers),trial_store)
magnitude_nSales <- calculateMagnitudeDistance(preTrialMeasures,quote(totSales),trial_store)
magnitude_nCustomers <- calculateMagnitudeDistance(preTrialMeasures,quote(nCustomers), trial_store)
#### Now, create a combined score column composed of correlation and magnitude
corr_weight <- 0.5
score_nSales <- merge(corr_nSales, magnitude_nSales, by = c("Store1","Store2"))[, scoreNSales := corr_measure * corr_weight + mag_measure * (1-corr_weight)]
score_nCustomers <- merge(corr_nCustomers, magnitude_nCustomers, by = c("Store1", "Store2"))[, scoreNCust := corr_measure * corr_weight + mag_measure * (1-corr_weight)]
#### Finally, combine scores across the drivers using a simple average.
score_Control <- merge(score_nSales,score_nCustomers, by = c("Store1","Store2"))
score_Control[, finalControlScore := scoreNSales * 0.5 + scoreNCust * 0.5]
#### Select control stores based on the highest matching store
#### (closest to 1 but not the store itself, i.e. the second ranked highest store)
#### Select control store for trial store 86
control_store <- score_Control[Store1 == trial_store,][order(-finalControlScore)][2, Store2]
control_store
#### Conduct visual checks on trends based on the drivers
# control_store <- 227
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[, Store_type :=  ifelse(STORE_NBR == trial_store,"Trial",
ifelse(STORE_NBR == control_store,"Control","Other stores"))
][, totSales:= mean(totSales), by = c("YEARMONTH","Store_type")
][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100,
as.numeric(YEARMONTH) %% 100, 1,
sep = "‐"), "%Y‐%m‐%d")][YEARMONTH < 201903 , ]
ggplot(pastSales, aes(TransactionMonth,totSales, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month of operation", y = "Total sales", title = "Total sales by month")
#### Conduct visual checks on trends based on the drivers
measureOverTimeCusts <- measureOverTime
pastCustomers <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control","Other stores"))][, numberCustomers := mean(nCustomers), by = c("YEARMONTH","Store_type")][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "‐"), "%Y‐%m‐%d")][YEARMONTH < 201903 , ]
ggplot(pastCustomers, aes(TransactionMonth,numberCustomers, color = Store_type)) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total number of customers", title = "Total number of customers by month")
#### Scale pre-trial control sales to match pre-trial trial store sales
scalingFactorForControlSales <- preTrialMeasures[STORE_NBR == trial_store &
YEARMONTH < 201902, sum(totSales)]/preTrialMeasures[STORE_NBR == control_store &
YEARMONTH < 201902, sum(totSales)]
#### Apply the scaling factor
measureOverTimeSales <- measureOverTime
scaledControlSales <- measureOverTimeSales[STORE_NBR == control_store, ][ ,
controlSales := totSales * scalingFactorForControlSales]
#### Calculate the percentage difference between scaled control sales and trial sales
#### Hint: When calculating percentage difference, remember to use absolute difference
percentageDiff <- merge(scaledControlSales[,c("controlSales","YEARMONTH")], measureOverTimeSales[STORE_NBR == trial_store, c("totSales","YEARMONTH")], by = "YEARMONTH")[, percentDiff := abs(controlSales-totSales)/controlSales]
#### As our null hypothesis is that the trial period is the same as the pre-trial
#### period, let's take the standard deviation based on the scaled percentage difference
#### in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902, percentDiff])
degreesOfFreedom <- 7
#### Trial and control store total sales
####  Create a table with sales by store type and month.
#### We only need data for the trial and control store.
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control","Other stores"))][, totSales := mean(totSales), by = c("Store_type","YEARMONTH")][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "‐"), "%Y‐%m‐%d")][Store_type %in% c("Trial","Control"), ]
#### Calculate the 5th and 95th percentile for control store sales.
#### Hint: The 5th and 95th percentiles can be approximated by using two standard
#### deviations away from the mean.
#### Hint2: Recall that the variable stdDev earlier calculates standard deviation
# in percentages, and not dollar sales.
pastSales_Controls95 <- pastSales[Store_type == "Control",][, totSales := totSales * (1+ 2*stdDev)][, Store_type := "Control 95th % confidence interval"]
pastSales_Controls5 <- pastSales[Store_type == "Control", ][, totSales := totSales * (1- 2*stdDev)][, Store_type := "Control 5th % confidence interval"]
#### Then, create a combined table with columns from pastSales,
#### pastSales_Controls95 and pastSales_Controls5
trialAssessment <- rbind(pastSales,pastSales_Controls95,pastSales_Controls5)
#### Plotting these in one nice graph
ggplot(trialAssessment, aes(TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessment[ YEARMONTH < 201905 & YEARMONTH > 201901 ,],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth), ymin = 0 , ymax =
Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total sales", title = "Total sales by month")
#### This would be a repeat of the steps before for total sales
#### Scale pre-trial control customers to match pre-trial trial store customers
scalingFactorForControlCust <- preTrialMeasures[STORE_NBR == trial_store & YEARMONTH < 201902, sum(nCustomers)]/preTrialMeasures[STORE_NBR == control_store & YEARMONTH < 201902, sum(nCustomers)]
#### Apply the scaling factor
measureOverTimeCusts <- measureOverTime
scaledControlCustomers <- measureOverTimeCusts[STORE_NBR == control_store,][ , controlCustomers := nCustomers
* scalingFactorForControlCust][, Store_type := ifelse(STORE_NBR== trial_store, "Trial",ifelse(STORE_NBR == control_store, "Control", "Other stores"))]
#### Calculate the percentage difference between scaled control sales and trial sales
percentageDiff <- merge(scaledControlCustomers[, c("YEARMONTH", "controlCustomers")],
measureOverTime[STORE_NBR == trial_store, c("nCustomers", "YEARMONTH")],
by = "YEARMONTH")[, percentageDiff := abs(controlCustomers-nCustomers)/controlCustomers]
#### As our null hypothesis is that the trial period is the same as the pre-trial
#### period, let's take the standard deviation based on the scaled percentage difference
#### in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902 , percentageDiff])
degreesOfFreedom <- 7
#### Trial and control store number of customers
pastCustomers <- measureOverTimeCusts[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))][, nCusts := mean(nCustomers), by =c("YEARMONTH", "Store_type")][Store_type %in% c("Trial", "Control"), ][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][Store_type %in% c("Trial", "Control"),]
#### Control store 95th percentile
pastCustomers_Controls95 <- pastCustomers[Store_type == "Control",][, nCusts := nCusts * (1 + stdDev * 2)][, Store_type := "Control 95th % confidence interval"]
#### Control store 5th percentile
pastCustomers_Controls5 <- pastCustomers[Store_type == "Control",][, nCusts := nCusts * (1 - stdDev * 2)][, Store_type := "Control 5th % confidence interval"]
trialAssessment <- rbind(pastCustomers, pastCustomers_Controls95,pastCustomers_Controls5)
#### Plotting these in one nice graph
ggplot(trialAssessment, aes(TransactionMonth, nCusts, color = Store_type)) +
geom_rect(data = trialAssessment[ YEARMONTH < 201905 & YEARMONTH > 201901 ,],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth), ymin = 0 , ymax =
Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total number of customers", title = "Total
number of customers by month")
#### Conduct the analysis on trial store 88.
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID)/uniqueN(LYLTY_CARD_NBR),
nChipsPerCust = sum(PROD_QTY)/uniqueN(LYLTY_CARD_NBR),
avgPricePerUnit = sum(TOT_SALES)/sum(PROD_QTY)),
by = .(STORE_NBR,YEARMONTH)][order(STORE_NBR,YEARMONTH)]
#### Use the functions from earlier to calculate the correlation of the sales and
# number of customers of each potential control store to the trial store
trial_store <- 88
corr_nSales <- calculateCorrelation(inputTable = preTrialMeasures,metricCol = quote(totSales),storeComparison = trial_store)
corr_nCustomers <- calculateCorrelation(inputTable = preTrialMeasures,metricCol = quote(nCustomers),storeComparison = trial_store)
#### Use the functions from earlier to calculate the magnitude distance of the
# sales and number of customers of each potential control store to the trial store
magnitude_nSales <- calculateMagnitudeDistance(preTrialMeasures,quote(totSales),trial_store)
magnitude_nCustomers <- calculateMagnitudeDistance(preTrialMeasures,quote(nCustomers),trial_store)
#### Create a combined score composed of correlation and magnitude by merging the
# correlations table and the magnitudes table, for each driver.
corr_weight <- 0.5
score_nSales <- merge(corr_nSales,magnitude_nSales, by= c("Store1","Store2"))[, scoreNSales := corr_weight * corr_measure + (1-corr_weight) * mag_measure]
score_nCustomers <- merge(corr_nCustomers,magnitude_nCustomers, by= c("Store1","Store2"))[, scoreNCust := corr_weight * corr_measure + (1-corr_weight) * mag_measure]
#### Combine scores across the drivers by merging sales scores and customer scores,
# and compute a final combined score.
score_Control <- merge(score_nSales,score_nCustomers, by=c("Store1","Store2"))
score_Control[, finalControlScore := scoreNSales * 0.5 + scoreNCust * 0.5]
#### Select control stores based on the highest matching store
#### (closest to 1 but not the store itself, i.e. the second ranked highest store)
#### Select control store for trial store 88
control_store <- score_Control[order(-finalControlScore)][2,Store2]
control_store
#### Visual checks on trends based on the drivers
#### For the period before the trial, create a graph with total sales of the trial
# store for each month, compared to the control store and other stores.
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))]
pastSales <- pastSales[, totSales := mean(totSales), by=c("YEARMONTH","Store_type")]
pastSales <- pastSales[, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][YEARMONTH < 201903, ]
ggplot(pastSales, aes(TransactionMonth, totSales, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
geom_point() +
labs(x="Month of operation",y="Total sales",title="Total sales by month")
#### Visual checks on trends based on the drivers
#### For the period before the trial, create a graph with customer counts of the
# trial store for each month, compared to the control store and other stores.
measureOverTimeCusts <- measureOverTime
pastCustomers <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))]
pastCustomers <- pastCustomers[, totCusts := mean(nCustomers), by=c("YEARMONTH","Store_type")]
pastCustomers <- pastCustomers[, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][YEARMONTH < 201903,]
ggplot(pastCustomers, aes(TransactionMonth, totCusts, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
geom_point() +
labs(x="Month of operation",y="Total customers",title="Total customers by month")
#### Scale pre-trial control store sales to match pre-trial trial store sales
scalingFactorForControlSales <-
preTrialMeasures[STORE_NBR == trial_store, sum(totSales)] / preTrialMeasures[STORE_NBR == control_store, sum(totSales)]
#### Apply the scaling factor
measureOverTimeSales <- measureOverTime
scaledControlSales <- measureOverTimeSales[STORE_NBR == control_store, ][, controlSales := scalingFactorForControlSales*totSales]
#### Calculate the absolute percentage difference between scaled control sales and trial sales
percentageDiff <- merge(scaledControlSales[,c("controlSales","YEARMONTH")], measureOverTime[STORE_NBR == trial_store, c("totSales","YEARMONTH")], by="YEARMONTH")[, percentDiff := abs(totSales-controlSales)/controlSales]
#### As our null hypothesis is that the trial period is the same as the pre-trial
#### period, let's take the standard deviation based on the scaled percentage difference
#### in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902, percentDiff])
degreesOfFreedom <- 7
#### Trial and control store total sales
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")
][Store_type %in% c("Trial", "Control"),]
#### Control store 95th percentile
pastSales_Controls95 <- pastSales[Store_type == "Control",][, totSales := totSales * (1 + stdDev * 2)][, Store_type := "Control 95th % confidence interval"]
#### Control store 5th percentile
pastSales_Controls5 <- pastSales[Store_type == "Control",][, totSales := totSales * (1 - stdDev * 2)][, Store_type := "Control 5th % confidence interval"]
trialAssessment <- rbind(pastSales, pastSales_Controls95, pastSales_Controls5)
#### Plotting these in one nice graph
ggplot(trialAssessment, aes(TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessment[ YEARMONTH < 201905 & YEARMONTH > 201901 ,],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0 , ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total sales", title = "Total sales by month")
#### This would be a repeat of the steps before for total sales
#### Scale pre-trial control store customers to match pre-trial trial store customers
scalingFactorForControlCust <- preTrialMeasures[STORE_NBR == trial_store & YEARMONTH < 201902, sum(nCustomers)] / preTrialMeasures[STORE_NBR == control_store & YEARMONTH < 201902, sum(nCustomers)]
#### Apply the scaling factor
measureOverTimeCusts <- measureOverTime
scaledControlCustomers <- measureOverTimeCusts[STORE_NBR == control_store,
][ , controlCustomers := nCustomers * scalingFactorForControlCust
][, Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store,"Control",
"Other stores"))]
#### Calculate the absolute percentage difference between scaled control sales and trial sales
percentageDiff <- merge(scaledControlCustomers[, c("YEARMONTH","controlCustomers")],measureOverTime[STORE_NBR == trial_store,c("nCustomers", "YEARMONTH")],by = "YEARMONTH")[, percentDiff := abs(controlCustomers-nCustomers)/controlCustomers]
#### As our null hypothesis is that the trial period is the same as the pre-trial
# period, let's take the standard deviation based on the scaled percentage difference
# in the pre-trial period
stdDev <- sd(percentageDiff[YEARMONTH < 201902 , percentDiff])
degreesOfFreedom <- 7 # note that there are 8 months in the pre-trial period hence 8 - 1 = 7 degrees of freedom
#### Trial and control store number of customers
measureOverTimeCusts <- measureOverTime
pastCustomers <- measureOverTimeCusts[ , Store_type := ifelse(STORE_NBR == trial_store, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))][, nCusts := mean(nCustomers), by = c("YEARMONTH", "Store_type")][, TransactionMonth := as.Date(paste(as.numeric(YEARMONTH) %/% 100, as.numeric(YEARMONTH) %% 100, 1, sep = "-"), "%Y-%m-%d")][Store_type %in% c("Trial", "Control"),]
#### Control store 95th percentile
pastCustomers_Controls95 <- pastCustomers[Store_type == "Control",] [, nCusts := nCusts * (1 + stdDev * 2)][, Store_type := "Control 95th % confidence"]
#### Control store 5th percentile
pastCustomers_Controls5 <- pastCustomers[Store_type == "Control",][,nCusts := nCusts * (1 - stdDev * 2)][, Store_type := "Control 5th % confidence"]
#### Combine the tables pastSales, pastSales_Controls95, pastSales_Controls5
trialAssessment <- rbind(pastCustomers, pastCustomers_Controls95, pastCustomers_Controls5)
#### Plotting these in one nice graph
ggplot(trialAssessment, aes(TransactionMonth, nCusts, color = Store_type)) +
geom_rect(data = trialAssessment[YEARMONTH > 201901 & YEARMONTH < 201905, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0 , ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
geom_point() +
labs(x = "Month of operation", y = "Total customers", title = "Total customers by month")
library(knitr)
library(kableExtra)
results_table <- data.frame(
"Trial Stores" = c("77", "86", "88"),
"Sales" = c("Yes", "No", "Yes"),
"Customers" = c("Yes","Yes","No")
)
# Create the table using kable
table_output <- kable(results_table, format = "html")
# Apply styling using kable_styling
table_output <- kable_styling(table_output, bootstrap_options = "striped")
# Add header row with the title and align it to the center
table_output <- add_header_above(table_output, header = c("Summary of Significant Difference in our Metrics" = 3))
table_output