ML_Predicting-Vehicle-Fuel-Efficiency.Rmd

---
title: "ML_Predicting-Vehicle-Fuel-Efficiency"
author: "Reinp"
date: "`r Sys.Date()`"
output:
  pdf_document: default
  html_document: 
    keep_md: yes
  word_document: default
---

# R Programming

## Set Chunk requirements

```{r setup, include=TRUE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
#echo=FALSE indicates that the code will not be shown in the final document 
#(though any results/output would still be displayed).
#include=FALSE to have the chunk evaluated, but neither the code nor its output displayed
# warning=FALSE and message=FALSE suppress any R warnings or messages from being included 
#in the final document
```


## Load Relevant Packages and Data Set

```{r}
library(tidyverse)
## tidyverse includes readr, ggplot2, dplyr, forcats, tibble, tidyr, purrr, stringr

## Reading our dataset
setwd('E:/Documents/Reinp/GitHub Respositories/ML_Predicting-Vehicle-Fuel-Efficiency')


cars2020 <- read.csv("cars2020.csv")
attach(cars2020)
View(cars2020)

```

## Structure of the Data

```{r structure of data}

#cars2020 #The input data consists of 1,164 observations of 14 variables

head(cars2020)

tail(cars2020)

# How many variables and observations are there?
ncol(cars2020)

nrow(cars2020)

#learn more about the dataset
help(cars2020)
??cars2020
str(cars2020)
class(cars2020)
typeof(cars2020) 
length(cars2020)
names(cars2020) #display variable names
#attributes(cars2020) #names(cars2020), class(cars2020), row.names(cars2020)
```

## Missing data

```{R}

which(!complete.cases(cars2020))


```

## Descriptive Statistics

```{R}
library(knitr)
library(mosaic)
library(psych)


names(cars2020)[1] <- "car_make" #rename by index column name with base r functions
#names(cars2020)[names(cars2020) == "ï..make"] <- "car_make"

#summary statistics
summary(cars2020) ##summarizes the dataset


#1. Dolar sign Syntax

table(cars2020$car_make) 
table(cars2020$transmission)
table(cars2020$drive)
table(cars2020$class)
table(cars2020$sidi)
table(cars2020$aspiration)
table(cars2020$fuelType1)
table(cars2020$atvType)
table(cars2020$startStop)


summary(cars2020$mpg)
summary(cars2020$gears)
summary(cars2020$displ)
summary(cars2020$cylinders)


#2. FormulaSyntax

## one categorical

tally(~car_make, data=cars2020)
tally(~transmission, data=cars2020)
tally(~drive, data=cars2020)
tally(~class, data=cars2020)
tally(~sidi, data=cars2020)
tally(~aspiration, data=cars2020)
tally(~fuelType1, data=cars2020)
tally(~atvType, data=cars2020)
tally(~startStop, data=cars2020)

## Two categoraical

tally(car_make~transmission, data=cars2020)
tally(car_make~drive, data=cars2020)
tally(car_make~class, data=cars2020)
tally(car_make~sidi, data=cars2020)
tally(car_make~aspiration, data=cars2020)
tally(car_make~fuelType1, data=cars2020)
tally(car_make~startStop, data=cars2020)

library(kableExtra)
kable(cbind(tally(car_make~transmission, data=cars2020), tally(car_make~sidi,
 data=cars2020), tally(car_make~startStop, data=cars2020)), align = "cccrrrr", 
  caption = "Group Rows")%>%
  add_header_above(c(" ", "Transmission" = 3, "Spark Ignited Direct Ignition" = 2,
                     "start-stop technology" = 2))

#latex, html, markdown, pandoc, and rst


##one continous variable

favstats(~mpg, data=cars2020)[c("max", "mean","sd", "n")]
favstats(~mpg, data=cars2020)

favstats(~gears, data=cars2020)
favstats(~displ, data=cars2020)
favstats(~cylinders, data=cars2020)

##one continous one categorical

favstats(mpg~ car_make, data=cars2020)
favstats(mpg~ transmission, data=cars2020)
favstats(mpg~ drive, data=cars2020)
favstats(mpg~ class, data=cars2020)
favstats(mpg~ aspiration, data=cars2020)
favstats(mpg~ fuelType1, data=cars2020)
favstats(mpg~ atvType, data=cars2020)

##one continous two categorical

favstats(mpg~ car_make+transmission, data=cars2020)
#favstats(mpg~ car_make+aspiration, data=cars2020)
#favstats(mpg~ car_make+fuelType1, data=cars2020)


```

## Splitting the data for training and testing

```{r}
library(rsample)
set.seed(1729)
split <- initial_split(cars2020, prop = 0.8, strata = mpg)


train <- training(split) 
test <- testing(split)

# splits the data in a 80:20 ratio (training:testing).

#uses the outcome variable, mpg to stratify. This is done to ensure that the 
#distribution of the outcome is comparable in both data sets.

#initial_time_split() takes the 1st prop samples for training,instead of random selection.


```

## Checking the distribution of mpg in the training and tests

```{r}
# Labeling the train and tests sets then combining them for purposes of making the plot

cars_recon <- bind_rows(mutate(train, Data = "Training"),
                   mutate(test, Data = "Testing"))

ggplot(cars_recon, aes(x = mpg, fill = Data)) + 
  geom_density(alpha = 0.4) +
  ggtitle("Comparing MPG distributions in train and test data sets")

ggplot(cars_recon, aes(x = mpg, colour = Data)) + 
  geom_density(alpha = 0.4) +
  ggtitle("Comparing MPG distributions in train and test data sets")
```

## Exploratory Data Analysis (EDA)

```{r}

library(dlookr)


# 1. provides descriptive statistics for numerical data
describe(cars2020) 

# 2. eda_report()

# eda_report() performs EDA on all variables of the data frame or object 
#(tbl_df,tbl, etc.) that inherits the data frame.

# eda_report() creates an EDA report in two forms: pdf file based on Latex
# and html file


##eda_report(cars2020, target = mpg , output_file = "EDACars2020.pdf") #pdf

##eda_report(cars2020, target=mpg, output_format="html", output_file="EDACars2020.html")


```

### EDA when target variable is numerical variable

#### 1. Cases where predictors are numeric variable

```{r}

num <- target_by(cars2020, mpg)

#general relationship between target variable fuel-efficiency (mpg) and 
#predictor engine size (displacement)

#we show the result of simple regression model of target ~ predictor relation

num_num <- relate(num, displ)
num_num

summary(num_num)

# visualize the relationship between the target variable and the predictor
plot(num_num)

#The relationship between 'mpg' and 'displ' is represented as a scatter plot. 
#The plot on the left represents the scatter plot of 'mpg' and 'displ' and the 
#confidence interval of the regression line and the regression line. 

#The plot on the right represents the relationship between the original data and 
#the predicted value of the linear model as a scatter plot. If there is a linear 
#relationship between the two variables, the observations will converge on the 
#red diagonal in the scatter plot.

#The scatter plot of the data with a large number of observations is output as 
#overlapping points. This makes it difficult to judge the relationship between the two variables. It also takes a long time to perform the visualization. 
#In this case, the above problem can be solved by hexabin plot.

#In plot(), the hex_thres argument provides a basis for drawing hexabin plots. For data with more than this number of observations, draw a hexabin plot.


ggplot(data = train, 
       aes(x = displ, y = mpg)) +
  geom_point(alpha = 0.25) + geom_smooth() + 
  xlab("Engine displace (L)") + 
  ylab("Miles per gallon") + 
  ggtitle("Fuel-efficiency vs Engine Size (displacement)")

```

#### 2. Cases where predictors are categorical variable

```{r}


#difference in distribution of fuel-efficiency for each transmission type

#shows the result of performing one-way ANOVA of target ~ predictor relation

cars2020$transmission <- factor(cars2020$transmission)
num1 <- target_by(cars2020, mpg)

num_cat <- relate(num1, transmission)
num_cat

summary(num_cat)

plot(num_cat)

#object of type 'closure' is not subsettable error happens when you’re trying to 
#treat a function like a list, vector, or data frame. 
#To fix it, start treating the function like a function.


ggplot(data = train, aes(x = transmission, y = mpg)) + 
  geom_boxplot() + 
  xlab("Transmission Type") + 
  ylab("Miles per gallon") + 
  ggtitle("Transmission type and fuel efficiency of 2020 cars")


```

### EDA when target variable is categorical variable

#### 1. Cases where predictors are numeric variable
```{r}

categ <- target_by(cars2020, transmission)

#the descriptive statistics are shown for each level of the target variable

cat_num <- relate(categ, mpg)
cat_num

summary(cat_num)

plot(cat_num)

ggplot(data = train, aes(x = mpg, fill = transmission)) + 
  geom_density(alpha = 0.4) +
  ggtitle("Comparing Transmission type distributions and MPG")
```

#### 2. Cases where predictors are categorical variable

```{r}
cars2020$drive <- factor(cars2020$drive)
categ1 <- target_by(cars2020, transmission)

#we show the contigency table of two variables. an independence test is performed 
#on the contigency table.

cat_cat <- relate(categ1, drive)
cat_cat

summary(cat_cat)

plot(cat_cat)


#visualizes the relationship between the target variable and the predictor by mosaic plot

library(ggmosaic)

ggplot(data = train) + 
  geom_mosaic(aes(x = product(transmission, drive), fill = transmission)) +
  labs(title = "Transmission type by drivetrain - 1 ~ W(fill=Y) + Y + X", 
       subtitle = "f(Transmission type , drivetrain)") +
xlab("Drivetrain") + 
  ylab("Transmission Type")


ggplot(data = train) + 
  geom_mosaic(aes(x = product(drive), fill = drive)) +
  labs(title = "drivetrain - 1 ~ W(fill=X) + X", 
       subtitle = "f(drivetrain)")+
xlab("Drivetrain")

ggplot(data = train) + 
  geom_mosaic(aes(x = product(drive), fill = transmission)) +
  labs(title = "drivetrain - 1 ~ W(fill=Y) + X", 
       subtitle = "f(drivetrain)")+
xlab("Drivetrain")


ggplot(data = train) + 
  geom_mosaic(aes(x = product(transmission, drive), fill = transmission, conds=product(sidi))) +
  labs(title = "Transmission type by drivetrain - 1 ~ W(fill=Y) + Y|Z + X", 
       subtitle = "f(Transmission type|sidi , drivetrain)") +
xlab("y") + 
  ylab("drivetrain")+
  facet_grid(sidi~.)+
  coord_flip()

```

### EDA: MPG, displacement and transmission type
```{r}

#scatter plots of fuel efficiency (mpg) vs engine size (displacement) differ across transmission types

ggplot(data = train, 
       aes(x = displ, y = mpg)) +
  geom_point(alpha = 0.25) + 
  geom_smooth()+
  facet_wrap(~transmission) + 
  xlab("Engine displace (L)") + 
  ylab("Miles per gallon") + 
  ggtitle("Fuel-efficiency vs Engine Size (displacement)")

```


## Finding a fitting distribution for the mpg variable

```{r}

library(car)
library(MASS) #So that distributions that must be non-zero can make sense of my data

qqp(cars2020$mpg+1, "norm", main="Q-Q Plot ~ mpg+1 Normal model")

qqp(cars2020$mpg+1, "lnorm", main="Q-Q Plot ~ mpg+1 LogNormal model") #lnorm is lognormal

qqp(cars2020$mpg+1, "exp", main="Q-Q Plot ~ mpg+1 Exponential model")


#qqp requires estimates of the parameters of the negative binomial, Poisson
# and gamma distributions. You can generate estimates using the fitdistr function.

#negative binomial and gamma distributions can only handle positive numbers.

#Poisson distribution can only handle positive whole numbers.

#Binomial and Poisson distributions are different from the others because they are
#discrete rather than continuous, which means they quantify distinct,
#countable events or the probability of these events

pois <- fitdistr(cars2020$mpg+1, "Poisson")
qqp(cars2020$mpg+1, "pois", lambda=pois$estimate, main="Q-Q Plot ~ mpg+1 Poisson model")


gamma <- fitdistr(cars2020$mpg+1, "gamma",
list(shape = 1, rate = 0.1), lower = 0.4)
qqp(cars2020$mpg+1, "gamma", shape = gamma$estimate[[1]], rate =
gamma$estimate[[2]], main="Q-Q Plot ~ mpg+1 Gamma model")


weibull <- fitdistr(cars2020$mpg+1, "weibull")
qqp(cars2020$mpg+1, "weibull", shape = weibull$estimate[[1]],
scale=weibull$estimate[[2]], main="Q-Q Plot ~ mpg+1 Weibull model")


```

## Creating a linear model

### 1. Ordinary Least Squares


```{r}
# Deselect the 2 column variables
id_cols <- c("car_make", "model")
train1 <- train[,!(names(train) %in% id_cols)]


#To fit a linear model using the method of (OLS) we use the lm function
ols_model <- lm(mpg~., data = train1)
ols_model

summary(ols_model)

plot(ols_model) #shows several diagnostic graphs

```

### 2. Alternate Ordinary Least Squares
```{r}

#This alternative model differs from the previous in using an outcome of log(mpg) 
#This choice is suggested because of the lognormal nature of the distribution of mpg


ols_log_model <- lm(log(mpg)~., data = train1)
summary(ols_log_model)
```

## Decision Tree models
```{r}

#fitting decision tree models. Each decision tree model will formulated with an 
#equivalent outcome to one of the OLS models

library(rpart)
library(rpart.plot)

dt_model <- rpart(mpg~., data = train1)
rpart.plot(dt_model)


dt_log_model <- rpart(log(mpg)~., data = train1)
rpart.plot(dt_log_model)

#Note the values displayed at the nodes represent the predicted log of the mpg variable.
#To convert these to interpretable fuel-efficiency values you use the exp function.


```

## Random Forest models

```{r}
#we formulate two random forest models (one for each of the predictors)

library(randomForest)
set.seed(2001)
rf_model <- randomForest(mpg~., data = train1)
set.seed(99)
rf_log_model <- randomForest(log(mpg)~., data = train1) 
```

## Performance of the Models on Training Data

### Collating model estimates

```{r}
#add the 6 model estimates to a data frame that also contains the actual mpg values 
#as well as the model predictors.

train_results <- mutate(train1, 
                        ols = predict(ols_model, train1),
                        ols_log = exp(predict(ols_log_model, train1)),
                        dt = predict(dt_model, train1),
                        dt_log  = exp(predict(dt_log_model, train1)),
                        rf = predict(rf_model, train1),
                        rf_log = exp(predict(rf_log_model, train1))
                        )

```

### Visualizing model performance

```{r}
#visualize the model performance by graphing the model estimates vs the actual values 
#of the fuel efficiency

#We reshape the data to be used easily with 'ggplot2'


library(tidyr)
train_results_long <- pivot_longer(train_results, ols:rf_log, 
                                   names_to = "method", values_to = "estimate")

#train_results_long1 <- train_results_long[,c('mpg', 'method','estimate')]

head(train_results_long)
tail(train_results_long)

#plot of the model estimates vs the actual MPG values

ggplot(data = train_results_long, 
       aes(x = mpg, y = estimate)) + 
  geom_point(shape = 21, colour = "blue") + 
  facet_wrap(~method, ncol = 2) + 
  geom_abline(slope = 1, intercept = 0)   + 
  xlim(c(0,60)) + ylim(c(0,60)) + theme_minimal()


#Points on the diagonal line correspond to cars for which the model estimated value
#and the actual mpg value are very close. Points that are above or below the line 
#correspond to cars for which the model overestimates or underestimates the mpg 
#respectively.


#Inspecting these plots visually, it seems that the random forest models fit more 
#closely than the OLS models, which fit more closely than the decision tree models.
```

### Getting model metrics

```{r}

#The yardstick::metrics function can be used for summary statistics for a model. 
#It requires that the model estimates be available in as a column of a data frame 
#that also contains a column of corresponding truth values.


#The three reported metrics are:
#1. Root mean squared error
# 2. R2
# 3. Mean absolute error

library(yardstick)

metric_ols <- metrics(train_results, truth = mpg, estimate = ols)
metric_ols

metric_ols_log <- metrics(train_results, truth = mpg, estimate = ols_log)
metric_ols_log

metric_dt <- metrics(train_results, truth = mpg, estimate = dt)
metric_dt

metric_dt_log <- metrics(train_results, truth = mpg, estimate = dt_log)
metric_dt_log

metric_rf <- metrics(train_results, truth = mpg, estimate = rf)
metric_rf

metric_rf_log <- metrics(train_results, truth = mpg, estimate = rf_log)
metric_rf_log

```

### Comparing models using the metrics (RMSE, R2, MAE)


```{r}

metrics2 <- function(fit_name, in_df, truth){
  out_df <- yardstick::metrics(in_df, fit_name, truth)
  out_df$fit <- fit_name
  return(out_df)
}

comp_models <- function(in_df, fit_names, in_truth, metric, prefix = ""){
  
  out_df <- purrr::map_df(fit_names, metrics2, in_df = in_df, truth = in_truth)
  out_df <- out_df %>% 
    filter(.metric == metric) %>% 
    arrange(fit, .estimate)
  names(out_df)[2] <- paste(prefix, metric)
  return(out_df)
}

#we use comp_models to get the metrics for each of the models fitting on the training data:

model_names <- c("ols", "ols_log", "dt", "dt_log", "rf", "rf_log")

train_rmse <- comp_models(train_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "rmse", 
                          prefix = "train")

train_rmse

train_rsq <- comp_models(train_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "rsq", 
                          prefix = "train")

train_rsq


train_mae <- comp_models(train_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "mae", 
                          prefix = "train")

train_mae


```

## Performance of the Models on Testing data

### Collating model estimates

```{r}

#To test how the models do on the unseen testing data. Use similar code as before to
#augment the testing data frame with the model estimates for each of the 6 models

test_results <- mutate(test, ols = predict(ols_model, test),
                       ols_log = exp(predict(ols_log_model, test)),
                       dt = predict(dt_model, test),
                       dt_log  = exp(predict(dt_log_model, test)),
                       rf = predict(rf_model, test),
                       rf_log = exp(predict(rf_log_model, test))
) 

```

### Visualizing model performance

```{r}

test_results_long <- pivot_longer(test_results, 
                                  ols:rf_log, 
                                  names_to = "method", 
                                  values_to = "estimate")

#plot of the model estimates vs the actual MPG values

ggplot(data = test_results_long, 
       aes(x = mpg, y = estimate)) + 
  geom_point(shape = 21, colour = "red") + 
  facet_wrap(~method, ncol = 2) + 
  geom_abline(slope = 1, intercept = 0) + 
  xlim(c(0,60)) + ylim(c(0,60)) + 
  theme_minimal()

```

### Comparing models using the metrics (RMSE, R2, MAE)

```{r}
test_rmse <- comp_models(test_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "rmse", 
                          prefix = "test")

test_rmse

test_rsq <- comp_models(test_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "rsq", 
                          prefix = "test")

test_rsq


test_mae <- comp_models(test_results, 
                          model_names, 
                          in_truth = "mpg", 
                          metric = "mae", 
                          prefix = "test")

test_mae

#We can see how the metric values changed collectively moving the models from the 
#training data to the testing data.

inner_join(train_rmse, test_rmse, by = "fit")
inner_join(train_rsq, test_rsq, by = "fit")
inner_join(train_mae, test_mae, by = "fit")

```


## Residuals of the Models

### Training data

```{r}
#add the 6 model residuals to a data frame that also contains the actual mpg values 
#as well as the model predictors.

train_residuals <- mutate(train_results, 
                        ols_residual = ols - mpg,
                        ols_log_residual = ols_log - mpg,
                        dt_residual = dt - mpg,
                        dt_log_residual  = dt_log - mpg,
                        rf_residual = rf - mpg,
                        rf_log_residual = rf_log - mpg
                        )


train_residuals_long <- pivot_longer(train_residuals, ols_residual:rf_log_residual, 
                                  names_to = "method1", values_to = "residual")


#plot of the model residuals vs the actual MPG values

ggplot(data = train_residuals_long, 
       aes(x = mpg, y = residual)) + 
  geom_point(shape = 21, colour = "green") + 
  facet_wrap(~method1, ncol = 2) + 
  geom_hline(yintercept = 0, linetype = "dashed")   + 
  xlim(c(0,60)) + ylim(c(-30,25)) + theme_minimal()


```


### Testing data

```{r}

#add the 6 model residuals to a data frame that also contains the actual mpg values 
#as well as the model predictors.

test_residuals <- mutate(test_results, 
                        ols_residual = ols - mpg,
                        ols_log_residual = ols_log - mpg,
                        dt_residual = dt - mpg,
                        dt_log_residual  = dt_log - mpg,
                        rf_residual = rf - mpg,
                        rf_log_residual = rf_log - mpg
                        )


test_residuals_long <- pivot_longer(test_residuals, ols_residual:rf_log_residual, 
                                  names_to = "method1", values_to = "residual")


#plot of the model residuals vs the actual MPG values

ggplot(data = test_residuals_long, 
       aes(x = mpg, y = residual)) + 
  geom_point(shape = 21, colour = "orange") + 
  facet_wrap(~method1, ncol = 2) + 
  geom_hline(yintercept = 0, linetype = "dashed")   + 
  xlim(c(0,60)) + ylim(c(-30,25)) + theme_minimal()

```