AKS_ICP_HW2.Rmd

---
title: "International Climate Policy - Homework 2\nWorking document (R Markdown)"
author: "Amartya Kumar Sinha"
date: "2024-02-01"
output:
  html_document: default
  word_document: default
  pdf_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
setwd("C:/Users/Amartya Kumar Sinha/OneDrive - The University of Chicago/IntlClimatePolicy_PPHA39930/Assignments/Assn2_IntlClimatePolicy") 
library(dplyr)
library(ggplot2)
library (lfe)
```

```{r}
rep_nomiss <- read.csv("icp_indiv_2_dg2011_rep_nomiss.csv")
ave_temp <- read.csv("icp_indiv_2_county_avetemp.csv")
```

# Question 1
### Taking population weighted average across all temperature variables for the entire country

```{r}
# Creating a new dataframe where each row is a temperature range and its population-weighted average
temp_vars <- grep("tday", names(rep_nomiss), value = TRUE)

pop_weighted_avg <- sapply(temp_vars, function(x) weighted.mean(rep_nomiss[[x]], rep_nomiss$population, na.rm = TRUE))
```

# 1. a) 
### Plotting histogram as required

```{r}
labels <- c("<10", "10–20", "20–30", "30–40", "40–50", "50–60", "60–70", "70–80", "80–90", ">90")

barplot(pop_weighted_avg, 
        main = "Population-Weighted Average Number of Days in \nEach Temperature Range (1968-2002 average)", 
        ylab = "", 
        xlab = "Annual distribution of daily mean temperatures (F)", 
        ylim = c(0, 80), 
        names.arg = labels, 
        las = 2, col = "lightyellow")

abline(h = seq(0, max(pop_weighted_avg), by = 10), col = "gray", lty = "dotted")
```
# 1. b) and c)
### Answering questions as required.
```{r}
# Calculating population-weighted average number of days above 90◦ F per year across the US
pop_weighted_avg_above_90 <- weighted.mean(rep_nomiss$tday_gt90, rep_nomiss$population, na.rm = TRUE)

# Finding county with the highest number of days above 90◦ F per year
highest_days_above_90 <- rep_nomiss[which.max(rep_nomiss$tday_gt90), "county"]
highest_days_above_90_count <- max(rep_nomiss$tday_gt90, na.rm = TRUE)

# Calculating the average number of days above 90◦F per year for each county over the sample period (1968-2002)
avg_days_above_90 <- aggregate(tday_gt90 ~ county, rep_nomiss, mean, na.rm = TRUE)

# Finding number of counties that have, on average over the sample period (1968-2002), experienced zero days above 90◦F per year
num_counties_zero_days_above_90 <- sum(avg_days_above_90$tday_gt90 == 0)

# Calculating the total number of counties
total_counties <- length(unique(rep_nomiss$county))

# Calculating the percentage of counties that have, on average over the sample period (1968-2002), experienced zero days above 90◦F per year
percentage_zero_days_above_90 <- (num_counties_zero_days_above_90 / total_counties) * 100

print(paste("The population-weighted average number of days above 90◦ F per year across the US is", round(pop_weighted_avg_above_90, 2)))
print(paste("The county with the highest number of days above 90◦ F per year is", highest_days_above_90, "with", round(highest_days_above_90_count, 2), "days"))
print(paste("The number of counties that have, on average over the sample period (1968-2002), experienced zero days above 90◦F per year is", num_counties_zero_days_above_90, "out of a total of", total_counties, "counties in the dataset, or about", round(percentage_zero_days_above_90, 2), "%"))
```
# Question 2
# 2. a)

```{r}
print(paste("The national average over-65 mortality rate is", round(mean(rep_nomiss$cruderate, na.rm = TRUE), 2), "deaths per 100,000 population"))
print(paste("The total number of deaths from 1968-2002 is", sum(rep_nomiss$deaths, na.rm = TRUE)))
```
# Question 3
Merging given datasets and creating variables as instructed
```{r}
# Merging dataframes on 'countycode'
merged_df <- merge(rep_nomiss, ave_temp, by = "countycode")

# Calculating sum of all days over 70◦ F and 80◦ F for each county
merged_df$hotdays <- rowSums(merged_df[,grep("tday_70_80|tday_80_90|tday_gt90", names(merged_df))], na.rm = TRUE)
merged_df$hotterdays <- rowSums(merged_df[,grep("tday_80_90|tday_gt90", names(merged_df))], na.rm = TRUE)

# Calculating  average mortality rate, hotdays, and hotterdays for each county
avg_by_county <- aggregate(cbind(cruderate, hotdays, hotterdays) ~ countycode, merged_df, mean, na.rm = TRUE)

# Checking first few rows of the new dataframe
print(head(avg_by_county))

```
# 3. a)
### Plotting figure as required
```{r}
avg_by_county <- merge(avg_by_county, ave_temp, by = "countycode")

model <- lm(cruderate ~ normal_1981_2010, data = avg_by_county)

slope <- coef(model)[2]

ggplot(avg_by_county, aes(x = normal_1981_2010, y = cruderate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Relationship between average county temperatures and over-65 mortality rates") +
  xlab("Average county temperatures (1981-2010)") +
  ylab("Over 65 Mortality Rates") +
  annotate("text", x = min(avg_by_county$normal_1981_2010), y = max(avg_by_county$cruderate), label = paste("Slope =", round(slope, 2)), hjust = 0, vjust = 1)
```
# 3. b)
### Plotting graphs as instructed: hot days first, and then hotter days
```{r}
model_hotdays <- lm(cruderate ~ hotdays, data = avg_by_county)

slope_hotdays <- coef(model_hotdays)[2]

rsq <- summary(model_hotdays)$r.squared

ggplot(avg_by_county, aes(x = hotdays, y = cruderate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Relationship between hot days and over-65 mortality rates") +
  xlab("Hot Days") +
  ylab("Over-65 Mortality Rates") +
  annotate("text", x = min(avg_by_county$hotdays), y = max(avg_by_county$cruderate), label = paste("Slope =", round(slope_hotdays, 2)), hjust = 0, vjust = 3) +
  annotate("text", x = min(avg_by_county$hotdays), y = max(avg_by_county$cruderate) - 10, label = paste("R-squared =", round(rsq, 2)), hjust = 0, vjust = 1)
```

```{r}
model_hotterdays <- lm(cruderate ~ hotterdays, data = avg_by_county)

slope_hotterdays <- coef(model_hotterdays)[2]

rsq <- summary(model_hotterdays)$r.squared

ggplot(avg_by_county, aes(x = hotterdays, y = cruderate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Relationship between hotter days and over-65 mortality rates") +
  xlab("Hotter Days") +
  ylab("Over-65 Mortality Rates") +
  annotate("text", x = min(avg_by_county$hotterdays), y = max(avg_by_county$cruderate), label = paste("Slope =", round(slope_hotterdays, 2)), hjust = 0, vjust = 3) +
  annotate("text", x = min(avg_by_county$hotterdays), y = max(avg_by_county$cruderate) - 10, label = paste("R-squared =", round(rsq, 2)), hjust = 0, vjust = 1)
```
# Question 4
### Data reload not needed since the dataframe "merged_df" exists consisting of the original two datasets merged together and the columns for hot days and hotter days are appended to it
```{r}
selected_counties <- c("Mobile County, AL", "Cook County, IL", "Los Angeles County, CA", "Miami-Dade County, FL")

# Creating new dataframe with selected counties
subset_q4 <- merged_df[merged_df$county.x %in% selected_counties, ]

# Inspecting subsetted dataframe
print(head(subset_q4))
```
# 4. a)
```{r}
model_hotterdays <- lm(cruderate ~ hotterdays, data = subset_q4)

slope_hotterdays <- coef(model_hotterdays)[2]

ggplot(subset_q4, aes(x = hotterdays, y = cruderate)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Relationship between hotter days and over 65 mortality rates") +
  xlab("Hotter Days") +
  ylab("Over 65 Mortality Rates") +
  annotate("text", x = min(subset_q4$hotterdays), y = max(subset_q4$cruderate), label = paste("Slope =", round(slope_hotterdays, 2)), hjust = 0, vjust = 1)
```
# 4. b)
```{r}
ggplot(subset_q4, aes(x = hotterdays, y = cruderate, color = county.x)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("County-based relationship between hotter days and over 65 mortality") +
  xlab("Hotter Days") +
  ylab("Over-65 Mortality Rates") +
  theme(legend.position = "bottom")
```
# Question 5
```{r}
dg2011 <- felm (cruderate 
                ~ tday_lt10 + tday_10_20 + tday_20_30 + tday_30_40 + tday_40_50 
                + tday_50_60 + tday_70_80 + tday_80_90 + tday_gt90
                + prec_10_15 + prec_15_20 + prec_20_25 + prec_25_30 + prec_30_35
                + prec_35_40 + prec_40_45 + prec_45_50 + prec_50_55 + prec_55_60
                + prec_gt60
                | countycode + ssyy | 0 | countycode,
                data = merged_df, weight = merged_df$population)
summary(dg2011)
```
# 5. a)
```{r}
# Creating a dataframe with order of temperature bins, with the value of zero for the temperature bin of 60-70F
tempbins <- data.frame(
  temperature_bins = factor(c("<10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", ">90"),
                            levels = c("<10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", ">90")),
  estimates = c(3.7239, 2.6755, 3.5811, 1.8146, 1.4530, 0.2559, 0, 0.8364, 1.3660, 5.3466),
  std_errors = c(1.3839, 1.1061, 0.8509, 0.6684, 0.5699, 0.3271, 0, 0.6805, 0.8861, 1.3828)
)

ggplot(tempbins, aes(x = temperature_bins, y = estimates)) +
  geom_point() +
  geom_errorbar(aes(ymin = estimates - 2*std_errors, ymax = estimates + 2*std_errors), width = 0.2) +
  geom_line(aes(group = 1)) +
  geom_text(aes(label = estimates), vjust = -1.5, hjust = 0.5) +
  labs(x = "Temperature bins (F)",
       y = "Over-65 mortality rate",
       title = "Estimated impact of a day in 9 daily mean temperature (F) bins on\nannual over-65 mortality rate, relative to a day in the 60-70F bin\n(including +2 and -2 standard errors)")
```