Skip to content

Commit

Permalink
Merge pull request #72 from rodekruis/fix/people-affected
Browse files Browse the repository at this point in the history
Fix people affected
  • Loading branch information
aklilu authored Sep 28, 2021
2 parents 28d89d9 + 4d41f9c commit cb645f1
Showing 1 changed file with 52 additions and 60 deletions.
112 changes: 52 additions & 60 deletions analysis/people_affected/people_affected.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.optimize import minimize
from scipy.optimize import curve_fit
import matplotlib as mpl
from statsmodels.stats import diagnostic

INPUT_DIR = Path("../../data/")

INPUT_DIR = Path("../../IBF-Typhoon-model/data")
OUTPUT_DIR = Path("output/")

DROMIC_REPORT_FILENAME = "2014 - 20 DROMIC Reports_PeopleAffected_DamagedHouses_Consolidated.xlsx"
Expand Down Expand Up @@ -82,90 +84,88 @@ df_combined = df_combined[~df_combined['location'].isin(total_cnames)]
```

```python
def fit_line(x, y):

def fit_line(x, y, yint_zero = False):
def linear_function(x, m, b):
return m * x + b

# likelihood function
def mle(params):
m, b, sd = params[0], params[1], params[2]
yhat = m*x + b # predictions
return -np.sum( stats.norm.logpdf(y, loc=yhat, scale=sd) )

results = curve_fit(linear_function, x, y)
m_fit, b_fit = results[0][0], results[0][1]
y_fit = m_fit * x + b_fit

print(f'Best fit slope & y-int: {m_fit:.1f}, {b_fit:.1f}')

# Report correlation results
for stat in ['pearsonr', 'spearmanr']:
print(f'\nResults for {stat}')
result = getattr(stats, stat)(x, y)
print(f'Correlation: {result[0]:.2f}, p-value: {result[1]:.2e}')

# Analyze residuals
res = y - y_fit

def mle_yint_zero(params):
m, sd = params[0], params[1]
yhat = m*x # predictions
return -np.sum( stats.norm.logpdf(y, loc=yhat, scale=sd) )

if yint_zero:
guess = np.array([25, 1])
function = mle_yint_zero

else:
guess = np.array([25, 35000, 1])
function = mle
#fig, ax = plt.subplots()
#ax.hist(res, bins=100)
#ax.set_xlabel('Residuals')

results = minimize(function, guess, method='Nelder-Mead',
options={'disp': True})
print('\nResult for shapiro')
result = stats.shapiro(res)
print(f'Correlation: {result[0]:.2f}, p-value: {result[1]:.2e}')

# Plot histogram of residuals
residuals = y - (results.x[0] * y + results.x[1])

fig, ax = plt.subplots()
ax.hist(residuals, bins=100)
ax.set_xlabel('residuals')
print('Best fit slope:', results.x[0])
if not yint_zero:
print('Best fit y-int:', results.x[1])
for stat in ['het_white', 'het_breuschpagan']:
print(f'\nResult for {stat}')
result = getattr(diagnostic, stat)(res, np.array([np.ones(len(x)), x]).T)
print(result)

return m_fit, b_fit

return results
```

```python

# Limit events to where there are at least 100 damaged houess / people affected
# (in reality setting this to 0 has little effect as the fit is dominated by
# the large numbers)
houses_thresh = 100
houses_thresh = 100 # Don't skip any for now
people_thresh = 100
df_to_fit = df_combined[(df_combined['npeople'] >= people_thresh) &
(df_combined['nhouses'] >= houses_thresh)]

x = df_to_fit['nhouses']
y = df_to_fit['npeople']

# Try fitting with a free y intercept and one fixed at 0 (to get a scaling factor)
print('For linear fit:')
results_linear = fit_line(x, y)
print('\nFor linear fit with y int fixed to 0:')
results_yint_zero = fit_line(x, y, yint_zero=True)
```

The best-fit line ha a slope of ~25 (rounding due to uncertainties), with both a free y-intercept and one that is fixed at 0,
which means that for each severly damaged house there are about 25 people affected.
Even though the average household size is around 5, the higher number can explained because we
expect that for each severely damaged house, there will be more people affected than
just those residing in the building itself.
Both Pearson's and Spearman's show a moderate correlation (>0.5) and reject the null hypothesis that there is no correlation.

However, residuals are non-normal so there is quite some heteroscedasticity, It might be worthwhile fitting an exponential
With Shapiro we do reject the null hypothesis that the residuals are normally distributed.

Last two tests: Null hypothesis homoscedasticity is present is rejected, therefore we find heteroscodasticity.

```python
# Try again but for log-log
results_log = fit_line(np.log(x), np.log(y))
```

An exponential of the form:
Pearson's correlation is slightly better, also rejecting the null hypothesis of no correlation. Spearman is unchaged as it should be since it is just the rank.

As previously with Shapiro we do reject the null hypothesis that the residuals are normally distributed.

However, we are not able to reject the null hypothesis of homoscodasticity.

Thus an exponential of the form:
```
n_people = 150 x n_houses ^ 0.7
n_people = e ^ 5.7 x n_houses ^ 0.7
```
is probably a better fit, but maybe less practical for interpretation

```python
# Plot the data and best-fit line

y_linear = results_linear.x[0] * x_line + results_linear.x[1]
y_yint_zero = results_yint_zero.x[0] * x_line
y_log = np.exp(results_log.x[1]) * x_line ** results_log.x[0]
x_line = np.linspace(100, x.max(), 100)
y_linear = results_linear[0] * x_line + results_linear[1]
y_log = np.exp(results_log[1]) * x_line ** results_log[0]

def plot_people_vs_houses(df, log=False, xlims=None, ylims=None):
x = df['nhouses']
Expand All @@ -175,7 +175,6 @@ def plot_people_vs_houses(df, log=False, xlims=None, ylims=None):
ax.plot(x, y, '.', alpha=0.3, c='Grey')
x_line = np.linspace(100, x.max(), 100)
ax.plot(x_line, y_linear, label='linear fit')
ax.plot(x_line, y_yint_zero, label='y-intercept 0')
ax.plot(x_line, y_log, label='exponential fit')
if log:
ax.set_xscale('log')
Expand Down Expand Up @@ -205,12 +204,10 @@ Some things to note:
# Plot the scaling factor -- i.e. the number you need to multiply the number of
# totally damaged houses to get the people affected

y1 = results_linear.x[0] * x_line + results_linear.x[1]
y2 = results_yint_zero.x[0] * x_line
y3 = np.exp(results_log.x[1]) * x_line ** results_log.x[0]
y1 = results_linear[0] * x_line + results_linear[1]
y3 = np.exp(results_log[1]) * x_line ** results_log[0]
fig, ax = plt.subplots()
ax.plot(x_line, y_linear / x_line, label='linear fit')
ax.plot(x_line, y_yint_zero / x_line, label='y-intercept 0')
ax.plot(x_line, y_log / x_line, label='exponential fit')
ax.set_ylim(0, 50)
ax.legend()
Expand All @@ -223,8 +220,7 @@ ax.grid(which='minor')

## Summary

We perform a liner fit both holding the y-intercept at 0 at allowing it to vary.
In both cases we obtain a slope of 25, meaning that for a single totally damaged
We perform a liner fit and obtain a slope of ~25, meaning that for a single totally damaged
house we expect about 25 affected people. This is more than a typical household size of 5,
however this is not unexpected since by the time houses are severely damaged,
we expected more people than just the residents of these houess in the nearby community to be affected.
Expand All @@ -242,7 +238,3 @@ ratio = df_combined['npeople'] / df_combined['nfam']
ratio = ratio[ratio < 100]
plt.hist(ratio, bins=np.arange(0.0, 10.0, 0.1))
```

```python

```

0 comments on commit cb645f1

Please sign in to comment.