Skip to content

Commit

Permalink
Merge pull request #183 from knaaptime/optim
Browse files Browse the repository at this point in the history
performance enhancements
  • Loading branch information
knaaptime authored Aug 9, 2021
2 parents 33dc101 + 28d47f9 commit d5d9648
Show file tree
Hide file tree
Showing 11 changed files with 90 additions and 42 deletions.
2 changes: 2 additions & 0 deletions .ci/37.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies:
- seaborn
- tqdm
- urbanaccess
- rvlib
- numba
# testing, etc
- codecov
- pytest
Expand Down
2 changes: 2 additions & 0 deletions .ci/38.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies:
- seaborn
- tqdm
- urbanaccess
- rvlib
- numba
# testing, etc
- codecov
- pytest
Expand Down
2 changes: 2 additions & 0 deletions .ci/39.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ dependencies:
- seaborn
- tqdm
- urbanaccess
- numba
# testing, etc
- codecov
- pytest
- pytest-mpl
- pytest-cov
- twine
- rvlib
# docs
- ipywidgets
- nbsphinx
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ dependencies:
- seaborn
- tqdm
- urbanaccess
- rvlib
- numba
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ scipy
seaborn
tqdm
urbanaccess
rvlib
numba
3 changes: 2 additions & 1 deletion segregation/singlegroup/density_corrected_dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import geopandas as gpd
import numpy as np
import pandas as pd
from rvlib import Normal
from scipy.optimize import minimize
from scipy.stats import norm

from .._base import SingleGroupIndex, SpatialImplicitIndex

Expand Down Expand Up @@ -57,6 +57,7 @@ def _density_corrected_dissim(data, group_pop_var, total_pop_var, xtol=1e-5):
# Constructing function that returns $n(\hat{\theta}_j)$
def return_optimal_theta(theta_j):
def fold_norm(x):
norm = Normal(0, 1)
y = (-1) * (norm.pdf(x - theta_j) + norm.pdf(x + theta_j))
return y

Expand Down
2 changes: 0 additions & 2 deletions segregation/singlegroup/gini.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import geopandas as gpd
import numpy as np
import pandas as pd

from .._base import SingleGroupIndex, SpatialImplicitIndex

Expand Down Expand Up @@ -111,7 +110,6 @@ def __init__(
**kwargs
):
"""Init."""

SingleGroupIndex.__init__(self, data, group_pop_var, total_pop_var)
if any([w, network, distance]):
SpatialImplicitIndex.__init__(
Expand Down
37 changes: 24 additions & 13 deletions segregation/singlegroup/modified_dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

import geopandas as gpd
import numpy as np

import pandas as pd
from .._base import SingleGroupIndex, SpatialImplicitIndex
from .dissim import _dissim
from joblib import Parallel, delayed
import multiprocessing


def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500):
def _modified_dissim(
data, group_pop_var, total_pop_var, iterations=500, n_jobs=-1, backend="threading"
):
"""Calculate Modified Dissimilarity index.
Parameters
Expand Down Expand Up @@ -38,6 +42,7 @@ def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500):
Reference: :cite:`carrington1997measuring`.
"""
n_jobs = multiprocessing.cpu_count()
if type(iterations) is not int:
raise TypeError("iterations must be an integer")

Expand All @@ -51,20 +56,24 @@ def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500):

p_null = x.sum() / t.sum()

Ds = np.empty(iterations)

for i in np.array(range(iterations)):
def _gen_estimate(i):
data = i[0]
n = i[1]
p = i[2]

freq_sim = np.random.binomial(
n=np.array([t.tolist()]),
p=np.array([[p_null] * data.shape[0]]),
size=(1, data.shape[0]),
).tolist()[0]
freq_sim = np.random.binomial(n=n, p=p, size=(1, data.shape[0]),).tolist()[0]
data[group_pop_var] = freq_sim
# data = data.assign(group_pop_var=freq_sim)
aux = _dissim(data, group_pop_var, total_pop_var)[0]
Ds[i] = aux
return aux

Ds = np.array(
Parallel(n_jobs=n_jobs, backend=backend)(
delayed(_gen_estimate)(
(data, np.array([t.tolist()]), np.array([[p_null] * data.shape[0]]))
)
for i in range(iterations)
)
)
D_star = Ds.mean()

if D >= D_star:
Expand Down Expand Up @@ -129,6 +138,8 @@ def __init__(
decay="linear",
function="triangular",
precompute=None,
n_jobs=-1,
backend="threading",
**kwargs
):
"""Init."""
Expand All @@ -139,7 +150,7 @@ def __init__(
self, w, network, distance, decay, function, precompute
)
aux = _modified_dissim(
self.data, self.group_pop_var, self.total_pop_var, iterations
self.data, self.group_pop_var, self.total_pop_var, iterations, backend=backend
)

self.statistic = aux[0]
Expand Down
46 changes: 32 additions & 14 deletions segregation/singlegroup/modified_gini.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@

import geopandas as gpd
import numpy as np

import pandas as pd
from .._base import SingleGroupIndex, SpatialImplicitIndex
from .gini import _gini_seg
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import multiprocessing


def _modified_gini(data, group_pop_var, total_pop_var, iterations=500):
def _modified_gini(
data, group_pop_var, total_pop_var, iterations=500, backend='threading'
):
"""Calculate Modified Gini index.
Parameters
Expand Down Expand Up @@ -37,27 +42,35 @@ def _modified_gini(data, group_pop_var, total_pop_var, iterations=500):
Reference: :cite:`carrington1997measuring`.
"""
n_jobs = multiprocessing.cpu_count()

D = _gini_seg(data, group_pop_var, total_pop_var)[0]

x = np.array(data[group_pop_var].astype(int))
t = np.array(data[total_pop_var].astype(int))
x = data[group_pop_var].values
t = data[total_pop_var].values.astype(int)

p_null = x.sum() / t.sum()

Ds = np.empty(iterations)
# Ds = np.empty(iterations)

for i in np.array(range(iterations)):
def _gen_estimate(i):
data = i[0]
n = i[1]
p = i[2]

freq_sim = np.random.binomial(
n=np.array([t.tolist()]),
p=np.array([[p_null] * data.shape[0]]),
size=(1, data.shape[0]),
).tolist()[0]
freq_sim = np.random.binomial(n=n, p=p, size=(1, data.shape[0]),).tolist()[0]
data[group_pop_var] = freq_sim
# data = data.assign(group_pop_var=freq_sim)
aux = _gini_seg(data, group_pop_var, total_pop_var)[0]
Ds[i] = aux
return aux

Ds = pd.Series(
Parallel(n_jobs=n_jobs, backend=backend)(
delayed(_gen_estimate)(
(data, np.array([t.tolist()]), np.array([[p_null] * data.shape[0]]))
)
for i in range(iterations)
)
)

D_star = Ds.mean()

Expand Down Expand Up @@ -123,6 +136,7 @@ def __init__(
decay="linear",
function="triangular",
precompute=None,
backend='threading',
**kwargs
):
"""Init."""
Expand All @@ -133,7 +147,11 @@ def __init__(
self, w, network, distance, decay, function, precompute
)
aux = _modified_gini(
self.data, self.group_pop_var, self.total_pop_var, iterations
self.data,
self.group_pop_var,
self.total_pop_var,
iterations,
backend=backend
)

self.statistic = aux[0]
Expand Down
33 changes: 22 additions & 11 deletions segregation/singlegroup/spatial_prox_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from libpysal.weights import Queen
from scipy.sparse.csgraph import floyd_warshall

from numba import njit
from .._base import SingleGroupIndex, SpatialExplicitIndex


Expand Down Expand Up @@ -45,20 +45,31 @@ def _spatial_prox_profile(data, group_pop_var, total_pop_var, w, m):
if not w:
w = Queen.from_dataframe(data)
delta = floyd_warshall(csgraph=w.sparse, directed=False)
group_vals = data[group_pop_var].to_numpy()
total_vals = data[total_pop_var].to_numpy()

grid = np.linspace(0, 1, m)

def calculate_etat(t):
g_t_i = np.where(data[group_pop_var] / data[total_pop_var] >= t, True, False)
k = g_t_i.sum()
@njit(fastmath=True, error_model="numpy")
def calc(grid):
def calculate_etat(t):
g_t_i = np.where(np.divide(group_vals, total_vals) >= t, True, False)
k = g_t_i.sum()

# i and j only varies in the units subset within the threshold in eta_t of Hong (2014).
sub_delta_ij = delta[g_t_i, :][:, g_t_i]
# i and j only varies in the units subset within the threshold in eta_t of Hong (2014).
sub_delta_ij = delta[g_t_i, :][:, g_t_i]

den = sub_delta_ij.sum()
eta_t = (k ** 2 - k) / den
return eta_t
den = sub_delta_ij.sum()
eta_t = (k ** 2 - k) / den
return eta_t

grid = np.linspace(0, 1, m)
aux = np.array(list(map(calculate_etat, grid)))
results = np.empty(len(grid))
for i, est in enumerate(grid):
aux = calculate_etat(est)
results[i] = aux
return results

aux = calc(grid)
aux[aux == np.inf] = 0
aux[aux == -np.inf] = 0
curve = np.nan_to_num(aux, 0)
Expand Down
1 change: 0 additions & 1 deletion segregation/singlegroup/spatial_proximity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
__author__ = "Renan X. Cortes <[email protected]>, Sergio J. Rey <[email protected]> and Elijah Knaap <[email protected]>"

import numpy as np
import pandas as pd
from ..util import generate_distance_matrix

from .._base import SingleGroupIndex, SpatialExplicitIndex
Expand Down

0 comments on commit d5d9648

Please sign in to comment.