diff --git a/.ci/37.yml b/.ci/37.yml index b402f785..ea3c409e 100644 --- a/.ci/37.yml +++ b/.ci/37.yml @@ -20,6 +20,8 @@ dependencies: - seaborn - tqdm - urbanaccess + - rvlib + - numba # testing, etc - codecov - pytest diff --git a/.ci/38.yml b/.ci/38.yml index 0aab2b69..a6748ea8 100644 --- a/.ci/38.yml +++ b/.ci/38.yml @@ -20,6 +20,8 @@ dependencies: - seaborn - tqdm - urbanaccess + - rvlib + - numba # testing, etc - codecov - pytest diff --git a/.ci/39.yml b/.ci/39.yml index dd30ffda..a81b97ee 100644 --- a/.ci/39.yml +++ b/.ci/39.yml @@ -20,12 +20,14 @@ dependencies: - seaborn - tqdm - urbanaccess + - numba # testing, etc - codecov - pytest - pytest-mpl - pytest-cov - twine + - rvlib # docs - ipywidgets - nbsphinx diff --git a/environment.yml b/environment.yml index 12dfb511..9215268b 100644 --- a/environment.yml +++ b/environment.yml @@ -21,3 +21,5 @@ dependencies: - seaborn - tqdm - urbanaccess + - rvlib + - numba \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e57e4e80..d58acb24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ scipy seaborn tqdm urbanaccess +rvlib +numba diff --git a/segregation/singlegroup/density_corrected_dissim.py b/segregation/singlegroup/density_corrected_dissim.py index 3268bada..da3797e6 100644 --- a/segregation/singlegroup/density_corrected_dissim.py +++ b/segregation/singlegroup/density_corrected_dissim.py @@ -5,8 +5,8 @@ import geopandas as gpd import numpy as np import pandas as pd +from rvlib import Normal from scipy.optimize import minimize -from scipy.stats import norm from .._base import SingleGroupIndex, SpatialImplicitIndex @@ -57,6 +57,7 @@ def _density_corrected_dissim(data, group_pop_var, total_pop_var, xtol=1e-5): # Constructing function that returns $n(\hat{\theta}_j)$ def return_optimal_theta(theta_j): def fold_norm(x): + norm = Normal(0, 1) y = (-1) * (norm.pdf(x - theta_j) + norm.pdf(x + theta_j)) return y diff --git a/segregation/singlegroup/gini.py b/segregation/singlegroup/gini.py index 37f4a458..6ebe9afd 100644 --- a/segregation/singlegroup/gini.py +++ b/segregation/singlegroup/gini.py @@ -4,7 +4,6 @@ import geopandas as gpd import numpy as np -import pandas as pd from .._base import SingleGroupIndex, SpatialImplicitIndex @@ -111,7 +110,6 @@ def __init__( **kwargs ): """Init.""" - SingleGroupIndex.__init__(self, data, group_pop_var, total_pop_var) if any([w, network, distance]): SpatialImplicitIndex.__init__( diff --git a/segregation/singlegroup/modified_dissim.py b/segregation/singlegroup/modified_dissim.py index 974a67fb..d6a01989 100644 --- a/segregation/singlegroup/modified_dissim.py +++ b/segregation/singlegroup/modified_dissim.py @@ -4,12 +4,16 @@ import geopandas as gpd import numpy as np - +import pandas as pd from .._base import SingleGroupIndex, SpatialImplicitIndex from .dissim import _dissim +from joblib import Parallel, delayed +import multiprocessing -def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500): +def _modified_dissim( + data, group_pop_var, total_pop_var, iterations=500, n_jobs=-1, backend="threading" +): """Calculate Modified Dissimilarity index. Parameters @@ -38,6 +42,7 @@ def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500): Reference: :cite:`carrington1997measuring`. """ + n_jobs = multiprocessing.cpu_count() if type(iterations) is not int: raise TypeError("iterations must be an integer") @@ -51,20 +56,24 @@ def _modified_dissim(data, group_pop_var, total_pop_var, iterations=500): p_null = x.sum() / t.sum() - Ds = np.empty(iterations) - - for i in np.array(range(iterations)): + def _gen_estimate(i): + data = i[0] + n = i[1] + p = i[2] - freq_sim = np.random.binomial( - n=np.array([t.tolist()]), - p=np.array([[p_null] * data.shape[0]]), - size=(1, data.shape[0]), - ).tolist()[0] + freq_sim = np.random.binomial(n=n, p=p, size=(1, data.shape[0]),).tolist()[0] data[group_pop_var] = freq_sim - # data = data.assign(group_pop_var=freq_sim) aux = _dissim(data, group_pop_var, total_pop_var)[0] - Ds[i] = aux + return aux + Ds = np.array( + Parallel(n_jobs=n_jobs, backend=backend)( + delayed(_gen_estimate)( + (data, np.array([t.tolist()]), np.array([[p_null] * data.shape[0]])) + ) + for i in range(iterations) + ) + ) D_star = Ds.mean() if D >= D_star: @@ -129,6 +138,8 @@ def __init__( decay="linear", function="triangular", precompute=None, + n_jobs=-1, + backend="threading", **kwargs ): """Init.""" @@ -139,7 +150,7 @@ def __init__( self, w, network, distance, decay, function, precompute ) aux = _modified_dissim( - self.data, self.group_pop_var, self.total_pop_var, iterations + self.data, self.group_pop_var, self.total_pop_var, iterations, backend=backend ) self.statistic = aux[0] diff --git a/segregation/singlegroup/modified_gini.py b/segregation/singlegroup/modified_gini.py index 84fc4d06..0e5de78c 100644 --- a/segregation/singlegroup/modified_gini.py +++ b/segregation/singlegroup/modified_gini.py @@ -4,12 +4,17 @@ import geopandas as gpd import numpy as np - +import pandas as pd from .._base import SingleGroupIndex, SpatialImplicitIndex from .gini import _gini_seg +from tqdm.auto import tqdm +from joblib import Parallel, delayed +import multiprocessing -def _modified_gini(data, group_pop_var, total_pop_var, iterations=500): +def _modified_gini( + data, group_pop_var, total_pop_var, iterations=500, backend='threading' +): """Calculate Modified Gini index. Parameters @@ -37,27 +42,35 @@ def _modified_gini(data, group_pop_var, total_pop_var, iterations=500): Reference: :cite:`carrington1997measuring`. """ + n_jobs = multiprocessing.cpu_count() D = _gini_seg(data, group_pop_var, total_pop_var)[0] - x = np.array(data[group_pop_var].astype(int)) - t = np.array(data[total_pop_var].astype(int)) + x = data[group_pop_var].values + t = data[total_pop_var].values.astype(int) p_null = x.sum() / t.sum() - Ds = np.empty(iterations) + # Ds = np.empty(iterations) - for i in np.array(range(iterations)): + def _gen_estimate(i): + data = i[0] + n = i[1] + p = i[2] - freq_sim = np.random.binomial( - n=np.array([t.tolist()]), - p=np.array([[p_null] * data.shape[0]]), - size=(1, data.shape[0]), - ).tolist()[0] + freq_sim = np.random.binomial(n=n, p=p, size=(1, data.shape[0]),).tolist()[0] data[group_pop_var] = freq_sim - # data = data.assign(group_pop_var=freq_sim) aux = _gini_seg(data, group_pop_var, total_pop_var)[0] - Ds[i] = aux + return aux + + Ds = pd.Series( + Parallel(n_jobs=n_jobs, backend=backend)( + delayed(_gen_estimate)( + (data, np.array([t.tolist()]), np.array([[p_null] * data.shape[0]])) + ) + for i in range(iterations) + ) + ) D_star = Ds.mean() @@ -123,6 +136,7 @@ def __init__( decay="linear", function="triangular", precompute=None, + backend='threading', **kwargs ): """Init.""" @@ -133,7 +147,11 @@ def __init__( self, w, network, distance, decay, function, precompute ) aux = _modified_gini( - self.data, self.group_pop_var, self.total_pop_var, iterations + self.data, + self.group_pop_var, + self.total_pop_var, + iterations, + backend=backend ) self.statistic = aux[0] diff --git a/segregation/singlegroup/spatial_prox_profile.py b/segregation/singlegroup/spatial_prox_profile.py index 330eeea1..46cd364a 100644 --- a/segregation/singlegroup/spatial_prox_profile.py +++ b/segregation/singlegroup/spatial_prox_profile.py @@ -5,7 +5,7 @@ import numpy as np from libpysal.weights import Queen from scipy.sparse.csgraph import floyd_warshall - +from numba import njit from .._base import SingleGroupIndex, SpatialExplicitIndex @@ -45,20 +45,31 @@ def _spatial_prox_profile(data, group_pop_var, total_pop_var, w, m): if not w: w = Queen.from_dataframe(data) delta = floyd_warshall(csgraph=w.sparse, directed=False) + group_vals = data[group_pop_var].to_numpy() + total_vals = data[total_pop_var].to_numpy() + + grid = np.linspace(0, 1, m) - def calculate_etat(t): - g_t_i = np.where(data[group_pop_var] / data[total_pop_var] >= t, True, False) - k = g_t_i.sum() + @njit(fastmath=True, error_model="numpy") + def calc(grid): + def calculate_etat(t): + g_t_i = np.where(np.divide(group_vals, total_vals) >= t, True, False) + k = g_t_i.sum() - # i and j only varies in the units subset within the threshold in eta_t of Hong (2014). - sub_delta_ij = delta[g_t_i, :][:, g_t_i] + # i and j only varies in the units subset within the threshold in eta_t of Hong (2014). + sub_delta_ij = delta[g_t_i, :][:, g_t_i] - den = sub_delta_ij.sum() - eta_t = (k ** 2 - k) / den - return eta_t + den = sub_delta_ij.sum() + eta_t = (k ** 2 - k) / den + return eta_t - grid = np.linspace(0, 1, m) - aux = np.array(list(map(calculate_etat, grid))) + results = np.empty(len(grid)) + for i, est in enumerate(grid): + aux = calculate_etat(est) + results[i] = aux + return results + + aux = calc(grid) aux[aux == np.inf] = 0 aux[aux == -np.inf] = 0 curve = np.nan_to_num(aux, 0) diff --git a/segregation/singlegroup/spatial_proximity.py b/segregation/singlegroup/spatial_proximity.py index 6ef5d393..7fcf884d 100644 --- a/segregation/singlegroup/spatial_proximity.py +++ b/segregation/singlegroup/spatial_proximity.py @@ -3,7 +3,6 @@ __author__ = "Renan X. Cortes , Sergio J. Rey and Elijah Knaap " import numpy as np -import pandas as pd from ..util import generate_distance_matrix from .._base import SingleGroupIndex, SpatialExplicitIndex