diff --git a/esda/crand.py b/esda/crand.py index aa82d7f4..0fea9599 100644 --- a/esda/crand.py +++ b/esda/crand.py @@ -103,7 +103,7 @@ def crand( scaling : float Scaling value to apply to every local statistic seed : None/int - Seed to ensure reproducibility of conditional randomizations + Seed to ensure reproducibility of conditional randomizations Returns ------- @@ -113,8 +113,8 @@ def crand( If keep=True, (N, permutations) array with simulated values of stat_func under the null of spatial randomness; else, empty (1, 1) array """ - cardinalities = np.array((w.sparse != 0).sum(1)).flatten() - max_card = cardinalities.max() + adj_matrix = w.sparse + n = len(z) if z.ndim == 2: if z.shape[1] == 2: @@ -141,6 +141,26 @@ def crand( # paralellise over permutations? if seed is None: seed = np.random.randint(12345, 12345000) + + # we need to be careful to shuffle only *other* sites, not + # the self-site. This means we need to + ### extract the self-weight, if any + self_weights = adj_matrix.diagonal() + ### force the self-site weight to zero + with warnings.catch_warnings(): + # massive changes to sparsity incur a cost, but it's not + # large for simply changing the diag + warnings.simplefilter("ignore") + adj_matrix.setdiag(0) + adj_matrix.eliminate_zeros() + ### extract the weights from a now no-self-weighted adj_matrix + other_weights = adj_matrix.data + ### use the non-self weight as the cardinality, since + ### this is the set we have to randomize. + ### if there is a self-neighbor, we need to *not* shuffle the + ### self neighbor, since conditional randomization conditions on site i. + cardinalities = np.array((adj_matrix != 0).sum(1)).flatten() + max_card = cardinalities.max() permuted_ids = vec_permutations(max_card, n, permutations, seed) if n_jobs != 1: @@ -162,7 +182,8 @@ def crand( z, # all z, for serial this is also the entire data observed, # observed statistics cardinalities, # cardinalities conforming to chunked z - w.sparse.data, # flat weights buffer + self_weights, # n-length vector containing the self-weights. + other_weights, # flat weights buffer permuted_ids, # permuted ids scaling, # scaling applied to all statistics keep, # whether or not to keep the local statistics @@ -171,12 +192,15 @@ def crand( else: if n_jobs == -1: n_jobs = os.cpu_count() + if n_jobs > len(z): + n_jobs = len(z) # Parallel implementation larger, rlocals = parallel_crand( z, observed, cardinalities, - w.sparse.data, + self_weights, + other_weights, permuted_ids, scaling, n_jobs, @@ -198,7 +222,8 @@ def compute_chunk( z: np.ndarray, observed: np.ndarray, cardinalities: np.ndarray, - weights: np.ndarray, + self_weights: np.ndarray, + other_weights: np.ndarray, permuted_ids: np.ndarray, scaling: np.float64, keep: bool, @@ -212,18 +237,22 @@ def compute_chunk( --------- chunk_start : int Starting index for the chunk of input. Should be zero if z_chunk == z. - z_chunk : numpy.ndarray + z_chunk : numpy.ndarray (n_chunk,) array containing the chunk of standardised observed values. z : ndarray 2D array with N rows with standardised observed values observed : ndarray (n_chunk,) array containing observed values for the chunk cardinalities : ndarray - (n_chunk,) array containing the cardinalities for each element. - weights : ndarray - Array containing the weights within the chunk in a flat format (ie. as - obtained from the `values` attribute of a CSR sparse representation of - the original W. This is as long as the sum of `cardinalities` + (n_chunk,) array containing the cardinalities for each element. + self_weights : ndarray of shape (n,) + Array containing the self-weights for each observation. In most cases, this + will be zero. But, in some cases (e.g. Gi-star or kernel weights), this will + be nonzero. + other_weights : ndarray + Array containing the weights of all other sites in the computation + other than site i. If self_weights is zero, this has as many entries + as the sum of `cardinalities`. permuted_ids : ndarray (permutations, max_cardinality) array with indices of permuted ids to use to construct random realizations of the statistic @@ -254,8 +283,11 @@ def compute_chunk( for i in range(chunk_n): cardinality = cardinalities[i] - ### this chomps the first `cardinality` weights off of `weights` - weights_i = weights[wloc : (wloc + cardinality)] + # we need to fix the self-weight to the first position + weights_i = np.zeros(cardinality + 1, dtype=other_weights.dtype) + weights_i[0] = self_weights[i] + ### this chomps the next `cardinality` weights off of `weights` + weights_i[1:] = other_weights[wloc : (wloc + cardinality)] wloc += cardinality z_chunk_i = z_chunk[i] mask[chunk_start + i] = False @@ -264,7 +296,9 @@ def compute_chunk( ] rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling) if keep: - rlocals[i,] = rstats + rlocals[ + i, + ] = rstats larger[i] = np.sum(rstats >= observed[i]) return larger, rlocals @@ -284,7 +318,7 @@ def build_weights_offsets(cardinalities: np.ndarray, n_chunks: int): Parameters ---------- cardinalities : ndarray - (n_chunk,) array containing the cardinalities for each element. + (n_chunk,) array containing the cardinalities for each element. n_chunks : int Number of chunks to split the weights into @@ -312,7 +346,8 @@ def chunk_generator( z: np.ndarray, observed: np.ndarray, cardinalities: np.ndarray, - weights: np.ndarray, + self_weights: np.ndarray, + other_weights: np.ndarray, w_boundary_points: np.ndarray, ): """ @@ -331,7 +366,7 @@ def chunk_generator( observed : ndarray (N,) array with observed values cardinalities : ndarray - (N,) array containing the cardinalities for each element. + (N,) array containing the cardinalities for each element. weights : ndarray Array containing the weights within the chunk in a flat format (ie. as obtained from the `values` attribute of a CSR sparse representation of @@ -344,14 +379,14 @@ def chunk_generator( ------ start : int Starting index for the chunk of input. Should be zero if z_chunk == z. - z_chunk : numpy.ndarray + z_chunk : numpy.ndarray (n_chunk,) array containing the chunk of standardised observed values. z : ndarray 2D array with N rows with standardised observed values observed_chunk : ndarray (n_chunk,) array containing observed values for the chunk cardinalities_chunk : ndarray - (n_chunk,) array containing the cardinalities for each element. + (n_chunk,) array containing the cardinalities for each element. weights_chunk : ndarray Array containing the weights within the chunk in a flat format (ie. as obtained from the `values` attribute of a CSR sparse representation of @@ -361,15 +396,17 @@ def chunk_generator( for i in range(n_jobs): start = starts[i] z_chunk = z[start : (start + chunk_size)] + self_weights_chunk = self_weights[start : (start + chunk_size)] observed_chunk = observed[start : (start + chunk_size)] cardinalities_chunk = cardinalities[start : (start + chunk_size)] - w_chunk = weights[w_boundary_points[i] : w_boundary_points[i + 1]] + w_chunk = other_weights[w_boundary_points[i] : w_boundary_points[i + 1]] yield ( start, z_chunk, z, observed_chunk, cardinalities_chunk, + self_weights_chunk, w_chunk, ) @@ -378,7 +415,8 @@ def parallel_crand( z: np.ndarray, observed: np.ndarray, cardinalities: np.ndarray, - weights: np.ndarray, + self_weights: np.ndarray, + other_weights: np.ndarray, permuted_ids: np.ndarray, scaling: np.float64, n_jobs: int, @@ -396,11 +434,15 @@ def parallel_crand( observed : ndarray (N,) array with observed values cardinalities : ndarray - (N,) array containing the cardinalities for each element. - weights : ndarray - Array containing the weights in a flat format (ie. as - obtained from the `values` attribute of a CSR sparse representation of - the original W. This is as long as the sum of `cardinalities` + (N,) array containing the cardinalities for each element. + self_weights : ndarray of shape (n,) + Array containing the self-weights for each observation. In most cases, this + will be zero. But, in some cases (e.g. Gi-star or kernel weights), this will + be nonzero. + other_weights : ndarray + Array containing the weights of all other sites in the computation + other than site i. If self_weights is zero, this has as many entries + as the sum of `cardinalities`. permuted_ids : ndarray (permutations, max_cardinality) array with indices of permuted ids to use to construct random realizations of the statistic @@ -454,7 +496,14 @@ def parallel_crand( # construct chunks using a generator chunks = chunk_generator( - n_jobs, starts, z, observed, cardinalities, weights, w_boundary_points, + n_jobs, + starts, + z, + observed, + cardinalities, + self_weights, + other_weights, + w_boundary_points, ) with parallel_backend("loky", inner_max_num_threads=1): @@ -463,8 +512,8 @@ def parallel_crand( for pars in chunks ) larger, rlocals = zip(*worker_out) - larger = np.hstack(larger).flatten() - rlocals = np.row_stack(rlocals) + larger = np.hstack(larger).squeeze() + rlocals = np.row_stack(rlocals).squeeze() return larger, rlocals diff --git a/esda/geary_local.py b/esda/geary_local.py index f84ebc1b..e34df518 100644 --- a/esda/geary_local.py +++ b/esda/geary_local.py @@ -5,20 +5,23 @@ from scipy import stats from sklearn.base import BaseEstimator import libpysal as lp -from esda.crand import ( - crand as _crand_plus, - njit as _njit, - _prepare_univariate -) +from esda.crand import crand as _crand_plus, njit as _njit, _prepare_univariate class Geary_Local(BaseEstimator): """Local Geary - Univariate""" - def __init__(self, connectivity=None, labels=False, sig=0.05, - permutations=999, n_jobs=1, keep_simulations=True, - seed=None): + def __init__( + self, + connectivity=None, + labels=False, + sig=0.05, + permutations=999, + n_jobs=1, + keep_simulations=True, + seed=None, + ): """ Initialize a Local_Geary estimator Arguments @@ -110,8 +113,8 @@ def fit(self, x): x = np.asarray(x).flatten() w = self.connectivity - w.transform = 'r' - + w.transform = "r" + permutations = self.permutations sig = self.sig keep_simulations = self.keep_simulations @@ -122,13 +125,13 @@ def fit(self, x): if permutations: self.p_sim, self.rlocalG = _crand_plus( - z=(x - np.mean(x))/np.std(x), + z=(x - np.mean(x)) / np.std(x), w=w, observed=self.localG, permutations=permutations, keep=keep_simulations, n_jobs=n_jobs, - stat_func=_local_geary + stat_func=_local_geary, ) if self.labels: @@ -137,16 +140,11 @@ def fit(self, x): # Create empty vector to fill self.labs = np.empty(len(x)) * np.nan # Outliers - self.labs[(self.localG < Eij_mean) & - (x > x_mean) & - (self.p_sim <= sig)] = 1 + self.labs[(self.localG < Eij_mean) & (x > x_mean) & (self.p_sim <= sig)] = 1 # Clusters - self.labs[(self.localG < Eij_mean) & - (x < x_mean) & - (self.p_sim <= sig)] = 2 + self.labs[(self.localG < Eij_mean) & (x < x_mean) & (self.p_sim <= sig)] = 2 # Other - self.labs[(self.localG > Eij_mean) & - (self.p_sim <= sig)] = 3 + self.labs[(self.localG > Eij_mean) & (self.p_sim <= sig)] = 3 # Non-significant self.labs[self.p_sim > sig] = 4 @@ -155,22 +153,23 @@ def fit(self, x): @staticmethod def _statistic(x, w): # Caclulate z-scores for x - zscore_x = (x - np.mean(x))/np.std(x) + zscore_x = (x - np.mean(x)) / np.std(x) # Create focal (xi) and neighbor (zi) values adj_list = w.to_adjlist(remove_symmetric=False) zseries = pd.Series(zscore_x, index=w.id_order) zi = zseries.loc[adj_list.focal].values zj = zseries.loc[adj_list.neighbor].values # Carry out local Geary calculation - gs = adj_list.weight.values * (zi-zj)**2 + gs = adj_list.weight.values * (zi - zj) ** 2 # Reorganize data adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index() - adj_list_gs.columns = ['gs', 'ID'] - adj_list_gs = adj_list_gs.groupby(by='ID').sum() + adj_list_gs.columns = ["gs", "ID"] + adj_list_gs = adj_list_gs.groupby(by="ID").sum() localG = adj_list_gs.gs.values - return (localG) + return localG + # -------------------------------------------------------------- # Conditional Randomization Function Implementations @@ -178,7 +177,10 @@ def _statistic(x, w): # Note: does not using the scaling parameter + @_njit(fastmath=True) def _local_geary(i, z, permuted_ids, weights_i, scaling): - zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) - return (zi-zrand)**2 @ weights_i + self_weight = weights_i[0] + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return (zi - zrand) ** 2 @ other_weights diff --git a/esda/getisord.py b/esda/getisord.py index c04a5df1..4a921a99 100644 --- a/esda/getisord.py +++ b/esda/getisord.py @@ -4,9 +4,12 @@ __author__ = "Sergio J. Rey , Myunghwa Hwang " __all__ = ["G", "G_Local"] +import warnings from libpysal.common import np, stats from libpysal.weights.spatial_lag import lag_spatial as slag +from libpysal.weights.util import fill_diagonal from .tabular import _univariate_handler +from .crand import njit as _njit, crand as _crand_plus, _prepare_univariate PERMUTATIONS = 999 @@ -220,7 +223,7 @@ def by_col( outvals=outvals, stat=cls, swapname=cls.__name__.lower(), - **stat_kws + **stat_kws, ) @@ -240,8 +243,12 @@ class G_Local(object): permutations : int the number of random permutations for calculating pseudo p values - star : boolean + star : boolean or float whether or not to include focal observation in sums (default: False) + if the row-transformed weight is provided, then this is the default + value to use within the spatial lag. Generally, weights should be + provided in binary form, and standardization/self-weighting will be + handled by the function itself. Attributes ---------- @@ -382,19 +389,32 @@ def __init__( permutations=PERMUTATIONS, star=False, keep_simulations=True, + n_jobs=-1, + seed=None, ): y = np.asarray(y).flatten() self.n = len(y) self.y = y + w, star = _infer_star_and_structure_w(w, star, transform) + w.transform = transform + self.w_transform = transform self.w = w - self.w_original = w.transform - self.w.transform = self.w_transform = transform.lower() self.permutations = permutations self.star = star self.calc() - self.p_norm = np.array([1 - stats.norm.cdf(np.abs(i)) for i in self.Zs]) + self.p_norm = 1 - stats.norm.cdf(np.abs(self.Zs)) if permutations: - self.__crand(keep_simulations) + self.p_sim, self.rGs = _crand_plus( + y, + w, + self.Gs, + permutations, + keep_simulations, + n_jobs=n_jobs, + stat_func=_g_local_star_crand if star else _g_local_crand, + scaling=y.sum(), + seed=seed, + ) if keep_simulations: self.sim = sim = self.rGs.T self.EG_sim = sim.mean(axis=0) @@ -442,41 +462,43 @@ def __getCardinalities(self): return self.wc def calc(self): + w = self.w + W = w.sparse + + self.y_sum = self.y.sum() + y = self.y - y2 = y * y - self.y_sum = y_sum = sum(y) - y2_sum = sum(y2) - - if not self.star: - yl = 1.0 * slag(self.w, y) - ydi = y_sum - y - self.Gs = yl / ydi - N = self.n - 1 - yl_mean = ydi / N - s2 = (y2_sum - y2) / N - (yl_mean) ** 2 - else: - self.w.transform = "B" - yl = 1.0 * slag(self.w, y) - yl += y - if self.w_transform == "r": - yl = yl / (self.__getCardinalities() + 1.0) - self.Gs = yl / y_sum - N = self.n - yl_mean = y.mean() - s2 = y.var() - - EGs_num, VGs_num = 1.0, 1.0 - if self.w_transform == "b": - W = self.__getCardinalities() - W += self.star - EGs_num = W * 1.0 - VGs_num = (W * (1.0 * N - W)) / (1.0 * N - 1) - - self.EGs = (EGs_num * 1.0) / N - self.VGs = (VGs_num) * (1.0 / (N ** 2)) * ((s2 * 1.0) / (yl_mean ** 2)) - self.Zs = (self.Gs - self.EGs) / np.sqrt(self.VGs) - - self.w.transform = self.w_original + remove_self = not self.star + N = self.w.n - remove_self + + statistic = (W @ y) / (y.sum() - y * remove_self) + + # ----------------------------------------------------# + # compute moments necessary for analytical inference # + # ----------------------------------------------------# + + empirical_mean = (y.sum() - y * remove_self) / N + # variance looks complex, yes, but it obtains from E[x^2] - E[x]^2. + # So, break it down to allow subtraction of the self-neighbor. + mean_of_squares = ((y ** 2).sum() - (y ** 2) * remove_self) / N + empirical_variance = mean_of_squares - empirical_mean ** 2 + + # Since we have corrected the diagonal, this should work + cardinality = np.asarray(W.sum(axis=1)).squeeze() + expected_value = cardinality / N + expected_variance = ( + cardinality + * (N - cardinality) + / (N - 1) + * (1 / N ** 2) + * (empirical_variance / (empirical_mean ** 2)) + ) + z_scores = (statistic - expected_value) / np.sqrt(expected_variance) + + self.Gs = statistic + self.EGs = expected_value + self.VGs = expected_variance + self.Zs = z_scores @property def _statistic(self): @@ -530,22 +552,115 @@ def by_col( outvals=outvals, stat=cls, swapname=cls.__name__.lower(), - **stat_kws + **stat_kws, ) +def _infer_star_and_structure_w(weights, star, transform): + assert transform.lower() in ("r", "b"), ( + f'Transforms must be binary "b" or row-standardized "r".' + f"Recieved: {transform}" + ) + adj_matrix = weights.sparse + diagonal = adj_matrix.diagonal() + zero_diagonal = (diagonal == 0).all() + + # Gi has a zero diagonal, Gi* has a nonzero diagonal + star = (not zero_diagonal) if star is None else star + + # Want zero diagonal but do not have it + if (not zero_diagonal) & (star is False): + weights = fill_diagonal(weights, 0) + # Want nonzero diagonal and have it + elif (not zero_diagonal) & (star is True): + weights = weights + # Want zero diagonal and have it + elif zero_diagonal & (star is False): + weights = weights + # Want nonzero diagonal and do not have it + elif zero_diagonal & (star is True): + # if the input is binary or requested transform is binary, + # set the diagonal to 1. + if transform.lower() == "b" or weights.transform.lower() == "b": + weights = fill_diagonal(weights, 1) + # if we know the target is row-standardized, use the row max + # this works successfully for effectively binary but "O"-transformed input + elif transform.lower() == "r": + # This warning is presented in the documentation as well + warnings.warn( + "Gi* requested, but (a) weights are already row-standardized," + " (b) no weights are on the diagonal, and" + " (c) no default value supplied to star. Assuming that the" + " self-weight is equivalent to the maximum weight in the" + " row. To use a different default (like, .5), set `star=.5`," + " or use libpysal.weights.fill_diagonal() to set the diagonal" + " values of your weights matrix and use `star=None` in Gi_Local." + ) + weights = fill_diagonal( + weights, np.asarray(adj_matrix.max(axis=1).todense()).flatten() + ) + else: # star was something else, so try to fill the weights with it + try: + weights = fill_diagonal(weights, star) + except: + raise TypeError( + f"Type of star ({type(star)}) not understood." + f" Must be an integer, boolean, float, or numpy.ndarray." + ) + star = (weights.sparse.diagonal() > 0).any() + weights.transform = transform + + return weights, star + + # -------------------------------------------------------------- # Conditional Randomization Function Implementations # -------------------------------------------------------------- -# TODO: Flesh these out correctly and implement for Gi stats -# @_njit(fastmath=True) -# def _gi_crand(i, z, permuted_ids, weights_i, scaling): -# zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) -# return (zrand * weights_i).sum(axis=1) / (scaling - zi) - -# @_njit(fastmath=True) -# def _gistar_crand(i, z, permuted_ids, weights_i, scaling): -# zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) -# return ((zrand * weights_i).sum(axis=1)) / scaling +@_njit(fastmath=True) +def _g_local_crand(i, z, permuted_ids, weights_i, scaling): + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return (zrand @ other_weights) / (scaling - zi) + + +@_njit(fastmath=True) +def _g_local_star_crand(i, z, permuted_ids, weights_i, scaling): + self_weight = weights_i[0] + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return (zrand @ other_weights + self_weight * zi) / scaling + + +if __name__ is "__main__": + import geopandas, numpy, esda, importlib + import matplotlib.pyplot as plt + from libpysal import weights, examples + + df = geopandas.read_file(examples.get_path("NAT.shp")) + + w = weights.Rook.from_dataframe(df) + + for transform in ("r", "b"): + for star in (True, False): + test = esda.getisord.G_Local(df.GI89, w, transform=transform, star=star) + out = test._calc2() + ( + statistic, + expected_value, + expected_variance, + z_scores, + empirical_mean, + empirical_variance, + ) = out + + numpy.testing.assert_allclose(statistic, test.Gs) + numpy.testing.assert_allclose(expected_value, test.EGs) + numpy.testing.assert_allclose(expected_variance, test.VGs) + numpy.testing.assert_allclose(z_scores, test.Zs) + numpy.testing.assert_allclose(empirical_mean, test.yl_mean) + numpy.testing.assert_allclose(empirical_variance, test.s2) + + # Also check that the None configuration works + test = esda.getisord.G_Local(df.GI89, w, star=None) diff --git a/esda/join_counts_local.py b/esda/join_counts_local.py index d515af3a..7944f5f1 100644 --- a/esda/join_counts_local.py +++ b/esda/join_counts_local.py @@ -2,11 +2,7 @@ import pandas as pd from sklearn.base import BaseEstimator from libpysal import weights -from esda.crand import ( - crand as _crand_plus, - njit as _njit, - _prepare_univariate -) +from esda.crand import crand as _crand_plus, njit as _njit, _prepare_univariate PERMUTATIONS = 999 @@ -16,8 +12,14 @@ class Join_Counts_Local(BaseEstimator): """Univariate Local Join Count Statistic""" - def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, - keep_simulations=True, seed=None): + def __init__( + self, + connectivity=None, + permutations=PERMUTATIONS, + n_jobs=1, + keep_simulations=True, + seed=None, + ): """ Initialize a Local_Join_Count estimator Arguments @@ -31,18 +33,18 @@ def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, p_values n_jobs : int Number of cores to be used in the conditional randomisation. If -1, - all available cores are used. + all available cores are used. keep_simulations : Boolean (default=True) - If True, the entire matrix of replications under the null - is stored in memory and accessible; otherwise, replications + If True, the entire matrix of replications under the null + is stored in memory and accessible; otherwise, replications are not saved seed : None/int - Seed to ensure reproducibility of conditional randomizations. - Must be set here, and not outside of the function, since numba - does not correctly interpret external seeds - nor numpy.random.RandomState instances. - + Seed to ensure reproducibility of conditional randomizations. + Must be set here, and not outside of the function, since numba + does not correctly interpret external seeds + nor numpy.random.RandomState instances. + Attributes ---------- LJC : numpy array @@ -98,36 +100,36 @@ def fit(self, y, n_jobs=1, permutations=999): """ # Need to ensure that the np.array() are of # dtype='float' for numba - y = np.array(y, dtype='float') + y = np.array(y, dtype="float") w = self.connectivity # Fill the diagonal with 0s w = weights.util.fill_diagonal(w, val=0) - w.transform = 'b' - + w.transform = "b" + keep_simulations = self.keep_simulations n_jobs = self.n_jobs seed = self.seed - + self.y = y self.n = len(y) self.w = w self.LJC = self._statistic(y, w) - + if permutations: self.p_sim, self.rjoins = _crand_plus( - z=self.y, - w=self.w, + z=self.y, + w=self.w, observed=self.LJC, - permutations=permutations, - keep=keep_simulations, + permutations=permutations, + keep=keep_simulations, n_jobs=n_jobs, - stat_func=_ljc_uni + stat_func=_ljc_uni, ) # Set p-values for those with LJC of 0 to NaN - self.p_sim[self.LJC == 0] = 'NaN' - + self.p_sim[self.LJC == 0] = "NaN" + return self @staticmethod @@ -139,12 +141,14 @@ def _statistic(y, w): focal = zseries.loc[adj_list.focal].values neighbor = zseries.loc[adj_list.neighbor].values LJC = (focal == 1) & (neighbor == 1) - adj_list_LJC = pd.DataFrame(adj_list.focal.values, - LJC.astype('uint8')).reset_index() - adj_list_LJC.columns = ['LJC', 'ID'] - adj_list_LJC = adj_list_LJC.groupby(by='ID').sum() - LJC = np.array(adj_list_LJC.LJC.values, dtype='float') - return (LJC) + adj_list_LJC = pd.DataFrame( + adj_list.focal.values, LJC.astype("uint8") + ).reset_index() + adj_list_LJC.columns = ["LJC", "ID"] + adj_list_LJC = adj_list_LJC.groupby(by="ID").sum() + LJC = np.array(adj_list_LJC.LJC.values, dtype="float") + return LJC + # -------------------------------------------------------------- # Conditional Randomization Function Implementations @@ -152,7 +156,10 @@ def _statistic(y, w): # Note: scaling not used + @_njit(fastmath=True) def _ljc_uni(i, z, permuted_ids, weights_i, scaling): - zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) - return zi * (zrand @ weights_i) + self_weight = weights_i[0] + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return zi * (zrand @ other_weights) diff --git a/esda/join_counts_local_bv.py b/esda/join_counts_local_bv.py index bb57f091..3aed8be7 100644 --- a/esda/join_counts_local_bv.py +++ b/esda/join_counts_local_bv.py @@ -8,7 +8,7 @@ crand as _crand_plus, njit as _njit, _prepare_univariate, - _prepare_bivariate + _prepare_bivariate, ) @@ -19,8 +19,14 @@ class Join_Counts_Local_BV(BaseEstimator): """Univariate Local Join Count Statistic""" - def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, - keep_simulations=True, seed=None): + def __init__( + self, + connectivity=None, + permutations=PERMUTATIONS, + n_jobs=1, + keep_simulations=True, + seed=None, + ): """ Initialize a Local_Join_Counts_BV estimator Arguments @@ -34,18 +40,18 @@ def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, p_values n_jobs : int Number of cores to be used in the conditional randomisation. If -1, - all available cores are used. + all available cores are used. keep_simulations : Boolean (default=True) - If True, the entire matrix of replications under the null - is stored in memory and accessible; otherwise, replications + If True, the entire matrix of replications under the null + is stored in memory and accessible; otherwise, replications are not saved seed : None/int - Seed to ensure reproducibility of conditional randomizations. - Must be set here, and not outside of the function, since numba - does not correctly interpret external seeds - nor numpy.random.RandomState instances. - + Seed to ensure reproducibility of conditional randomizations. + Must be set here, and not outside of the function, since numba + does not correctly interpret external seeds + nor numpy.random.RandomState instances. + """ self.connectivity = connectivity @@ -109,20 +115,20 @@ def fit(self, x, z, case="CLC", n_jobs=1, permutations=999): """ # Need to ensure that the np.array() are of # dtype='float' for numba - x = np.array(x, dtype='float') - z = np.array(z, dtype='float') + x = np.array(x, dtype="float") + z = np.array(z, dtype="float") w = self.connectivity # Fill the diagonal with 0s w = weights.util.fill_diagonal(w, val=0) - w.transform = 'b' + w.transform = "b" self.x = x self.z = z self.n = len(x) self.w = w self.case = case - + keep_simulations = self.keep_simulations n_jobs = self.n_jobs seed = self.seed @@ -133,30 +139,32 @@ def fit(self, x, z, case="CLC", n_jobs=1, permutations=999): if case == "BJC": self.p_sim, self.rjoins = _crand_plus( z=np.column_stack((x, z)), - w=self.w, + w=self.w, observed=self.LJC, - permutations=permutations, - keep=True, + permutations=permutations, + keep=True, n_jobs=n_jobs, - stat_func=_ljc_bv_case1 + stat_func=_ljc_bv_case1, ) # Set p-values for those with LJC of 0 to NaN - self.p_sim[self.LJC == 0] = 'NaN' + self.p_sim[self.LJC == 0] = "NaN" elif case == "CLC": self.p_sim, self.rjoins = _crand_plus( z=np.column_stack((x, z)), - w=self.w, + w=self.w, observed=self.LJC, - permutations=permutations, - keep=True, + permutations=permutations, + keep=True, n_jobs=n_jobs, - stat_func=_ljc_bv_case2 + stat_func=_ljc_bv_case2, ) # Set p-values for those with LJC of 0 to NaN - self.p_sim[self.LJC == 0] = 'NaN' + self.p_sim[self.LJC == 0] = "NaN" else: - raise NotImplementedError(f'The requested LJC method ({case}) \ - is not currently supported!') + raise NotImplementedError( + f"The requested LJC method ({case}) \ + is not currently supported!" + ) return self @@ -180,24 +188,31 @@ def _statistic(x, z, w, case): neighbor_z = zseries_z.loc[adj_list.neighbor].values if case == "BJC": - BJC = (focal_x == 1) & (focal_z == 0) & \ - (neighbor_x == 0) & (neighbor_z == 1) - adj_list_BJC = pd.DataFrame(adj_list.focal.values, - BJC.astype('uint8')).reset_index() - adj_list_BJC.columns = ['BJC', 'ID'] - adj_list_BJC = adj_list_BJC.groupby(by='ID').sum() - return (np.array(adj_list_BJC.BJC.values, dtype='float')) + BJC = ( + (focal_x == 1) & (focal_z == 0) & (neighbor_x == 0) & (neighbor_z == 1) + ) + adj_list_BJC = pd.DataFrame( + adj_list.focal.values, BJC.astype("uint8") + ).reset_index() + adj_list_BJC.columns = ["BJC", "ID"] + adj_list_BJC = adj_list_BJC.groupby(by="ID").sum() + return np.array(adj_list_BJC.BJC.values, dtype="float") elif case == "CLC": - CLC = (focal_x == 1) & (focal_z == 1) & \ - (neighbor_x == 1) & (neighbor_z == 1) - adj_list_CLC = pd.DataFrame(adj_list.focal.values, - CLC.astype('uint8')).reset_index() - adj_list_CLC.columns = ['CLC', 'ID'] - adj_list_CLC = adj_list_CLC.groupby(by='ID').sum() - return (np.array(adj_list_CLC.CLC.values, dtype='float')) + CLC = ( + (focal_x == 1) & (focal_z == 1) & (neighbor_x == 1) & (neighbor_z == 1) + ) + adj_list_CLC = pd.DataFrame( + adj_list.focal.values, CLC.astype("uint8") + ).reset_index() + adj_list_CLC.columns = ["CLC", "ID"] + adj_list_CLC = adj_list_CLC.groupby(by="ID").sum() + return np.array(adj_list_CLC.CLC.values, dtype="float") else: - raise NotImplementedError(f'The requested LJC method ({case}) \ - is not currently supported!') + raise NotImplementedError( + f"The requested LJC method ({case}) \ + is not currently supported!" + ) + # -------------------------------------------------------------- # Conditional Randomization Function Implementations @@ -205,17 +220,23 @@ def _statistic(x, z, w, case): # Note: scaling not used + @_njit(fastmath=True) def _ljc_bv_case1(i, z, permuted_ids, weights_i, scaling): zx = z[:, 0] zy = z[:, 1] - zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, weights_i) - return zx[i] * (zyrand @ weights_i) + self_weight = weights_i[0] + other_weights = weights_i[1:] + zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, other_weights) + return zx[i] * (zyrand @ other_weights) + @_njit(fastmath=True) def _ljc_bv_case2(i, z, permuted_ids, weights_i, scaling): zx = z[:, 0] zy = z[:, 1] - zxi, zxrand, zyi, zyrand = _prepare_bivariate(i, z, permuted_ids, weights_i) + self_weight = weights_i[0] + other_weights = weights_i[1:] + zxi, zxrand, zyi, zyrand = _prepare_bivariate(i, z, permuted_ids, other_weights) zf = zxrand * zyrand - return zy[i] * (zf @ weights_i) + return zy[i] * (zf @ other_weights) diff --git a/esda/join_counts_local_mv.py b/esda/join_counts_local_mv.py index 034d6ebe..5cd24b5b 100644 --- a/esda/join_counts_local_mv.py +++ b/esda/join_counts_local_mv.py @@ -3,11 +3,7 @@ from scipy import sparse from sklearn.base import BaseEstimator from libpysal import weights -from esda.crand import ( - crand as _crand_plus, - njit as _njit, - _prepare_univariate -) +from esda.crand import crand as _crand_plus, njit as _njit, _prepare_univariate PERMUTATIONS = 999 @@ -17,8 +13,14 @@ class Join_Counts_Local_MV(BaseEstimator): """Multivariate Local Join Count Statistic""" - def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, - keep_simulations=True, seed=None): + def __init__( + self, + connectivity=None, + permutations=PERMUTATIONS, + n_jobs=1, + keep_simulations=True, + seed=None, + ): """ Initialize a Local_Join_Counts_MV estimator Arguments @@ -32,18 +34,18 @@ def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, p_values n_jobs : int Number of cores to be used in the conditional randomisation. If -1, - all available cores are used. + all available cores are used. keep_simulations : Boolean (default=True) - If True, the entire matrix of replications under the null - is stored in memory and accessible; otherwise, replications + If True, the entire matrix of replications under the null + is stored in memory and accessible; otherwise, replications are not saved seed : None/int - Seed to ensure reproducibility of conditional randomizations. - Must be set here, and not outside of the function, since numba - does not correctly interpret external seeds - nor numpy.random.RandomState instances. - + Seed to ensure reproducibility of conditional randomizations. + Must be set here, and not outside of the function, since numba + does not correctly interpret external seeds + nor numpy.random.RandomState instances. + """ self.connectivity = connectivity @@ -98,37 +100,36 @@ def fit(self, variables, n_jobs=1, permutations=999): w = self.connectivity # Fill the diagonal with 0s w = weights.util.fill_diagonal(w, val=0) - w.transform = 'b' + w.transform = "b" self.n = len(variables[0]) self.w = w - self.variables = np.array(variables, dtype='float') - + self.variables = np.array(variables, dtype="float") + keep_simulations = self.keep_simulations n_jobs = self.n_jobs seed = self.seed - # Need to ensure that the product is an + # Need to ensure that the product is an # np.array() of dtype='float' for numba - self.ext = np.array(np.prod(np.vstack(variables), axis=0), - dtype='float') + self.ext = np.array(np.prod(np.vstack(variables), axis=0), dtype="float") self.LJC = self._statistic(variables, w) if permutations: self.p_sim, self.rjoins = _crand_plus( - z=self.ext, - w=self.w, + z=self.ext, + w=self.w, observed=self.LJC, - permutations=permutations, - keep=True, + permutations=permutations, + keep=True, n_jobs=n_jobs, - stat_func=_ljc_mv + stat_func=_ljc_mv, ) # Set p-values for those with LJC of 0 to NaN - self.p_sim[self.LJC == 0] = 'NaN' - + self.p_sim[self.LJC == 0] = "NaN" + return self @staticmethod @@ -140,32 +141,30 @@ def _statistic(variables, w): # The zseries zseries = [pd.Series(i, index=w.id_order) for i in variables] # The focal values - focal = [zseries[i].loc[adj_list.focal].values for - i in range(len(variables))] + focal = [zseries[i].loc[adj_list.focal].values for i in range(len(variables))] # The neighbor values - neighbor = [zseries[i].loc[adj_list.neighbor].values for - i in range(len(variables))] + neighbor = [ + zseries[i].loc[adj_list.neighbor].values for i in range(len(variables)) + ] # Find instances where all surrounding # focal and neighbor values == 1 - focal_all = np.array(np.all(np.dstack(focal) == 1, - axis=2)) - neighbor_all = np.array(np.all(np.dstack(neighbor) == 1, - axis=2)) + focal_all = np.array(np.all(np.dstack(focal) == 1, axis=2)) + neighbor_all = np.array(np.all(np.dstack(neighbor) == 1, axis=2)) MCLC = (focal_all == True) & (neighbor_all == True) - # Convert list of True/False to boolean array + # Convert list of True/False to boolean array # and unlist (necessary for building pd.DF) - MCLC = list(MCLC*1) + MCLC = list(MCLC * 1) # Create a df that uses the adjacency list # focal values and the BBs counts - adj_list_MCLC = pd.DataFrame(adj_list.focal.values, - MCLC).reset_index() + adj_list_MCLC = pd.DataFrame(adj_list.focal.values, MCLC).reset_index() # Temporarily rename the columns - adj_list_MCLC.columns = ['MCLC', 'ID'] - adj_list_MCLC = adj_list_MCLC.groupby(by='ID').sum() + adj_list_MCLC.columns = ["MCLC", "ID"] + adj_list_MCLC = adj_list_MCLC.groupby(by="ID").sum() + + return np.array(adj_list_MCLC.MCLC.values, dtype="float") - return (np.array(adj_list_MCLC.MCLC.values, dtype='float')) # -------------------------------------------------------------- # Conditional Randomization Function Implementations @@ -173,7 +172,10 @@ def _statistic(variables, w): # Note: scaling not used + @_njit(fastmath=True) def _ljc_mv(i, z, permuted_ids, weights_i, scaling): - zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) - return zi * (zrand @ weights_i) + self_weight = weights_i[0] + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return zi * (zrand @ other_weights) diff --git a/esda/moran.py b/esda/moran.py index afc9fcd9..332e5617 100644 --- a/esda/moran.py +++ b/esda/moran.py @@ -1781,13 +1781,17 @@ def _wikh_slow(W, sokal_correction=False): @_njit(fastmath=True) def _moran_local_bv_crand(i, z, permuted_ids, weights_i, scaling): + self_weight = weights_i[0] + other_weights = weights_i[1:] zx = z[:, 0] zy = z[:, 1] - zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, weights_i) - return zx[i] * (zyrand @ weights_i) * scaling + zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, other_weights) + return zx[i] * (zyrand @ other_weights + self_weight * zyi) * scaling @_njit(fastmath=True) def _moran_local_crand(i, z, permuted_ids, weights_i, scaling): - zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i) - return zi * (zrand @ weights_i) * scaling + self_weight = weights_i[0] + other_weights = weights_i[1:] + zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights) + return zi * (zrand @ other_weights + self_weight * zi) * scaling diff --git a/esda/tests/test_getisord.py b/esda/tests/test_getisord.py index 2ee5ffbc..35ed3a42 100644 --- a/esda/tests/test_getisord.py +++ b/esda/tests/test_getisord.py @@ -3,9 +3,9 @@ from .. import getisord from libpysal.weights.distance import DistanceBand -from libpysal.common import pandas +from libpysal.common import pandas, RTOL, ATOL -POINTS = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)] +POINTS = np.array([(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]) W = DistanceBand(POINTS, threshold=15) Y = np.array([2, 3, 3.2, 5, 8, 7]) @@ -20,8 +20,8 @@ def setUp(self): def test_G(self): g = getisord.G(self.y, self.w) - self.assertAlmostEqual(g.G, 0.55709779, places=8) - self.assertAlmostEqual(g.p_norm, 0.1729, places=4) + np.testing.assert_allclose(g.G, 0.55709779, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(g.p_norm, 0.172936, rtol=RTOL, atol=ATOL) @unittest.skipIf(PANDAS_EXTINCT, "missing pandas") def test_by_col(self): @@ -34,8 +34,8 @@ def test_by_col(self): this_pval = np.unique(r1.y_p_sim.values) np.random.seed(12345) stat = getisord.G(self.y, self.w) - self.assertAlmostEqual(this_getisord, stat._statistic) - self.assertAlmostEqual(this_pval, stat.p_sim) + np.testing.assert_allclose(this_getisord, stat._statistic) + np.testing.assert_allclose(this_pval, stat.p_sim) class G_Local_Tester(unittest.TestCase): @@ -45,35 +45,33 @@ def setUp(self): np.random.seed(10) def test_G_Local_Binary(self): - lg = getisord.G_Local(self.y, self.w, transform="B") - self.assertAlmostEqual(lg.Zs[0], -1.0136729, places=7) - self.assertAlmostEqual(lg.p_sim[0], 0.10100000000000001, places=7) - self.assertAlmostEqual(lg.p_z_sim[0], 0.154373052, places=7) + lg = getisord.G_Local(self.y, self.w, transform="B", seed=10) + np.testing.assert_allclose(lg.Zs[0], -1.0136729, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(lg.p_sim[0], 0.102, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(lg.p_z_sim[0], 0.153923, rtol=RTOL, atol=ATOL) def test_G_Local_Row_Standardized(self): - lg = getisord.G_Local(self.y, self.w, transform="R") - self.assertAlmostEqual(lg.Zs[0], -0.62074534, places=7) - self.assertAlmostEqual(lg.p_sim[0], 0.10100000000000001, places=7) + lg = getisord.G_Local(self.y, self.w, transform="R", seed=10) + np.testing.assert_allclose(lg.Zs[0], -0.62074534, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(lg.p_sim[0], 0.102, rtol=RTOL, atol=ATOL) def test_G_star_Local_Binary(self): - lg = getisord.G_Local(self.y, self.w, transform="B", star=True) - self.assertAlmostEqual(lg.Zs[0], -1.39727626, places=8) - self.assertAlmostEqual(lg.p_sim[0], 0.10100000000000001, places=7) + lg = getisord.G_Local(self.y, self.w, transform="B", star=True, seed=10) + np.testing.assert_allclose(lg.Zs[0], -1.39727626, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(lg.p_sim[0], 0.102, rtol=RTOL, atol=ATOL) def test_G_star_Row_Standardized(self): - lg = getisord.G_Local(self.y, self.w, transform="R", star=True) - self.assertAlmostEqual(lg.Zs[0], -0.62488094, places=8) - self.assertAlmostEqual(lg.p_sim[0], 0.10100000000000001, places=7) + lg = getisord.G_Local(self.y, self.w, transform="R", star=True, seed=10) + np.testing.assert_allclose(lg.Zs[0], -0.62488094, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(lg.p_sim[0], 0.102, rtol=RTOL, atol=ATOL) @unittest.skipIf(PANDAS_EXTINCT, "missing pandas") def test_by_col(self): import pandas as pd df = pd.DataFrame(self.y, columns=["y"]) - np.random.seed(12345) - r1 = getisord.G_Local.by_col(df, ["y"], w=self.w) - np.random.seed(12345) - stat = getisord.G_Local(self.y, self.w) + r1 = getisord.G_Local.by_col(df, ["y"], w=self.w, seed=12345) + stat = getisord.G_Local(self.y, self.w, seed=12345) np.testing.assert_allclose(r1.y_g_local.values, stat.Gs) np.testing.assert_allclose(r1.y_p_sim, stat.p_sim)