Skip to content

Commit

Permalink
Merge pull request #133 from ljwolf/getisord
Browse files Browse the repository at this point in the history
add the start on new calculation method for getisord statistics
  • Loading branch information
sjsrey authored Jul 26, 2021
2 parents 84bb359 + c423b96 commit 9ab060c
Show file tree
Hide file tree
Showing 8 changed files with 460 additions and 262 deletions.
109 changes: 79 additions & 30 deletions esda/crand.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def crand(
scaling : float
Scaling value to apply to every local statistic
seed : None/int
Seed to ensure reproducibility of conditional randomizations
Seed to ensure reproducibility of conditional randomizations
Returns
-------
Expand All @@ -113,8 +113,8 @@ def crand(
If keep=True, (N, permutations) array with simulated values of stat_func under the
null of spatial randomness; else, empty (1, 1) array
"""
cardinalities = np.array((w.sparse != 0).sum(1)).flatten()
max_card = cardinalities.max()
adj_matrix = w.sparse

n = len(z)
if z.ndim == 2:
if z.shape[1] == 2:
Expand All @@ -141,6 +141,26 @@ def crand(
# paralellise over permutations?
if seed is None:
seed = np.random.randint(12345, 12345000)

# we need to be careful to shuffle only *other* sites, not
# the self-site. This means we need to
### extract the self-weight, if any
self_weights = adj_matrix.diagonal()
### force the self-site weight to zero
with warnings.catch_warnings():
# massive changes to sparsity incur a cost, but it's not
# large for simply changing the diag
warnings.simplefilter("ignore")
adj_matrix.setdiag(0)
adj_matrix.eliminate_zeros()
### extract the weights from a now no-self-weighted adj_matrix
other_weights = adj_matrix.data
### use the non-self weight as the cardinality, since
### this is the set we have to randomize.
### if there is a self-neighbor, we need to *not* shuffle the
### self neighbor, since conditional randomization conditions on site i.
cardinalities = np.array((adj_matrix != 0).sum(1)).flatten()
max_card = cardinalities.max()
permuted_ids = vec_permutations(max_card, n, permutations, seed)

if n_jobs != 1:
Expand All @@ -162,7 +182,8 @@ def crand(
z, # all z, for serial this is also the entire data
observed, # observed statistics
cardinalities, # cardinalities conforming to chunked z
w.sparse.data, # flat weights buffer
self_weights, # n-length vector containing the self-weights.
other_weights, # flat weights buffer
permuted_ids, # permuted ids
scaling, # scaling applied to all statistics
keep, # whether or not to keep the local statistics
Expand All @@ -171,12 +192,15 @@ def crand(
else:
if n_jobs == -1:
n_jobs = os.cpu_count()
if n_jobs > len(z):
n_jobs = len(z)
# Parallel implementation
larger, rlocals = parallel_crand(
z,
observed,
cardinalities,
w.sparse.data,
self_weights,
other_weights,
permuted_ids,
scaling,
n_jobs,
Expand All @@ -198,7 +222,8 @@ def compute_chunk(
z: np.ndarray,
observed: np.ndarray,
cardinalities: np.ndarray,
weights: np.ndarray,
self_weights: np.ndarray,
other_weights: np.ndarray,
permuted_ids: np.ndarray,
scaling: np.float64,
keep: bool,
Expand All @@ -212,18 +237,22 @@ def compute_chunk(
---------
chunk_start : int
Starting index for the chunk of input. Should be zero if z_chunk == z.
z_chunk : numpy.ndarray
z_chunk : numpy.ndarray
(n_chunk,) array containing the chunk of standardised observed values.
z : ndarray
2D array with N rows with standardised observed values
observed : ndarray
(n_chunk,) array containing observed values for the chunk
cardinalities : ndarray
(n_chunk,) array containing the cardinalities for each element.
weights : ndarray
Array containing the weights within the chunk in a flat format (ie. as
obtained from the `values` attribute of a CSR sparse representation of
the original W. This is as long as the sum of `cardinalities`
(n_chunk,) array containing the cardinalities for each element.
self_weights : ndarray of shape (n,)
Array containing the self-weights for each observation. In most cases, this
will be zero. But, in some cases (e.g. Gi-star or kernel weights), this will
be nonzero.
other_weights : ndarray
Array containing the weights of all other sites in the computation
other than site i. If self_weights is zero, this has as many entries
as the sum of `cardinalities`.
permuted_ids : ndarray
(permutations, max_cardinality) array with indices of permuted
ids to use to construct random realizations of the statistic
Expand Down Expand Up @@ -254,8 +283,11 @@ def compute_chunk(

for i in range(chunk_n):
cardinality = cardinalities[i]
### this chomps the first `cardinality` weights off of `weights`
weights_i = weights[wloc : (wloc + cardinality)]
# we need to fix the self-weight to the first position
weights_i = np.zeros(cardinality + 1, dtype=other_weights.dtype)
weights_i[0] = self_weights[i]
### this chomps the next `cardinality` weights off of `weights`
weights_i[1:] = other_weights[wloc : (wloc + cardinality)]
wloc += cardinality
z_chunk_i = z_chunk[i]
mask[chunk_start + i] = False
Expand All @@ -264,7 +296,9 @@ def compute_chunk(
]
rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling)
if keep:
rlocals[i,] = rstats
rlocals[
i,
] = rstats
larger[i] = np.sum(rstats >= observed[i])
return larger, rlocals

Expand All @@ -284,7 +318,7 @@ def build_weights_offsets(cardinalities: np.ndarray, n_chunks: int):
Parameters
----------
cardinalities : ndarray
(n_chunk,) array containing the cardinalities for each element.
(n_chunk,) array containing the cardinalities for each element.
n_chunks : int
Number of chunks to split the weights into
Expand Down Expand Up @@ -312,7 +346,8 @@ def chunk_generator(
z: np.ndarray,
observed: np.ndarray,
cardinalities: np.ndarray,
weights: np.ndarray,
self_weights: np.ndarray,
other_weights: np.ndarray,
w_boundary_points: np.ndarray,
):
"""
Expand All @@ -331,7 +366,7 @@ def chunk_generator(
observed : ndarray
(N,) array with observed values
cardinalities : ndarray
(N,) array containing the cardinalities for each element.
(N,) array containing the cardinalities for each element.
weights : ndarray
Array containing the weights within the chunk in a flat format (ie. as
obtained from the `values` attribute of a CSR sparse representation of
Expand All @@ -344,14 +379,14 @@ def chunk_generator(
------
start : int
Starting index for the chunk of input. Should be zero if z_chunk == z.
z_chunk : numpy.ndarray
z_chunk : numpy.ndarray
(n_chunk,) array containing the chunk of standardised observed values.
z : ndarray
2D array with N rows with standardised observed values
observed_chunk : ndarray
(n_chunk,) array containing observed values for the chunk
cardinalities_chunk : ndarray
(n_chunk,) array containing the cardinalities for each element.
(n_chunk,) array containing the cardinalities for each element.
weights_chunk : ndarray
Array containing the weights within the chunk in a flat format (ie. as
obtained from the `values` attribute of a CSR sparse representation of
Expand All @@ -361,15 +396,17 @@ def chunk_generator(
for i in range(n_jobs):
start = starts[i]
z_chunk = z[start : (start + chunk_size)]
self_weights_chunk = self_weights[start : (start + chunk_size)]
observed_chunk = observed[start : (start + chunk_size)]
cardinalities_chunk = cardinalities[start : (start + chunk_size)]
w_chunk = weights[w_boundary_points[i] : w_boundary_points[i + 1]]
w_chunk = other_weights[w_boundary_points[i] : w_boundary_points[i + 1]]
yield (
start,
z_chunk,
z,
observed_chunk,
cardinalities_chunk,
self_weights_chunk,
w_chunk,
)

Expand All @@ -378,7 +415,8 @@ def parallel_crand(
z: np.ndarray,
observed: np.ndarray,
cardinalities: np.ndarray,
weights: np.ndarray,
self_weights: np.ndarray,
other_weights: np.ndarray,
permuted_ids: np.ndarray,
scaling: np.float64,
n_jobs: int,
Expand All @@ -396,11 +434,15 @@ def parallel_crand(
observed : ndarray
(N,) array with observed values
cardinalities : ndarray
(N,) array containing the cardinalities for each element.
weights : ndarray
Array containing the weights in a flat format (ie. as
obtained from the `values` attribute of a CSR sparse representation of
the original W. This is as long as the sum of `cardinalities`
(N,) array containing the cardinalities for each element.
self_weights : ndarray of shape (n,)
Array containing the self-weights for each observation. In most cases, this
will be zero. But, in some cases (e.g. Gi-star or kernel weights), this will
be nonzero.
other_weights : ndarray
Array containing the weights of all other sites in the computation
other than site i. If self_weights is zero, this has as many entries
as the sum of `cardinalities`.
permuted_ids : ndarray
(permutations, max_cardinality) array with indices of permuted
ids to use to construct random realizations of the statistic
Expand Down Expand Up @@ -454,7 +496,14 @@ def parallel_crand(

# construct chunks using a generator
chunks = chunk_generator(
n_jobs, starts, z, observed, cardinalities, weights, w_boundary_points,
n_jobs,
starts,
z,
observed,
cardinalities,
self_weights,
other_weights,
w_boundary_points,
)

with parallel_backend("loky", inner_max_num_threads=1):
Expand All @@ -463,8 +512,8 @@ def parallel_crand(
for pars in chunks
)
larger, rlocals = zip(*worker_out)
larger = np.hstack(larger).flatten()
rlocals = np.row_stack(rlocals)
larger = np.hstack(larger).squeeze()
rlocals = np.row_stack(rlocals).squeeze()
return larger, rlocals


Expand Down
56 changes: 29 additions & 27 deletions esda/geary_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
crand as _crand_plus,
njit as _njit,
_prepare_univariate
)
from esda.crand import crand as _crand_plus, njit as _njit, _prepare_univariate


class Geary_Local(BaseEstimator):

"""Local Geary - Univariate"""

def __init__(self, connectivity=None, labels=False, sig=0.05,
permutations=999, n_jobs=1, keep_simulations=True,
seed=None):
def __init__(
self,
connectivity=None,
labels=False,
sig=0.05,
permutations=999,
n_jobs=1,
keep_simulations=True,
seed=None,
):
"""
Initialize a Local_Geary estimator
Arguments
Expand Down Expand Up @@ -110,8 +113,8 @@ def fit(self, x):
x = np.asarray(x).flatten()

w = self.connectivity
w.transform = 'r'
w.transform = "r"

permutations = self.permutations
sig = self.sig
keep_simulations = self.keep_simulations
Expand All @@ -122,13 +125,13 @@ def fit(self, x):

if permutations:
self.p_sim, self.rlocalG = _crand_plus(
z=(x - np.mean(x))/np.std(x),
z=(x - np.mean(x)) / np.std(x),
w=w,
observed=self.localG,
permutations=permutations,
keep=keep_simulations,
n_jobs=n_jobs,
stat_func=_local_geary
stat_func=_local_geary,
)

if self.labels:
Expand All @@ -137,16 +140,11 @@ def fit(self, x):
# Create empty vector to fill
self.labs = np.empty(len(x)) * np.nan
# Outliers
self.labs[(self.localG < Eij_mean) &
(x > x_mean) &
(self.p_sim <= sig)] = 1
self.labs[(self.localG < Eij_mean) & (x > x_mean) & (self.p_sim <= sig)] = 1
# Clusters
self.labs[(self.localG < Eij_mean) &
(x < x_mean) &
(self.p_sim <= sig)] = 2
self.labs[(self.localG < Eij_mean) & (x < x_mean) & (self.p_sim <= sig)] = 2
# Other
self.labs[(self.localG > Eij_mean) &
(self.p_sim <= sig)] = 3
self.labs[(self.localG > Eij_mean) & (self.p_sim <= sig)] = 3
# Non-significant
self.labs[self.p_sim > sig] = 4

Expand All @@ -155,30 +153,34 @@ def fit(self, x):
@staticmethod
def _statistic(x, w):
# Caclulate z-scores for x
zscore_x = (x - np.mean(x))/np.std(x)
zscore_x = (x - np.mean(x)) / np.std(x)
# Create focal (xi) and neighbor (zi) values
adj_list = w.to_adjlist(remove_symmetric=False)
zseries = pd.Series(zscore_x, index=w.id_order)
zi = zseries.loc[adj_list.focal].values
zj = zseries.loc[adj_list.neighbor].values
# Carry out local Geary calculation
gs = adj_list.weight.values * (zi-zj)**2
gs = adj_list.weight.values * (zi - zj) ** 2
# Reorganize data
adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
adj_list_gs.columns = ['gs', 'ID']
adj_list_gs = adj_list_gs.groupby(by='ID').sum()
adj_list_gs.columns = ["gs", "ID"]
adj_list_gs = adj_list_gs.groupby(by="ID").sum()

localG = adj_list_gs.gs.values

return (localG)
return localG


# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
return (zi-zrand)**2 @ weights_i
self_weight = weights_i[0]
other_weights = weights_i[1:]
zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights)
return (zi - zrand) ** 2 @ other_weights
Loading

0 comments on commit 9ab060c

Please sign in to comment.