Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Theil doc #90

Merged
merged 2 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
352 changes: 267 additions & 85 deletions docs/user-guide/measure/theil.ipynb

Large diffs are not rendered by default.

265 changes: 143 additions & 122 deletions inequality/theil.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
"""
Theil Inequality metrics
"""
"""Theil Inequality metrics"""

__author__ = "Sergio J. Rey <srey@asu.edu>"
__author__ = "Sergio J. Rey <srey@sdsu.edu>"

import warnings

import numpy as np
import pandas as pd
import numpy

__all__ = ["Theil", "TheilD", "TheilDSim"]

SMALL = np.finfo("float").tiny
SMALL = numpy.finfo("float").tiny


class Theil:
Expand All @@ -28,181 +23,207 @@ class Theil:

Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).

column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.

Attributes
----------

T : numpy.array
Theil's *T* index.
An array in the shape :math:`(t,)` or :math:`(1,)`
containing Theil's *T* for each column of ``y``.

Notes
-----
The old API (nxT arrays) is deprecated and will be removed in the future.
Use nx1 sequences or DataFrames with a single column instead.
This computation involves natural logs. To prevent ``ln[0]`` from occurring, a
small value is added to each element of ``y`` before beginning the computation.

Examples
--------

>>> import libpysal
>>> import numpy
>>> from inequality.theil import Theil

>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> theil_y = Theil(y)

>>> theil_y.T
array([0.20894344, 0.15222451, 0.10472941, 0.10194725, 0.09560113,
0.10511256, 0.10660832])

"""

def __init__(self, y, column=None):
# Deprecation warning for old API
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
warnings.warn(
"The nxT input format is deprecated. In future versions, "
"please provide nx1 sequences or a DataFrame with a single column.",
DeprecationWarning,
stacklevel=2,
)
# Old API behavior
n = y.shape[0]
else:
# New API: Handle sequence or DataFrame
if isinstance(y, pd.DataFrame):
if column is None:
raise ValueError("For DataFrame input, `column` must be specified.")
y = y[column].values
elif isinstance(y, list | np.ndarray):
y = np.asarray(y)
else:
raise TypeError("Input must be an array, list, or DataFrame.")
n = len(y)

# Calculation
def __init__(self, y):
n = len(y)
y = y + SMALL * (y == 0) # can't have 0 values
yt = y.sum(axis=0)
s = y / (yt * 1.0)
lns = np.log(n * s)
lns = numpy.log(n * s)
slns = s * lns
self.T = sum(slns)
t = sum(slns)
self.T = t


class TheilD:
"""
Decomposition of Theil's *T* based on partitioning of
"""Decomposition of Theil's *T* based on partitioning of
observations into exhaustive and mutually exclusive groups.

Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).

partition : array-like, DataFrame, or sequence
Partition indicating group membership.
If using a DataFrame, specify the column using `partition_col`.

column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.

partition_col : str, optional
If `partition` is a DataFrame, specify the column to be used.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.
partition : numpy.array
An array in the shape :math:`(n,)` of elements indicating which partition
each observation belongs to. These are assumed to be exhaustive.

Attributes
----------

T : numpy.array
Global Theil's *T*.
An array in the shape :math:`(t,)` or :math:`(1,)`
containing the global inequality *T*.
bg : numpy.array
Between-group inequality.
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing between group inequality.
wg : numpy.array
Within-group inequality.
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing within group inequality.

"""
Examples
--------

def __init__(self, y, partition, column=None, partition_col=None):
# Deprecation warning for old API
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
warnings.warn(
"The nxT input format is deprecated. In future versions, "
"please provide nx1 sequences or a DataFrame with a single column.",
DeprecationWarning,
stacklevel=2,
)
else:
# New API: Handle sequence or DataFrame
if isinstance(y, pd.DataFrame):
if column is None:
raise ValueError("For DataFrame input, `column` must be specified.")
y = y[column].values
elif isinstance(y, list | np.ndarray):
y = np.asarray(y)
else:
raise TypeError("Input must be an array, list, or DataFrame.")

# Handle partition similarly
if isinstance(partition, pd.DataFrame):
if partition_col is None:
raise ValueError(
"For DataFrame input, `partition_col` must be specified."
)
partition = partition[partition_col].values
elif isinstance(partition, list | np.ndarray):
partition = np.asarray(partition)
else:
raise TypeError("Partition must be an array, list, or DataFrame.")
>>> import libpysal
>>> import numpy
>>> from inequality.theil import TheilD

groups = np.unique(partition)
t = Theil(y).T
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> regimes = numpy.array(f.by_col('hanson98'))
>>> theil_d = TheilD(y, regimes)

>>> theil_d.bg
array([0.0345889 , 0.02816853, 0.05260921, 0.05931219, 0.03205257,
0.02963731, 0.03635872])

>>> theil_d.wg
array([0.17435454, 0.12405598, 0.0521202 , 0.04263506, 0.06354856,
0.07547525, 0.0702496 ])

"""

def __init__(self, y, partition):
groups = numpy.unique(partition)
T = Theil(y).T # noqa N806
ytot = y.sum(axis=0)

# Group totals
gtot = np.array([y[partition == gid].sum(axis=0) for gid in groups])
# group totals
gtot = numpy.array([y[partition == gid].sum(axis=0) for gid in groups])

if ytot.size == 1:
if ytot.size == 1: # y is 1-d
sg = gtot / (ytot * 1.0)
sg.shape = (sg.size, 1)
else:
sg = np.dot(gtot, np.diag(1.0 / ytot))
ng = np.array([sum(partition == gid) for gid in groups])
sg = numpy.dot(gtot, numpy.diag(1.0 / ytot))
ng = numpy.array([sum(partition == gid) for gid in groups])
ng.shape = (ng.size,) # ensure ng is 1-d
# Between group inequality
n = y.shape[0]
# between group inequality
sg = sg + (sg == 0) # handle case when a partition has 0 for sum
bg = np.multiply(sg, np.log(np.dot(np.diag(len(y) * 1.0 / ng), sg))).sum(axis=0)
bg = numpy.multiply(sg, numpy.log(numpy.dot(numpy.diag(n * 1.0 / ng), sg))).sum(
axis=0
)

self.T = t
self.T = T
self.bg = bg
self.wg = t - bg
self.wg = T - bg


class TheilDSim:
"""
Random permutation-based inference on Theil's inequality decomposition.
"""Random permutation based inference on Theil's inequality decomposition.
Provides for computationally based inference regarding the inequality
decomposition using random spatial permutations.
See :cite:`rey_interregional_2010`.

Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).

partition : array-like, DataFrame, or sequence
Partition indicating group membership.
If using a DataFrame, specify the column using `partition_col`.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.
partition : numpy.array
An array in the shape :math:`(n,)` of elements indicating which partition
each observation belongs to. These are assumed to be exhaustive.
permutations : int
The number of random spatial permutations for computationally
based inference on the decomposition.

Attributes
----------

observed : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing a ``TheilD`` instance for the observed data.
bg : numpy.array
An array in the shape ``(permutations+1, t)``
representing between group inequality.
bg_pvalue : numpy.array
An array in the shape :math:`(t,1)` representing the :math:`p`-value
for the between group measure. Measures the percentage of the realized
values that were greater than or equal to the observed ``bg`` value.
Includes the observed value.
wg : numpy.array
An array in the shape ``(permutations+1)``
representing within group inequality. Depending on the
shape of ``y``, the array may be 1- or 2-dimensional.

Examples
--------

permutations : int, optional
Number of random permutations for inference (default: 99).
>>> import libpysal
>>> import numpy
>>> from inequality.theil import TheilDSim

column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> regimes = numpy.array(f.by_col('hanson98'))
>>> numpy.random.seed(10)
>>> theil_ds = TheilDSim(y, regimes, 999)

partition_col : str, optional
If `partition` is a DataFrame, specify the column to be used.
>>> theil_ds.bg_pvalue
array([0.4 , 0.344, 0.001, 0.001, 0.034, 0.072, 0.032])

"""

def __init__(self, y, partition, permutations=99, column=None, partition_col=None):
observed = TheilD(y, partition, column=column, partition_col=partition_col)
def __init__(self, y, partition, permutations=99):
observed = TheilD(y, partition)
bg_ct = observed.bg == observed.bg # already have one extreme value
bg_ct = bg_ct * 1.0
results = [observed]
for _ in range(permutations):
yp = np.random.permutation(y)
yp = numpy.random.permutation(y)
t = TheilD(yp, partition)
bg_ct += 1.0 * t.bg >= observed.bg
results.append(t)
self.results = results
self.T = observed.T
self.bg_pvalue = bg_ct / (permutations * 1.0 + 1)
self.bg = np.array([r.bg for r in results])
self.wg = np.array([r.wg for r in results])
self.bg = numpy.array([r.bg for r in results])
self.wg = numpy.array([r.wg for r in results])
Loading