diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml index 66e9049..2e19f9e 100644 --- a/.github/workflows/Build.yml +++ b/.github/workflows/Build.yml @@ -448,3 +448,57 @@ jobs: - name: Step 5 - Test GroupedCEExplainer run: python ./tests/gce/test_gce.py + + build-glance-on-py310: + # The type of runner that the job will run on + runs-on: "${{ matrix.os }}" + strategy: + matrix: + #os: [ubuntu-18.04, ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-20.04, macos-latest, windows-latest] + python-version: ["3.10"] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + - name: Step 1 - checkout aix360 repository + uses: actions/checkout@v3 + + - name: Step 2 - set up python version + uses: actions/setup-python@v4 + with: + python-version: "${{ matrix.python-version }}" + + - name: Step 3 - upgrade setuptools + run: pip3 install pytest nbmake wheel --upgrade setuptools + + - name: Step 4 - Install aix360 with dipvae algorithm related dependencies + run: pip3 install .[glance] + + - name: Step 5 - Test Base + run: pytest ./tests/glance/test_base.py + + - name: Step 6 - Test Counterfactual Costs + run: pytest ./tests/glance/test_counterfactual_costs.py + + - name: Step 7 - Test Counterfactual Tree + run: pytest ./tests/glance/test_counterfactual_tree.py + + - name: Step 8 - Test Iterative Merges + run: pytest ./tests/glance/test_iterative_merges.py + + - name: Step 9 - Test KMeans + run: pytest ./tests/glance/test_KMeans.py + + - name: Step 10 - Test Local Cfs + run: pytest ./tests/glance/test_local_cfs.py + + - name: Step 11 - Test Node + run: pytest ./tests/glance/test_node.py + + - name: Step 12 - Test Phase2 + run: pytest ./tests/glance/test_phase2.py + + - name: Step 13 - Test Utils + run: pytest ./tests/glance/test_utils.py + + \ No newline at end of file diff --git a/aix360/algorithms/glance/__init__.py b/aix360/algorithms/glance/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aix360/algorithms/glance/base.py b/aix360/algorithms/glance/base.py new file mode 100644 index 0000000..39332e1 --- /dev/null +++ b/aix360/algorithms/glance/base.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod +import pandas as pd +import numpy as np + + +class ClusteringMethod(ABC): + """ + Abstract base class for clustering methods. + """ + + def __init__(self): + """ + Initialize the ClusteringMethod. + """ + pass + + @abstractmethod + def fit(self, data: pd.DataFrame): + """ + Fit the clustering model on the given data. + + Parameters: + - data (pd.DataFrame): DataFrame of input data to fit the model. + """ + pass + + @abstractmethod + def predict(self, instances: pd.DataFrame) -> np.ndarray: + """ + Predict the cluster labels for the given instances. + + Parameters: + - instances (pd.DataFrame): DataFrame of input instances. + + Returns: + - cluster_labels (np.ndarray): Array of cluster labels for each instance. + """ + pass + + +class LocalCounterfactualMethod(ABC): + """ + Abstract base class for local counterfactual methods. + """ + + def __init__(self): + """ + Initialize the LocalCounterfactualMethod. + """ + pass + + @abstractmethod + def fit(self, **kwargs): + """ + Fit the counterfactual method. + + Parameters: + - **kwargs: Additional keyword arguments for fitting. + """ + pass + + @abstractmethod + def explain_instances( + self, instances: pd.DataFrame, num_counterfactuals: int + ) -> pd.DataFrame: + """ + Find the local counterfactuals for the given instances. + + Parameters: + - instances (pd.DataFrame): DataFrame of input instances for which counterfactuals are desired. + - num_counterfactuals (int): Number of counterfactuals to generate for each instance. + + Returns: + - counterfactuals (pd.DataFrame): DataFrame of counterfactual instances. + """ + pass + + +class GlobalCounterfactualMethod(ABC): + """ + Abstract base class for global counterfactual methods. + """ + + def __init__(self, **kwargs): + """ + Initialize the LocalCounterfactualMethod. + + Parameters: + - **kwargs: Additional keyword arguments for init. + """ + pass + + @abstractmethod + def fit(self, X, y, **kwargs): + """ + Fit the counterfactual method. + + Parameters: + - **kwargs: Additional keyword arguments for fitting. + """ + pass + + @abstractmethod + def explain_group(self, instances: pd.DataFrame) -> pd.DataFrame: + """ + Find the global counterfactuals for the given group of instances. + + Parameters: + - instances (pd.DataFrame, optional): DataFrame of input instances for which global counterfactuals are desired. + If None, explain the whole group of affected instances. + + Returns: + - counterfactuals (pd.DataFrame): DataFrame of counterfactual instances. + """ + pass diff --git a/aix360/algorithms/glance/clustering/__init__.py b/aix360/algorithms/glance/clustering/__init__.py new file mode 100644 index 0000000..173b0eb --- /dev/null +++ b/aix360/algorithms/glance/clustering/__init__.py @@ -0,0 +1 @@ +from .kmeans import KMeansMethod diff --git a/aix360/algorithms/glance/clustering/kmeans.py b/aix360/algorithms/glance/clustering/kmeans.py new file mode 100644 index 0000000..9ac1db2 --- /dev/null +++ b/aix360/algorithms/glance/clustering/kmeans.py @@ -0,0 +1,60 @@ +from ..base import ClusteringMethod +from sklearn.cluster import KMeans + + +class KMeansMethod(ClusteringMethod): + """ + Implementation of a clustering method using KMeans. + + This class provides an interface to apply KMeans clustering to a dataset. + """ + + def __init__(self, num_clusters, random_seed): + """ + Initializes the KMeansMethod class. + + Parameters: + ---------- + num_clusters : int + The number of clusters to form as well as the number of centroids to generate. + random_seed : int + A seed for the random number generator to ensure reproducibility. + """ + + self.num_clusters = num_clusters + self.random_seed = random_seed + self.model = KMeans() + + def fit(self, data): + """ + Fits the KMeans model on the provided dataset. + + Parameters: + ---------- + data : array-like or sparse matrix, shape (n_samples, n_features) + Training instances to cluster. + + Returns: + ------- + None + """ + self.model = KMeans( + n_clusters=self.num_clusters, n_init=10, random_state=self.random_seed + ) + self.model.fit(data) + + def predict(self, instances): + """ + Predicts the nearest cluster each sample in the provided data belongs to. + + Parameters: + ---------- + instances : array-like or sparse matrix, shape (n_samples, n_features) + New data to predict. + + Returns: + ------- + labels : array, shape (n_samples,) + Index of the cluster each sample belongs to. + """ + return self.model.predict(instances) diff --git a/aix360/algorithms/glance/counterfactual_costs.py b/aix360/algorithms/glance/counterfactual_costs.py new file mode 100644 index 0000000..1df8056 --- /dev/null +++ b/aix360/algorithms/glance/counterfactual_costs.py @@ -0,0 +1,58 @@ +from typing import Callable, List, Dict +import numpy as np +import pandas as pd + + +def build_dist_func_dataframe( + X: pd.DataFrame, + numerical_columns: List[str], + categorical_columns: List[str], + n_bins: int = 10, +) -> Callable[[pd.DataFrame, pd.DataFrame], pd.Series]: + """ + Builds and returns a custom distance function for computing distances between rows of two DataFrames based on specified numerical and categorical columns. + + For numerical columns, the values are first binned into intervals based on the provided number of bins (`n_bins`). + The distance between numerical features is computed as the sum of the absolute differences between binned values. For categorical columns, the distance is calculated as the number of mismatched categorical values. + + Parameters: + ---------- + X : pd.DataFrame + The reference DataFrame used to determine the bin intervals for numerical columns. + numerical_columns : List[str] + List of column names in `X` that contain numerical features. + categorical_columns : List[str] + List of column names in `X` that contain categorical features. + n_bins : int, optional + The number of bins to use when normalizing numerical columns, by default 10. + + Returns: + ------- + Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A distance function that takes two DataFrames as input (`X1` and `X2`) and returns a Series of distances between corresponding rows in `X1` and `X2`. + + The distance function works as follows: + - For numerical columns: the absolute differences between binned values are summed. + - For categorical columns: the number of mismatches between values is counted. + """ + feat_intervals = { + col: ((max(X[col]) - min(X[col])) / n_bins) for col in numerical_columns + } + + def bin_numericals(instances: pd.DataFrame): + ret = instances.copy() + for col in numerical_columns: + ret[col] /= feat_intervals[col] + return ret + + def dist_f(X1: pd.DataFrame, X2: pd.DataFrame) -> pd.Series: + X1 = bin_numericals(X1) + X2 = bin_numericals(X2) + + ret = (X1[numerical_columns] - X2[numerical_columns]).abs().sum(axis="columns") + ret += (X1[categorical_columns] != X2[categorical_columns]).astype(int).sum(axis="columns") + + return ret + + return dist_f + diff --git a/aix360/algorithms/glance/counterfactual_tree/__init__.py b/aix360/algorithms/glance/counterfactual_tree/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aix360/algorithms/glance/counterfactual_tree/counterfactual_tree.py b/aix360/algorithms/glance/counterfactual_tree/counterfactual_tree.py new file mode 100644 index 0000000..09162c9 --- /dev/null +++ b/aix360/algorithms/glance/counterfactual_tree/counterfactual_tree.py @@ -0,0 +1,467 @@ +from typing import Union, Any, List, Optional, Dict, Tuple, Callable +from ..base import GlobalCounterfactualMethod, LocalCounterfactualMethod +from ..iterative_merges.iterative_merges import C_GLANCE, _select_action_max_eff +import pandas as pd +from ..utils.metadata_requests import _decide_local_cf_method +from ..utils.centroid import centroid_pandas +from ..utils.action import extract_actions_pandas, apply_action_pandas +from sklearn.inspection import permutation_importance +from ..iterative_merges.iterative_merges import cumulative +from ..counterfactual_costs import build_dist_func_dataframe +from .node import Node +import numpy as np +from tqdm import tqdm + + +class T_GLANCE: + """ + A class to generate counterfactual explanations using a decision tree-like structure. + + This class allows users to create a tree structure for counterfactual generation, + optimizing effectiveness and cost based on specified features. It supports both local + and global methods for generating counterfactuals. + + Attributes: + ---------- + model : Any + The predictive model used for generating counterfactuals. + split_features : Union[List, int] + Features to split the tree. Can be a list of feature names or an integer specifying + the number of top features to use based on permutation importance. + partition_counterfactuals : int + The number of partitions to create for counterfactuals. + child_count : int + The number of children each node can have. + global_method : Union[GlobalCounterfactualMethod, str] + The global counterfactual generation method to use. + local_method : Union[LocalCounterfactualMethod, str] + The local counterfactual generation method to use. + num_local_counterfactuals : int + The number of local counterfactuals to generate. + node : Node + The root node of the counterfactual tree. + node_instances : pd.DataFrame + The instances that were used to build the counterfactual tree. + dist_func_dataframe : Callable + A distance function for calculating distances between instances. + + Methods: + ------- + fit(X, y, train_dataset=None, feat_to_vary="all", random_seed=13, numeric_features_names=None, categorical_features_names=None): + Fits the counterfactual tree to the provided data. + + _local_group_eff_cost(instances): + Calculates the effectiveness and cost of local counterfactuals for a group of instances. + + _group_eff_cost(instances): + Calculates the effectiveness and cost of counterfactuals for a group of instances, + utilizing local or global methods. + + partition_group(instances): + Partitions the group of instances into a tree structure based on the specified features. + + cumulative_leaf_actions(): + Computes the total effectiveness and cost of actions taken from leaf nodes of the tree. + + """ + + def __init__( + self, + model: Any, + split_features: Union[List, int] = None, + partition_counterfactuals: int = None, + child_count: int = 2, + global_method: Union[GlobalCounterfactualMethod, str] = None, + local_method: Union[LocalCounterfactualMethod, str] = None, + num_local_counterfactuals: int = 100, + ): + """ + Initializes the CounterfactualTree instance. + + Parameters: + ---------- + model : Any + The predictive model to use for generating counterfactuals. + split_features : Union[List, int], optional + Features to split the tree. If None, uses permutation importance to select. + If an integer, selects the top N features. + partition_counterfactuals : int, optional + Number of partitions for counterfactual generation. + child_count : int, optional + Number of children for each node in the tree. Default is 2. + global_method : Union[GlobalCounterfactualMethod, str], optional + The global counterfactual generation method to use. + local_method : Union[LocalCounterfactualMethod, str], optional + The local counterfactual generation method to use. + num_local_counterfactuals : int, optional + Number of local counterfactuals to generate. Default is 100. + """ + self.model = model + self.split_features = split_features + self.partition_counterfactuals = partition_counterfactuals + self.child_count = child_count + self.global_method = global_method + self.local_method = local_method + self.num_local_counterfactuals = num_local_counterfactuals + + def fit( + self, + X: pd.DataFrame, + y: pd.Series, + train_dataset: Optional[pd.DataFrame] = None, + feat_to_vary: Optional[Union[List[str], str]] = "all", + random_seed: int = 13, + numeric_features_names: Optional[List[str]] = None, + categorical_features_names: Optional[List[str]] = None, + ): + """ + Fits the counterfactual tree to the provided data. + + Parameters: + ---------- + X : pd.DataFrame + Features of the dataset. + y : pd.Series + Target variable. + train_dataset : Optional[pd.DataFrame], optional + The training dataset to use for local counterfactual generation methods. + feat_to_vary : Optional[Union[List[str], str]], optional + Features to vary in counterfactual generation. Default is "all". + random_seed : int, optional + Random seed for reproducibility. Default is 13. + numeric_features_names : Optional[List[str]], optional + List of numeric feature names. If None, they will be inferred from X. + categorical_features_names : Optional[List[str]], optional + List of categorical feature names. If None, they will be inferred from X. + """ + if self.split_features == None: + perm_importance = permutation_importance( + self.model, X, y, n_repeats=30, random_state=42 + ) + + feature_names = X.columns + + mean_importance = perm_importance.importances_mean + top_indices = mean_importance.argsort()[-2:][::-1] + top_features = feature_names[top_indices] + + self.split_features = list(top_features) + elif isinstance(self.split_features, int): + perm_importance = permutation_importance( + self.model, X, y, n_repeats=30, random_state=42 + ) + + feature_names = X.columns + + mean_importance = perm_importance.importances_mean + top_indices = mean_importance.argsort()[-self.split_features:][::-1] + top_features = feature_names[top_indices] + + self.split_features = list(top_features) + + self.split_values = _get_split_values(X, self.split_features, self.child_count) + + if numeric_features_names is None: + if categorical_features_names is None: + numeric_features_names = X.select_dtypes( + include=["number"] + ).columns.tolist() + else: + numeric_features_names = X.columns.difference( + categorical_features_names + ).tolist() + if categorical_features_names is None: + categorical_features_names = X.columns.difference( + numeric_features_names + ).tolist() + + self.numerical_features_names = numeric_features_names + self.categorical_features_names = categorical_features_names + self.X = X + self.y = y + self.train_dataset = train_dataset + self.random_seed = random_seed + self.feat_to_vary = feat_to_vary + self.dist_func_dataframe = build_dist_func_dataframe( + self.X, self.numerical_features_names, self.categorical_features_names + ) + + if self.local_method == None: + backup = "Dice" + else: + backup = self.local_method + self.cf_generator_backup = _decide_local_cf_method( + method=backup, + model=self.model, + train_dataset=self.train_dataset, + numeric_features_names=self.numerical_features_names, + categorical_features_names=self.categorical_features_names, + feat_to_vary=self.feat_to_vary, + random_seed=random_seed, + ) + + if self.global_method == None and self.local_method == None: + self.generation_method = "Global-IM" + if self.partition_counterfactuals == None: + self.partition_counterfactuals = 3 + self.cf_generator = C_GLANCE( + self.model, final_clusters=self.partition_counterfactuals, verbose=False + ) + if self.train_dataset is None: + raise ValueError( + "You need to pass train_dataset for Dice if you want default Iterative merges." + ) + self.cf_generator.fit(X, y, self.train_dataset) + elif self.global_method != None: + self.generation_method = "Global" + if self.partition_counterfactuals == None: + self.partition_counterfactuals = 3 + self.cf_generator = self.global_method + else: + self.generation_method = "Local" + if self.partition_counterfactuals == None: + self.partition_counterfactuals = 1 + + def _local_group_eff_cost(self, instances): + """ + Calculates the effectiveness and cost of local counterfactuals for a group of instances. + + Parameters: + ---------- + instances : pd.DataFrame + The group of instances to analyze. + + Returns: + ------- + Tuple[float, float, List[Any]] + A tuple containing the effectiveness, cost, and list of actions. + """ + centroid = centroid_pandas( + instances, + self.numerical_features_names, + self.categorical_features_names, + ) + cfs = self.cf_generator_backup.explain_instances( + centroid, + self.num_local_counterfactuals, + ) + if cfs.shape[0] == 0: + return 0, 0, [] + + actions = extract_actions_pandas( + X=pd.concat([centroid] * cfs.shape[0]).set_index( + cfs.index + ), + cfs=cfs, + categorical_features=self.categorical_features_names, + numerical_features=self.numerical_features_names, + categorical_no_action_token="-", + ) + # actions = [action for _, action in actions.iterrows()] + actions_info = _select_action_max_eff( + self.model, + instances, + actions, + self.dist_func_dataframe, + self.numerical_features_names, + self.categorical_features_names, + self.partition_counterfactuals, + ) + if type(actions_info) is not list: + actions_info = [actions_info] + actions = [action for _, _, action in actions_info] + eff, cost = cumulative( + self.model, + instances, + actions, + self.dist_func_dataframe, + self.numerical_features_names, + self.categorical_features_names, + "-", + ) + return eff, cost, actions + + def _group_eff_cost( + self, + instances, + ): + """ + Calculates the effectiveness and cost of counterfactuals for a group of instances, + utilizing local or global methods. + + Parameters: + ---------- + instances : pd.DataFrame + The group of instances to analyze. + + Returns: + ------- + Tuple[float, float, List[Any]] + A tuple containing the effectiveness, cost, and list of actions. + """ + + if self.generation_method == "Local": + return self._local_group_eff_cost(instances) + elif self.generation_method == "Global-IM": + clusters = min(100, len(instances)) + if clusters < self.partition_counterfactuals: + return self._local_group_eff_cost(instances) + else: + self.cf_generator.initial_clusters = clusters + eff, cost = self.cf_generator.explain_group(instances) + actions = self.cf_generator.global_actions() + elif self.generation_method == 'Global': + eff, cost = self.cf_generator.explain_group(instances) + actions = self.cf_generator.global_actions() + else: + raise ValueError("Generation method does not exist") + + + return eff, cost, actions + + def partition_group(self, instances: pd.DataFrame): + """ + Partitions the group of instances into a tree structure based on the specified features. + + Parameters: + ---------- + instances : pd.DataFrame + The group of instances to partition. + + Returns: + ------- + Node + The root node of the partitioned tree. + """ + + def _partition_group( + group, split_features, eff_prec=None, cost_prec=None, actions_prec=None + ): + + if eff_prec == None: + eff_node, cost_node, actions = self._group_eff_cost(group) + else: + eff_node, cost_node, actions = eff_prec, cost_prec, actions_prec + + node = Node( + effectiveness=eff_node, cost=cost_node, actions=actions, size=len(group) + ) + possible_splits = [] + + for feature in split_features: + eff_children, cost_children = 0, 0 + children_info = [] + + for feature_split_values in self.split_values[feature]: + split_df = group[group[feature].isin(feature_split_values)] + + if not split_df.empty: + eff_child, cost_child, actions = self._group_eff_cost(split_df) + eff_children += eff_child + cost_children += cost_child + children_info.append( + ( + feature_split_values, + split_df, + eff_child, + cost_child, + actions, + ) + ) + + possible_splits.append( + (feature, eff_children, cost_children, children_info) + ) + + if len(possible_splits) == 0: + return node + + possible_splits = sorted(possible_splits, key=lambda x: -x[1]) + node.split_feature = possible_splits[0][0] + split_features.remove(node.split_feature) + child_info = possible_splits[0][3] + + for child in child_info: + child_node = _partition_group( + child[1], split_features, child[2], child[3], child[4] + ) + + node.add_child(child[0], child_node) + + return node + + self.node = _partition_group(instances, self.split_features) + self.node_instances = instances + return self.node + + def cumulative_leaf_actions(self): + """ + Computes the total effectiveness and cost of actions taken from leaf nodes of the tree. + + Returns: + ------- + Tuple[float, float, int] + A tuple containing the total effectiveness, total cost, and the number of actions taken. + """ + eff, cost = cumulative( + self.model, + self.node_instances, + self.node.return_leafs_actions(), + self.dist_func_dataframe, + self.numerical_features_names, + self.categorical_features_names, + categorical_no_action_token="-", + ) + + print(f"\nTOTAL EFFECTIVENESS: {eff / self.node_instances.shape[0]:.2%}") + print(f"\nTOTAL COST: {(cost / eff if eff > 0 else 0):.2f}") + + return eff, cost, len(self.node.return_leafs_actions()) + + +def _split_list(lst, n): + """ + Splits a list into n approximately equal parts. + + Parameters: + ---------- + lst : list + The list to split. + n : int + The number of parts to split the list into. + + Returns: + ------- + Generator + A generator yielding the split parts of the list. + """ + k, m = divmod(len(lst), n) + return (lst[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + + +def _get_split_values(X, split_features, child_count): + """ + Generates split values for the specified features. + + Parameters: + ---------- + X : pd.DataFrame + The dataset containing features. + split_features : List[str] + The list of features to create split values for. + child_count : int + The number of child splits to create for each feature. + + Returns: + ------- + Dict[str, List[List[Any]]] + A dictionary mapping each feature to its corresponding split values. + """ + split_values = {} + for feature in split_features: + lst = sorted(list(X[feature].unique())) + split_count = child_count + if child_count == -1: + split_count = len(lst) + split_values[feature] = list(_split_list(lst, split_count)) + return split_values diff --git a/aix360/algorithms/glance/counterfactual_tree/node.py b/aix360/algorithms/glance/counterfactual_tree/node.py new file mode 100644 index 0000000..a029c81 --- /dev/null +++ b/aix360/algorithms/glance/counterfactual_tree/node.py @@ -0,0 +1,245 @@ +class Node: + """ + A class representing a node in a decision tree structure. + + Each node can have child nodes, actions associated with it, and metrics + such as effectiveness and cost. This class provides methods to add child + nodes, retrieve actions from leaf nodes, and visualize the tree structure. + + Attributes: + ---------- + split_feature : str or None + The feature used to split the data at this node. Default is None. + + actions : list or None + A list of actions associated with this node. Default is None. + + effectiveness : float + The effectiveness of the actions taken at this node. Default is 0. + + cost : float + The total cost associated with the actions at this node. Default is 0. + + size : int + The number of instances or data points at this node. Default is 0. + + children : dict + A dictionary mapping from subgroup values to child nodes. + + Methods: + ------- + add_child(subgroup, child_node): + Adds a child node to this node. + + return_leafs_actions(): + Returns all actions from the leaf nodes in the subtree rooted at this node. + + to_igraph(numeric_features=[]): + Converts the tree structure to an igraph object for visualization. + + display_igraph_jupyter(numeric_features=[]): + Displays the tree structure in a Jupyter notebook using matplotlib and igraph. + """ + + def __init__( + self, + split_feature=None, + actions=None, + effectiveness=0, + cost=0, + size=0, + ): + """ + Initializes a new Node instance. + + Parameters: + ---------- + split_feature : str or None + The feature used to split the data at this node. Default is None. + + actions : list or None + A list of actions associated with this node. Default is None. + + effectiveness : float + The effectiveness of the actions taken at this node. Default is 0. + + cost : float + The total cost associated with the actions at this node. Default is 0. + + size : int + The number of instances or data points at this node. Default is 0. + """ + self.split_feature = split_feature + self.effectiveness = effectiveness + self.cost = cost + self.size = size + self.children = {} + self.actions = actions + + def add_child(self, subgroup, child_node): + """ + Adds a child node to this node. + + Parameters: + ---------- + subgroup : any + The value associated with the child node. + + child_node : Node + The child node to be added. + """ + print() + self.children[tuple(subgroup)] = child_node + + def return_leafs_actions(self): + """ + Returns all actions from the leaf nodes in the subtree rooted at this node. + + Returns: + ------- + list + A flattened list of actions from the leaf nodes. + """ + cfs_list = [] + + def find_leafs_actions(node): + if node.children == {}: + cfs_list.append(node.actions) + else: + for child_node in node.children.values(): + find_leafs_actions(child_node) + + find_leafs_actions(self) + return [action for sublist in cfs_list for action in sublist] + + + def to_igraph(self, numeric_features=[]): + """ + Converts the tree structure to an igraph object for visualization. + + Parameters: + ---------- + numeric_features : list + A list of numeric feature names used for processing node labels. + + Returns: + ------- + ig.Graph + An igraph object representing the tree structure. + """ + import igraph as ig + + def pre_order(node, timer, reg): + node.id = timer + reg[timer] = node + timer += 1 + for _value, child_node in node.children.items(): + timer = pre_order(child_node, timer=timer, reg=reg) + return timer + + node_registry = dict() + n_nodes = pre_order(self, timer=0, reg=node_registry) + + def pre_order_2(node): + max_value = 0 + for value, child_node in node.children.items(): + if node.split_feature in numeric_features: + if max(value) > max_value: + max_value = max(value) + max_list = child_node + for value, child_node in node.children.items(): + if node.split_feature in numeric_features: + if child_node != max_list: + val = max(value) + for value, child_node in node.children.items(): + child_node.data_feat = node.split_feature + if node.split_feature in numeric_features: + if child_node == max_list: + child_node.data_val = f"> {val}" + else: + child_node.data_val = f"<= {val}" + else: + child_node.data_val = value if len(value) > 1 else value[0] + pre_order_2(child_node) + + pre_order_2(self) + self.data_feat = "all" + self.data_val = "-" + + graph = ig.Graph(directed=True) + + def add_nodes(node): + size = node.size + num_flipped = node.effectiveness + cost_sum = node.cost + eff = num_flipped / size + actions = [action[action != "-"].to_dict() for action in node.actions] + actions_ = [] + + for action in actions: + action_copy = action + for k, v in action_copy.items(): + if k in numeric_features: + action_copy[k] = round(v, 3) + actions_.append(action_copy) + + if num_flipped == 0: + cost = 0 + else: + cost = cost_sum / num_flipped + + label = f"{eff=:.2%}\n{cost=:.2f}\n{size=}\n" + for action in actions_: + label += f"{action}\n" + + graph.add_vertex( + node.id, + label=label, + ) + for _child_name, child in node.children.items(): + add_nodes(child) + + add_nodes(self) + + def add_edges(node): + for _child_name, child in node.children.items(): + graph.add_edge( + node.id, child.id, label=f"{node.split_feature} {child.data_val}" + ) + add_edges(child) + + add_edges(self) + + return graph + + def display_igraph_jupyter(self, numeric_features=[]): + """ + Displays the tree structure in a Jupyter notebook using matplotlib and igraph. + + Parameters: + ---------- + numeric_features : list + A list of numeric feature names used for processing node labels. + """ + import igraph as ig + import matplotlib.pyplot as plt + + g = self.to_igraph(numeric_features=numeric_features) + fig, ax = plt.subplots() + fig.set_figheight(10) + fig.set_figwidth(40) + vertex_labels = g.vs["label"] + edge_labels = g.es["label"] + ig.plot( + g, + target=ax, + layout="reingold_tilford", + vertex_size=55, + # vertex_frame_width=10.0, + # vertex_frame_color="white", + vertex_label=vertex_labels, + edge_label=edge_labels, + vertex_label_size=8.0, + ) + ax.invert_yaxis() + \ No newline at end of file diff --git a/aix360/algorithms/glance/iterative_merges/__init__.py b/aix360/algorithms/glance/iterative_merges/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aix360/algorithms/glance/iterative_merges/iterative_merges.py b/aix360/algorithms/glance/iterative_merges/iterative_merges.py new file mode 100644 index 0000000..b81482e --- /dev/null +++ b/aix360/algorithms/glance/iterative_merges/iterative_merges.py @@ -0,0 +1,1281 @@ +from typing import Union, Any, List, Optional, Dict, Tuple, Callable, Literal +import math +import numbers +import itertools +from tqdm import tqdm +import warnings +from colorama import Fore, Style + +import numpy as np +import pandas as pd + +from scipy.cluster.hierarchy import DisjointSet +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer + +from IPython.display import display +from ..base import GlobalCounterfactualMethod +from ..base import LocalCounterfactualMethod +from ..base import ClusteringMethod +from ..utils.centroid import centroid_pandas +from ..utils.action import ( + apply_action_pandas, + actions_mean_pandas, +) +from ..counterfactual_costs import build_dist_func_dataframe +from ..utils.metadata_requests import _decide_cluster_method, _decide_local_cf_method +from .phase2 import generate_cluster_centroid_explanations + + +class C_GLANCE(GlobalCounterfactualMethod): + """ + A class for generating global counterfactual explanations using an iterative merging approach. + + It allows the user to control the number of clusters and the methods used + for clustering and generating counterfactuals. + + Attributes: + ---------- + model : Any + The predictive model used for generating counterfactuals. + initial_clusters : int + The initial number of clusters to form. + final_clusters : int + The target number of clusters after merging. + num_local_counterfactuals : int + The number of local counterfactuals to generate for each cluster. + heuristic_weights : Tuple[float, float] + Weights used in the heuristic for merging clusters. + alternative_merges : bool + If True, allows alternative merging strategies. + random_seed : int + Seed for random number generation. + verbose : bool + If True, enables verbose output during processing. + final_clustering : Optional[Dict[int, pd.DataFrame]] + The final clustering of instances after merging. + cluster_results : Optional[Dict[int, Dict[str, Any]]] + Results of the clustering including effectiveness and cost metrics. + + Methods: + ------- + _set_features_names(X, numerical_names, categorical_names): + Sets the feature names for numerical and categorical features. + + fit(X, y, train_dataset, feat_to_vary, numeric_features_names, categorical_features_names, + clustering_method, cf_generator, cluster_action_choice_algo, ...) + Fits the clustering and counterfactual generation model to the provided dataset. + + explain_group(instances): + Explains the group of instances by generating counterfactuals based on clustering. + + global_actions(): + Retrieves the global actions derived from the clustered results. + """ + + def __init__( + self, + model: Any, + initial_clusters: int = 100, + final_clusters: int = 10, + num_local_counterfactuals: int = 5, + heuristic_weights: Tuple[float, float] = (0.5, 0.5), + alternative_merges: bool = True, + random_seed: int = 13, + verbose=True, + ) -> None: + """ + Initializes the IterativeMerges instance. + + Parameters: + ---------- + model : Any + The predictive model used for generating counterfactuals. + initial_clusters : int, optional + The initial number of clusters to form. Default is 100. + final_clusters : int, optional + The target number of clusters after merging. Default is 10. + num_local_counterfactuals : int, optional + The number of local counterfactuals to generate for each cluster. Default is 5. + heuristic_weights : Tuple[float, float], optional + Weights used in the heuristic for merging clusters. Default is (0.5, 0.5). + alternative_merges : bool, optional + If True, allows alternative merging strategies. Default is True. + random_seed : int, optional + Seed for random number generation. Default is 13. + verbose : bool, optional + If True, enables verbose output during processing. Default is True. + """ + super().__init__() + self.model = model + self.initial_clusters = initial_clusters + self.final_clusters = final_clusters + self.num_local_counterfactuals = num_local_counterfactuals + self.heuristic_weights = heuristic_weights + self.alternative_merges = alternative_merges + self.random_seed = random_seed + self.verbose = verbose + self.final_clustering = None + self.clusters_results = None + + def _set_features_names( + self, + X: pd.DataFrame, + numerical_names: Optional[List[str]], + categorical_names: Optional[List[str]] + ) -> Tuple[List[str], List[str]]: + """ + Sets the feature names for numerical and categorical features. + + Parameters: + ---------- + X : pd.DataFrame + The dataset to analyze. + numerical_names : Optional[List[str]] + List of numerical feature names. If None, they will be inferred from X. + categorical_names : Optional[List[str]] + List of categorical feature names. If None, they will be inferred from X. + + Returns: + ------- + Tuple[List[str], List[str]] + A tuple containing lists of numerical and categorical feature names. + """ + if numerical_names is None and categorical_names is None: + numerical_names = X.select_dtypes( + include=["number"] + ).columns.tolist() + categorical_names = X.columns.difference( + numerical_names + ).tolist() + elif numerical_names is None and categorical_names is not None: + numerical_names = X.columns.difference(categorical_names).tolist() + elif numerical_names is not None and categorical_names is None: + categorical_names = X.columns.difference(numerical_names).tolist() + + assert numerical_names is not None and categorical_names is not None + return numerical_names, categorical_names + + def fit( + self, + X: pd.DataFrame, + y: pd.Series, + train_dataset: pd.DataFrame, + feat_to_vary: Optional[Union[List[str], str]] = "all", + numeric_features_names: Optional[List[str]] = None, + categorical_features_names: Optional[List[str]] = None, + clustering_method: Union[ClusteringMethod, Literal["KMeans"]] = "KMeans", + cf_generator: Union[ + LocalCounterfactualMethod, + Literal["Dice", "NearestNeighbors", "RandomSampling"] + ] = "Dice", + cluster_action_choice_algo: Literal["max-eff", "mean-act", "low-cost"] = "max-eff", + nns__n_scalars: Optional[int] = None, + rs__n_most_important: Optional[int] = None, + rs__n_categorical_most_frequent: Optional[int] = None, + lowcost__action_threshold: Optional[int] = None, + lowcost__num_low_cost: Optional[int] = None, + min_cost_eff_thres__effectiveness_threshold: Optional[float] = None, + min_cost_eff_thres_combinations__num_min_cost: Optional[int] = None, + eff_thres_hybrid__max_n_actions_full_combinations: Optional[int] = None, + ) -> "C_GLANCE": + """ + Fits the clustering and counterfactual generation model to the provided dataset. + + Parameters: + ---------- + X : pd.DataFrame + Features of the dataset. + y : pd.Series + Target variable. + train_dataset : pd.DataFrame + The training dataset used for local counterfactual generation methods. + feat_to_vary : Optional[Union[List[str], str]], optional + Features to vary in counterfactual generation. Default is "all". + numeric_features_names : Optional[List[str]], optional + List of numeric feature names. If None, they will be inferred from X. + categorical_features_names : Optional[List[str]], optional + List of categorical feature names. If None, they will be inferred from X. + clustering_method : Union[ClusteringMethod, Literal["KMeans"]], optional + The clustering method to use. Default is "KMeans". + cf_generator : Union[LocalCounterfactualMethod, Literal["Dice", "NearestNeighbors", "RandomSampling"]], optional + The local counterfactual generation method to use. Default is "Dice". + cluster_action_choice_algo : Literal["max-eff", "mean-act", "low-cost""], optional + The algorithm for selecting actions from clusters. Default is "max-eff". + nns__n_scalars : Optional[int], optional + Number of scalar features to use for nearest neighbors. Default is None. + rs__n_most_important : Optional[int], optional + Number of most important features for random sampling. Default is None. + rs__n_categorical_most_frequent : Optional[int], optional + Number of most frequent categorical features for random sampling. Default is None. + lowcost__action_threshold : Optional[int], optional + Action threshold for low-cost methods. Default is None. + lowcost__num_low_cost : Optional[int], optional + Number of low-cost actions to consider. Default is None. + min_cost_eff_thres__effectiveness_threshold : Optional[float], optional + Effectiveness threshold for minimum cost methods. Default is None. + min_cost_eff_thres_combinations__num_min_cost : Optional[int], optional + Number of minimum cost combinations to evaluate. Default is None. + eff_thres_hybrid__max_n_actions_full_combinations : Optional[int], optional + Maximum number of actions for full combinations in hybrid thresholding. Default is None. + + Returns: + ------- + IterativeMerges + Returns the fitted instance of IterativeMerges. + """ + self.numerical_features_names, self.categorical_features_names = self._set_features_names( + X=X, + numerical_names=numeric_features_names, + categorical_names=categorical_features_names, + ) + + self.X = X + self.y = y + self.train_dataset = train_dataset + self.clustering_method_ = clustering_method + self.action_threshold = lowcost__action_threshold if lowcost__action_threshold is not None else 1.5 + self.num_low_cost = lowcost__num_low_cost if lowcost__num_low_cost is not None else 20 + self.effectiveness_threshold = min_cost_eff_thres__effectiveness_threshold if min_cost_eff_thres__effectiveness_threshold is not None else 0.1 + self.min_cost_eff_thres_combinations__num_min_cost = min_cost_eff_thres_combinations__num_min_cost + self.cluster_action_choice_algo: Literal["max-eff", "mean-act", "low-cost", "min-cost-eff-thres", "min-cost-eff-thres-combinations", "hybrid"] = cluster_action_choice_algo + self.eff_thres_hybrid__max_n_actions_full_combinations = eff_thres_hybrid__max_n_actions_full_combinations if eff_thres_hybrid__max_n_actions_full_combinations is None else 50 + + if nns__n_scalars is not None: + self.n_scalars = nns__n_scalars + else: + self.n_scalars = 1000 + if rs__n_most_important is not None: + self.n_most_important = rs__n_most_important + else: + self.n_most_important = len(X.columns) + if rs__n_categorical_most_frequent is not None: + self.n_categorical_most_frequent = rs__n_categorical_most_frequent + else: + self.n_categorical_most_frequent = 20 + + self.cf_generator = _decide_local_cf_method( + method=cf_generator, + model=self.model, + train_dataset=self.train_dataset, + numeric_features_names=self.numerical_features_names, + categorical_features_names=self.categorical_features_names, + feat_to_vary=feat_to_vary, + random_seed=self.random_seed, + n_scalars=self.n_scalars, + n_most_important=self.n_most_important, + n_categorical_most_frequent=self.n_categorical_most_frequent, + ) + + self.dist_func_dataframe = build_dist_func_dataframe( + X=X, + numerical_columns=self.numerical_features_names, + categorical_columns=self.categorical_features_names, + ) + return self + + def explain_group( + self, instances: pd.DataFrame + ) -> Tuple[int, float]: + """ + Explains the group of instances by generating counterfactuals based on clustering. + + Parameters: + ---------- + instances : pd.DataFrame + The group of instances to explain. + + Returns: + ------- + Tuple[int, float] + A tuple containing the total effectiveness and total cost of the generated counterfactuals. + """ + if self.initial_clusters > instances.shape[0]: + warnings.warn( + "Requested number of initial clusters is larger than the number of instances to explain. Setting to number of instances." + ) + self.initial_clusters = instances.shape[0] + + self.clustering_method = _decide_cluster_method( + self.clustering_method_, self.initial_clusters, self.random_seed + ) + + clusters = _generate_clusters( + instances=instances, + num_clusters=self.initial_clusters, + categorical_features_names=self.categorical_features_names, + clustering_method=self.clustering_method, + ) + + cluster_centroids = { + i: centroid_pandas( + X=instances, + numerical_columns=self.numerical_features_names, + categorical_columns=self.categorical_features_names, + ) + for i, instances in clusters.items() + } + + cluster_explanations, cluster_expl_actions, explanations_centroid = ( + generate_cluster_centroid_explanations( + cluster_centroids=cluster_centroids, + cf_generator=self.cf_generator, + num_local_counterfactuals=self.num_local_counterfactuals, + numerical_features_names=self.numerical_features_names, + categorical_features_names=self.categorical_features_names, + ) + ) + # delete clusters with no explanations + clusters = {i: cluster for i, cluster in clusters.items() if i in cluster_explanations.keys()} + cluster_centroids = {i: cluster for i, cluster in cluster_centroids.items() if i in cluster_explanations.keys()} + + while len(clusters) > self.final_clusters: + cluster1, cluster2 = _find_candidate_clusters( + clusters=clusters, + cluster_centroids=cluster_centroids, + explanations_centroid=explanations_centroid, + heuristic_weights=self.heuristic_weights, + dist_func_dataframe=self.dist_func_dataframe, + ) + + _merge_clusters( + cluster1=cluster1, + cluster2=cluster2, + clusters=clusters, + cluster_explanations=cluster_explanations, + cluster_centroids=cluster_centroids, + cluster_expl_actions=cluster_expl_actions, + explanations_centroid=explanations_centroid, + numerical_features_names=self.numerical_features_names, + categorical_features_names=self.categorical_features_names, + ) + + clusters_res, total_eff, total_cost = cluster_results( + model=self.model, + instances=instances, + clusters=clusters, + cluster_expl_actions=cluster_expl_actions, + dist_func_dataframe=self.dist_func_dataframe, + numerical_features_names=self.numerical_features_names, + categorical_features_names=self.categorical_features_names, + cluster_action_choice_algo=self.cluster_action_choice_algo, + action_threshold=self.action_threshold, + num_low_cost=self.num_low_cost, + effectiveness_threshold=self.effectiveness_threshold, + num_min_cost=self.min_cost_eff_thres_combinations__num_min_cost, + max_n_actions_full_combinations=self.eff_thres_hybrid__max_n_actions_full_combinations, + ) + + for i, stats in clusters_res.items(): + stats["size"] = clusters[i].shape[0] + + if self.verbose == True: + format_glance_output( + cluster_stats=clusters_res, + categorical_columns = self.categorical_features_names) +# print_results( +# clusters_stats=clusters_res, +# total_effectiveness=total_eff, +# total_cost=total_cost, +# ) + + eff, cost = cumulative( + self.model, + instances, + [stats["action"] for i, stats in clusters_res.items()], + self.dist_func_dataframe, + self.numerical_features_names, + self.categorical_features_names, + "-", + ) + if self.verbose == True: + print(f"{Style.BRIGHT}TOTAL EFFECTIVENESS:{Style.RESET_ALL} {Fore.GREEN}{eff / instances.shape[0]:.2%}{Fore.RESET}") + print(f"{Style.BRIGHT}TOTAL COST:{Style.RESET_ALL} {Fore.MAGENTA}{(cost / eff):.2f}{Fore.RESET}") + + self.final_clustering = clusters + self.cluster_results = clusters_res + + return eff, cost + + def global_actions(self): + return [stats["action"] for i, stats in self.cluster_results.items()] + + +def cumulative( + model, + instances, + actions, + dist_func_dataframe, + numeric_features_names, + categorical_features_names, + categorical_no_action_token, +): + """ + Computes the cumulative effectiveness and cost of applying a set of actions + to a given set of instances using a predictive model. + + Parameters: + ---------- + model : Any + A predictive model with a predict method. This model will be used to predict + outcomes after applying actions to the input instances. + instances : pd.DataFrame + A DataFrame containing the instances for which actions are to be applied. + actions : List[dict] + A list of actions, where each action is represented as a dictionary that + specifies how to modify the instances. + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A distance function that takes two DataFrames and returns a Series of distances + between corresponding rows. + numeric_features_names : List[str] + A list of names for the numeric features in the instances DataFrame. + categorical_features_names : List[str] + A list of names for the categorical features in the instances DataFrame. + categorical_no_action_token : Any + A token used to represent a no-action state for categorical features. + + Returns: + ------- + Tuple[int, float] + A tuple containing: + - effectiveness: An integer count of how many actions were effective (i.e., + resulted in a finite cost). + - cost: A float representing the total cost incurred by the effective actions. + """ + costs = [] + all_predictions = [] + + for action in actions: + applied_df = apply_action_pandas( + instances, + action, + numeric_features_names, + categorical_features_names, + categorical_no_action_token, + ) + + predictions = model.predict(applied_df) + all_predictions.append(predictions) + cur_costs = dist_func_dataframe(instances.reset_index(drop=True), applied_df.reset_index(drop=True)) + cur_costs[predictions == 0] = np.inf + costs.append(cur_costs) + + if costs == []: + return 0, 0. + final_costs = np.column_stack(costs).min(axis=1) + effectiveness = (final_costs != np.inf).sum() + cost = final_costs[final_costs != np.inf].sum() + + return effectiveness, cost + + +def action_fake_cost( + action: pd.Series, + numerical_features_names: List[str], + categorical_features_names: List[str], +): + return ( + action[numerical_features_names].sum() + + (action[categorical_features_names] != "-").sum() + ) + + +def _select_action_low_cost( + model: Any, + instances: pd.DataFrame, + cluster_instances: pd.DataFrame, + candidate_actions: pd.DataFrame, + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], + numerical_features_names: List[str], + categorical_features_names: List[str], + action_threshold: int, + num_low_cost: int, + inv_total_clusters: int, +): + """ + Selects the action with the lowest cost that flips a sufficient number of instances + in the given dataset, based on a predictive model. + + This function evaluates candidate actions, applies them to the provided instances, + and calculates the number of predictions that were flipped as a result. It returns + the action that results in the lowest recourse cost while also meeting a specified + threshold of flipped predictions. + + Parameters: + ---------- + model : Any + A machine learning model used for making predictions. + + instances : pd.DataFrame + A DataFrame containing the instances for which counterfactuals are being generated. + + cluster_instances : pd.DataFrame + A DataFrame containing instances from a specific cluster used for evaluating actions. + + candidate_actions : pd.DataFrame + A DataFrame containing potential actions to apply to the instances. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance or cost between two DataFrames. + + numerical_features_names : List[str] + A list of names for the numerical features in the instances. + + categorical_features_names : List[str] + A list of names for the categorical features in the instances. + + action_threshold : int + The minimum ratio of flipped predictions to total instances required to consider + an action effective. + + num_low_cost : int + The maximum number of low-cost actions to evaluate. + + inv_total_clusters : int + The inverse of the total number of clusters used for normalization. + + Returns: + ------- + Tuple[int, float, pd.Series] + A tuple containing: + - The number of predictions flipped. + - The minimum recourse cost associated with the best action. + - The best action selected from the candidate actions. + + Raises: + ------ + ValueError + If no actions are found that meet the effectiveness threshold. + """ + actions_list = [action for _, action in candidate_actions.iterrows()] + actions_list.sort( + key=lambda action: action_fake_cost( + action, numerical_features_names, categorical_features_names + ) + ) + cf_list = [] + for action in actions_list[: min(num_low_cost, len(actions_list))]: + cfs = apply_action_pandas( + X=instances, + action=action, + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + categorical_no_action_token="-", + ) + predictions: np.ndarray = model.predict(cfs) + n_flipped = predictions.sum() + + if n_flipped > (action_threshold * inv_total_clusters) * len(instances): + cfs = apply_action_pandas( + X=cluster_instances, + action=action, + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + categorical_no_action_token="-", + ) + predictions: np.ndarray = model.predict(cfs) + n_flipped = predictions.sum() + factuals_flipped = cluster_instances[predictions == 1] + cfs_flipped = cfs[predictions == 1] + recourse_cost_sum = dist_func_dataframe(factuals_flipped, cfs_flipped).sum() + cf_list.append((n_flipped, recourse_cost_sum, action)) + + if len(cf_list) == 0: + raise ValueError( + "Change action_threshold. No action found in cluster with effectiveness in all instances above the threshold" + ) + else: + n_flipped, min_recourse_cost_sum, best_action = min( + cf_list, key=lambda x: (x[1], -x[0]) + ) + + return n_flipped, min_recourse_cost_sum, best_action + +def actions_cumulative_eff_cost( + model: Any, + X: pd.DataFrame, + actions_with_costs: List[Tuple[pd.Series, float]], + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], + numerical_columns: List[str], + categorical_columns: List[str], + categorical_no_action_token: Any, +) -> Tuple[float, float]: + """ + Evaluates the cumulative effectiveness and cost of applying a sequence of actions + to a dataset using a predictive model. + + This function applies each action from the sorted list of actions with their costs, + predicts the outcomes, and calculates the total number of predictions that were flipped + as well as the total recourse cost incurred from the actions. + + Parameters: + ---------- + model : Any + A machine learning model used for making predictions on the modified instances. + + X : pd.DataFrame + The original DataFrame of instances to which actions will be applied. + + actions_with_costs : List[Tuple[pd.Series, float]] + A list of tuples where each tuple contains: + - A pandas Series representing the action to apply. + - A float representing the cost associated with the action. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance or cost between two DataFrames. + + numerical_columns : List[str] + A list of names for the numerical columns in the DataFrame. + + categorical_columns : List[str] + A list of names for the categorical columns in the DataFrame. + + categorical_no_action_token : Any + A token used to represent the absence of an action for categorical features. + + Returns: + ------- + Tuple[float, float] + A tuple containing: + - The total number of predictions flipped across all actions applied. + - The total recourse cost incurred from applying the actions. + """ + X = X.copy() + actions_with_costs = sorted(actions_with_costs, key=lambda t: t[1]) + n_flipped_total = 0 + recourse_cost_sum = 0 + for action, _old_cost in actions_with_costs: + cfs = apply_action_pandas( + X=X, + action=action, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns, + categorical_no_action_token=categorical_no_action_token, + ) + predictions: np.ndarray = model.predict(cfs) + n_flipped_total += predictions.sum() + factuals_flipped = X[predictions == 1] + cfs_flipped = cfs[predictions == 1] + recourse_cost_sum += dist_func_dataframe(factuals_flipped, cfs_flipped).sum() + X = X[predictions == 0] + + return n_flipped_total, recourse_cost_sum + +def _select_action_max_eff( + model: Any, + instances: pd.DataFrame, + candidate_actions: pd.DataFrame, + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], + numerical_features_names: List[str], + categorical_features_names: List[str], + num_actions: int = 1, +) -> Tuple[int, int, pd.Series]: + """ + Selects actions based on maximizing the effectiveness. + + This function evaluates a set of candidate actions by applying each action to the given + instances, predicting the outcomes, and calculating the number of predictions that are + flipped (changed from 0 to 1). It also computes the recourse cost associated with each action. + Depending on the number of actions specified, it returns either the best action or a list + of the top actions based on effectiveness. + + Parameters: + ---------- + model : Any + A machine learning model used for making predictions on the modified instances. + + instances : pd.DataFrame + The DataFrame of original instances to which actions will be applied. + + candidate_actions : pd.DataFrame + A DataFrame containing the candidate actions to evaluate. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance or cost between two DataFrames. + + numerical_features_names : List[str] + A list of names for the numerical columns in the DataFrame. + + categorical_features_names : List[str] + A list of names for the categorical columns in the DataFrame. + + num_actions : int, optional + The number of top actions to select based on effectiveness. Defaults to 1. + + Returns: + ------- + Tuple[int, int, pd.Series] + If `num_actions` is 1, returns: + - The maximum number of predictions flipped. + - The total recourse cost associated with the best action. + - The best action (pd.Series). + + If `num_actions` > 1, returns a list of the top actions based on their effectiveness. + """ + max_n_flipped = 0 + cf_list = [] + + for _, action in candidate_actions.iterrows(): + cfs = apply_action_pandas( + X=instances, + action=action, + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + categorical_no_action_token="-", + ) + predictions: np.ndarray = model.predict(cfs) + n_flipped = predictions.sum() + + if n_flipped < max_n_flipped and num_actions == 1: + continue + max_n_flipped = n_flipped + + factuals_flipped = instances[predictions == 1] + cfs_flipped = cfs[predictions == 1] + recourse_cost_sum = dist_func_dataframe(factuals_flipped, cfs_flipped).sum() + cf_list.append((n_flipped, recourse_cost_sum, action)) + + if num_actions == 1: + max_n_flipped, recourse_cost_sum, best_action = max( + cf_list, key=lambda x: (x[0], -x[1]) + ) + + return max_n_flipped, recourse_cost_sum, best_action + else: + cf_list.sort(key=lambda x: (-x[0], x[1])) + return cf_list[:num_actions] + + +def _select_action_mean( + model: Any, + instances: pd.DataFrame, + candidate_actions: pd.DataFrame, + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], + numerical_features_names: List[str], + categorical_features_names: List[str], +) -> Tuple[int, int, pd.Series]: + """ + Selects the mean action from a set of candidate actions and evaluates its effectiveness. + + This function computes the mean action from the candidate actions and applies it to the + given instances. It then predicts the outcomes and calculates the number of predictions that + are flipped (changed from 0 to 1) as well as the associated recourse cost. + + Parameters: + ---------- + model : Any + A machine learning model used for making predictions on the modified instances. + + instances : pd.DataFrame + The DataFrame of original instances to which the mean action will be applied. + + candidate_actions : pd.DataFrame + A DataFrame containing the candidate actions from which the mean action will be derived. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance or cost between two DataFrames. + + numerical_features_names : List[str] + A list of names for the numerical columns in the DataFrame. + + categorical_features_names : List[str] + A list of names for the categorical columns in the DataFrame. + + Returns: + ------- + Tuple[int, int, pd.Series] + A tuple containing: + - The number of predictions flipped by applying the mean action. + - The total recourse cost associated with the mean action. + - The mean action (pd.Series). + """ + mean_action = actions_mean_pandas( + actions=candidate_actions, + numerical_features=numerical_features_names, + categorical_features=categorical_features_names, + categorical_no_action_token="-", + ) + cfs = apply_action_pandas( + X=instances, + action=mean_action, + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + categorical_no_action_token="-", + ) + predictions: np.ndarray = model.predict(cfs) + n_flipped = predictions.sum() + factuals_flipped = instances[predictions == 1] + cfs_flipped = cfs[predictions == 1] + recourse_cost_sum = dist_func_dataframe(factuals_flipped, cfs_flipped).sum() + + return n_flipped, recourse_cost_sum, mean_action + + +def cluster_results( + model: Any, + instances: pd.DataFrame, + clusters: Dict[int, pd.DataFrame], + cluster_expl_actions: Dict[int, pd.DataFrame], + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], + numerical_features_names: List[str], + categorical_features_names: List[str], + cluster_action_choice_algo: Literal["max-eff", "mean-act", "low-cost", "min-cost-eff-thres", "eff-thres-hybrid"] = "max-eff", + action_threshold: int = 2, + num_low_cost: int = 20, + effectiveness_threshold: float = 0.1, + num_min_cost: Optional[int] = None, + max_n_actions_full_combinations: int = 50, +) -> Tuple[Dict[int, Dict[str, Any]], float, float]: + """ + Evaluates and selects actions for each cluster based on a specified action choice algorithm. + + This function iterates through each cluster of instances, applying the specified algorithm to + select the best action for achieving recourse while minimizing costs. It calculates the total + effectiveness and mean recourse costs across all clusters. + + Parameters: + ---------- + model : Any + A machine learning model used for making predictions on modified instances. + + instances : pd.DataFrame + The DataFrame of original instances to which actions will be applied. + + clusters : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to DataFrames of instances belonging to each cluster. + + cluster_expl_actions : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to DataFrames of candidate actions for each cluster. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance or cost between two DataFrames. + + numerical_features_names : List[str] + A list of names for the numerical columns in the DataFrames. + + categorical_features_names : List[str] + A list of names for the categorical columns in the DataFrames. + + cluster_action_choice_algo : Literal["max-eff", "mean-act", "low-cost", "min-cost-eff-thres", "eff-thres-hybrid"] + The algorithm to use for selecting actions from candidate actions. Options include: + - "max-eff": Select the action with maximum effectiveness. + - "mean-act": Select the mean action from candidate actions. + - "low-cost": Select actions based on low cost. + + action_threshold : int + Minimum threshold for the number of flipped predictions required to consider an action effective. + + num_low_cost : int + The number of low-cost actions to consider (used when the low-cost algorithm is selected). + + effectiveness_threshold : float + Minimum effectiveness required for actions (used when the min-cost-eff-thres algorithm is selected). + + num_min_cost : Optional[int] + Number of minimum cost actions to consider (used when the min-cost-eff-thres algorithm is selected). + + max_n_actions_full_combinations : int + Maximum number of actions to evaluate in full combinations (not currently used in the function). + + Returns: + ------- + Tuple[Dict[int, Dict[str, Any]], float, float] + A tuple containing: + - A dictionary where each key is a cluster ID and each value is another dictionary with the selected action, its effectiveness, and cost. + - Total effectiveness percentage across all clusters. + - Total mean recourse cost across all clusters. + """ + n_flipped_total = 0 + total_recourse_cost_sum = 0 + ret_clusters = {} + for i, cluster in clusters.items(): + if cluster_action_choice_algo == "max-eff": + n_flipped, recourse_cost_sum, selected_action = _select_action_max_eff( + model=model, + instances=cluster, + candidate_actions=cluster_expl_actions[i], + dist_func_dataframe=dist_func_dataframe, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + ) + elif cluster_action_choice_algo == "mean-act": + n_flipped, recourse_cost_sum, selected_action = _select_action_mean( + model=model, + instances=cluster, + candidate_actions=cluster_expl_actions[i], + dist_func_dataframe=dist_func_dataframe, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + ) + elif cluster_action_choice_algo == "low-cost": + n_flipped, recourse_cost_sum, selected_action = _select_action_low_cost( + model=model, + instances=instances, + cluster_instances=cluster, + candidate_actions=cluster_expl_actions[i], + dist_func_dataframe=dist_func_dataframe, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + action_threshold=action_threshold, + num_low_cost=num_low_cost, + inv_total_clusters=(1 / len(clusters)), + ) + elif cluster_action_choice_algo == "min-cost-eff-thres-combinations": + break + elif cluster_action_choice_algo == "eff-thres-hybrid": + break + else: + raise ValueError( + "Unsupported algorithm for choice of final action for each cluster" + ) + + ret_clusters[i] = { + "action": selected_action, + "effectiveness": n_flipped / cluster.shape[0], + "cost": recourse_cost_sum / n_flipped, + } + n_flipped_total += n_flipped + total_recourse_cost_sum += recourse_cost_sum + + + if cluster_action_choice_algo == "min-cost-eff-thres-combinations": + n_flipped_total, total_recourse_cost_sum, action_set = _select_action_min_cost_eff_thres_combinations( + model=model, + instances=instances, + clusters=clusters, + candidate_actions=cluster_expl_actions, + dist_func_dataframe=dist_func_dataframe, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + effectiveness_threshold=effectiveness_threshold, + num_min_cost=num_min_cost, + ) + + assert len(action_set) == len(clusters) + actions_iter = iter(action_set) + ret_clusters = {i: { + "action": next(actions_iter), + "effectiveness": np.nan, + "cost": np.nan, + } for i in clusters.keys()} + + n_individuals_total = instances.shape[0] + total_effectiveness_percentage = n_flipped_total / n_individuals_total + total_mean_recourse_cost = total_recourse_cost_sum / n_flipped_total + + return ret_clusters, total_effectiveness_percentage, total_mean_recourse_cost + + assert len(action_set) == len(clusters) + actions_iter = iter(action_set) + ret_clusters = {i: { + "action": next(actions_iter), + "effectiveness": np.nan, + "cost": np.nan, + } for i in clusters.keys()} + + n_individuals_total = instances.shape[0] + total_effectiveness_percentage = n_flipped_total / n_individuals_total + total_mean_recourse_cost = total_recourse_cost_sum / n_flipped_total + + return ret_clusters, total_effectiveness_percentage, total_mean_recourse_cost + else: + n_individuals_total = sum(cluster.shape[0] for cluster in clusters.values()) + + total_effectiveness_percentage = n_flipped_total / n_individuals_total + total_mean_recourse_cost = total_recourse_cost_sum / n_flipped_total + return ret_clusters, total_effectiveness_percentage, total_mean_recourse_cost + + +def print_results( + clusters_stats: Dict[int, Dict[str, numbers.Number]], + total_effectiveness: float, + total_cost: float, +): + """ + Prints the statistics for each cluster, including effectiveness and cost. + + This function takes the results of cluster analysis and formats them for easy + viewing. It displays the size of each cluster, the actions taken, and the + effectiveness and cost of those actions. + + Parameters: + ---------- + clusters_stats : Dict[int, Dict[str, numbers.Number]] + A dictionary where keys are cluster IDs (integers) and values are + dictionaries containing statistics for each cluster. Each value dictionary + must contain the following keys: + - "size": The size of the cluster. + - "action": The actions taken for the cluster. + - "effectiveness": The effectiveness of the actions in the cluster. + - "cost": The cost associated with the actions. + + total_effectiveness : float + The total effectiveness percentage across all clusters, represented as a decimal + (e.g., 0.75 for 75%). + + total_cost : float + The total cost associated with the actions taken across all clusters. + """ + for i, stats in enumerate(clusters_stats.values()): + print(f"CLUSTER {i + 1} with size {stats['size']}:") + display(pd.DataFrame(stats["action"]).T) + print(f"Effectiveness: {stats['effectiveness']:.2%}, Cost: {stats['cost']:.2f}") + +def format_glance_output( + cluster_stats: Dict[int, Dict[str, numbers.Number]], + categorical_columns: List[str], +): + cluster_res = pd.DataFrame(cluster_stats) + for index,row in cluster_res.T.reset_index(drop=True).iterrows(): + # print(f"{Style.BRIGHT}CLUSTER {index+1}{Style.RESET_ALL} with size {row['size']}") + output_string = f"{Style.BRIGHT}Action {index+1} \n{Style.RESET_ALL}" + for column_name, value in row['action'].to_frame().T.reset_index(drop=True).iteritems(): + if column_name in categorical_columns: + if value[0] != '-': + output_string += f"{Style.BRIGHT}{column_name}{Style.RESET_ALL} = {Fore.RED}{value[0]}{Fore.RESET} \n" + else: + if value[0] != '-': + if value[0] > 0 : + output_string += f"{Style.BRIGHT}{column_name}{Style.RESET_ALL} +{Fore.RED}{value[0]}{Fore.RESET} \n" + elif value[0] < 0 : + output_string += f"{Style.BRIGHT}{column_name}{Style.RESET_ALL} {Fore.RED}{value[0]}{Fore.RESET} \n" + print(output_string) + print(f"{Style.BRIGHT}Effectiveness:{Style.RESET_ALL} {Fore.GREEN}{row['effectiveness']:.2%}{Fore.RESET}\t{Style.BRIGHT}Cost:{Style.RESET_ALL} {Fore.MAGENTA}{row['cost']:.2f}{Fore.RESET}") + print("\n") + +def _merge_clusters( + cluster1: int, + cluster2: int, + clusters: Dict[int, pd.DataFrame], + cluster_explanations: Dict[int, pd.DataFrame], + cluster_centroids: Dict[int, pd.DataFrame], + cluster_expl_actions: Dict[int, pd.DataFrame], + explanations_centroid: Dict[int, pd.DataFrame], + numerical_features_names: List[str], + categorical_features_names: List[str], +): + """ + Merges two clusters into one and updates all associated data structures. + + This function takes two cluster identifiers and combines their respective data. + It updates the clusters, explanations, centroids, and action dataframes accordingly. + + Parameters: + ---------- + cluster1 : int + The identifier for the first cluster to merge. + + cluster2 : int + The identifier for the second cluster to merge into. + + clusters : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their respective dataframes. + + cluster_explanations : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their explanations dataframes. + + cluster_centroids : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their centroid dataframes. + + cluster_expl_actions : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their explanation actions dataframes. + + explanations_centroid : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their centroid explanations dataframes. + + numerical_features_names : List[str] + A list of names for the numerical features in the dataset. + + categorical_features_names : List[str] + A list of names for the categorical features in the dataset. + """ + clusters[cluster2] = pd.concat( + [clusters[cluster2], clusters[cluster1]], ignore_index=True + ) + del clusters[cluster1] + + cluster_explanations[cluster2] = pd.concat( + [cluster_explanations[cluster2], cluster_explanations[cluster1]], + ignore_index=True, + ) + del cluster_explanations[cluster1] + + explanations_centroid[cluster2] = centroid_pandas( + cluster_explanations[cluster2], + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + ) + del explanations_centroid[cluster1] + + cluster_expl_actions[cluster2] = pd.concat( + [cluster_expl_actions[cluster2], cluster_expl_actions[cluster1]], + ignore_index=True, + ) + del cluster_expl_actions[cluster1] + + cluster_centroids[cluster2] = centroid_pandas( + clusters[cluster2], + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + ) + del cluster_centroids[cluster1] + + +def _find_candidate_clusters( + clusters: Dict[int, pd.DataFrame], + cluster_centroids: Dict[int, pd.DataFrame], + explanations_centroid: Dict[int, pd.DataFrame], + heuristic_weights: Tuple[float, float], + dist_func_dataframe: Callable[[pd.DataFrame, pd.DataFrame], pd.Series], +) -> Tuple[int, int]: + """ + Identifies the best candidate clusters for merging based on distances of centroids + and explanation centroids, weighted by given heuristic values. + + The function selects the smallest cluster and calculates distances to all other clusters' centroids. + It uses these distances to determine a heuristic value for potential merges, returning the two + clusters with the best merge heuristic. + + Parameters: + ---------- + clusters : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their respective dataframes. + + cluster_centroids : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their centroid dataframes. + + explanations_centroid : Dict[int, pd.DataFrame] + A dictionary mapping cluster IDs to their explanation centroids. + + heuristic_weights : Tuple[float, float] + A tuple containing two weights used to combine centroid distances and explanation centroid distances. + + dist_func_dataframe : Callable[[pd.DataFrame, pd.DataFrame], pd.Series] + A function that computes the distance between two dataframes, returning a series of distances. + + Returns: + ------- + Tuple[int, int] + A tuple containing the IDs of the two candidate clusters identified for merging. + """ + clusters_idx = clusters.keys() + + smallest_cluster = min(clusters_idx, key=lambda i: (clusters[i].shape[0], i)) + smallest_expl_centroid_repeat = pd.concat( + [explanations_centroid[smallest_cluster]] * (len(clusters) - 1), + ignore_index=True, + ) + expl_centroids_rest = pd.concat( + [explanations_centroid[i] for i in clusters_idx if i != smallest_cluster], + ignore_index=True, + ) + explanations_centroid_distances = dist_func_dataframe( + smallest_expl_centroid_repeat, + expl_centroids_rest, + ) + smallest_centroid_repeat = pd.concat( + [cluster_centroids[smallest_cluster]] * (len(clusters) - 1), ignore_index=True + ) + centroids_rest = pd.concat( + [cluster_centroids[i] for i in clusters_idx if i != smallest_cluster], + ignore_index=True, + ) + cluster_centroids_distances = dist_func_dataframe( + smallest_centroid_repeat, + centroids_rest, + ) + merge_heuristic_values = ( + heuristic_weights[0] * cluster_centroids_distances + + heuristic_weights[1] * explanations_centroid_distances + ) + candidates = [ + (smallest_cluster, cluster1) + for cluster1 in clusters_idx + if cluster1 != smallest_cluster + ] + candidates = [ + (c1, c2, merge_heuristic_values.iloc[i]) + for i, (c1, c2) in enumerate(candidates) + ] + + candidates.sort(key=lambda x: (x[2], x[1])) + + return candidates[0][0], candidates[0][1] + + +def _generate_clusters( + instances: pd.DataFrame, + num_clusters: int, + categorical_features_names: List[str], + clustering_method: ClusteringMethod, +) -> Dict[int, pd.DataFrame]: + """ + Generates clusters from the given instances using the specified clustering method. + + The function applies one-hot encoding to the categorical features in the input data, + fits the provided clustering method, and assigns instances to clusters. It returns + a dictionary mapping cluster IDs to their respective dataframes. + + Parameters: + ---------- + instances : pd.DataFrame + The input data containing instances to be clustered. + + num_clusters : int + The desired number of clusters to generate. Note that the actual number of + clusters may vary depending on the clustering method used. + + categorical_features_names : List[str] + A list of names of categorical features in the input data that need to be + one-hot encoded for clustering. + + clustering_method : ClusteringMethod + An instance of a clustering method (e.g., KMeans, DBSCAN) that implements + the fit and predict methods. + + Returns: + ------- + Dict[int, pd.DataFrame] + A dictionary where the keys are cluster IDs and the values are dataframes + containing the instances assigned to each cluster. + """ + ohe_instances = _one_hot_encode(instances, categorical_features_names) + clustering_method.fit(ohe_instances) + assigned_clusters = clustering_method.predict(ohe_instances) + + cluster_ids = np.unique(assigned_clusters) + cluster_ids.sort() + clusters = {i: instances.iloc[assigned_clusters == i] for i in cluster_ids} + + return clusters + + +def _one_hot_encode(X: pd.DataFrame, categorical_columns: List[str]) -> pd.DataFrame: + """ + Applies one-hot encoding to the specified categorical columns of a DataFrame. + + This function transforms categorical columns in the input DataFrame into + a one-hot encoded format, allowing them to be used in machine learning models. + The non-categorical columns are retained in their original form. + + Parameters: + ---------- + X : pd.DataFrame + The input DataFrame containing the data with both categorical and numerical features. + + categorical_columns : List[str] + A list of names of the categorical columns in the DataFrame that should be one-hot encoded. + + Returns: + ------- + pd.DataFrame + A new DataFrame where the specified categorical columns have been one-hot encoded, + and all other columns are retained as is. + """ + transformer = ColumnTransformer( + [("ohe", OneHotEncoder(sparse_output=False), categorical_columns)], + remainder="passthrough", + ) + ret = transformer.fit_transform(X) + assert isinstance(ret, np.ndarray) + return pd.DataFrame(ret, columns=transformer.get_feature_names_out()) diff --git a/aix360/algorithms/glance/iterative_merges/phase2.py b/aix360/algorithms/glance/iterative_merges/phase2.py new file mode 100644 index 0000000..a000563 --- /dev/null +++ b/aix360/algorithms/glance/iterative_merges/phase2.py @@ -0,0 +1,102 @@ +from typing import Dict, List, Tuple + +import pandas as pd + +from ..base import LocalCounterfactualMethod +from ..utils.action import extract_actions_pandas +from ..utils.centroid import centroid_pandas + +def generate_cluster_centroid_explanations( + cluster_centroids: Dict[int, pd.DataFrame], + cf_generator: LocalCounterfactualMethod, + num_local_counterfactuals: int, + numerical_features_names: List[str], + categorical_features_names: List[str], +) -> Tuple[Dict[int, pd.DataFrame], Dict[int, pd.DataFrame], Dict[int, pd.DataFrame]]: + """ + Generates explanations for cluster centroids by creating counterfactual instances + for each centroid and extracting corresponding actions and explanations. + + Parameters: + ---------- + cluster_centroids : Dict[int, pd.DataFrame] + A dictionary where keys are cluster identifiers and values are DataFrames + representing the centroids of each cluster. + cf_generator : LocalCounterfactualMethod + An instance of a LocalCounterfactualMethod used to generate counterfactuals. + num_local_counterfactuals : int + The number of counterfactuals to generate for each cluster centroid. + numerical_features_names : List[str] + A list of names for numerical features in the dataset. + categorical_features_names : List[str] + A list of names for categorical features in the dataset. + + Returns: + ------- + Tuple[Dict[int, pd.DataFrame], Dict[int, pd.DataFrame], Dict[int, pd.DataFrame]] + A tuple containing three dictionaries: + - cluster_explanations: A dictionary of counterfactuals for each cluster centroid. + - cluster_expl_actions: A dictionary of extracted actions for the generated counterfactuals. + - explanations_centroid: A dictionary of centroid explanations based on the generated counterfactuals. + + Raises: + ------- + ValueError + If no counterfactuals are found for any of the centroids. + """ + cluster_explanations = { + i: cf_generator.explain_instances( + cluster_centroids[i], num_local_counterfactuals + ) + for i, _ in cluster_centroids.items() + } + returned_requested = True + empty_cfs_idxs = [] + for i, cfs in cluster_explanations.items(): + if cfs.empty: + empty_cfs_idxs.append(i) + if cfs.shape[0] != num_local_counterfactuals: + returned_requested = False + for i in empty_cfs_idxs: + del cluster_explanations[i] + + if not cluster_explanations: + raise ValueError("No counterfactuals found for any of the centroids.") + + if returned_requested: + cluster_expl_actions = { + i: extract_actions_pandas( + X=pd.concat([cluster_centroids[i]] * num_local_counterfactuals).set_index( + cluster_explanations[i].index + ), + cfs=cluster_explanations[i], + categorical_features=categorical_features_names, + numerical_features=numerical_features_names, + categorical_no_action_token="-", + ) + for i, _cfs in cluster_explanations.items() + } + else: + cluster_expl_actions = { + i: extract_actions_pandas( + X=pd.concat([cluster_centroids[i]] * cluster_explanations[i].shape[0]).set_index( + cluster_explanations[i].index + ), + cfs=cluster_explanations[i], + categorical_features=categorical_features_names, + numerical_features=numerical_features_names, + categorical_no_action_token="-", + ) + for i, _cfs in cluster_explanations.items() + } + + explanations_centroid = { + i: centroid_pandas( + X=cluster_explanations[i], + numerical_columns=numerical_features_names, + categorical_columns=categorical_features_names, + ) + for i, _cfs in cluster_explanations.items() + } + + return cluster_explanations, cluster_expl_actions, explanations_centroid \ No newline at end of file diff --git a/aix360/algorithms/glance/local_cfs/__init__.py b/aix360/algorithms/glance/local_cfs/__init__.py new file mode 100644 index 0000000..b2b0df6 --- /dev/null +++ b/aix360/algorithms/glance/local_cfs/__init__.py @@ -0,0 +1,3 @@ +from .dice_method import DiceMethod +from .nearest_neighbor import NearestNeighborMethod +from .random_sampling import RandomSampling diff --git a/aix360/algorithms/glance/local_cfs/dice_method.py b/aix360/algorithms/glance/local_cfs/dice_method.py new file mode 100644 index 0000000..8fe12da --- /dev/null +++ b/aix360/algorithms/glance/local_cfs/dice_method.py @@ -0,0 +1,115 @@ +from ..base import LocalCounterfactualMethod +import dice_ml +import pandas as pd + + +class DiceMethod(LocalCounterfactualMethod): + """ + Implementation of the Dice method for generating counterfactual instances.(https://interpret.ml/DiCE/) + + The Dice method uses a specified machine learning model and data to generate counterfactual examples, + providing insights into how changes in feature values can influence model predictions. + + Methods: + -------- + __init__(): + Initializes the DiceMethod instance. + + fit(model, data, outcome_name, continuous_features, feat_to_vary, random_seed=13): + Fits the DiceMethod to the provided dataset, preparing the counterfactual generator. + + explain_instances(instances, num_counterfactuals): + Generates counterfactual instances for the specified input instances. + """ + + def __init__(self): + """ + Initializes a new instance of the DiceMethod class. + + Attributes: + ---------- + cf_generator : None or dice_ml.Dice + Counterfactual generator instance, initially set to None. + """ + super().__init__() + self.cf_generator = None + + def fit( + self, + model, + data, + outcome_name, + continuous_features, + feat_to_vary, + random_seed=13, + ): + """ + Fits the DiceMethod to the provided dataset by creating a counterfactual generator. + + Parameters: + ---------- + model : object + A machine learning model used for predictions. + data : pd.DataFrame + The dataset containing features and the outcome variable. + outcome_name : str + The name of the outcome variable in the dataset. + continuous_features : List[str] + A list of names for continuous (numerical) features. + feat_to_vary : List[str] + A list of feature names that can be varied to generate counterfactuals. + random_seed : int, optional + Seed for random number generation to ensure reproducibility, by default 13. + """ + dice_dataset = dice_ml.Data( + dataframe=data, + continuous_features=continuous_features, + outcome_name=outcome_name, + ) + self.random_seed = random_seed + self.feat_to_vary = feat_to_vary + dice_model = dice_ml.Model(model=model, backend="sklearn", func=None) + self.cf_generator = dice_ml.Dice(dice_dataset, dice_model, method="random") + + def explain_instances( + self, instances: pd.DataFrame, num_counterfactuals: int + ) -> pd.DataFrame: + """ + Generates counterfactual instances for the specified input instances. + + Parameters: + ---------- + instances : pd.DataFrame + DataFrame containing the instances for which counterfactuals are generated. + num_counterfactuals : int + The number of counterfactuals to generate for each instance. + + Returns: + ------- + pd.DataFrame + A DataFrame containing the generated counterfactuals. + + Raises: + ------- + ValueError + If the counterfactual generator has not been initialized (fit method not called). + """ + if self.cf_generator is None: + raise ValueError("Fit the Local Counterfactual method first.") + + counterfactuals = self.cf_generator.generate_counterfactuals( + instances, + total_CFs=num_counterfactuals, + desired_class=1, + random_seed=self.random_seed, + features_to_vary=self.feat_to_vary, + posthoc_sparsity_param=None, + ) + + return pd.concat( + [ + counterfactuals.cf_examples_list[i].final_cfs_df.iloc[:, :-1] + for i in range(len(instances)) + ], + ignore_index=False, + ) diff --git a/aix360/algorithms/glance/local_cfs/nearest_neighbor.py b/aix360/algorithms/glance/local_cfs/nearest_neighbor.py new file mode 100644 index 0000000..b2087fd --- /dev/null +++ b/aix360/algorithms/glance/local_cfs/nearest_neighbor.py @@ -0,0 +1,114 @@ +from typing import List +import warnings + +import pandas as pd +import numpy as np +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer + +from ..base import LocalCounterfactualMethod +from ..utils.action import extract_actions_pandas, apply_actions_pandas_rows + +class NearestNeighborMethod(LocalCounterfactualMethod): + """ + NearestNeighborMethod is a local counterfactual method that finds the nearest unaffected neighbors in the training dataset to explain instances by generating counterfactuals. + + This method identifies instances in the training set where the model prediction remains unaffected, + and uses the nearest neighbors (based on feature similarity) to generate counterfactual explanations for new instances. + + Methods: + -------- + __init__(): + Initializes the NearestNeighborMethod instance. + + fit(model, data, outcome_name, continuous_features, feat_to_vary, random_seed=13): + Fits the method to the training data by identifying unaffected instances based on model predictions and preparing the feature encoding for nearest neighbor searches. + + explain_instances(instances, num_counterfactuals): + Finds and returns the nearest unaffected neighbors for each instance, generating the specified number of counterfactual explanations. + """ + def __init__(self): + """ + Initializes a new instance of the NearestNeighborMethod class. + """ + super().__init__() + + def fit( + self, + model, + data: pd.DataFrame, + outcome_name: str, + continuous_features: List[str], + feat_to_vary: List[str], + random_seed=13, + ): + """ + Fits the NearestNeighborMethod by identifying unaffected instances in the training dataset and preparing feature encodings for counterfactual search. + + Parameters: + ---------- + model : object + A machine learning model with a `predict` method that outputs binary predictions (0 or 1). + data : pd.DataFrame + A dataset containing the features and outcome variable used for fitting the method. + outcome_name : str + The name of the outcome column in the dataset. + continuous_features : List[str] + A list of continuous (numerical) feature column names. + feat_to_vary : List[str] + A list of features allowed to vary when generating counterfactuals. + random_seed : int, optional + Seed for random number generation to ensure reproducibility, by default 13. + """ + X, y = data.drop(columns=[outcome_name]), data[outcome_name] + self.numerical_features = continuous_features + self.categorical_features = X.columns.difference(continuous_features).tolist() + + self.encoder = ColumnTransformer( + [("ohe", OneHotEncoder(sparse_output=False), self.categorical_features)], + remainder="passthrough", + ).fit(X) + + train_preds = model.predict(X) + self.train_unaffected = X[train_preds == 1] + self.train_unaffected_one_hot = self.encoder.transform(self.train_unaffected) + + self.random_seed = random_seed + self.feat_to_vary = feat_to_vary + + def explain_instances( + self, instances: pd.DataFrame, num_counterfactuals: int + ) -> pd.DataFrame: + """ + Generates counterfactual explanations for the provided instances by finding the nearest unaffected neighbors in the training data. + + Parameters: + ---------- + instances : pd.DataFrame + DataFrame containing the instances for which counterfactual explanations are needed. + num_counterfactuals : int + The number of counterfactuals to generate for each instance. + + Returns: + ------- + pd.DataFrame + A DataFrame containing the nearest unaffected neighbors (counterfactuals) for each instance. + + Notes: + ------ + - If the requested number of counterfactuals exceeds the number of available unaffected instances, a warning is raised, and all unaffected instances are used. + - Nearest neighbors are determined using a one-hot encoded feature representation. + """ + instances_one_not = self.encoder.transform(instances) + if num_counterfactuals > self.train_unaffected.shape[0]: + warnings.warn(f"{num_counterfactuals} were requested, but only {self.train_unaffected.shape[0]} unaffected instances given. Taking all.") + num_counterfactuals = self.train_unaffected.shape[0] + nn = NearestNeighbors(n_neighbors=num_counterfactuals).fit(self.train_unaffected_one_hot) + distances, indices = nn.kneighbors(instances_one_not) + + cfs = [self.train_unaffected.iloc[row] for row in indices] + + return pd.concat(cfs, ignore_index=False) + + diff --git a/aix360/algorithms/glance/local_cfs/random_sampling.py b/aix360/algorithms/glance/local_cfs/random_sampling.py new file mode 100644 index 0000000..93cc13b --- /dev/null +++ b/aix360/algorithms/glance/local_cfs/random_sampling.py @@ -0,0 +1,233 @@ +import pandas as pd +from ..base import LocalCounterfactualMethod +import numpy as np +from sklearn.inspection import permutation_importance + +class RandomSampling(LocalCounterfactualMethod): + """ + RandomSampling is a local counterfactual method that generates counterfactual instances + through random sampling based on the distribution of features in the unaffected training data. + + This method identifies the most important features and the most frequent categories within the + unaffected training data to generate counterfactuals by sampling from these distributions. + + Methods: + -------- + __init__(model, n_most_important, n_categorical_most_frequent, numerical_features, categorical_features, random_state=None): + Initializes the RandomSampling instance with the specified parameters. + + fit(X, y): + Fits the RandomSampling method to the provided training data by calculating feature importances and identifying unaffected instances. + + _sample_instances(n_samples, fixed_feature_values, random_state=None): + Samples instances based on the specified feature distributions, fixing certain feature values while sampling others. + + explain(instance, num_counterfactuals, n_samples=1000, random_state=None): + Generates counterfactual explanations for a given instance by sampling and modifying feature values. + + explain_instances(instances, num_counterfactuals, n_samples=1000, random_state=None): + Generates counterfactuals for multiple instances by calling the explain method for each instance. + """ + def __init__(self, model, n_most_important, n_categorical_most_frequent, numerical_features, categorical_features, random_state=None): + """ + Initializes a new instance of the RandomSampling class. + + Parameters: + ---------- + model : object + A machine learning model used for predictions and feature importance evaluation. + n_most_important : int + The number of most important features to consider when generating counterfactuals. + n_categorical_most_frequent : int + The number of most frequent categories to consider for categorical features. + numerical_features : List[str] + A list of continuous (numerical) feature names. + categorical_features : List[str] + A list of categorical feature names. + random_state : int, optional + Seed for random number generation to ensure reproducibility, by default None. + """ + self.model = model + self.n_most_important = n_most_important + self.n_categorical_most_frequent = n_categorical_most_frequent + self.numerical_features = numerical_features + self.categorical_features = categorical_features + self.random_state = random_state + + def fit(self, X: pd.DataFrame, y: pd.Series): + """ + Fits the RandomSampling method to the provided training data by calculating feature importances and identifying unaffected instances. + + Parameters: + ---------- + X : pd.DataFrame + The training dataset containing feature columns. + y : pd.Series + The target variable corresponding to the training dataset. + + Returns: + ------- + self : RandomSampling + Returns the fitted instance of RandomSampling. + """ + self.X_ = X + self.feature_names_ = X.columns.tolist() + # Permutation feature importance + result = permutation_importance(self.model, X, y, random_state=self.random_state) + self.feature_importances_ = result.importances_mean + top_k_indices = np.argsort(self.feature_importances_)[::-1][:self.n_most_important] + self.top_k_features_ = X.columns[top_k_indices] + + train_preds = self.model.predict(X) + unaffected = X[train_preds == 1] + + # Store min and max values for numerical features + self.numeric_min_ = unaffected[self.numerical_features].min() + self.numeric_max_ = unaffected[self.numerical_features].max() + for f in self.numerical_features: + if np.isnan(self.numeric_min_[f]): + self.numeric_min_[f] = X[f].min() + if np.isnan(self.numeric_max_[f]): + self.numeric_max_[f] = X[f].max() + + # Get the top m most frequent categories for categorical features + self.categorical_top_m_ = {} + for col in self.categorical_features: + top_categories = unaffected[col].value_counts().index[:self.n_categorical_most_frequent] + if top_categories.empty: + top_categories = X[col].value_counts().index[:self.n_categorical_most_frequent] + self.categorical_top_m_[col] = top_categories + + return self + + def _sample_instances(self, n_samples: int, fixed_feature_values, random_state=None): + """ + Samples instances based on the specified feature distributions, fixing certain feature values while sampling others. + + Parameters: + ---------- + n_samples : int + The number of instances to sample. + fixed_feature_values : dict + A dictionary of feature names and their fixed values during sampling. + random_state : int, optional + Seed for random number generation, by default None. + + Returns: + ------- + pd.DataFrame + A DataFrame containing the sampled instances with the same feature structure as the original data. + """ + if random_state is not None: + np.random.seed(random_state) + samples_columns = [] + for col in self.X_.columns: + if col in fixed_feature_values: + column = [fixed_feature_values[col]] * n_samples + elif col in self.numerical_features: + column = np.random.uniform(self.numeric_min_[col], self.numeric_max_[col], n_samples) + else: + column = np.random.choice(self.categorical_top_m_[col], n_samples) + samples_columns.append(column) + return pd.DataFrame({col_name: column for col_name, column in zip(self.X_.columns, samples_columns)}) + + def explain(self, instance, num_counterfactuals, n_samples=1000, random_state=None): + """ + Generates counterfactual explanations for a given instance by sampling and modifying feature values. + + Parameters: + ---------- + instance : pd.DataFrame + A single row DataFrame representing the instance for which counterfactuals are generated. + num_counterfactuals : int + The number of counterfactuals to generate. + n_samples : int, optional + The number of samples to draw for generating counterfactuals, by default 1000. + random_state : int, optional + Seed for random number generation, by default None. + + Returns: + ------- + pd.DataFrame + A DataFrame containing the generated counterfactuals for the provided instance. + + Raises: + ------- + ValueError + If the input instance is not a single-row DataFrame or if its columns do not match the training dataset's columns. + """ + # Check if instance is a single row DataFrame + if not isinstance(instance, pd.DataFrame) or instance.shape[0] != 1: + raise ValueError("Input must be a single row DataFrame.") + + # Check if the DataFrame columns match the features provided during initialization + if set(instance.columns) != set(self.X_.columns): + raise ValueError("Columns of the input instance do not match the columns used during fitting.") + + fixed_feature_values = {} + for col in self.feature_names_: + if col not in self.top_k_features_: + fixed_feature_values[col] = instance[col].item() + random_instances = self._sample_instances(n_samples, fixed_feature_values, random_state) + + # Generate copies of the query instance that will be changed one feature + # at a time to encourage sparsity. + cfs_df = None + candidate_cfs = instance.apply(lambda col: col.repeat(n_samples)).reset_index(drop=True) + # Loop to change one feature at a time, then two features, and so on. + for num_features_to_vary in range(1, len(self.top_k_features_)+1): + selected_features = np.random.choice(self.top_k_features_, (n_samples, 1), replace=True) + for k in range(n_samples): + candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]] + preds = self.model.predict(candidate_cfs) + if sum(preds) > 0: + rows_to_add = candidate_cfs[preds == 1] + + if cfs_df is None: + cfs_df = rows_to_add.copy() + else: + cfs_df = pd.concat([cfs_df, rows_to_add]) + cfs_df.drop_duplicates(inplace=True) + # Always change at least 2 features before stopping + if num_features_to_vary >= 2 and len(cfs_df) >= num_counterfactuals: + break + + if cfs_df is None: + return None + + assert isinstance(cfs_df, pd.DataFrame) + if len(cfs_df) > num_counterfactuals: + cfs_df = cfs_df.sample(num_counterfactuals) + cfs_df.reset_index(inplace=True, drop=True) + return cfs_df + + def explain_instances( + self, instances: pd.DataFrame, num_counterfactuals: int, n_samples=1000, random_state=None + ) -> pd.DataFrame: + """ + Generates counterfactuals for multiple instances by calling the explain method for each instance. + + Parameters: + ---------- + instances : pd.DataFrame + DataFrame containing instances for which counterfactual explanations are needed. + num_counterfactuals : int + The number of counterfactuals to generate for each instance. + n_samples : int, optional + The number of samples to draw for generating counterfactuals, by default 1000. + random_state : int, optional + Seed for random number generation, by default None. + + Returns: + ------- + pd.DataFrame + A DataFrame containing the generated counterfactuals for all provided instances. + """ + cfs = [] + for i in range(instances.shape[0]): + cfs_instance = self.explain(instances.iloc[i:i+1], num_counterfactuals=num_counterfactuals, n_samples=n_samples, random_state=random_state) + if cfs_instance is not None: + cfs.append(cfs_instance) + + ret = pd.concat(cfs, ignore_index=False) if cfs != [] else pd.DataFrame(columns=instances.columns).astype(instances.dtypes) + return ret diff --git a/aix360/algorithms/glance/utils/__init__.py b/aix360/algorithms/glance/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aix360/algorithms/glance/utils/action.py b/aix360/algorithms/glance/utils/action.py new file mode 100644 index 0000000..174d8ab --- /dev/null +++ b/aix360/algorithms/glance/utils/action.py @@ -0,0 +1,225 @@ +from typing import List, Any, Optional + +import numpy as np +import numpy.typing as npt +import pandas as pd + + +def apply_action_pandas( + X: pd.DataFrame, + action: pd.Series, + numerical_columns: List[str], + categorical_columns: List[str], + categorical_no_action_token: Any, + numerical_no_action_token: Optional[Any] = None, +) -> pd.DataFrame: + """Apply `action` to all rows of `X`. For numerical columns, add the + respective component from `action`. For categorical columns, set the + component of all rows to the value of `action`, unless it is equal to + the `categorical_no_action_token`, in which case do nothing for this + column. + + Args: + X (pd.DataFrame): matrix of observations + action (pd.Series): for each column / feature, the action to be applied + numerical_columns (List[str]): numerical column names + categorical_columns (List[str]): categorical column names + categorical_no_action_token (Any): special value signifying no-action (i.e. equivalent to 0 for numerical columns) + + Returns: + pd.DataFrame: new observations resulting from the action application. + """ + assert (X.columns == action.index).all() + if numerical_no_action_token is None: + numerical_no_action_token = categorical_no_action_token + + ret = X.copy(deep=True) + for col in numerical_columns: + if action[col] != numerical_no_action_token: + ret[col] = X[col] + action[col] + for col in categorical_columns: + if action[col] != categorical_no_action_token: + ret[col] = action[col] + ret = ret.astype(X.dtypes) + + return ret + + +def apply_action_numpy( + X: npt.NDArray[np.number], + action: npt.NDArray[np.number], + numerical_columns: List[int], + categorical_columns: List[int], + categorical_no_action_token: np.number, +) -> npt.NDArray[np.number]: + """Apply `action` to all rows of `X`. For numerical columns, add the + respective component from `action`. For categorical columns, set the + component of all rows to the value of `action`, unless it is equal to + the `categorical_no_action_token`, in which case do nothing for this + column. + + Note: input array should have a numeric dtype. Thus, categorical columns + should be encoded by numbers (e.g. Ordinal Encoding). + + Args: + X (npt.NDArray[np.number]): matrix of observations + action (npt.NDArray[np.number]): for each column / feature, the action to be applied + numerical_columns (List[int]): numerical column indices + categorical_columns (List[int]): categorical column indices + categorical_no_action_token (np.number): special value signifying no-action (i.e. equivalent to 0 for numerical columns) + + Returns: + npt.NDArray[np.number]: new observations resulting from the action application. + """ + assert len(X.shape) == 2 + assert len(action.shape) == 1 + assert ( + X.shape[1] == action.shape[0] + ), "action should have length equal to the number of columns" + + ret = X.copy() + ret[:, numerical_columns] += action[numerical_columns] + categorical_columns_masked = np.intersect1d( + np.where(action != categorical_no_action_token)[0], categorical_columns + ) + ret[:, categorical_columns_masked] = action[categorical_columns_masked] + + return ret + + +def extract_actions_pandas( + X: pd.DataFrame, + cfs: pd.DataFrame, + categorical_features: List[str], + numerical_features: List[str], + categorical_no_action_token: Any, +): + """ + Extracts the actions needed to convert the original dataset `X` into the counterfactual dataset `cfs`. + + For categorical features, the function identifies changes between `X` and `cfs`. + If no change is observed in a categorical feature, a specified `categorical_no_action_token` is used to denote that no action is needed. + For numerical features, the function computes the difference between the counterfactual and the original values. + + Parameters: + ---------- + X : pd.DataFrame + The original dataset, where each row represents an instance, and each column is a feature. + cfs : pd.DataFrame + The counterfactual dataset, which has the same structure as `X`. It represents the desired state after some action is applied. + categorical_features : List[str] + List of columns in `X` and `cfs` that are categorical. + numerical_features : List[str] + List of columns in `X` and `cfs` that are numerical. + categorical_no_action_token : Any + A token or value to insert into categorical features where no change is needed (i.e., the feature value in `X` is the same as in `cfs`). + + Returns: + ------- + pd.DataFrame + A DataFrame of the same shape as `X` and `cfs` where each value indicates the action required to transform `X` into `cfs`: + - For categorical features: the value in `cfs` if it differs from `X`, otherwise `categorical_no_action_token`. + - For numerical features: the difference between `cfs` and `X`. + """ + actions = X.copy(deep=True) + + for col in categorical_features: + are_equal_indicator = X[col] == cfs[col] + actions.loc[are_equal_indicator, col] = categorical_no_action_token + actions.loc[~are_equal_indicator, col] = cfs.loc[~are_equal_indicator, col] + for col in numerical_features: + actions[col] = cfs[col] - X[col] + return actions + +def apply_actions_pandas_rows( + X: pd.DataFrame, + actions: pd.DataFrame, + numerical_columns: List[str], + categorical_columns: List[str], + categorical_no_action_token: object, +) -> pd.DataFrame: + """ + Applies a set of actions to transform the original dataset `X` based on the actions specified in the `actions` DataFrame. + + For numerical columns, the function adds the values from the `actions` DataFrame to the corresponding columns in `X`. + For categorical columns, if the action for a column is not equal to the `categorical_no_action_token`, the value from the `actions` DataFrame is used to update `X`. + Otherwise, the original value from `X` is retained. + + Parameters: + ---------- + X : pd.DataFrame + The original dataset, where each row represents an instance, and each column is a feature. + actions : pd.DataFrame + A DataFrame of the same shape as `X`, containing the actions to apply to each feature. + - For numerical columns: contains the values to add to the corresponding features in `X`. + - For categorical columns: contains either the new value to apply or the `categorical_no_action_token`. + numerical_columns : List[str] + List of columns in `X` and `actions` that are numerical. + categorical_columns : List[str] + List of columns in `X` and `actions` that are categorical. + categorical_no_action_token : object + A token or value indicating that no action should be taken for a categorical feature. + + Returns: + ------- + pd.DataFrame + A DataFrame of the same shape as `X` where the actions have been applied: + - For numerical columns: each value is updated by adding the corresponding action from `actions`. + - For categorical columns: updated values from `actions` are used where applicable; otherwise, the original values from `X` are retained. + """ + ret = X.copy(deep=True) + for col in numerical_columns: + ret[col] = X[col] + actions[col] + for col in categorical_columns: + no_action_indicator = actions[col] == categorical_no_action_token + ret.loc[~ no_action_indicator, col] = actions.loc[~ no_action_indicator, col].values + ret.loc[no_action_indicator, col] = X.loc[no_action_indicator, col].values + + return ret + +def actions_mean_pandas( + actions: pd.DataFrame, + numerical_features: List[str], + categorical_features: List[str], + categorical_no_action_token: Any, +) -> pd.Series: + """ + Computes the mean action for numerical features and the most frequent action for categorical features from a given actions DataFrame. + + For numerical features, the function calculates the mean of the actions across all instances. + For categorical features, it determines the most frequent value in the `actions` DataFrame, unless all values are equal to the `categorical_no_action_token`, + in which case the token is returned. + + Parameters: + ---------- + actions : pd.DataFrame + A DataFrame where each row represents an instance, and each column represents an action for a feature (either numerical or categorical). + numerical_features : List[str] + List of columns in `actions` that are numerical features. + categorical_features : List[str] + List of columns in `actions` that are categorical features. + categorical_no_action_token : Any + A token or value that indicates no action is needed for categorical features. + + Returns: + ------- + pd.Series + A Series where: + - For numerical features, the values are the mean of the actions for each numerical column. + - For categorical features, the values are the most frequent action in each categorical column, or the `categorical_no_action_token` if no action was needed. + """ + ret = pd.Series(index=actions.columns, dtype="object") + ret[numerical_features] = actions[numerical_features].mean() + for col in categorical_features: + if (actions[col] == categorical_no_action_token).all(): + ret[col] = categorical_no_action_token + else: + value_cnts = actions[col].value_counts() + most_freq = ( + value_cnts.index[0] + if value_cnts.index[0] != categorical_features + else value_cnts.index[1] + ) + ret[col] = most_freq + + return ret diff --git a/aix360/algorithms/glance/utils/centroid.py b/aix360/algorithms/glance/utils/centroid.py new file mode 100644 index 0000000..27bc664 --- /dev/null +++ b/aix360/algorithms/glance/utils/centroid.py @@ -0,0 +1,71 @@ +from typing import List +import pandas as pd +import numpy as np +import numpy.typing as npt +from statistics import multimode +from IPython.display import display + + +def centroid_pandas( + X: pd.DataFrame, + numerical_columns: List[str], + categorical_columns: List[str], +) -> pd.DataFrame: + """Calculates the centroid of the rows of a pandas DataFrame. Specifically, + for the `numerical_columns` columns, the centroid has value the mean of all + rows, while for the `categorical_columns` columns, the centroid has value + the mode of all rows. + + Args: + X (pd.DataFrame): matrix of observations + numerical_columns (List[str]): numerical column names + categorical_columns (List[str]): categorical column names + + Returns: + pd.DataFrame: DataFrame whose single row is the centroid + """ + centroid = pd.DataFrame(columns=X.columns).astype(X.dtypes) + + centroid.loc[0, numerical_columns] = X[numerical_columns].mean(axis="index") + if categorical_columns != []: + centroid.loc[0, categorical_columns] = X[categorical_columns].apply( + lambda col: multimode(col)[0] + ) + # centroid.loc[0, categorical_columns] = X[categorical_columns].mode().iloc[0] + + return centroid + + +def centroid_numpy( + X: npt.NDArray[np.number], + numerical_columns: List[int], + categorical_columns: List[int], +) -> npt.NDArray[np.number]: + """Calculates the centroid of the rows of a 2d numy array. Specifically, + for the `numerical_columns` columns, the centroid has value the mean of all + rows, while for the `categorical_columns` columns, the centroid has value + the mode of all rows. + + Args: + X (npt.NDArray[np.number]): matrix of observations + numerical_columns (List[int]): numerical column indices + categorical_columns (List[int]): categorical column indices + + Returns: + npt.NDArray[np.number]: 2d numpy array whose single row is the centroid + """ + assert len(X.shape) == 2 + centroid = np.zeros((1, X.shape[1])) + + centroid[:, numerical_columns] = X[:, numerical_columns].mean(axis=0) + + def most_frequent(x): + unique_values, counts = np.unique(x, return_counts=True) + most_common = unique_values[np.argmax(counts)] + return most_common + + centroid[:, categorical_columns] = [ + most_frequent(X[:, i]) for i in categorical_columns + ] + + return centroid diff --git a/aix360/algorithms/glance/utils/metadata_requests.py b/aix360/algorithms/glance/utils/metadata_requests.py new file mode 100644 index 0000000..b3cceed --- /dev/null +++ b/aix360/algorithms/glance/utils/metadata_requests.py @@ -0,0 +1,124 @@ +from ..base import ClusteringMethod, LocalCounterfactualMethod +from ..clustering import KMeansMethod +from ..local_cfs import DiceMethod, NearestNeighborMethod, RandomSampling + + +def _decide_cluster_method(method, n_clusters, random_seed) -> ClusteringMethod: + """ + Determines and returns the appropriate clustering method based on the input `method` argument. + + If `method` is a string specifying a known clustering algorithm, the function initializes the corresponding clustering method (e.g., KMeans). + If `method` is already an instance of a clustering method, it is returned unchanged. + + Parameters: + ---------- + method : str or ClusteringMethod + The desired clustering method. This can either be a string specifying a supported clustering method (e.g., "KMeans") or an instance of a clustering method. + n_clusters : int + The number of clusters to use in the clustering algorithm. + random_seed : int + A seed for the random number generator to ensure reproducibility. + + Returns: + ------- + ClusteringMethod + An instance of the appropriate clustering method based on the input. For example, if `method` is "KMeans", an instance of `KMeansMethod` is returned. + + Raises: + ------- + ValueError + If an unsupported string is passed as the `method` argument. + """ + if isinstance(method, str): + if method == "KMeans": + method = KMeansMethod(num_clusters=n_clusters, random_seed=random_seed) + else: + raise ValueError(f"Unsupported clustering method: {method}") + else: + method = method + return method + + +def _decide_local_cf_method( + method, model, train_dataset, numeric_features_names, + categorical_features_names, feat_to_vary, + random_seed, n_most_important: int = 15, + n_categorical_most_frequent: int = 15, + n_scalars: int = 1000, +) -> LocalCounterfactualMethod: + """ + Determines and returns the appropriate local counterfactual method based on the input `method` argument. + + This function initializes the specified local counterfactual method (e.g., "Dice", "NearestNeighbors", or "RandomSampling") and fits it to the provided training dataset. + If the `method` is already an instance of a local counterfactual method, it is returned unchanged. + + Parameters: + ---------- + method : str or LocalCounterfactualMethod + The desired local counterfactual method. This can either be a string specifying a supported method (e.g., "Dice", "NearestNeighbors", or "RandomSampling") or an instance of a local counterfactual method. + model : object + The machine learning model to be used for generating counterfactuals. + train_dataset : pd.DataFrame + The training dataset on which the counterfactual method will be fit. The dataset must contain a target column named "target". + numeric_features_names : List[str] + A list of feature names that are numeric. + categorical_features_names : List[str] + A list of feature names that are categorical. + feat_to_vary : List[str] + A list of features that are allowed to vary when generating counterfactuals. + random_seed : int + A seed for the random number generator to ensure reproducibility. + n_most_important : int, optional + The number of most important features to consider when generating counterfactuals (used by methods like RandomSampling), by default 15. + n_categorical_most_frequent : int, optional + The number of most frequent categorical values to consider when generating counterfactuals (used by methods like RandomSampling), by default 15. + n_scalars : int, optional + The number of scalar samples used during random sampling (used by RandomSampling), by default 1000. + + Returns: + ------- + LocalCounterfactualMethod + An instance of the appropriate local counterfactual method based on the input. For example, if `method` is "Dice", an instance of `DiceMethod` is returned. + + Raises: + ------- + ValueError + If an unsupported string is passed as the `method` argument. + """ + if isinstance(method, str): + if method == "Dice": + dice = DiceMethod() + dice.fit( + model, + train_dataset, + "target", + numeric_features_names, + feat_to_vary, + random_seed, + ) + method = dice + elif method == "NearestNeighbors": + method = NearestNeighborMethod() + method.fit( + model, + train_dataset, + "target", + numeric_features_names, + feat_to_vary, + random_seed, + ) + elif method == "RandomSampling": + method = RandomSampling( + model=model, + n_most_important=n_most_important, + n_categorical_most_frequent=n_categorical_most_frequent, + numerical_features=numeric_features_names, + categorical_features=categorical_features_names, + random_state=random_seed, + ) + method.fit(train_dataset.drop(columns="target"), train_dataset["target"]) + else: + raise ValueError(f"Unsupported local counterfactual method: {method}") + else: + method = method + return method diff --git a/aix360/data/adult_data/__init__.py b/aix360/data/adult_data/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/aix360/data/adult_data/__init__.py @@ -0,0 +1 @@ + diff --git a/aix360/data/compas_data/__init__.py b/aix360/data/compas_data/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/aix360/data/compas_data/__init__.py @@ -0,0 +1 @@ + diff --git a/aix360/datasets/adult_dataset.py b/aix360/datasets/adult_dataset.py new file mode 100644 index 0000000..fcc6aa7 --- /dev/null +++ b/aix360/datasets/adult_dataset.py @@ -0,0 +1,87 @@ +import os +import datetime +import pandas as pd +import numpy as np +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + +def default_preprocessing(data): + all_columns = ["Age", "Workclass", "Education", "Marital-Status", + "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", + "Capital-Loss", "Hours-Per-Week", "Native-Country", "Status"] + cate_columns = ['Workclass', 'Education', 'Marital-Status', 'Occupation', + 'Relationship', 'Race', 'Sex', 'Native-Country'] + numerical_columns = [c for c in all_columns if c not in cate_columns + ["Status"]] + + # remove redundant education num column (education processed in one_hot) + data = data.drop(2, axis=1) + data = data.drop(4, axis=1) + # remove rows with missing values: '?,' + data = data.replace('?,', np.nan); data = data.dropna() + data.columns = all_columns + for col in data.columns[:-1]: + #print(col) + if col not in cate_columns: + data[col] = data[col].apply(lambda x: float(x[:-1])) + else: + data[col] = data[col].apply(lambda x: x[:-1]) + # Prepocess Targets to <=50K = 0, >50K = 1 + data[data.columns[-1]] = data[data.columns[-1]].replace(['<=50K', '>50K'], + [0, 1]) + + data = data.reset_index(drop=True) + + for col in numerical_columns: + data[col] = data[col].astype(int) + + for col in data.columns: + if col not in numerical_columns and col != data.columns[-1]: + data[col] = data[col].astype(str) + data = data[data['Native-Country'] != 'Holand-Netherlands'] + return data + + +class AdultDataset(): + """Adult Dataset. + + The Adult dataset, also known as the "Census Income" dataset, is a widely used collection of demographic information derived from the 1994 U.S. Census database + and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data. + The target variable is whether an individual earns more than $50,000 per year, + making it a popular dataset for classification tasks in machine learning. + """ + + def __init__(self, custom_preprocessing=default_preprocessing, dirpath=None): + self._dirpath = dirpath + if not self._dirpath: + self._dirpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data','adult_data') + + self._filepath = os.path.join(self._dirpath, 'adult.csv') + print("Using Adult dataset: ", self._filepath) + + try: + #require access to dataframe + #df = pd.read_csv(filepath) + self._df = pd.read_csv(self._filepath, header = None, delim_whitespace = True) + except IOError as err: + print("IOError: {}".format(err)) + print("To use this class, please place the adult.csv:") + print("file, as-is, in the folder:") + print("\n\t{}\n".format(os.path.abspath(os.path.join( + os.path.abspath(__file__), '..', 'data','adult_data')))) + import sys + sys.exit(1) + + if custom_preprocessing: + #require access to dataframe + #self._data = custom_preprocessing(df) + self._data = custom_preprocessing(self._df.copy()) + + # return a copy of the dataframe with Riskperformance as last column + def dataframe(self): + # First pop and then add 'Riskperformance' column + dfcopy = self._data.copy() + return(dfcopy) + + def data(self): + return self._data + diff --git a/aix360/datasets/compas_dataset.py b/aix360/datasets/compas_dataset.py new file mode 100644 index 0000000..a35714d --- /dev/null +++ b/aix360/datasets/compas_dataset.py @@ -0,0 +1,116 @@ +import os +import datetime +import pandas as pd +import numpy as np +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + +def default_preprocessing(data): + data = data.dropna(subset=["days_b_screening_arrest"]) + data = data.rename(columns={data.columns[-1]: "status"}) + data = data.to_dict("list") + for k in data.keys(): + data[k] = np.array(data[k]) + + dates_in = data["c_jail_in"] + dates_out = data["c_jail_out"] + # this measures time in Jail + time_served = [] + for i in range(len(dates_in)): + di = datetime.datetime.strptime(dates_in[i], "%Y-%m-%d %H:%M:%S") + do = datetime.datetime.strptime(dates_out[i], "%Y-%m-%d %H:%M:%S") + time_served.append((do - di).days) + time_served = np.array(time_served) + time_served[time_served < 0] = 0 + data["time_served"] = time_served + + """ Filtering the data """ + # These filters are as taken by propublica + # (refer to https://github.com/propublica/compas-analysis) + # If the charge date of a defendants Compas scored crime was not within 30 days + # from when the person was arrested, we assume that because of data quality + # reasons, that we do not have the right offense. + idx = np.logical_and( + data["days_b_screening_arrest"] <= 30, data["days_b_screening_arrest"] >= -30 + ) + + # We coded the recidivist flag -- is_recid -- to be -1 + # if we could not find a compas case at all. + idx = np.logical_and(idx, data["is_recid"] != -1) + + # In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of + # 'O' -- will not result in Jail time are removed (only two of them). + idx = np.logical_and(idx, data["c_charge_degree"] != "O") + # F: felony, M: misconduct + + # We filtered the underlying data from Broward county to include only those rows + # representing people who had either recidivated in two years, or had at least two + # years outside of a correctional facility. + idx = np.logical_and(idx, data["score_text"] != "NA") + + # select the examples that satisfy this criteria + for k in data.keys(): + data[k] = data[k][idx] + data = pd.DataFrame(data) + cols = [ + "Sex", + "Age_Cat", + "Race", + "C_Charge_Degree", + "Priors_Count", + "Time_Served", + "Status", + ] + data = data[[col.lower() for col in cols]] + data.columns = cols + return data + + +class COMPASDataset(): + """COMPAS Dataset. + + The COMPAS dataset (Correctional Offender Management Profiling for Alternative Sanctions) Angwin et al. (2016) + is available at https://github.com/propublica/compas-analysis/blob/master/compas-scores-two-years.csv. + Detailed description and information on the dataset can be found at https://www.propublica.org/ + article/how-we-analyzed-the-compas-recidivism-algorithm. It categorizes recidivism risk + based on several factors, including race. + + + + """ + + def __init__(self, custom_preprocessing=default_preprocessing, dirpath=None): + self._dirpath = dirpath + if not self._dirpath: + self._dirpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data','compas_data') + + self._filepath = os.path.join(self._dirpath, 'compas.csv') + print("Using Compas dataset: ", self._filepath) + + try: + #require access to dataframe + #df = pd.read_csv(filepath) + self._df = pd.read_csv(self._filepath) + except IOError as err: + print("IOError: {}".format(err)) + print("To use this class, please place the compas.csv:") + print("file, as-is, in the folder:") + print("\n\t{}\n".format(os.path.abspath(os.path.join( + os.path.abspath(__file__), '..', 'data','compas_data')))) + import sys + sys.exit(1) + + if custom_preprocessing: + #require access to dataframe + #self._data = custom_preprocessing(df) + self._data = custom_preprocessing(self._df.copy()) + + # return a copy of the dataframe with Riskperformance as last column + def dataframe(self): + # First pop and then add 'Riskperformance' column + dfcopy = self._data.copy() + return(dfcopy) + + def data(self): + return self._data + diff --git a/examples/glance/Adult.ipynb b/examples/glance/Adult.ipynb new file mode 100644 index 0000000..048afde --- /dev/null +++ b/examples/glance/Adult.ipynb @@ -0,0 +1,751 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GLANCE: Global Actions In A Nutshell for Counterfactual Explainability\n", + "\n", + "**GLANCE** is a versatile and adaptive framework, introduced in our [respective paper](https://arxiv.org/abs/2405.18921), for generating *global counterfactual explanations*.
\n", + "These explanations are expressed as actions that offer recourse to large population subgroups.
The framework aims to provide explanations and insights, ensuring that the actions benefit as many individuals as possible.\n", + "\n", + "GLANCE consists of two algorithms: \n", + "\n", + "- **C-GLANCE** that employs a clustering-based approach, ideal for explaining and debugging the model\n", + "- **T-GLANCE** that employs a tree-based approach, ideal for policy-making and auditing.\n", + "\n", + "In this notebook, we will explore how to use these algorithms effectively.\n", + "\n", + "We will use the Adult Dataset from the UCI Machine Learning Repository ([reference](https://archive.ics.uci.edu/dataset/2/adult)).
\n", + "Adult Dataset is a widely used collection of demographic information derived from the 1994 U.S. Census database. It includes features such as age, work class,
education level, marital status, occupation, relationship, race, sex, hours worked per week and the target variable is whether an individual earns more than $50,000 per year,
making it a popular dataset for classification tasks in machine learning.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminaries\n", + "\n", + "### Import Dependencies \n", + "As usual in python, the first step is to import all necessary packages.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from xgboost import XGBClassifier\n", + "import pandas as pd\n", + "from aix360.algorithms.glance.iterative_merges.iterative_merges import C_GLANCE\n", + "from aix360.algorithms.glance.counterfactual_tree.counterfactual_tree import T_GLANCE\n", + "from aix360.datasets.adult_dataset import AdultDataset\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using Adult dataset: c:\\users\\nikolastheol\\documents\\github\\aix360\\aix360\\datasets\\..\\data\\adult_data\\adult.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeWorkclassEducationMarital-StatusOccupationRelationshipRaceSexCapital-GainCapital-LossHours-Per-WeekNative-CountryStatus
039State-govBachelorsNever-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States0
150Self-emp-not-incBachelorsMarried-civ-spouseExec-managerialHusbandWhiteMale0013United-States0
238PrivateHS-gradDivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States0
353Private11thMarried-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States0
428PrivateBachelorsMarried-civ-spouseProf-specialtyWifeBlackFemale0040Cuba0
\n", + "
" + ], + "text/plain": [ + " Age Workclass Education Marital-Status Occupation \\\n", + "0 39 State-gov Bachelors Never-married Adm-clerical \n", + "1 50 Self-emp-not-inc Bachelors Married-civ-spouse Exec-managerial \n", + "2 38 Private HS-grad Divorced Handlers-cleaners \n", + "3 53 Private 11th Married-civ-spouse Handlers-cleaners \n", + "4 28 Private Bachelors Married-civ-spouse Prof-specialty \n", + "\n", + " Relationship Race Sex Capital-Gain Capital-Loss Hours-Per-Week \\\n", + "0 Not-in-family White Male 2174 0 40 \n", + "1 Husband White Male 0 0 13 \n", + "2 Not-in-family White Male 0 0 40 \n", + "3 Husband Black Male 0 0 40 \n", + "4 Wife Black Female 0 0 40 \n", + "\n", + " Native-Country Status \n", + "0 United-States 0 \n", + "1 United-States 0 \n", + "2 United-States 0 \n", + "3 United-States 0 \n", + "4 Cuba 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = AdultDataset()\n", + "df = data.dataframe()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example Model to be used for explanations\n", + "We use the train set to train a simple XGBoostClassifier. This will serve as the demonstrative model, which we will then treat as a black box and apply our algorithm.\n", + "\n", + "Of course, any model can be used in its place.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.sample(frac=0.01, random_state=42)\n", + "X = df.drop(columns='Status')\n", + "y = df['Status']\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=13)\n", + "num_features = X_train._get_numeric_data().columns.to_list()\n", + "cate_features = X_train.columns.difference(num_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "model = XGBClassifier()\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"cat\",\n", + " OneHotEncoder(sparse_output=False, handle_unknown=\"ignore\"),\n", + " cate_features,\n", + " )\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "_model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\"classifier\", model),\n", + " ]\n", + ")\n", + "_model.fit(X_train,y_train)\n", + "predictions = _model.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GLANCE focuses on the population that has obtained the unfavorable outcome (affected population) (affected population) by a ML model.
It aims to find the *s* global actions that, if applied, the specific population will achieve the favorable outcome.
\n", + "
\n", + "For this reason, our first step is to identify the affected population, as it serves as input to the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "affected = X_test[predictions == 0].reset_index(drop=True)\n", + "train_dataset = df.copy()\n", + "\n", + "\n", + "for col in num_features:\n", + " train_dataset[col] = train_dataset[col].astype(float)\n", + " \n", + "train_dataset[\"target\"] = df['Status']\n", + "train_dataset.drop(columns='Status',inplace=True)\n", + "\n", + "#feat_to_vary is a variable used by DiCE counterfactual explanation\n", + "feat_to_vary = list(affected.columns)\n", + "target_name = \"Status\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A Practical Example of GLANCE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## C-GLANCE \n", + "C-GLANCE is a clustering-based algorithm designed to generate global counterfactual explanations.
It starts by forming initial clusters and gradually merges them until the number of clusters matches the user-defined final_clusters parameter.
From each of these final clusters, the best action is selected, and together, these actions form the global explanation.\n", + "\n", + "C-GLANCE framework is loaded with:\n", + " - the model to be explained\n", + " - number of initial clusters, \n", + " - number of final clusters, from each of which the best action is extracted\n", + " - number of local counterfactuals, that the Local Counterfactual Method generates for each centroid of the initial clusters\n", + "\n", + " C-GLANCE algorithm allows the users to specify the number of global actions generated and serves as a tool to explain and debug ML models." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 2.76it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.92it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.61it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.89it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.79it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.03it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.02it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.06it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.99it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.01it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.97it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.00it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.88it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.04it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.99it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.01it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.97it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.99it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.96it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.92it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.01it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.16it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.40it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.17it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.96it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.02it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.00it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.08it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.14it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.98it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.04it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.00it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.29it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.92it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.83it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.94it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.00it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.02it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.17it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 3.02it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mAction 1 \n", + "\u001b[0m\u001b[1mEducation\u001b[0m = \u001b[31mBachelors\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m48937.7\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m5.79\u001b[39m\n", + "\n", + "\n", + "\u001b[1mAction 2 \n", + "\u001b[0m\u001b[1mAge\u001b[0m +\u001b[31m10.0\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m32792.9\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m4.65\u001b[39m\n", + "\n", + "\n", + "\u001b[1mAction 3 \n", + "\u001b[0m\u001b[1mMarital-Status\u001b[0m = \u001b[31mSeparated\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m71470.0\u001b[39m \n", + "\u001b[1mHours-Per-Week\u001b[0m +\u001b[31m32.3\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m11.47\u001b[39m\n", + "\n", + "\n", + "\u001b[1mTOTAL EFFECTIVENESS:\u001b[0m \u001b[32m100.00%\u001b[39m\n", + "\u001b[1mTOTAL COST:\u001b[0m \u001b[35m5.81\u001b[39m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\nikolastheol\\documents\\github\\aix360\\aix360\\algorithms\\glance\\iterative_merges\\iterative_merges.py:1032: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", + " for column_name, value in row['action'].to_frame().T.reset_index(drop=True).iteritems():\n" + ] + } + ], + "source": [ + "global_method = C_GLANCE(\n", + " _model, # model to be explained\n", + " initial_clusters=40, # starting number of clusters\n", + " final_clusters=3, # number of final clusters after merging, from each of which the best action is extracted\n", + " num_local_counterfactuals=10, # number of counterfactuals the Local Counterfactual Method generates for each centroid of the initial clusters\n", + ")\n", + "global_method.fit(\n", + " df.drop(columns=[\"Status\"]),\n", + " df[\"Status\"],\n", + " train_dataset,\n", + " feat_to_vary, # DiCE parameter that chooses which features to change in order to generate counterfactuals\n", + " cf_generator=\"Dice\"\n", + ")\n", + "\n", + "clusters, clusters_res = global_method.explain_group(affected)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## C-GLANCE Output\n", + "C-GLANCE generates a set of final actions, with a focus on their overall impact when applied to the entire affected population. While each action is initially associated with a specific cluster, the key metrics we prioritize are the *Total Effectiveness* and *Total Cost* across the whole population.\n", + "\n", + "- *Total Effectiveness* is the percentage of individuals that achieve the favorable outcome, if each one of the final actions is applied to the whole affected population.
\n", + "- *Total Cost* is calculated as the mean recourse cost of the whole set of final actions over the entire population.\n", + "\n", + "Additionally, for each generated action the suggested changes are also reported, as well as the *effectiveness* and *cost* they achieve on the population of the cluster they were extracted from. More specifically:\n", + "\n", + "- *Effectiveness*, for each cluster-action pair ($C$, $a$), represents the percentage of individuals in $C$ who get the favorable outcome when the action $a$ is applied.\n", + "
\n", + "- *Cost*, for each cluster-action pair ($C$, $a$), is the mean recourse cost computed when the action $a$ is applied to the individuals of cluster $C$.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above output of C-GLANCE algorithm we can see the 3 actions computed.
\n", + "For example:\n", + "\n", + "\n", + "**Action 1**
\n", + "**Education =** Bachelors
\n", + "**Capital-Gain =** +48937.7
\n", + "**Effectiveness**: 100.00%\t **Cost**: 5.79
\n", + "Action 1 corresponds to the affected individuals of cluster 1 from the 3 final clusters, and suggests that if the affected individuals change their education to Bachelors Degree and increase their capital-gain for 48937,7 they will **all** get the favorable outcome as the respective Effectiveness for Action 1 is **100.0%**.
\n", + "In the same manner we can explain the other 2 Actions and their respective metrics.\n", + "\n", + "**Action 2**
\n", + "**Age =** +10.0
\n", + "**Capital-Gain =** +32792.9
\n", + "**Effectiveness**: 100.00%\t **Cost**: 4.65
\n", + "Action 2 corresponds to the affected individuals of cluster 2 from the 3 final clusters, and suggests that if the affected individuals were 10 years older and increased their capital-gain for +32792.9 they will **all** get the favorable outcome as the respective Effectiveness for Action 2 is **100.0%**.
\n", + "\n", + "**Action 3**
\n", + "**Marital-Status =** Seperated
\n", + "**Capital-Gain =** +71470.0
\n", + "**Hours-Per-Week =** +32.3
\n", + "**Effectiveness**: 100.00%\t **Cost**: 11.47
\n", + "Action 3 corresponds to the affected individuals of cluster 3 from the 3 final clusters, and suggests that if the affected individuals had Seperated as marital status, increased their capital-gain for +71470.0 and increased their working hours per week for 32.3 hours more they will **all** get the favorable outcome as the respective Effectiveness for Action 3 is **100.0%**.
\n", + "\n", + "In the end of the ouput the *Total Effectiveness* and *Total Cost* is showcased.

\n", + "**TOTAL EFFECTIVENESS:** 100.00%
\n", + "**TOTAL COST** 5.81
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## C-GLANCE Modularity\n", + "Our framework is highly **modular**, allowing users to customize various aspects of it.
\n", + "\n", + "Specifically:\n", + "- **Choice of Local Counterfactual Methods**: Users can select from a variety of local counterfactual methods to generate candidate counterfactual explanations, such as:\n", + " - **NearestNeighbors**: When queried to provide *k* counterfactuals for an affected individual, it retrieves the k nearest neighbors from the set of unaffected instances based on their proximity to the affected individual.\n", + " - **Random Sampling**: To find counterfactuals for an affected instance, this method iteratively modifies its features one at a time. The process begins by randomly altering one feature at a time, generating multiple new candidate instances\n", + "\n", + "- **Strategy for Selecting Actions**: Additionally, users can choose different strategies for selecting the best actions from the generated counterfactuals. This enables fine-tuning of the process, allowing for the optimal balance between effectiveness and recourse cost, based on user-defined preferences.\n", + " - **max-eff** : Selects actions based on maximizing the effectiveness.\n", + " - **low-cost** : Selects the action with the lowest cost that flips a sufficient number of instances.\n", + " - **mean-act** : Selects the mean action from a set of candidate actions.\n", + "\n", + "In order to use them, the user should provide the **fit** method with the **cf_generator** and **cluster_action_choice_algo** variables and choose the methods of his/hers liking." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "global_method = C_GLANCE(\n", + " _model, initial_clusters=100, final_clusters=3, num_local_counterfactuals=10\n", + ")\n", + "global_method.fit(\n", + " df.drop(columns=[\"Status\"]),\n", + " df[\"Status\"],\n", + " train_dataset,\n", + " feat_to_vary,\n", + " cluster_action_choice_algo = 'max-eff',\n", + " cf_generator=\"RandomSampling\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\nikolastheol\\documents\\github\\aix360\\aix360\\algorithms\\glance\\iterative_merges\\iterative_merges.py:294: UserWarning: Requested number of initial clusters is larger than the number of instances to explain. Setting to number of instances.\n", + " warnings.warn(\n", + "c:\\users\\nikolastheol\\documents\\github\\aix360\\aix360\\algorithms\\glance\\iterative_merges\\iterative_merges.py:1032: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n", + " for column_name, value in row['action'].to_frame().T.reset_index(drop=True).iteritems():\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mAction 1 \n", + "\u001b[0m\u001b[1mEducation\u001b[0m = \u001b[31mBachelors\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m13360.410752281923\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m2.17\u001b[39m\n", + "\n", + "\n", + "\u001b[1mAction 2 \n", + "\u001b[0m\u001b[1mOccupation\u001b[0m = \u001b[31mTransport-moving\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m16762.525423024712\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m2.68\u001b[39m\n", + "\n", + "\n", + "\u001b[1mAction 3 \n", + "\u001b[0m\u001b[1mOccupation\u001b[0m = \u001b[31mProf-specialty\u001b[39m \n", + "\u001b[1mCapital-Gain\u001b[0m +\u001b[31m17762.44715992576\u001b[39m \n", + "\n", + "\u001b[1mEffectiveness:\u001b[0m \u001b[32m100.00%\u001b[39m\t\u001b[1mCost:\u001b[0m \u001b[35m2.59\u001b[39m\n", + "\n", + "\n", + "\u001b[1mTOTAL EFFECTIVENESS:\u001b[0m \u001b[32m100.00%\u001b[39m\n", + "\u001b[1mTOTAL COST:\u001b[0m \u001b[35m2.25\u001b[39m\n" + ] + } + ], + "source": [ + "clusters, clusters_res = global_method.explain_group(affected)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## T-GLANCE\n", + "T-GLANCE is the second algorithm of the GLANCE framework that generates global counterfactual explanations using a decision tree-like structure.\n", + "\n", + "In contrast to C-GLANCE where users specify the number of the final actions to be generated, T-GLANCE allows the users to specify the split features when building the decision tree.
\n", + "\n", + "This characteristic of T-GLANCE makes it ideal for finding actions that can serve as subgroup policies, for the subgroups corresponding to the nodes of the tree.\n", + "\n", + "T-GLANCE framework is loaded with:\n", + " - the model to be explained,\n", + " - split_features, the specified features to be used in the splits, if **None** we provide suggestions based feature permutation score of the features.\n", + "\n", + "\n", + "T-GLANCE is also modular, as the user can specify in which way the actions of each node(subgroup) are computed.
\n", + "This can be either a Local Counterfactual Explanation Method (**local_method** parameter) or a Global Counterfactual Explanation Method (**global_method** parameter)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 1.94it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.28it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.99it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 2.42it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.54it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.55it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 1.28it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cf_tree = T_GLANCE(_model, \n", + " split_features=None,\n", + " local_method='Dice')\n", + "\n", + "cf_tree.fit(X, y, train_dataset)\n", + "node = cf_tree.partition_group(affected)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "node.display_igraph_jupyter(\n", + " numeric_features=affected.select_dtypes(\n", + " include=[\"number\"]).columns.tolist()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "TOTAL EFFECTIVENESS: 100.00%\n", + "\n", + "TOTAL COST: 4.27\n" + ] + } + ], + "source": [ + "eff, cost, length = cf_tree.cumulative_leaf_actions()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## T-GLANCE Output\n", + "The output of the T-GLANCE algorithm consists of a visual representation of the decision tree in which the user can inspect the subgroups created by the chosen split features and the respective actions along with their *Effectiveness* and *Cost*.\n", + "\n", + "*Effectiveness* and *Cost* are computed for the population of each node of the tree, similar to the way they were computed for each final cluster in C-GLANCE.\n", + "\n", + "*Total Effectiveness* and *Total Cost* are computed in the same manner as in C-GLANCE, with the only difference being that the set of actions being evaluated are the ones that correspond to the leaf nodes of the tree." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More specifically, in the above output of T-GLANCE the end user can see that 3 subgroups were created using as split features the **Age** and **Marital-Status**.
\n", + "For example, the user can inspect that the individuals corresponding to the bottom right node belong to the subgroup with **Age <= 43** and **Marital-Status{Never-Married,Seperated,Widowed}**.
\n", + "\n", + "By applying the action **{Age:25.789, Relationship: Not-In-Family, Capital-Gain: 50416,753, Capital-Loss: 0.0, Hours-Per-Week: 16,174}** that is showcased in the bottom of the node, all individuals can get the favorable outcome as the *effectiveness* on this node is **100.00%** with a cost of 10.85.
Value 0.0 for feature Capital-Loss suggests that this action does not alter the specific feature.\n", + "\n", + "As a practical example of the auditing capabilities of T-GLANCE, one can observe the node described previously in contrast to the right child of the root node. More specifically, this node corresponds to individuals with **Age > 43**, with the optimal suggested action being **{Age:0.0, Workclass: Local-Gov, Capital-Gain: 18483,25, Capital-Loss: 0.0, Hours-Per-Week: 0.0}** with *effectiveness* of **100.00%** and cost 2.76.\n", + "\n", + "By comparing the suggested actions for the two subgroups, the user can gain the insight that individuals having **Age > 43** need only to change their workclass and raise their capital-gain while the individuals belonging to **Age <= 43** and **Marital-Status{Never-Married,Seperated,Widowed}** subgroup have to increase their capital-gain much more and also increase their hours-per-week and their age and also change their relationship-status." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aix360", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.py b/setup.py index eb1101a..9703530 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,16 @@ "lime", "shap==0.42.1", ], + "glance":[ + "numpy==1.23.5", + "pandas==1.5.3", + "scikit-learn==1.5.2", + "dice-ml==0.11", + "tqdm==4.66.1", + "igraph==0.11.4", + "colorama==0.4.6", + "xgboost==2.0.3" + ], } # minimal dependencies in install_requires diff --git a/tests/glance/test_KMeans.py b/tests/glance/test_KMeans.py new file mode 100644 index 0000000..0913d83 --- /dev/null +++ b/tests/glance/test_KMeans.py @@ -0,0 +1,64 @@ +import pytest +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +from aix360.algorithms.glance.clustering import KMeansMethod # Replace 'your_module' with the actual module name +from sklearn.exceptions import NotFittedError + +def test_kmeans_initialization(): + """ + Test that KMeansMethod initializes correctly with the given number of clusters and random seed. + """ + num_clusters = 3 + random_seed = 42 + kmeans_method = KMeansMethod(num_clusters=num_clusters, random_seed=random_seed) + + assert kmeans_method.num_clusters == num_clusters + assert kmeans_method.random_seed == random_seed + assert isinstance(kmeans_method.model, KMeans) # Ensure model is an instance of KMeans + + +def test_kmeans_fit(): + """ + Test that the KMeansMethod can fit data properly. + """ + # Create synthetic data + num_clusters = 3 + data, _ = make_blobs(n_samples=100, centers=num_clusters, random_state=42) + + kmeans_method = KMeansMethod(num_clusters=num_clusters, random_seed=42) + kmeans_method.fit(data) + + assert kmeans_method.model.n_clusters == num_clusters + assert hasattr(kmeans_method.model, 'cluster_centers_') # Ensure model has been fit + + +def test_kmeans_predict(): + """ + Test that KMeansMethod can predict cluster assignments for new instances. + """ + # Create synthetic data + num_clusters = 3 + data, labels = make_blobs(n_samples=100, centers=num_clusters, random_state=42) + + kmeans_method = KMeansMethod(num_clusters=num_clusters, random_seed=42) + kmeans_method.fit(data) # Fit the model first + + # Test prediction on new instances + new_instances = np.array([[0, 0], [5, 5], [-5, -5]]) + predictions = kmeans_method.predict(new_instances) + print(predictions.dtype) + + assert len(predictions) == len(new_instances) # Ensure prediction length matches input length + assert all(isinstance(pred, np.integer) for pred in predictions) # Ensure predictions are valid cluster labels + + +def test_kmeans_predict_without_fit(): + """ + Test that predict raises an exception if the model hasn't been fit. + """ + kmeans_method = KMeansMethod(num_clusters=3, random_seed=42) + + with pytest.raises(NotFittedError): # Raises error when model is not fitted + kmeans_method.predict([[0, 0]]) + diff --git a/tests/glance/test_base.py b/tests/glance/test_base.py new file mode 100644 index 0000000..d9c5ed9 --- /dev/null +++ b/tests/glance/test_base.py @@ -0,0 +1,259 @@ +import pytest +import pandas as pd +import numpy as np +from aix360.algorithms.glance.base import ClusteringMethod, LocalCounterfactualMethod, GlobalCounterfactualMethod + + +# Concrete implementation for ClusteringMethod +class SimpleKMeans(ClusteringMethod): + def __init__(self, **kwargs): + super().__init__() # Call parent __init__ (optional but explicit) + self.kwargs = kwargs # Store any passed kwargs + + def fit(self, data: pd.DataFrame): + self.data = data + self.labels = np.random.randint(0, 2, size=len(data)) # Randomly assign clusters (2 clusters) + + def predict(self, instances: pd.DataFrame) -> np.ndarray: + return self.labels # Return the same random labels + + +# Concrete implementation for LocalCounterfactualMethod +class SimpleCounterfactual(LocalCounterfactualMethod): + def __init__(self, **kwargs): + super().__init__() + self.kwargs = kwargs # Store any passed kwargs + + def fit(self, **kwargs): + # Store additional fit kwargs + self.fit_kwargs = kwargs + + def explain_instances(self, instances: pd.DataFrame, num_counterfactuals: int) -> pd.DataFrame: + return instances.copy() # Dummy return for testing + + +# Concrete implementation for GlobalCounterfactualMethod +class SimpleGlobalCounterfactual(GlobalCounterfactualMethod): + def __init__(self, **kwargs): + super().__init__(**kwargs) # Call parent __init__ and pass kwargs + self.kwargs = kwargs # Store any passed kwargs + + def fit(self, X, y, **kwargs): + self.X = X + self.y = y + self.fit_kwargs = kwargs # Store any additional kwargs + + def explain_group(self, instances: pd.DataFrame = None) -> pd.DataFrame: + if instances is None: + return self.X # Return training data if no instances provided + return instances.copy() + + +# Sample data for testing +sample_data = pd.DataFrame({ + 'feature1': [1.0, 2.0, 3.0], + 'feature2': [4.0, 5.0, 6.0], +}) + +sample_target = pd.Series([0, 1, 0]) # Example target variable + + +# Test for SimpleKMeans +def test_simple_kmeans(): + kmeans = SimpleKMeans(param1="test") + kmeans.fit(sample_data) + + # Check if labels are assigned correctly + assert isinstance(kmeans, ClusteringMethod) + assert len(kmeans.labels) == len(sample_data) + assert set(kmeans.labels).issubset({0, 1}) # Expect labels to be either 0 or 1 + + predictions = kmeans.predict(sample_data) + assert len(predictions) == len(sample_data) # Predictions should match the number of samples + + # Test that kwargs are passed correctly + assert kmeans.kwargs['param1'] == "test" + + +# Test for SimpleCounterfactual +def test_simple_counterfactual(): + counterfactual = SimpleCounterfactual(param1="test") + counterfactual.fit(param2="fit_param") + + cf = counterfactual.explain_instances(sample_data, num_counterfactuals=2) + + # Check if the counterfactuals are returned correctly + assert isinstance(counterfactual, LocalCounterfactualMethod) + assert cf.equals(sample_data) # For this stub implementation, they should be the same + + # Test that kwargs are passed correctly + assert counterfactual.kwargs['param1'] == "test" + assert counterfactual.fit_kwargs['param2'] == "fit_param" + + +# Test for SimpleGlobalCounterfactual +def test_simple_global_counterfactual(): + global_cf = SimpleGlobalCounterfactual(param1="test") + global_cf.fit(sample_data, sample_target, param2="fit_param") + + # Test case when instances are passed + cf_group = global_cf.explain_group(sample_data) + assert cf_group.equals(sample_data) # For this stub implementation, they should be the same + assert isinstance(global_cf, GlobalCounterfactualMethod) + # Test case when no instances are passed + cf_group_default = global_cf.explain_group() + assert cf_group_default.equals(sample_data) # Should return the training data + + # Test that kwargs are passed correctly + assert global_cf.kwargs['param1'] == "test" + assert global_cf.fit_kwargs['param2'] == "fit_param" + + +# Test abstract class instantiation errors +def test_abstract_classes_instantiation(): + with pytest.raises(TypeError): + ClusteringMethod() # Should raise TypeError because it is abstract + + with pytest.raises(TypeError): + LocalCounterfactualMethod() # Should raise TypeError because it is abstract + + with pytest.raises(TypeError): + GlobalCounterfactualMethod() # Should raise TypeError because it is abstract + + +# # Test edge case for empty data +# def test_empty_data(): +# empty_data = pd.DataFrame() + +# kmeans = SimpleKMeans() +# kmeans.fit(empty_data) +# assert len(kmeans.predict(empty_data)) == 0 # Ensure it handles empty data correctly + +# counterfactual = SimpleCounterfactual() +# cf = counterfactual.explain_instances(empty_data, num_counterfactuals=2) +# assert cf.empty # Check that the result is an empty DataFrame + +# global_cf = SimpleGlobalCounterfactual() +# global_cf.fit(empty_data, pd.Series()) +# cf_group = global_cf.explain_group(empty_data) +# assert cf_group.empty # Check that the result is an empty DataFrame + +# def test_data_with_missing_values(): +# data_with_nan = pd.DataFrame({ +# 'feature1': [1.0, np.nan, 3.0], +# 'feature2': [4.0, 5.0, np.nan], +# }) + +# # Test for ClusteringMethod +# kmeans = SimpleKMeans() +# kmeans.fit(data_with_nan) +# predictions = kmeans.predict(data_with_nan) +# assert len(predictions) == len(data_with_nan) + +# # Test for LocalCounterfactualMethod +# counterfactual = SimpleCounterfactual() +# cf = counterfactual.explain_instances(data_with_nan, num_counterfactuals=2) +# assert cf.equals(data_with_nan) + +# # Test for GlobalCounterfactualMethod +# global_cf = SimpleGlobalCounterfactual() +# global_cf.fit(data_with_nan, pd.Series([0, 1, 0])) +# cf_group = global_cf.explain_group(data_with_nan) +# assert cf_group.equals(data_with_nan) + +# def test_data_with_mixed_types(): +# mixed_data = pd.DataFrame({ +# 'feature1': [1, 2, 3], # integers +# 'feature2': [1.5, 2.5, 3.5], # floats +# 'feature3': ['a', 'b', 'c'] # strings +# }) + +# # Test for ClusteringMethod +# kmeans = SimpleKMeans() +# kmeans.fit(mixed_data) +# predictions = kmeans.predict(mixed_data) +# assert len(predictions) == len(mixed_data) + +# # Test for LocalCounterfactualMethod +# counterfactual = SimpleCounterfactual() +# cf = counterfactual.explain_instances(mixed_data, num_counterfactuals=2) +# assert cf.equals(mixed_data) + +# # Test for GlobalCounterfactualMethod +# global_cf = SimpleGlobalCounterfactual() +# global_cf.fit(mixed_data, pd.Series([0, 1, 0])) +# cf_group = global_cf.explain_group(mixed_data) +# assert cf_group.equals(mixed_data) + +# def test_counterfactual_num_values(): +# # Test with num_counterfactuals=0 +# counterfactual = SimpleCounterfactual() +# cf_zero = counterfactual.explain_instances(sample_data, num_counterfactuals=0) +# assert cf_zero.equals(sample_data) # Expected to return the same data + +# # Test with negative num_counterfactuals +# cf_negative = counterfactual.explain_instances(sample_data, num_counterfactuals=-1) +# assert cf_negative.equals(sample_data) # Should handle gracefully (same data) + +# def test_single_instance(): +# single_instance = pd.DataFrame({'feature1': [1.0], 'feature2': [4.0]}) + +# # Test for ClusteringMethod +# kmeans = SimpleKMeans() +# kmeans.fit(single_instance) +# predictions = kmeans.predict(single_instance) +# assert len(predictions) == 1 + +# # Test for LocalCounterfactualMethod +# counterfactual = SimpleCounterfactual() +# cf = counterfactual.explain_instances(single_instance, num_counterfactuals=1) +# assert cf.equals(single_instance) + +# # Test for GlobalCounterfactualMethod +# global_cf = SimpleGlobalCounterfactual() +# global_cf.fit(single_instance, pd.Series([0])) +# cf_group = global_cf.explain_group(single_instance) +# assert cf_group.equals(single_instance) + +# def test_fit_with_kwargs(): +# # ClusteringMethod with extra kwargs +# kmeans = SimpleKMeans(param1="test") +# kmeans.fit(sample_data) +# assert kmeans.kwargs['param1'] == "test" + +# # LocalCounterfactualMethod with extra kwargs +# counterfactual = SimpleCounterfactual(param1="test") +# counterfactual.fit(param2="fit_param", extra_param="extra") +# assert counterfactual.fit_kwargs['extra_param'] == "extra" + +# # GlobalCounterfactualMethod with extra kwargs +# global_cf = SimpleGlobalCounterfactual(param1="test") +# global_cf.fit(sample_data, sample_target, param2="fit_param", another_param="another") +# assert global_cf.fit_kwargs['another_param'] == "another" + +# def test_global_counterfactual_with_none(): +# global_cf = SimpleGlobalCounterfactual() +# global_cf.fit(sample_data, sample_target) + +# # Test case with instances as None +# cf_group_default = global_cf.explain_group(None) +# assert cf_group_default.equals(sample_data) # Should return training data + +# def test_predict_with_empty_data(): +# empty_data = pd.DataFrame() + +# kmeans = SimpleKMeans() +# kmeans.fit(empty_data) +# predictions = kmeans.predict(empty_data) +# assert len(predictions) == 0 # Ensure predict returns no labels for empty data +# def test_clustering_method_abstract_instantiation(): +# with pytest.raises(TypeError): +# ClusteringMethod() + +# def test_local_counterfactual_method_abstract_instantiation(): +# with pytest.raises(TypeError): +# LocalCounterfactualMethod() + +# def test_global_counterfactual_method_abstract_instantiation(): +# with pytest.raises(TypeError): +# GlobalCounterfactualMethod() diff --git a/tests/glance/test_counterfactual_costs.py b/tests/glance/test_counterfactual_costs.py new file mode 100644 index 0000000..9471b1e --- /dev/null +++ b/tests/glance/test_counterfactual_costs.py @@ -0,0 +1,52 @@ +import pandas as pd +import numpy as np +import pytest +from aix360.algorithms.glance.counterfactual_costs import build_dist_func_dataframe +# Assume build_dist_func_dataframe is imported from the relevant module + +def test_build_dist_func_dataframe(): + # Setup input DataFrame with numerical and categorical columns + X = pd.DataFrame({ + 'age': [25, 30, 22, 45], + 'salary': [50000, 60000, 55000, 53000], + 'gender': ['Male', 'Female', 'Female', 'Male'] + }) + + # Specify numerical and categorical columns + numerical_columns = ['age', 'salary'] + categorical_columns = ['gender'] + + # Build the distance function + dist_func = build_dist_func_dataframe(X, numerical_columns, categorical_columns, n_bins=5) + + # Generate test DataFrames + X1 = pd.DataFrame({ + 'age': [26, 31, 23, 46], + 'salary': [51000, 61000, 54000, 52000], + 'gender': ['Male', 'Female', 'Male', 'Female'] + }) + + X2 = pd.DataFrame({ + 'age': [25, 30, 22, 45], + 'salary': [50000, 60000, 55000, 53000], + 'gender': ['Female', 'Female', 'Female', 'Male'] + }) + + # Calculate expected distances manually + feat_intervals = { + col: ((max(X[col]) - min(X[col])) / 5) for col in numerical_columns + } + + + expected_distances = np.array([ + abs(26 - 25) / feat_intervals['age'] + abs(51000 - 50000) / feat_intervals['salary'] + (X1['gender'][0] != X2['gender'][0]), # First row: age diff + gender diff + abs(31 - 30) / feat_intervals['age'] + abs(61000 - 60000) / feat_intervals['salary'] + (X1['gender'][1] != X2['gender'][1]), # Second row: age diff + gender same + abs(23 - 22) / feat_intervals['age'] + abs(54000 - 55000) / feat_intervals['salary'] + (X1['gender'][2] != X2['gender'][2]), # Third row: age diff + gender diff + abs(46 - 45) / feat_intervals['age'] + abs(52000 - 53000) / feat_intervals['salary'] + (X1['gender'][3] != X2['gender'][3]) # Fourth row: age diff + gender same + ]) + + # Invoke the distance function + distances = dist_func(X1, X2) + + # Assert the distances are as expected + pd.testing.assert_series_equal(distances, pd.Series(expected_distances), check_dtype=False) diff --git a/tests/glance/test_counterfactual_tree.py b/tests/glance/test_counterfactual_tree.py new file mode 100644 index 0000000..a429f81 --- /dev/null +++ b/tests/glance/test_counterfactual_tree.py @@ -0,0 +1,68 @@ +import pandas as pd +import numpy as np +import pytest +from sklearn.linear_model import LogisticRegression +from sklearn.datasets import make_classification +from aix360.algorithms.glance.counterfactual_tree.counterfactual_tree import T_GLANCE # Adjust the import as needed + +# Sample dataset for testing +def create_sample_data(): + # Example with 100 samples and 5 features + np.random.seed(0) # For reproducibility + data = pd.DataFrame(np.random.rand(100, 5), columns=[f'feature_{i}' for i in range(5)]) + data['target'] = np.random.randint(0, 2, size=100) # Binary outcome + return data + +@pytest.fixture +def sample_data(): + return create_sample_data() + +@pytest.fixture +def fitted_model(sample_data): + model = LogisticRegression() + model.fit(sample_data.drop(columns='target'), sample_data['target']) + return model + +def test_counterfactual_tree_initialization(fitted_model): + tree = T_GLANCE(model=fitted_model) + assert tree.model == fitted_model + assert tree.split_features is None + assert tree.partition_counterfactuals is None + +def test_counterfactual_tree_fit(fitted_model): + data = create_sample_data() + tree = T_GLANCE(model=fitted_model) + + # Test default split_features (None) + tree.fit(data.drop(columns='target'), data['target'], data) + assert len(tree.split_features) == 2 # Assuming model has informative features + + # Test with specific number of split features + tree = T_GLANCE(model=fitted_model, split_features=3) + tree.fit(data.drop(columns='target'), data['target'], data) + assert len(tree.split_features) == 3 + + # Test with a numeric feature list + numeric_features = [f'feature_{i}' for i in range(5)] + tree.fit(data.drop(columns='target'), data['target'], data ,numeric_features_names=numeric_features) + assert tree.numerical_features_names == numeric_features + + # Test with categorical features + categorical_features = [f'feature_0'] + tree.fit(data.drop(columns='target'), data['target'], data, categorical_features_names=categorical_features) + assert tree.categorical_features_names == categorical_features + +def test_partition_group(fitted_model): + data = create_sample_data() + tree = T_GLANCE(model=fitted_model) + tree.fit(data.drop(columns='target'), data['target'],data) + + # Simulate instances for partitioning + instances = data.sample(20).drop(columns='target') + node = tree.partition_group(instances) + + assert node is not None + assert hasattr(node, 'split_feature') + assert hasattr(node, 'children') + + diff --git a/tests/glance/test_iterative_merges.py b/tests/glance/test_iterative_merges.py new file mode 100644 index 0000000..3f83e83 --- /dev/null +++ b/tests/glance/test_iterative_merges.py @@ -0,0 +1,487 @@ +import pytest +import pandas as pd +import numpy as np +from sklearn.cluster import KMeans +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer +from typing import List, Dict +from aix360.algorithms.glance.iterative_merges.iterative_merges import C_GLANCE,cumulative,action_fake_cost,_select_action_low_cost,_select_action_max_eff,_select_action_mean,print_results,format_glance_output,_generate_clusters, _one_hot_encode,_find_candidate_clusters,_merge_clusters +import unittest + +# Sample data for testing +@pytest.fixture +def sample_data(): + data = { + 'feature1': ['A', 'B', 'A', 'B', 'C'], + 'feature2': [1, 2, 1, 2, 3], + 'feature3': [5.0, 6.0, 5.5, 6.5, 7.0] + } + return pd.DataFrame(data) + +def test_one_hot_encode(sample_data): + categorical_columns = ['feature1'] + encoded_df = _one_hot_encode(sample_data, categorical_columns) + + # Check if the output is a DataFrame + assert isinstance(encoded_df, pd.DataFrame) + + # Check if the correct columns are present after encoding + expected_columns = ['ohe__feature1_A', 'ohe__feature1_B', 'ohe__feature1_C', 'remainder__feature2', 'remainder__feature3'] + assert all(col in encoded_df.columns for col in expected_columns) + + # Check the shape of the output + assert encoded_df.shape == (5, 5) # 5 rows and 5 columns + + # Check the first row of the encoded DataFrame + expected_first_row = [1.0, 0.0, 0.0, 1, 5.0] # One-hot encoded values for first entry + assert all(encoded_df.iloc[0].values == expected_first_row) + +def test_generate_clusters(sample_data): + categorical_features_names = ['feature1'] + num_clusters = 2 + + # Create a KMeans clustering method + clustering_method = KMeans(n_clusters=num_clusters, random_state=42) + + clusters = _generate_clusters(sample_data, num_clusters, categorical_features_names, clustering_method) + + # Check if the clusters dictionary is returned + assert isinstance(clusters, dict) + + # Check if the number of clusters is correct + assert len(clusters) == num_clusters + + # Check if all instances are assigned to a cluster + all_assigned = sum(len(cluster) for cluster in clusters.values()) + assert all_assigned == sample_data.shape[0] + + # Check if the clusters are correctly assigned + for cluster_id, cluster_df in clusters.items(): + assert isinstance(cluster_df, pd.DataFrame) + assert all(cluster_df.index.isin(sample_data.index)) + +@pytest.fixture +def clusters_data(): + return { + 0: pd.DataFrame({'feature1': [1, 2], 'feature2': [3, 4]}), + 1: pd.DataFrame({'feature1': [5, 6], 'feature2': [7, 8]}), + 2: pd.DataFrame({'feature1': [9], 'feature2': [10]}), # Smallest cluster + } + +@pytest.fixture +def centroids_data(): + return { + 0: pd.DataFrame({'feature1': [1.5], 'feature2': [3.5]}), + 1: pd.DataFrame({'feature1': [5.5], 'feature2': [7.5]}), + 2: pd.DataFrame({'feature1': [9], 'feature2': [10]}), + } + +@pytest.fixture +def explanations_data(): + return { + 0: pd.DataFrame({'explanation': [0.1, 0.2]}), + 1: pd.DataFrame({'explanation': [0.3, 0.4]}), + 2: pd.DataFrame({'explanation': [0.5]}), + } + +@pytest.fixture +def distance_function(): + # A simple mock distance function + def mock_distance(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.Series: + return pd.Series(np.random.rand(df1.shape[0])) # Random distances + return mock_distance + +def test_find_candidate_clusters(clusters_data, centroids_data, explanations_data, distance_function): + heuristic_weights = (0.5, 0.5) + + # Call the function with mock data + candidate_cluster = _find_candidate_clusters( + clusters=clusters_data, + cluster_centroids=centroids_data, + explanations_centroid=explanations_data, + heuristic_weights=heuristic_weights, + dist_func_dataframe=distance_function + ) + + # Validate the output + assert isinstance(candidate_cluster, tuple) + assert len(candidate_cluster) == 2 + + # The smallest cluster is expected to be cluster 2 (with only one instance) + assert candidate_cluster[0] == 2 # Expected smallest cluster + assert candidate_cluster[1] in [0, 1] # The candidate cluster should be either cluster 0 or cluster 1 + + # Additional checks can be added based on your specific expectations + # For instance, we could check that the candidate cluster is one of the remaining clusters + assert candidate_cluster[1] != candidate_cluster[0] + + +@pytest.fixture +def categorical_columns(): + return ["Action1", "Action2"] + +def test_print_results(capsys, clusters_stats): + total_effectiveness = 0.8 + total_cost = 250.0 + + # Call the print_results function + print_results(clusters_stats, total_effectiveness, total_cost) + + # Capture the output + captured = capsys.readouterr() + + # Validate the output + assert "CLUSTER 1 with size 10:" in captured.out + assert "Effectiveness: 85.00%, Cost: 100.00" in captured.out + assert "CLUSTER 2 with size 15:" in captured.out + assert "Effectiveness: 75.00%, Cost: 150.00" in captured.out + +@pytest.fixture +def clusters_stats(): + return { + 0: { + "size": 10, + "action": pd.DataFrame({"Action1": [1], "Action2": [2]}), + "effectiveness": 0.85, + "cost": 100.0, + }, + 1: { + "size": 15, + "action": pd.DataFrame({"Action1": [-1], "Action2": [0]}), + "effectiveness": 0.75, + "cost": 150.0, + }, + } + +@pytest.fixture +def categorical_columns(): + return ["Action1", "Action2"] + +def strip_ansi_codes(text: str) -> str: + import re + + """Remove ANSI escape sequences from a string.""" + ansi_escape = re.compile(r'\x1B\[[0-?9;]*[mK]') + return ansi_escape.sub('', text) + +def test_format_glance_output(capsys, clusters_stats, categorical_columns): + # Convert the action DataFrames into a Series for the test + for cluster_id in clusters_stats.keys(): + clusters_stats[cluster_id]['action'] = clusters_stats[cluster_id]['action'].iloc[0] + + # Call the format_glance_output function + format_glance_output(clusters_stats, categorical_columns) + + # Capture the output + captured = capsys.readouterr() + + # Strip ANSI codes from captured output for clean comparison + output = strip_ansi_codes(captured.out) + + # Validate the output for cluster 1 + assert "Action 1" in output + assert "Effectiveness: 85.00%" in output + assert "Cost: 100.00" in output + assert "Action 2" in output + assert "Effectiveness: 75.00%" in output + assert "Cost: 150.00" in output + + +class MockModel: + def predict(self, X): + # Simple mock predict function that flips the prediction based on a threshold + return (X.sum(axis=1) > 1).astype(int) + +def mock_distance_function(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.Series: + # Simple distance function that computes the sum of absolute differences + return (df1 - df2).abs().sum(axis=1) + +def test_select_action_mean(): + # Sample instances DataFrame + instances = pd.DataFrame({ + 'feature1': [0, 2, 1], + 'feature2': [1, 0, 1] + }) + + # Sample candidate actions DataFrame + candidate_actions = pd.DataFrame({ + 'feature1': [1, 0, 2], + 'feature2': [0, 2, 3] + }) + + # Define numerical and categorical features + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = [] # Assuming no categorical features for this test + + # Create a mock model instance + model = MockModel() + + # Call the _select_action_mean function + n_flipped, recourse_cost_sum, mean_action = _select_action_mean( + model=model, + instances=instances, + candidate_actions=candidate_actions, + dist_func_dataframe=mock_distance_function, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + ) + # Expected values + expected_n_flipped = 3 # From the mock model predictions (0 + 1 + 1 = 2) + + # The calculation for recourse cost sum + + expected_recourse_cost_sum = 6 + + # Mean of candidate actions + expected_mean_action = pd.Series({ + 'feature1': 1.0, + 'feature2': 1.666667 + }) + + mean_action = mean_action.astype('float64') + expected_mean_action = expected_mean_action.astype('float64') + # Assertions + assert n_flipped == expected_n_flipped + assert recourse_cost_sum == expected_recourse_cost_sum + pd.testing.assert_series_equal(mean_action, expected_mean_action) + +def test_select_action_max_eff(): + # Sample instances DataFrame + instances = pd.DataFrame({ + 'feature1': [0, 1, 2], + 'feature2': [1, 0, 1] + }) + + # Sample candidate actions DataFrame + candidate_actions = pd.DataFrame({ + 'feature1': [1, 0], + 'feature2': [0, 2] + }) + + # Define numerical and categorical features + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = [] # Assuming no categorical features for this test + + # Create a mock model instance + model = MockModel() + + # Call the _select_action_max_eff function + max_n_flipped, recourse_cost_sum, best_action = _select_action_max_eff( + model=model, + instances=instances, + candidate_actions=candidate_actions, + dist_func_dataframe=mock_distance_function, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + num_actions=1, + ) + + # Expected values + expected_max_n_flipped = 3 # Based on the mock model predictions + expected_recourse_cost_sum = 3 + + expected_best_action = pd.Series([1, 0], index=['feature1', 'feature2'], name=0) + expected_best_action = expected_best_action.astype(np.int64) + best_action = best_action.astype(np.int64) + + assert max_n_flipped == expected_max_n_flipped + assert recourse_cost_sum == expected_recourse_cost_sum + pd.testing.assert_series_equal(best_action, expected_best_action) + +def test_select_action_low_cost(): + + model = MockModel() + instances = pd.DataFrame({ + 'feature1': [1, 2, 3], + 'feature2': [0, 5, 6] + }) + + # Example cluster instances DataFrame + cluster_instances = pd.DataFrame({ + 'feature1': [1, 2], + 'feature2': [4, 5] + }) + + candidate_actions = pd.DataFrame({ + 'feature1': [1, 2], + 'feature2': [3, 4],}) + + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = [] + action_threshold = 0.5 + num_low_cost = 1 + inv_total_clusters = 1 + + # Mock the dist_func_dataframe + dist_func_dataframe = mock_distance_function + + + # Call the function under test + n_flipped, min_recourse_cost_sum, best_action = _select_action_low_cost( + model=model, + instances=instances, + cluster_instances=cluster_instances, + candidate_actions=candidate_actions, + dist_func_dataframe=dist_func_dataframe, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names, + action_threshold=action_threshold, + num_low_cost=num_low_cost, + inv_total_clusters=inv_total_clusters, + ) + + assert n_flipped == 2 + assert min_recourse_cost_sum == 8 + pd.testing.assert_series_equal(best_action, pd.Series([1, 3], index=['feature1', 'feature2'], name=0)) + +class TestActionFakeCost(unittest.TestCase): + + def test_basic_functionality(self): + # Sample action with numerical and categorical features + action = pd.Series({ + 'feature1': 10, + 'feature2': 20, + 'cat_feature1': "-", + 'cat_feature2': "value" + }) + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = ['cat_feature1', 'cat_feature2'] + + result = action_fake_cost(action, numerical_features_names, categorical_features_names) + expected = 10 + 20 + 1 # sum of numerical + count of non "-" in categorical + self.assertEqual(result, expected) + + def test_no_categorical_features(self): + # Action with only numerical features + action = pd.Series({ + 'feature1': 15, + 'feature2': 25 + }) + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = [] # No categorical features + + result = action_fake_cost(action, numerical_features_names, categorical_features_names) + expected = 15 + 25 # Just the sum of numerical features + self.assertEqual(result, expected) + + def test_all_categorical_features(self): + # Action with categorical features only + action = pd.Series({ + 'cat_feature1': "-", + 'cat_feature2': "-" + }) + numerical_features_names = [] # No numerical features + categorical_features_names = ['cat_feature1', 'cat_feature2'] + + result = action_fake_cost(action, numerical_features_names, categorical_features_names) + expected = 0 # No numerical features and both categorical are "-" + self.assertEqual(result, expected) + + def test_mixed_features(self): + # Action with mixed categorical features + action = pd.Series({ + 'feature1': 5, + 'feature2': 10, + 'cat_feature1': "value", + 'cat_feature2': "-" + }) + numerical_features_names = ['feature1', 'feature2'] + categorical_features_names = ['cat_feature1', 'cat_feature2'] + + result = action_fake_cost(action, numerical_features_names, categorical_features_names) + expected = 5 + 10 + 1 # sum of numerical features + count of non "-" + self.assertEqual(result, expected) + +def mock_apply_action_pandas(instances, action, numeric_features_names, categorical_features_names, categorical_no_action_token): + # Example: Apply an action by adding a fixed value to numeric features + modified_instances = instances.copy() + for feature in numeric_features_names: + if feature in action: + modified_instances[feature] += action[feature] + return modified_instances + +# Replace the real function with the mock one +apply_action_pandas = mock_apply_action_pandas + +def test_cumulative(): + instances = pd.DataFrame({ + 'feature1': [1, 2, 3], + 'feature2': [0, 5, 6] + }) + + # Example cluster instances DataFrame + + candidate_actions = [pd.Series({'feature1': 3,'feature2': 2})] + model = MockModel() + categorical_features_names = [] + numeric_features_names = ['feature1', 'feature2'] + categorical_no_action_token = "-" + + effectiveness, cost = cumulative( + model, + instances, + candidate_actions, + mock_distance_function, + numeric_features_names, + categorical_features_names, + categorical_no_action_token + ) + + assert effectiveness == 3 + assert cost == 15 + +def test_iterative_merges_init(): + model = MockModel() + im = C_GLANCE(model=model) + + assert im.model == model + assert im.initial_clusters == 100 + assert im.final_clusters == 10 + assert im.num_local_counterfactuals == 5 + assert im.heuristic_weights == (0.5, 0.5) + assert im.alternative_merges is True + assert im.random_seed == 13 + assert im.verbose is True + +def test_set_features_names(): +# Sample data for testing + sample_X = pd.DataFrame({ + 'feature1': [1.0, 2.0, 3.0], + 'feature2': [4.0, 5.0, 6.0] + }) + + sample_y = pd.Series([0, 1, 0]) # Target variable + model = MockModel() + im = C_GLANCE(model=model) + + numerical_names, categorical_names = im._set_features_names(sample_X, None, None) + assert numerical_names == ['feature1', 'feature2'] + assert categorical_names == [] + + numerical_names, categorical_names = im._set_features_names(sample_X, ['feature1'], None) + assert numerical_names == ['feature1'] + assert categorical_names == ['feature2'] + + numerical_names, categorical_names = im._set_features_names(sample_X, None, ['feature2']) + assert numerical_names == ['feature1'] + assert categorical_names == ['feature2'] + +def test_fit(): + sample_X = pd.DataFrame({ + 'feature1': [1.0, 2.0, 3.0], + 'feature2': [4.0, 5.0, 6.0], + 'target':[0,1,1] + }) + + sample_y = pd.Series([0, 1, 0]) # Target variable + model = MockModel() + im = C_GLANCE(model=model) + + result = im.fit(sample_X.drop(columns='target'), sample_y, sample_X) + + assert isinstance(result, C_GLANCE) + assert im.numerical_features_names == ['feature1', 'feature2'] + assert im.categorical_features_names == [] + assert im.X.equals(sample_X.drop(columns='target')) + assert im.y.equals(sample_y) + assert im.train_dataset.equals(sample_X) \ No newline at end of file diff --git a/tests/glance/test_local_cfs.py b/tests/glance/test_local_cfs.py new file mode 100644 index 0000000..996ecb7 --- /dev/null +++ b/tests/glance/test_local_cfs.py @@ -0,0 +1,150 @@ +import pytest +import pandas as pd +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.datasets import make_classification +from aix360.algorithms.glance.local_cfs.nearest_neighbor import NearestNeighborMethod +from aix360.algorithms.glance.local_cfs.random_sampling import RandomSampling +from aix360.algorithms.glance.local_cfs.dice_method import DiceMethod + + +# Create a sample dataset for testing +def create_sample_data(): + X, y = make_classification(n_samples=100, n_features=5, n_informative=3, n_redundant=1, random_state=42) + df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(5)]) + df['outcome'] = y + return df + +# Test for NearestNeighborMethod +def test_nearest_neighbor_method(): + # Create sample data + data = create_sample_data() + + # Initialize model and methods + model = LogisticRegression() + model.fit(data.drop(columns='outcome'), data['outcome']) # Fit model to data + nn_method = NearestNeighborMethod() + + # Fit the NearestNeighborMethod + nn_method.fit(model, data, outcome_name='outcome', continuous_features=[f'feature_{i}' for i in range(5)], feat_to_vary=['feature_0']) + + # Test explaining instances + instances = data.sample(5).drop(columns='outcome') # Randomly select 5 instances + counterfactuals = nn_method.explain_instances(instances, num_counterfactuals=3) + + # Validate output + assert counterfactuals.shape[0] == 15 # 5 instances * 3 counterfactuals + assert set(counterfactuals.columns) == set(data.columns[:-1]) # Check if columns match original features + + + +def create_sample_data_random(num_samples=100): + np.random.seed(42) # For reproducibility + data = pd.DataFrame({ + 'feature_0': np.random.rand(num_samples), + 'feature_1': np.random.rand(num_samples), + 'feature_2': np.random.rand(num_samples), + 'feature_3': np.random.rand(num_samples), + 'feature_4': np.random.rand(num_samples), + 'outcome': np.random.choice([0, 1], num_samples) + }) + return data + +@pytest.fixture +def setup_data(): + """Fixture to create sample data and fit a model.""" + data = create_sample_data_random() + model = LogisticRegression() + model.fit(data.drop(columns='outcome'), data['outcome']) + + return model, data + +def test_random_sampling_method(setup_data): + model, data = setup_data + + # Initialize RandomSampling method + nn_method = RandomSampling(model, n_most_important=3, n_categorical_most_frequent=2, + numerical_features=[f'feature_{i}' for i in range(5)], + categorical_features=[]) + + # Fit the method + nn_method.fit(data.drop(columns='outcome'), data['outcome']) + + # Test explaining instances + instances = data.sample(5).drop(columns='outcome') # Randomly select 5 instances + counterfactuals = nn_method.explain_instances(instances, num_counterfactuals=3) + + # Validate output + assert counterfactuals.shape[0] == 15 # 5 instances * 3 counterfactuals + assert set(counterfactuals.columns) == set(data.columns[:-1]) # Check if columns match original features + +def test_invalid_explain_input_shape(setup_data): + model, data = setup_data + + nn_method = RandomSampling(model, n_most_important=3, n_categorical_most_frequent=2, + numerical_features=[f'feature_{i}' for i in range(5)], + categorical_features=[]) + nn_method.fit(data.drop(columns='outcome'), data['outcome']) + + # Check that ValueError is raised for empty DataFrame + with pytest.raises(ValueError, match="Input must be a single row DataFrame."): + nn_method.explain(pd.DataFrame(), num_counterfactuals=3) + +def test_explain_instances_with_all_one_class(setup_data): + model, data = setup_data + + # Create a subset with only one class + all_zeros = data[data['outcome'] == 0].sample(5).drop(columns='outcome') + nn_method = RandomSampling(model, n_most_important=3, n_categorical_most_frequent=2, + numerical_features=[f'feature_{i}' for i in range(5)], + categorical_features=[]) + nn_method.fit(data.drop(columns='outcome'), data['outcome']) + + # Explain instances where all instances belong to one class + counterfactuals = nn_method.explain_instances(all_zeros, num_counterfactuals=3) + + # Assert that the output is handled properly, could be empty or valid based on implementation + assert isinstance(counterfactuals, pd.DataFrame) + +def test_explain_instances_with_insufficient_valid_counterfactuals(setup_data): + model, data = setup_data + + # Testing behavior when the method should return fewer counterfactuals than requested + nn_method = RandomSampling(model, n_most_important=3, n_categorical_most_frequent=2, + numerical_features=[f'feature_{i}' for i in range(5)], + categorical_features=[]) + nn_method.fit(data.drop(columns='outcome'), data['outcome']) + + few_instances = data.sample(1).drop(columns='outcome') # Only one instance + counterfactuals = nn_method.explain_instances(few_instances, num_counterfactuals=10) + + # The output should not exceed the number requested + assert counterfactuals.shape[0] <= 10 +# def test_random_sampling_method(): +# data = create_sample_data() + +# # Initialize model and methods +# model = LogisticRegression() +# model.fit(data.drop(columns='outcome'), data['outcome']) # Fit model to data +# nn_method = RandomSampling(model, 15, 20, numerical_features=[f'feature_{i}' for i in range(5)], categorical_features=[]) +# nn_method.fit(data.drop(columns='outcome'), data['outcome']) + +# instances = data.sample(5).drop(columns='outcome') # Randomly select 5 instances +# counterfactuals = nn_method.explain_instances(instances, num_counterfactuals=3) +# # Validate output +# assert counterfactuals.shape[0] == 15 # 5 instances * 3 counterfactuals +# assert set(counterfactuals.columns) == set(data.columns[:-1]) # Check if columns match original features + +def test_dice_method(): + data = create_sample_data() + + # Initialize model and methods + model = LogisticRegression() + model.fit(data.drop(columns='outcome'), data['outcome']) # Fit model to data + dice_method = DiceMethod() + dice_method.fit(model,data,'outcome',[f'feature_{i}' for i in range(5)],[f'feature_{i}' for i in range(2)]) + instances = data.sample(5).drop(columns='outcome') + counterfactuals = dice_method.explain_instances(instances, num_counterfactuals=3) + # Validate output + assert counterfactuals.shape[0] == 15 # 5 instances * 3 counterfactuals + assert set(counterfactuals.columns) == set(data.columns[:-1]) # Check if columns match original features diff --git a/tests/glance/test_node.py b/tests/glance/test_node.py new file mode 100644 index 0000000..b868144 --- /dev/null +++ b/tests/glance/test_node.py @@ -0,0 +1,136 @@ +from aix360.algorithms.glance.counterfactual_tree.node import Node +import pandas as pd +import unittest + +class TestNodeClass(unittest.TestCase): + + def setUp(self): + """Set up some common test cases.""" + # Create a root node and two children for testing. + self.root = Node(split_feature="feature1", actions=[{"action": "A"}], effectiveness=5, cost=10, size=100) + self.child1 = Node(split_feature="feature2", actions=[{"action": "B"}], effectiveness=3, cost=5, size=50) + self.child2 = Node(split_feature="feature3", actions=[{"action": "C"}], effectiveness=2, cost=8, size=25) + + def test_node_initialization(self): + """Test if the node initializes correctly with given parameters.""" + node = Node(split_feature="feature1", actions=[{"action": "A"}], effectiveness=5, cost=10, size=100) + self.assertEqual(node.split_feature, "feature1") + self.assertEqual(node.actions, [{"action": "A"}]) + self.assertEqual(node.effectiveness, 5) + self.assertEqual(node.cost, 10) + self.assertEqual(node.size, 100) + self.assertEqual(node.children, {}) + + def test_add_child(self): + """Test if the add_child method works as expected.""" + subgroup1 = [1, 2] + subgroup2 = [3, 4] + + # Add children to the root node. + self.root.add_child(subgroup1, self.child1) + self.root.add_child(subgroup2, self.child2) + + # Check if the children were added correctly. + self.assertIn(tuple(subgroup1), self.root.children) + self.assertIn(tuple(subgroup2), self.root.children) + self.assertEqual(self.root.children[tuple(subgroup1)], self.child1) + self.assertEqual(self.root.children[tuple(subgroup2)], self.child2) + + def test_return_leafs_actions(self): + """Test if the return_leafs_actions method returns all actions from leaf nodes.""" + # Add children to root node. + self.root.add_child([1, 2], self.child1) + self.root.add_child([3, 4], self.child2) + + # Since both child1 and child2 are leaf nodes, their actions should be returned. + leaf_actions = self.root.return_leafs_actions() + self.assertEqual(len(leaf_actions), 2) + self.assertIn({"action": "B"}, leaf_actions) + self.assertIn({"action": "C"}, leaf_actions) + + def test_return_leafs_actions_with_nested_tree(self): + """Test if return_leafs_actions works with a nested tree structure.""" + child3 = Node(split_feature="feature4", actions=[{"action": "D"}], effectiveness=1, cost=3, size=15) + # Add child1 as a child of root, and child3 as a child of child1. + self.child1.add_child([5, 6], child3) + self.root.add_child([1, 2], self.child1) + + # Now, child3 is the only leaf node. + leaf_actions = self.root.return_leafs_actions() + self.assertEqual(leaf_actions, [{"action": "D"}]) + +class TestNodeGraphClass(unittest.TestCase): + + def setUp(self): + """Set up some common test cases.""" + # Create a root node and two children for testing. + self.root = Node( + split_feature="feature1", + actions=[pd.Series({"action": "A"})], # Use Pandas Series + effectiveness=5, + cost=10, + size=100 + ) + self.child1 = Node( + split_feature="feature2", + actions=[pd.Series({"action": "B"})], # Use Pandas Series + effectiveness=3, + cost=5, + size=50 + ) + self.child2 = Node( + split_feature="feature3", + actions=[pd.Series({"action": "C"})], # Use Pandas Series + effectiveness=2, + cost=8, + size=25 + ) + + def test_to_igraph_structure(self): + """Test if the iGraph object is correctly created from the node structure.""" + # Add children to the root node. + self.root.add_child([1, 2], self.child1) + self.root.add_child([3, 4], self.child2) + + # Convert the node tree to iGraph. + g = self.root.to_igraph() + + # Check if the correct number of vertices is created. + self.assertEqual(len(g.vs), 3) # 1 root node + 2 child nodes. + + # Check if the correct number of edges is created. + self.assertEqual(len(g.es), 2) # 2 edges from root to its children. + + # Check the vertex labels. + vertex_labels = g.vs["label"] + self.assertTrue(all("action" in label for label in vertex_labels)) + + # Check the edge labels. + edge_labels = g.es["label"] + self.assertIn("feature1", edge_labels[0]) # Root should have split on 'feature1'. + + # Check the labels of the edges. + edge_labels = g.es["label"] + self.assertEqual(edge_labels[0], "feature1 (1, 2)") + self.assertEqual(edge_labels[1], "feature1 (3, 4)") + + def test_pre_order_traversal_node_ids(self): + """Test if pre-order traversal correctly assigns node IDs.""" + # Add children to the root node. + self.root.add_child([1, 2], self.child1) + self.root.add_child([3, 4], self.child2) + + # Traverse the tree and assign IDs. + self.root.to_igraph() + + # Check if the nodes were assigned unique IDs in pre-order fashion. + self.assertEqual(self.root.id, 0) + self.assertEqual(self.child1.id, 1) + self.assertEqual(self.child2.id, 2) + + + + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/glance/test_phase2.py b/tests/glance/test_phase2.py new file mode 100644 index 0000000..5566019 --- /dev/null +++ b/tests/glance/test_phase2.py @@ -0,0 +1,116 @@ +import pytest +import pandas as pd +import numpy as np +from aix360.algorithms.glance.iterative_merges.phase2 import generate_cluster_centroid_explanations +from aix360.algorithms.glance.utils.action import extract_actions_pandas +from aix360.algorithms.glance.utils.centroid import centroid_pandas +from aix360.algorithms.glance.base import LocalCounterfactualMethod +from unittest.mock import MagicMock + +@pytest.fixture +def setup_cluster_data(): + # Create synthetic cluster centroids + cluster_centroids = { + 0: pd.DataFrame(np.random.rand(1, 5), columns=[f'feature_{i}' for i in range(5)]), + 1: pd.DataFrame(np.random.rand(1, 5), columns=[f'feature_{i}' for i in range(5)]) + } + numerical_features_names = [f'feature_{i}' for i in range(3)] + categorical_features_names = [f'feature_{i}' for i in range(3, 5)] + return cluster_centroids, numerical_features_names, categorical_features_names + + +@pytest.fixture +def setup_mock_cf_generator(): + # Mock the LocalCounterfactualMethod and its explain_instances method + cf_generator = MagicMock(spec=LocalCounterfactualMethod) + + # Simulate counterfactual generation returning valid data + def mock_explain_instances(instances, num_counterfactuals): + return pd.DataFrame( + np.random.rand(num_counterfactuals, instances.shape[1]), + columns=instances.columns + ) + + cf_generator.explain_instances.side_effect = mock_explain_instances + return cf_generator + + +def test_generate_cluster_centroid_explanations_basic(setup_cluster_data, setup_mock_cf_generator): + cluster_centroids, numerical_features_names, categorical_features_names = setup_cluster_data + cf_generator = setup_mock_cf_generator + + num_local_counterfactuals = 3 + + cluster_explanations, cluster_expl_actions, explanations_centroid = generate_cluster_centroid_explanations( + cluster_centroids=cluster_centroids, + cf_generator=cf_generator, + num_local_counterfactuals=num_local_counterfactuals, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names + ) + + # Test the shape and type of returned cluster_explanations + assert isinstance(cluster_explanations, dict) + assert len(cluster_explanations) == len(cluster_centroids) + for cluster_id, cf in cluster_explanations.items(): + assert isinstance(cf, pd.DataFrame) + assert cf.shape[0] == num_local_counterfactuals + + # Test the cluster_expl_actions are returned and not empty + assert isinstance(cluster_expl_actions, dict) + assert len(cluster_expl_actions) == len(cluster_centroids) + + # Test the centroid calculations + assert isinstance(explanations_centroid, dict) + assert len(explanations_centroid) == len(cluster_centroids) + for cluster_id, centroid in explanations_centroid.items(): + # Fix: Expecting DataFrame instead of Series + assert isinstance(centroid, pd.DataFrame) # Update to DataFrame + assert centroid.shape[1] == len(numerical_features_names) + len(categorical_features_names) + + +def test_generate_cluster_centroid_explanations_empty_counterfactuals(setup_cluster_data): + cluster_centroids, numerical_features_names, categorical_features_names = setup_cluster_data + + # Mock the LocalCounterfactualMethod to return empty DataFrames for counterfactuals + cf_generator = MagicMock(spec=LocalCounterfactualMethod) + cf_generator.explain_instances.return_value = pd.DataFrame() + + num_local_counterfactuals = 3 + + with pytest.raises(ValueError, match="No counterfactuals found for any of the centroids."): + generate_cluster_centroid_explanations( + cluster_centroids=cluster_centroids, + cf_generator=cf_generator, + num_local_counterfactuals=num_local_counterfactuals, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names + ) + + +def test_generate_cluster_centroid_explanations_incorrect_num_counterfactuals(setup_cluster_data, setup_mock_cf_generator): + cluster_centroids, numerical_features_names, categorical_features_names = setup_cluster_data + cf_generator = setup_mock_cf_generator + + # Mock the explain_instances method to return an incorrect number of counterfactuals + def mock_explain_instances(instances, num_counterfactuals): + return pd.DataFrame(np.random.rand(2, instances.shape[1]), columns=instances.columns) # Only return 2 cfs + + cf_generator.explain_instances.side_effect = mock_explain_instances + + num_local_counterfactuals = 3 + + cluster_explanations, cluster_expl_actions, explanations_centroid = generate_cluster_centroid_explanations( + cluster_centroids=cluster_centroids, + cf_generator=cf_generator, + num_local_counterfactuals=num_local_counterfactuals, + numerical_features_names=numerical_features_names, + categorical_features_names=categorical_features_names + ) + + # Test the shape and type of returned cluster_explanations + assert isinstance(cluster_explanations, dict) + assert len(cluster_explanations) == len(cluster_centroids) + for cluster_id, cf in cluster_explanations.items(): + assert isinstance(cf, pd.DataFrame) + assert cf.shape[0] != num_local_counterfactuals \ No newline at end of file diff --git a/tests/glance/test_utils.py b/tests/glance/test_utils.py new file mode 100644 index 0000000..efac547 --- /dev/null +++ b/tests/glance/test_utils.py @@ -0,0 +1,420 @@ +import pytest +import numpy as np +import pandas as pd +from statistics import multimode +from aix360.algorithms.glance.clustering import KMeansMethod # Adjust the import based on your module structure +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from aix360.algorithms.glance.utils.centroid import centroid_pandas,centroid_numpy # Replace 'your_module' with the actual module name +from aix360.algorithms.glance.utils.action import apply_action_numpy,apply_action_pandas,actions_mean_pandas,apply_actions_pandas_rows,extract_actions_pandas +from aix360.algorithms.glance.utils.metadata_requests import _decide_cluster_method,_decide_local_cf_method +from aix360.algorithms.glance.local_cfs import DiceMethod, NearestNeighborMethod, RandomSampling +from xgboost import XGBClassifier + +def test_centroid_pandas(): + """ + Test the centroid_pandas function for numerical and categorical columns. + """ + data = { + "age": [25, 30, 22, 35, 40], + "salary": [50000, 60000, 55000, 45000, 70000], + "gender": ["Male", "Female", "Female", "Male", "Female"], + } + df = pd.DataFrame(data) + + numerical_columns = ["age", "salary"] + categorical_columns = ["gender"] + + centroid = centroid_pandas(df, numerical_columns, categorical_columns) + + # Expected centroid values + expected_centroid = pd.DataFrame({ + "age": [30.4], # Mean of [25, 30, 22, 35, 40] + "salary": [56000.0], # Mean of [50000, 60000, 55000, 45000, 70000] + "gender": ["Female"], # Mode of ['Male', 'Female', 'Female', 'Male', 'Female'] + }) + + pd.testing.assert_frame_equal(centroid, expected_centroid) + + +def test_centroid_pandas_no_categorical_columns(): + """ + Test centroid_pandas when there are no categorical columns. + """ + data = { + "age": [25, 30, 22, 35, 40], + "salary": [50000, 60000, 55000, 45000, 70000], + } + df = pd.DataFrame(data) + + numerical_columns = ["age", "salary"] + categorical_columns = [] + + centroid = centroid_pandas(df, numerical_columns, categorical_columns) + + # Expected centroid values (just the mean of the numerical columns) + expected_centroid = pd.DataFrame({ + "age": [30.4], # Mean of [25, 30, 22, 35, 40] + "salary": [56000.0], # Mean of [50000, 60000, 55000, 45000, 70000] + }) + + pd.testing.assert_frame_equal(centroid, expected_centroid) + +def test_centroid_numpy(): + """ + Test the centroid_numpy function for numerical and categorical columns. + """ + data = np.array([ + [25, 50000, 0], # 0 = Male + [30, 60000, 1], # 1 = Female + [22, 55000, 1], + [35, 45000, 0], + [40, 70000, 1] + ]) + + numerical_columns = [0, 1] # age and salary + categorical_columns = [2] # gender + + centroid = centroid_numpy(data, numerical_columns, categorical_columns) + + # Expected centroid values + expected_centroid = np.array([[30.4, 56000.0, 1]]) # Mode of gender is '1' (Female) + + np.testing.assert_array_equal(centroid, expected_centroid) + + +def test_centroid_numpy_no_categorical_columns(): + """ + Test centroid_numpy when there are no categorical columns. + """ + data = np.array([ + [25, 50000], + [30, 60000], + [22, 55000], + [35, 45000], + [40, 70000] + ]) + + numerical_columns = [0, 1] # age and salary + categorical_columns = [] # No categorical columns + + centroid = centroid_numpy(data, numerical_columns, categorical_columns) + + # Expected centroid values (just the mean of the numerical columns) + expected_centroid = np.array([[30.4, 56000.0]]) + + np.testing.assert_array_equal(centroid, expected_centroid) + + +def test_apply_action_pandas(): + """ + Test the apply_action_pandas function for applying actions to numerical and categorical columns. + """ + data = pd.DataFrame({ + "age": [25, 30, 22, 35], + "salary": [50000, 60000, 55000, 45000], + "gender": ["Male", "Female", "Female", "Male"] + }) + + action = pd.Series({"age": 5, "salary": 1000, "gender": "Female"}) + + numerical_columns = ["age", "salary"] + categorical_columns = ["gender"] + categorical_no_action_token = "NoChange" + + result = apply_action_pandas(data, action, numerical_columns, categorical_columns, categorical_no_action_token) + + expected_result = pd.DataFrame({ + "age": [30, 35, 27, 40], # Age incremented by 5 + "salary": [51000, 61000, 56000, 46000], # Salary incremented by 1000 + "gender": ["Female", "Female", "Female", "Female"] # Gender set to 'Female' + }) + + pd.testing.assert_frame_equal(result, expected_result) + +def test_apply_action_pandas_no_change_token(): + """ + Test apply_action_pandas with a categorical no-action token. + """ + data = pd.DataFrame({ + "age": [25, 30, 22, 35], + "salary": [50000, 60000, 55000, 45000], + "gender": ["Male", "Female", "Female", "Male"] + }) + + action = pd.Series({"age": 5, "salary": 1000, "gender": "NoChange"}) + + numerical_columns = ["age", "salary"] + categorical_columns = ["gender"] + categorical_no_action_token = "NoChange" + + result = apply_action_pandas(data, action, numerical_columns, categorical_columns, categorical_no_action_token) + + expected_result = pd.DataFrame({ + "age": [30, 35, 27, 40], # Age incremented by 5 + "salary": [51000, 61000, 56000, 46000], # Salary incremented by 1000 + "gender": ["Male", "Female", "Female", "Male"] # No change for gender + }) + + pd.testing.assert_frame_equal(result, expected_result) + +def test_apply_action_numpy(): + """ + Test the apply_action_numpy function for applying actions to numerical and categorical columns. + """ + data = np.array([ + [25, 50000, 0], # 0 = Male + [30, 60000, 1], # 1 = Female + [22, 55000, 1], + [35, 45000, 0] + ]) + + action = np.array([5, 1000, 1]) # Increase age by 5, salary by 1000, gender to 'Female' (1) + + numerical_columns = [0, 1] + categorical_columns = [2] + categorical_no_action_token = 0 # '0' means no change for gender + + result = apply_action_numpy(data, action, numerical_columns, categorical_columns, categorical_no_action_token) + + expected_result = np.array([ + [30, 51000, 1], + [35, 61000, 1], + [27, 56000, 1], + [40, 46000, 1] + ]) + + np.testing.assert_array_equal(result, expected_result) + +def test_extract_actions_pandas(): + """ + Test the extract_actions_pandas function for extracting actions from differences between two dataframes. + """ + X = pd.DataFrame({ + "age": [25, 30, 22], + "salary": [50000, 60000, 55000], + "gender": ["Male", "Female", "Female"] + }) + + cfs = pd.DataFrame({ + "age": [30, 30, 25], + "salary": [51000, 61000, 55000], + "gender": ["Female", "Female", "Male"] + }) + + numerical_features = ["age", "salary"] + categorical_features = ["gender"] + categorical_no_action_token = "NoChange" + + result = extract_actions_pandas(X, cfs, categorical_features, numerical_features, categorical_no_action_token) + + expected_result = pd.DataFrame({ + "age": [5, 0, 3], # Difference in ages + "salary": [1000, 1000, 0], # Difference in salary + "gender": ["Female", "NoChange", "Male"] # Gender action, 'NoChange' for unchanged + }) + + pd.testing.assert_frame_equal(result, expected_result) + +def test_apply_actions_pandas_rows(): + """ + Test the apply_actions_pandas_rows function for applying row-wise actions to numerical and categorical columns. + """ + X = pd.DataFrame({ + "age": [25, 30, 22], + "salary": [50000, 60000, 55000], + "gender": ["Male", "Female", "Female"] + }) + + actions = pd.DataFrame({ + "age": [5, 0, 3], + "salary": [1000, 1000, 0], + "gender": ["Female", "NoChange", "Male"] + }) + + numerical_columns = ["age", "salary"] + categorical_columns = ["gender"] + categorical_no_action_token = "NoChange" + + result = apply_actions_pandas_rows(X, actions, numerical_columns, categorical_columns, categorical_no_action_token) + + expected_result = pd.DataFrame({ + "age": [30, 30, 25], # Age updated by actions + "salary": [51000, 61000, 55000], # Salary updated by actions + "gender": ["Female", "Female", "Male"] # Gender updated where applicable + }) + + pd.testing.assert_frame_equal(result, expected_result) + +def test_actions_mean_pandas(): + """ + Test the actions_mean_pandas function for calculating the mean action for numerical and categorical columns. + """ + actions = pd.DataFrame({ + "age": [5, 0, 3], + "salary": [1000, 1000, 0], + "gender": ["Female", "NoChange", "Male"] + }) + + numerical_features = ["age", "salary"] + categorical_features = ["gender"] + categorical_no_action_token = "NoChange" + + result = actions_mean_pandas(actions, numerical_features, categorical_features, categorical_no_action_token) + + expected_result = pd.Series({ + "age": 8 / 3, # Mean of [5, 0, 3] + "salary": 2000 / 3, # Mean of [1000, 1000, 0] + "gender": "Female" # Most frequent value + }) + + pd.testing.assert_series_equal(result, expected_result) + +def test_decide_cluster_method_kmeans(): + """Test that 'KMeans' method returns KMeansMethod instance.""" + n_clusters = 3 + random_seed = 42 + result = _decide_cluster_method("KMeans", n_clusters, random_seed) + assert isinstance(result, KMeansMethod) + assert result.num_clusters == n_clusters + assert result.random_seed == random_seed + +def test_decide_cluster_method_invalid(): + """Test that invalid method raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported clustering method: unsupported_method"): + _decide_cluster_method("unsupported_method", n_clusters=3, random_seed=42) + +def test_decide_cluster_method_instance(): + """Test that passing an instance returns the same instance.""" + kmeans_instance = KMeansMethod(num_clusters=3, random_seed=42) + result = _decide_cluster_method(kmeans_instance, n_clusters=None, random_seed=None) + assert result is kmeans_instance + +def test_decide_local_cf_method_dice(): + """Test that 'Dice' method returns an instance of DiceMethod.""" + model = XGBClassifier() # Create or mock a model as required + train_dataset = X = pd.DataFrame({ + "age": [25, 30, 22], + "salary": [50000, 60000, 55000], + "gender": ["Male", "Female", "Female"], + 'target': [1,0,1] + }) # Create or mock a DataFrame as required + numeric_features_names = ['age', 'salary'] + categorical_features_names = ['gender'] + feat_to_vary = ['age', 'salary'] + random_seed = 42 + + result = _decide_local_cf_method("Dice", model, train_dataset, numeric_features_names, categorical_features_names, feat_to_vary, random_seed) + assert isinstance(result, DiceMethod) + +def test_decide_local_cf_method_nearest_neighbors(): + """Test that 'NearestNeighbors' method returns an instance of NearestNeighborMethod.""" + model = XGBClassifier() # Create or mock a model as required + train_dataset = X = pd.DataFrame({ + "age": [25, 30, 22,45,60,20], + "salary": [50000, 60000, 55000, 53000,75000,30000], + "gender": ["Male", "Female", "Female","Male", "Female",'Female'], + 'target': [1,0,1,0,1,1] + }) + numeric_features_names = ['age', 'salary'] + categorical_features_names = ['gender'] + preprocessor = ColumnTransformer( + transformers=[ + ( + "cat", + OneHotEncoder(sparse_output=False, handle_unknown="ignore"), + categorical_features_names, + ) + ], + remainder="passthrough", + ) + model_ = Pipeline( + [ + ("preprocessor", preprocessor), + ("normalize", StandardScaler()), + ("classifier", model), + ] + ) + model_.fit( + train_dataset.drop(columns='target'), + train_dataset['target'], + ) + feat_to_vary = ['age', 'salary'] + random_seed = 42 + + result = _decide_local_cf_method("NearestNeighbors", model_, train_dataset, numeric_features_names, categorical_features_names, feat_to_vary, random_seed) + assert isinstance(result, NearestNeighborMethod) + +def test_decide_local_cf_method_random_sampling(): + """Test that 'RandomSampling' method returns an instance of RandomSampling.""" + model = XGBClassifier() # Create or mock a model as required + train_dataset = X = pd.DataFrame({ + "age": [25, 30, 22,45,60,20], + "salary": [50000, 60000, 55000, 53000,75000,30000], + "gender": ["Male", "Female", "Female","Male", "Female",'Female'], + 'target': [1,0,1,0,1,1] + }) + numeric_features_names = ['age', 'salary'] + categorical_features_names = ['gender'] + preprocessor = ColumnTransformer( + transformers=[ + ( + "cat", + OneHotEncoder(sparse_output=False, handle_unknown="ignore"), + categorical_features_names, + ) + ], + remainder="passthrough", + ) + model_ = Pipeline( + [ + ("preprocessor", preprocessor), + ("normalize", StandardScaler()), + ("classifier", model), + ] + ) + model_.fit( + train_dataset.drop(columns='target'), + train_dataset['target'], + ) + feat_to_vary = ['age', 'salary'] + random_seed = 42 + + result = _decide_local_cf_method("RandomSampling", model_, train_dataset, numeric_features_names, categorical_features_names, feat_to_vary, random_seed) + assert isinstance(result, RandomSampling) + +def test_decide_local_cf_method_invalid(): + """Test that invalid method raises ValueError.""" + model = XGBClassifier() # Create or mock a model as required + train_dataset = X = pd.DataFrame({ + "age": [25, 30, 22], + "salary": [50000, 60000, 55000], + "gender": ["Male", "Female", "Female"], + 'target': [1,0,1] + }) # Create or mock a DataFrame as required + numeric_features_names = ['age', 'salary'] + categorical_features_names = ['gender'] + feat_to_vary = ['age', 'salary'] + random_seed = 42 + + with pytest.raises(ValueError, match="Unsupported local counterfactual method: unsupported_method"): + _decide_local_cf_method("unsupported_method", model, train_dataset, numeric_features_names, categorical_features_names, feat_to_vary, random_seed) + +def test_decide_local_cf_method_instance(): + """Test that passing an instance returns the same instance.""" + dice_instance = DiceMethod() + model = XGBClassifier() # Create or mock a model as required + train_dataset = X = pd.DataFrame({ + "age": [25, 30, 22], + "salary": [50000, 60000, 55000], + "gender": ["Male", "Female", "Female"], + 'target': [1,0,1] + }) # Create or mock a DataFrame as required + numeric_features_names = ['age', 'salary'] + categorical_features_names = ['gender'] + feat_to_vary = ['age', 'salary'] + random_seed = 42 + + result = _decide_local_cf_method(dice_instance, model, train_dataset, numeric_features_names, categorical_features_names, feat_to_vary, random_seed) + assert result is dice_instance \ No newline at end of file