Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add network analysis #343

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 188 additions & 120 deletions geochemistrypi/data_mining/cli_pipeline.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

OPTION = ["Yes", "No"]
DATA_OPTION = ["Own Data", "Testing Data (Built-in)"]
TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection"]
MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection"]
TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection", "Data For Network Analysis"]
MODE_OPTION = ["Regression", "Classification", "Clustering","Dimensional Reduction", "Abnormal Detection", "Network Analysis"]

# The model provided to use
REGRESSION_MODELS = [
Expand Down Expand Up @@ -68,6 +68,7 @@
CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
ABNORMALDETECTION_MODELS = ["Isolation Forest"]
NETWORKANALYSIS_MODELS = ["Bron Kerbosch Community Detection", "Louvain Method Community Detection"]

# The model can deal with missing values
# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Expand All @@ -89,7 +90,7 @@


# Special AutoML models
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression", "bron_kerbosch, louvain_method"]
RAY_FLAML = ["Multi-layer Perceptron"]

MISSING_VALUE_STRATEGY = ["Drop Rows with Missing Values ", "Impute Missing Values"]
Expand Down
Binary file not shown.
Empty file.
83 changes: 83 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from itertools import combinations
import numpy as np
import pandas as pd




def pair_dataframes(dataframes):
pairs = []
for pair in combinations(enumerate(dataframes), 2):
idx1, df1 = pair[0]
idx2, df2 = pair[1]
pairs.append((df1, df2, idx1, idx2))
return pairs

def convert_to_triplets(indices, distances, mineral_a_ids, mineral_b_ids):

triplets = []

for i, (neighbors, distances_row) in enumerate(zip(indices, distances)):
for neighbor, distance in zip(neighbors, distances_row):
triplet = (mineral_a_ids[i], mineral_b_ids[neighbor], distance)
triplets.append(triplet)

return triplets


def triplets_df_clean(triplets_df):
triplets_df[['Node1', 'Node2']] = np.sort(triplets_df[['Node1', 'Node2']], axis=1)
triplets_df = triplets_df.drop_duplicates(subset=['Node1', 'Node2'])
triplets_df['Distance'] = triplets_df['Distance'].apply(lambda x: 0.001 if pd.isnull(x) or x == 0 else x)
triplets_df = triplets_df.dropna(subset=['Distance'])
triplets_df = triplets_df.sort_values(by='Node1')
return triplets_df


def construct_adj_matrix(graph_data):
nodes = np.unique(graph_data[['Node1', 'Node2']].values)
num_nodes = len(nodes)
adj_matrix = np.zeros((num_nodes, num_nodes))
node_index_mapping = {node: idx for idx, node in enumerate(nodes)}
for index, row in graph_data.iterrows():
node1, node2, distance = int(row['Node1']), int(row['Node2']), row['Distance']
adj_matrix[node_index_mapping[node1], node_index_mapping[node2]] = distance
adj_matrix[node_index_mapping[node2], node_index_mapping[node1]] = distance
mapping_df = pd.DataFrame(list(node_index_mapping.items()), columns=['Original_Node', 'Mapped_Node'])
return adj_matrix,mapping_df


def accurate_statistic_algo(communities, ids, group_ids):
result_df = communities.copy()
flat_ids = np.array(ids).flatten()
flat_group_ids = np.array(group_ids).flatten()

for i, row in communities.iloc[1:].iterrows():
for j, val in row.items():
if val in flat_ids:
idx = flat_ids.tolist().index(val)
if idx < len(flat_group_ids):
result_df.at[i, j] = flat_group_ids[idx]
repeated_counts = []
unique_counts = []
for index, row in result_df.iterrows():
row_values = row[1:].dropna()
seen_values = set()
repeated_set = set()
for num in row_values:
if num in seen_values:
repeated_set.add(num)
else:
seen_values.add(num)

repeated_count = len(repeated_set)
unique_count = len(seen_values) - repeated_count

repeated_counts.append(repeated_count)
unique_counts.append(unique_count)

result_df['Repeated_Counts'] = repeated_counts
result_df['Unique_Counts'] = unique_counts

return result_df

60 changes: 60 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_community.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import communities.algorithms
import pandas as pd
def bron_kerbosch_algo(adj_matrix,mapping_df):
communities_list = communities.algorithms.bron_kerbosch(adj_matrix, pivot=True)
node_mapping_df = mapping_df
mapping_dict = dict(zip(node_mapping_df['Mapped_Node'], node_mapping_df['Original_Node']))


column_names = ['Community'] + [f'Node{i + 1}' for i in range(len(max(communities_list, key=len)))]

community_data = [['Community'] + [f'Node{i + 1}' for i in range(len(max(communities_list, key=len)))]]
community_data += [[f'Community {idx + 1}'] + list(community) for idx, community in
enumerate(communities_list)]

bk_df = pd.DataFrame(community_data, columns=column_names)
mapped_bk_df = pd.DataFrame(columns=column_names)
for index, row in bk_df.iterrows():
mapped_row = []
for column in bk_df.columns[1:]:
community_nodes = row[column]

if not isinstance(community_nodes, list):
community_nodes = [community_nodes]

original_nodes = [mapping_dict.get(node, float('nan')) for node in community_nodes]
mapped_row.append(original_nodes)

mapped_bk_df.loc[index] = [row['Community']] + [item for sublist in mapped_row for item in sublist]
return mapped_bk_df


def louvain_method_algo(adj_matrix,mapping_df):
communities_list, _ =communities.algorithms.louvain_method(adj_matrix)
node_mapping_df = mapping_df
mapping_dict = dict(zip(node_mapping_df['Mapped_Node'], node_mapping_df['Original_Node']))
column_names = ['Community'] + [f'Node{i + 1}' for i in range(len(max(communities_list, key=len)))]


community_data = [['Community'] + [f'Node{i + 1}' for i in range(len(max(communities_list, key=len)))]]
community_data += [[f'Community {idx + 1}'] + list(community) for idx, community in
enumerate(communities_list)]
louvain_df=pd.DataFrame(community_data, columns=column_names)
mapped_louvain_df = pd.DataFrame(columns=column_names)


for index, row in louvain_df.iterrows():
mapped_row = []


for column in louvain_df.columns[1:]:
community_nodes = row[column]

if not isinstance(community_nodes, list):
community_nodes = [community_nodes]

original_nodes = [mapping_dict.get(node, float('nan')) for node in community_nodes]
mapped_row.append(original_nodes)

mapped_louvain_df.loc[index] = [row['Community']] + [item for sublist in mapped_row for item in sublist]
return mapped_louvain_df
36 changes: 36 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import scipy
import numpy as np

def mahalanobis_distance_singal(x, y,inv_cov):
x_minus_y = x - y

return np.sqrt(np.dot(np.dot(x_minus_y, inv_cov), x_minus_y.T))

def mahalanobis_distance_calculator(mineral_a, mineral_b):
inv_cov = np.linalg.pinv(np.cov(mineral_a, rowvar=False))
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, lambda u, v: mahalanobis_distance_singal(u, v, inv_cov))
return distance


def euclidean_distance_calcular(mineral_a, mineral_b):
metric = 'euclidean'
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, metric)
return distance


def compute_distance_between_2(mineral_a, mineral_b, k=1, metric='euclidean'):
if metric == 'mahalanobis':
return mahalanobis_distance_calculator(mineral_a, mineral_b)
elif metric == 'euclidean':
return euclidean_distance_calcular(mineral_a, mineral_b)
else:
raise ValueError("Unsupported distance metric. Supported metrics are 'mahalanobis' and 'euclidean'.")









Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import numpy as np

def top_k_nearest_neighbors(distance, k=5):

nearest_neighbors_indices = np.argsort(distance, axis=1)[:, :k]

nearest_neighbors_distances = np.take_along_axis(distance, nearest_neighbors_indices, axis=1)

return nearest_neighbors_indices, nearest_neighbors_distances
106 changes: 106 additions & 0 deletions geochemistrypi/data_mining/model/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import pandas as pd
from ._base import WorkflowBase
from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
from ..utils.base import clear_output, save_data, save_fig, save_text
from .func.algo_network._distance import mahalanobis_distance_calculator
from .func.algo_network._distance import euclidean_distance_calcular
from .func.algo_network._common import pair_dataframes
from .func.algo_network._nearest import top_k_nearest_neighbors
from .func.algo_network._common import convert_to_triplets
from .func.algo_network._common import triplets_df_clean
from .func.algo_network._common import construct_adj_matrix
from .func.algo_network._community import bron_kerbosch_algo
from .func.algo_network._community import louvain_method_algo
from .func.algo_network._common import accurate_statistic_algo
class NetworkAnalysisWorkflowBase(WorkflowBase):
def __init__(self) -> None:
self.distance_calculator = "EU"
self.community_detection_algo = "BK"
self.minerals = [] # df list
self.ids = []
self.labels = []
self.k = 1
self.distances = pd.DataFrame #graph
self.communities_sample= pd.DataFrame #result
self.communities_gruop= pd.DataFrame #result after statistic
def fit(self) -> None:
merged_df = pd.concat([self.X, self.y], axis=1)
split_dfs = []
for mineral_type, group in merged_df.groupby('mineral_type'):
split_dfs.append(group)
extracted_dfs = []
for mineral_type, df in enumerate(split_dfs):
last_column = df.iloc[:, -1:]
extracted_dfs.append(last_column)
for mineral_type, df in enumerate(split_dfs):
split_dfs[mineral_type] = df.iloc[:, :-1]
self.minerals = split_dfs
self.labels = extracted_dfs
def manual_hyper_parameters(cls) -> None:
"""Manual hyper-parameters specification."""
return dict()
def generate_ids(self):
offset = 0
for df in self.minerals:
self.ids.append(list(range(offset, offset + len(df))))
offset += len(df)
def compute_distance(self):
all_triplets = []
pair_combinations = pair_dataframes(self.minerals)
if self.distance_calculator == 'EU':
distance_func = euclidean_distance_calcular
elif self.distance_calculator == 'MA':
distance_func = mahalanobis_distance_calculator
for pair in pair_combinations:
mineral1, mineral2, index1, index2 = pair
a_to_b_indices, a_to_b_distances=top_k_nearest_neighbors(distance_func(mineral1, mineral2),self.k)
all_triplets +=convert_to_triplets(a_to_b_indices, a_to_b_distances,self.ids[index1], self.ids[index2])
self.distances = triplets_df_clean(pd.DataFrame(all_triplets, columns=['Node1', 'Node2', 'Distance']))
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.distances, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)

def accuracy_statistic(self):
print("----------accuracy---------")
self.communities_gruop=accurate_statistic_algo(self.communities_sample,self.ids,self.labels)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_gruop, f"{self.naming} Result Accuracy Statistic", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)

class bron_kerbosch(NetworkAnalysisWorkflowBase):
def __init__(self,X,y) -> None:
super().__init__()
self.naming = "Bron_kerbosch"
self.X = X
self.y = y
self.community_detection_algo = "BK"

def community_detection(self):
self.fit()
self.generate_ids()
self.compute_distance()
adj_matrix,mapping_df=construct_adj_matrix(self.distances)
self.communities_sample=bron_kerbosch_algo(adj_matrix,mapping_df)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
self.accuracy_statistic()


class louvain_method(NetworkAnalysisWorkflowBase):
def __init__(self,X,y) -> None:
super().__init__()
self.naming = "Louvain_method"
self.X = X
self.y = y
self.community_detection_algo = "LU"

def community_detection(self):
self.fit()
self.generate_ids()
self.compute_distance()
adj_matrix, mapping_df = construct_adj_matrix(self.distances)
self.communities_sample=louvain_method_algo(adj_matrix,mapping_df)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
self.accuracy_statistic()


38 changes: 38 additions & 0 deletions geochemistrypi/data_mining/process/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
import os

import pandas as pd
from multipledispatch import dispatch
from rich import print

from ..constants import MLFLOW_ARTIFACT_DATA_PATH, SECTION
from ..data.data_readiness import num_input
from ..model.network import (
NetworkAnalysisWorkflowBase,
bron_kerbosch,
louvain_method
)
from ._base import ModelSelectionBase
class NetworkAnalysisModelSelection(ModelSelectionBase):
def __init__(self, model_name: str) -> None:
self.model_name = model_name
if model_name == "Bron Kerbosch Community Detection":
self.net_workflow = BronKerboschWorkflow()
elif model_name == "Louvain Method Community Detection":
self.net_workflow = LouvainMethodWorkflow()

def activate(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
self.net_workflow.community_detection(X, y)

class BronKerboschWorkflow(NetworkAnalysisWorkflowBase):
def community_detection(self, X, y):
instance = bron_kerbosch(X,y)
instance.community_detection()
pass

class LouvainMethodWorkflow(NetworkAnalysisWorkflowBase):
def community_detection(self, X, y):
instance = louvain_method(X,y)
instance.community_detection()
pass

Loading