-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchi_square_test.py
91 lines (71 loc) · 3.71 KB
/
chi_square_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chisquare, fisher_exact, power_divergence, chi2_contingency
from loguru import logger
from scipy.stats import expon, chi2
import seaborn
# logger.info(chisquare([16, 18, 16, 14, 12, 12]))
##################################################################################
# Following code taken from: https://towardsdatascience.com/how-to-compare-two-or-more-distributions-9b06ee4d30bf
##################################################################################
# Init dataframe
df_bins = pd.DataFrame()
N = 512
n_control_group = 16 * 1024
q = 10
# np.random.seed(39)
comparison_ex = np.random.exponential(size=n_control_group)
mean = 1
stdev = 0.5
normal_distr_samples = np.random.normal(mean, stdev, N)
comparison_norm = np.random.normal(1, 0.52, n_control_group)
comparison = comparison_norm
# Generate bins from control group
_, bins = pd.qcut(comparison, q=q, retbins=True)
df_bins['bin'] = pd.cut(comparison, bins=bins).value_counts().index
# Apply bins to both groups
df_bins['comparison_observed'] = pd.cut(comparison, bins=bins).value_counts().values
df_bins['normal_distr_observed'] = pd.cut(normal_distr_samples, bins=bins).value_counts().values
# Compute expected frequency in the treatment group
df_bins['normal_distr_expected'] = df_bins['comparison_observed'] / np.sum(df_bins['comparison_observed']) * \
np.sum(df_bins['normal_distr_observed'])
comp_series = pd.Series(comparison)
t1 = pd.DataFrame({'index':comp_series.index, 'values':comp_series.values, 'group': 'comparison'})
distr_series = pd.Series(normal_distr_samples)
t2 = pd.DataFrame({'index':distr_series.index, 'values':distr_series.values, 'group': 'distr'})
df_plotting = pd.concat([t1, t2])
seaborn.boxplot(data=df_plotting, x='group', y='values')
plt.title("Boxplot")
plt.show()
seaborn.histplot(data=df_plotting, x='values', hue='group', bins=50)
plt.title("Histogram")
plt.show()
seaborn.histplot(data=df_plotting, x='values', hue='group', bins=50, stat='density', common_norm=False)
plt.title("Density Histogram")
plt.show()
seaborn.kdeplot(x='values', data=df_plotting, hue='group', common_norm=False)
plt.title("Kernel Density Function")
plt.show()
seaborn.histplot(x='values', data=df_plotting, hue='group', bins=len(df_plotting), stat="density",
element="step", fill=False, cumulative=True, common_norm=False)
plt.title("Cumulative distribution function")
plt.show()
logger.info(f"\n{df_bins.to_markdown()}")
# stat, p_value = chisquare(df_bins['normal_distr_observed'])
# stat, p_value = chisquare(df_bins['normal_distr_observed'], df_bins['normal_distr_expected'])
res = chi2_contingency(df_bins[['normal_distr_observed', 'comparison_observed']], lambda_="log-likelihood")
# stat, p_value = power_divergence(df_bins['normal_distr_observed'],
# df_bins['comparison_observed'],
# lambda_="log-likelihood")
logger.info(f"Chi-squared Test: statistic={res.statistic:.4f} (What does this mean?)")
logger.info(f"Chi-squared Test: p-value={res.pvalue:.4f} - {100*res.pvalue:.2f}% ")
if res.pvalue < 0.05:
logger.info(f"p-Value < 0.05 --> null hypothesis is rejected --> data is not correlated "
f"(distributions are not the same)")
else:
logger.info(f"p-Value > 0.05 --> null hypothesis can not be rejected --> data is correlated "
f"(distributions are the same)")
##################################################################################
# code above taken from: https://towardsdatascience.com/how-to-compare-two-or-more-distributions-9b06ee4d30bf
##################################################################################