-
Notifications
You must be signed in to change notification settings - Fork 0
/
sigclust.py
209 lines (170 loc) · 7.21 KB
/
sigclust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
SOURCE: https://github.com/aetilley/sigclust.git
Defs for sigclust, cluster_index2, MAD, comp_sim_var, comp_sim_tau.
"""
import numpy as np
from sklearn.cluster import k_means
from sklearn.preprocessing import scale as pp_scale
def sigclust(X, mc_iters=100, method=2, verbose=True, scale=True,
p_op=True, ngrains=100):
"""
Return tuple (p, clust) where p is the p-value for k-means++ clustering of
data matrix X with k==2, and clust is a (binary) array of length
num_samples = X.shape[0] whose Nth value is the cluster assigned to
the Nth sample (row) of X at the k-means step.
Equivalently, clust == k_means(X,2)[1].
mc_iters is an integer giving the number of iterations in
the Monte Carlo step.
method = 0 uses the sample covariance matrix eigenvalues directly
for simulation.
method = 1 applies hard thresholding.
method = 2 applies soft thresholding.
scale = True applies mean centering and variance normalization
(sigma = 1) preprocessing to the input X.
verbose = True prints some additional statistics of input data.
When method == 2 (solf-thresholding), p_op indicates to perform some
addiitonal optimization on the parameter tau. If p_op == False,
the parameter tau is set to that which best preserves the trace of
the sample covariance matrix (this is just the output of comp_sim_tau):
sum_{i}lambda_i == sum_{i}({lambda_i - tau - sigma^2}_{+} + sigma^2).
If p_op == True, tau is set to some value between 0 and the output
of comp_sim_tau which maximizes the relative size of the largest
simulation variance. ngrains is then number of values checked in
this optimization. p_op and ngrains are ignored for method != 2.
"""
if scale:
print("Scaling and centering input matrix.")
X = pp_scale(X)
num_samples, num_features = X.shape
if verbose:
print("""Number of samples: %d\nNumber of features: %d""" %
(num_samples, num_features))
ci, labels = cluster_index_2(X)
print("Cluster index of input data: %f" % ci)
mad = MAD(X)
if verbose:
print("Median absolute deviation from the median of input data: %f"
% mad)
bg_noise_var = (mad * normalizer) ** 2
print("Estimated variance for background noise: %f"
% bg_noise_var)
sample_cov_mat = np.cov(X.T)
eig_vals = np.linalg.eigvals(sample_cov_mat)
sim_vars = comp_sim_vars(eig_vals, bg_noise_var, method, p_op, ngrains)
if verbose:
print("The %d variances for simulation have\nmean: %f\n"
"standard deviation: %f."
% (X.shape[1], np.mean(sim_vars), np.std(sim_vars)))
sim_cov_mat = np.diag(sim_vars)
# MONTE CARLO STEP
# Counter for simulated cluster indices less than or equal to ci.
lte = 0
print("""Simulating %d cluster indices. Please wait...""" %
mc_iters)
CIs = np.zeros(mc_iters)
for i in np.arange(mc_iters):
# Generate mc_iters datasets
# each of the same size as
# the original input.
sim_data = np.random.multivariate_normal(
np.zeros(num_features), sim_cov_mat, num_samples)
ci_sim = (cluster_index_2(sim_data))[0]
CIs[i] = ci_sim
if ci_sim <= ci:
lte += 1
print("Simulation complete.")
print("The simulated cluster indices had\n"
"mean: %f\nstandard deviation: %f." %
(np.mean(CIs), np.std(CIs)))
p = lte / mc_iters
print("In %d iterations there were\n"
"%d cluster indices <= the cluster index %f\n"
"of the input data." % (mc_iters, lte, ci))
print("p-value: %f" % p)
return p, labels
def cluster_index_2(X):
global_mean = np.mean(X, axis=0)
sum_squared_distances = (((X - global_mean) ** 2).sum(axis=1)).sum()
# Sum of squared distances of each sample from the global mean
centroids, labels, inertia = k_means(X, 2, n_init='auto')
ci = inertia / sum_squared_distances
return ci, labels
normalizer = 1.48257969
"""
Equal to 1/(Phi^{-1}(3/4)) where Phi is the CDF
of the standard normal distribution N(0, 1)
"""
def MAD(X):
"""
Returns the median absolute deviation
from the median for (a flattened version of)
the input data array X.
"""
return np.median(np.abs(X - np.median(X)))
def comp_sim_vars(eig_vals, bg_noise_var, method, p_op, ngrains):
"""
Compute variances for simlulation given sample variances and
background noise variance.
method in {0, 1, 2} determines raw, hard, or soft thresholding methods.
When method is 2 (solf-thresholding), p_op indicates to perform some
addiitonal optimization on the parameter tau.
If p_op is False, the parameter tau is set to that which best preserves
the trace of the sample covariance matrix.
This is just the output of comp_sim_tau.
sum_{i}lambda_i == sum_{i}{lambda_i - tau - sigma^2}_{+} + sigma^2).
If p_op is True, tau is set to some value between 0 and the output
of comp_sim_tau which maximizes the relative size of the largest
simulation variance.
"""
rev_sorted_vals = np.array(sorted(eig_vals, reverse = True))
assert method in {0, 1, 2}, "method parameter must be one of 0,1,2"
if method == 0:
print("Ignoring background noise and using\n"
"raw sample covariance estimates...")
return rev_sorted_vals
elif method == 1:
print("Applying hard thresholding...")
return np.maximum(rev_sorted_vals,
bg_noise_var * np.ones(len(eig_vals)))
else:
print("Applying soft thresholding...")
tau = comp_sim_tau(rev_sorted_vals, bg_noise_var, p_op, ngrains)
sim_vars = rev_sorted_vals - tau
return np.maximum(sim_vars,
bg_noise_var * np.ones(len(eig_vals)))
def comp_sim_tau(rsrtd_vals, bg_noise_var, p_op, ngrains):
"""
Find tau to preserve trace of sample cov matrix in the simulation
cov matrix.
sum_{i}(lambda_i) = sum_{i}{(lambda_i - tau - bg_noise_var)_{+} + bg_noise_var}
"""
diffs = rsrtd_vals - bg_noise_var
first_nonpos_ind = len(diffs) - len(diffs[diffs <=0])
expended = -diffs[first_nonpos_ind:].sum()
possible_returns = np.sort(diffs[:first_nonpos_ind]).cumsum()[::-1]
tau_bonuses = np.arange(first_nonpos_ind) * diffs[:first_nonpos_ind]
deficits = expended - possible_returns - tau_bonuses
pos_defic = deficits > 0
if pos_defic.sum() == 0:
tau = expended / first_nonpos_ind
else:
ind = np.where(pos_defic)[0][0]
if ind == 0:
tau = diffs[0]
else:
tau = diffs[ind] + deficits[ind] / ind
if p_op:
tau = opt_tau(0, tau, rsrtd_vals, bg_noise_var, ngrains)
return tau
def opt_tau(L, U, vals, sig2, ngrains=100):
"""Find tau between L and U that optimizes TCI."""
d_tau = (U - L)/ngrains
rel_sizes = np.zeros(ngrains)
for i in np.arange(ngrains):
tau_cand = L + i * d_tau
vals0 = vals - tau_cand
vals0[vals0 < sig2] = sig2
rel_sizes[i] = vals0[0] / vals0.sum()
armax = np.argmax(rel_sizes)
tau = L + armax * d_tau
return tau