-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclustermethods.py
executable file
·252 lines (209 loc) · 12.2 KB
/
clustermethods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python2.7
"""
The goal of this program is to run several clustering methods on a gene by sample log2(FPKM/TPM)+1 tsv file, and to return information that allows
the researcher to decide on the 'best' one. Unfortunately there are few 'blind' measures to decide this, at least without having
a ground truth. The silhouette score is one, and a good range is supposed to be upwards of 0.6 (it goes to 1).
Another one is the cophenetic index, but it can only be calculated with the use of a distance matrix so it cannot be used
for biclustering or kmeans/kmedoids.
The program is dependent on three homemade libraries: utils, kmedoids and hierarchical. These should be in the same github repo
as the current program. Hierarchical is a slightly altered copy of sklearn.cluster.AgglomerativeClustering. See the code for details.
The module python-sklearn can be installed on Ubuntu using apt-get; pandas, scipy and numpy can be pip installed.
"""
import sys
import argparse
import textwrap
import copy
import numpy as np
import pandas as pd
import utils
import kmedoids
import time
from scipy.cluster.hierarchy import linkage, cut_tree, cophenet
from scipy.spatial.distance import squareform as sf, pdist
from scipy.stats import spearmanr
from sklearn.cluster import KMeans #, AgglomerativeClustering
from hierarchical import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster.bicluster import SpectralBiclustering
def passedTime(start, end):
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
ptime = "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
return ptime
##### get arguments ###################################################
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''\
Runs kmeans, kmedoids, biclustering and different kinds of hierarchical clustering
on input gene by sample tsv format file. Data is filtered for low expression and only the top 1000
most variable genes are used.
Program calculates the cophenetic index (for hierarchical trees) and the silhouette score.
Note that the cophenetic index will not change for different cluster numbers: it is calculated for the whole tree
Outputs:
<args.base>.clusters.tsv a sample by clustermethod table which can be used as 'color' upload to Tumormap
<args.base>.scores.txt silhouette score and cophenetic index for methods
<args.base>.groups.txt (optional) groupings of samples that nearly always cluster together no matter which method is used.
NOTE: Several methods used here are probabilistic and will NOT give the exact same output twice.
However, usually you will find that the same method scores best on different runs.
'''))
group = parser.add_argument_group('required arguments')
group.add_argument('inputfile', type=str, help='input tsv file')
# optional flag
parser.add_argument('--clusters', type=int, default=5, help="Number of expected clusters (default 5)")
parser.add_argument('--genes', action='store_true', help="Cluster genes instead of samples")
parser.add_argument('--maxfract', type=float, default=0.8, help="Maximum percentage of input samples allowed to end up in the same cluster (default 0.8)")
parser.add_argument('--subsample', type=int, default=False,
help="Use this number of samples instead of the whole set. Good for running quick tests on larger datasets")
parser.add_argument('--base', type=str, default='clusters', help="Basename for output (default clusters)")
parser.add_argument('--print_reduced', action='store_true',
help="Print the 1000 genes that were used for clustering in a gene by sample table, useful if you want to do any subsequent runs, or upload to Tumormap")
parser.add_argument('--print_groups', action='store_true',
help="Save a list of sample groups that always occur together, no matter which clustering method was used. This is useful for defining core groups of samples. Setting this flag also adds a 'shared' column to the clusters file")
parser.add_argument('--min_groupsize', type=int, default=3, help="Minimum number of samples in a shared group (default 3)")
parser.add_argument('--noK', action='store_true', help="Skip Kmeans and Kmedoids (useful if you care about the grouped samples)")
args = parser.parse_args()
if args.maxfract > 1 or args.maxfract < 0.05:
print >>sys.stderr, "please select a maxfract between 0.05 and 1"
sys.exit(1)
##### Read and format input ###########################################
start = time.time()
# gene by sample table
# Pandas dataframes interpret header (df.columns.values or list(df)) and rownames (df.index)
print >>sys.stderr, passedTime(start, time.time()), "reading", args.inputfile
df=pd.read_table(args.inputfile, sep='\t', header=0, index_col=0)
if args.subsample:
# randomly select samples
if len(list(df)) > args.subsample:
print >>sys.stderr, passedTime(start, time.time()), "randomly selecting {} samples of {}".format(args.subsample, len(list(df)))
ids = list(df)
np.random.shuffle(ids)
df = df.loc[:,ids[:args.subsample]]
# remove genes with average expression of 1 or less
print >>sys.stderr, passedTime(start, time.time()), "removing low expression genes (<1)"
df = df[df.mean(1)>1]
sbg = df
if not args.genes:
# Rank genes by stdev
print >>sys.stderr, passedTime(start, time.time()), "selecting 1000 genes with highest stdev"
df['stdev'] = df.std(axis=1)
df.sort_values('stdev', inplace=True, ascending=False)
df.drop('stdev', axis=1, inplace=True)
# get 1000 most variable genes over all samples
df = df.head(1000)
if args.print_reduced:
df.to_csv(args.base+'.1kgenes.tsv', sep='\t')
sbg = df.transpose()
##### Cluster ######################################################
methods = [] # holds all info on each clustering method
n_clusters = args.clusters
print >>sys.stderr, passedTime(start, time.time()), "Bicluster (probabilistic, YMMV)"
# This clusters samples and genes at the same time. It can work extremely well but is sensitive to its parameters
# In testing, n_components and n_best did well when set to args.clusters, at least in a largeish dataset (1000 samples)
biclust = SpectralBiclustering(n_clusters=n_clusters, method='log', n_components=n_clusters, n_best=n_clusters, svd_method='randomized')
biclust.fit(sbg)
labels = biclust.row_labels_
silscore = silhouette_score(sbg, labels)
biclustObject = utils.cmethod('biclust', labels, silscore, 0.0, args.maxfract)
methods.append(biclustObject)
print >>sys.stderr, passedTime(start, time.time()), "Hierarchical clustering with a distance matrix"
# using a precomputed distance matrix (correlation and yule work equally well in testing)
# average and complete linkage both work well
s_distance = pairwise_distances(sbg, metric='correlation')
for l in ['average', 'complete']:
ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage=l)
ac.fit_predict(s_distance)
labels = ac.labels_
# the cophenetic index can only be calculated on a scipy.hierarchy distance matrix, so create one.
ac = utils.distanceMatrix(ac)
# NOTE: the cophenetic score does not change if you alter the number of clusters
c, coph_dists = cophenet(ac.distancematrix, sf(s_distance))
silscore = silhouette_score(s_distance, labels, metric="precomputed")
cname = 'hier_' + l
hierObject = utils.cmethod(cname, labels, silscore, c, args.maxfract)
methods.append(hierObject)
print >>sys.stderr, passedTime(start, time.time()), "Hierarchical clustering using WARD linkage"
# Ward calculates its own euclidean distance matrix
ac = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
ac.fit_predict(sbg)
labels = ac.labels_
ac = utils.distanceMatrix(ac)
# create euclidean distance matrix for cophenetic score
c, coph_dists = cophenet(ac.distancematrix, pdist(sbg))
silscore = silhouette_score(sbg, labels)
hierObject = utils.cmethod('hier_ward', labels, silscore, c, args.maxfract)
methods.append(hierObject)
print >>sys.stderr, passedTime(start, time.time()), "Hierarchical clustering using spearman rank"
# unfortunately this is not an option in sklearn so use scipy
spearmat, pval = spearmanr(sbg, axis=1)
spearmat = np.round(1-spearmat, 6) # without rounding there are non-zero values on the diagonals
for l in ['average', 'complete']:
spearman_samples =linkage(sf(spearmat), l)
sample_groups = cut_tree(spearman_samples, n_clusters=args.clusters)
labels = sample_groups.flatten()
c, coph_dists = cophenet(spearman_samples, sf(spearmat))
silscore = silhouette_score(spearmat, labels, metric="precomputed")
cname = 'hier_spearman_' + l
hiersaObject = utils.cmethod(cname, labels, silscore, c, args.maxfract)
methods.append(hiersaObject)
if not args.noK:
print >>sys.stderr, passedTime(start, time.time()), "KMEANS (probabilistic, YMMV)"
# I thought max_iter was supposed to result in convergence but it doesn't seem to help much
# run 5 times just to show range of outcome
for i in xrange(1,6):
kmeans = KMeans(init='k-means++', n_clusters=args.clusters, n_init=args.clusters, max_iter=100)
labels = kmeans.fit_predict(sbg)
silscore = silhouette_score(sbg, labels)
cname = 'kmeans_' + str(i)
kmeansObject = utils.cmethod(cname, labels, silscore, 0.0, args.maxfract)
methods.append(kmeansObject)
print >>sys.stderr, passedTime(start, time.time()), "KMEDOIDS (probabilistic, YMMV)"
# Same issue as Kmeans. Same approach
for i in xrange(1,6):
medoids, clusterinfo, labels = kmedoids.kMedoids(s_distance, args.clusters)
silscore = silhouette_score(s_distance, labels)
cname = 'kmedoids_' + str(i)
kmedObject = utils.cmethod(cname, labels, silscore, 0.0, args.maxfract)
methods.append(kmedObject)
##### Create consistent sample groups ######################################################
# This outputs a list of samples that always occur together in a cluster, no matter which method is used
# It also adds a 'shared' column to the clusters output file. An attempt is made to give similar clusters
# similar labels, so that cluster B1 largely contains the same samples in each cluster method.
if args.print_groups:
print >>sys.stderr, passedTime(start, time.time()), "Finding consistent groups in all methods used"
setlist = [i.dups for i in methods if i.ok]
grouplist1, tally = utils.persistent_groups(copy.copy(setlist), list(sbg.index), args.min_groupsize)
# print the groups to file
samples = list(sbg.index)
labels = [-1] * len(sbg.index)
with open(args.base + '.groups.txt', 'w') as o:
o.write("Group\tMembercount\n")
for i in xrange(len(grouplist1)):
clabel = i+1
for x in grouplist1[i]:
idx = samples.index(x)
labels[idx] = clabel
o.write("Group {}\t{}\t{}\n".format(i, len(grouplist1[i]),"\t".join(str(x) for x in grouplist1[i])))
shareObject = utils.cmethod('shared', labels, 0.0, 0.0, args.maxfract)
shareObject.ok = False
methods.append(shareObject)
utils.rename_labels(methods, tally)
shareObject.orderedlabels = ['' if x=='0' else x for x in shareObject.orderedlabels]
with open(args.base + '.scores.txt', 'w') as s:
s.write("silhouette score\tcophenetic score\tdensity score\tmethod\n")
for i in methods:
if i.ok and not (i.name == 'shared'):
if i.cscore == 0.0:
s.write("{0:.2f}\tNA\t{1}\n".format(i.silscore, i.name))
else:
s.write("{0:.2f}\t{1:0.2f}\t{2}\n".format(i.silscore, i.cscore, i.name))
##### Output cluster file ###################################################
#samples on rows, clusters in columns - prepend a letter to the labels so TumorMap will show them properly
if hasattr(i, 'orderedlabels'):
outdf = pd.DataFrame([i.orderedlabels for i in methods if i.ok])
else:
outdf = pd.DataFrame([i.labels for i in methods if i.ok])
outdf.columns = df.columns
outdf = outdf.transpose()
outdf.columns = [i.name for i in methods if i.ok]
outdf.to_csv(args.base+'.clusters.tsv', sep='\t')