-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw4mkmeans.xml
317 lines (255 loc) · 14.8 KB
/
w4mkmeans.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
<tool id="w4mkmeans" name="Kmeans for W4m" version="0.98.5">
<description>Calculate K-means for W4m dataMatrix features or samples</description>
<requirements>
<requirement type="package" version="3.4.1">r-base</requirement>
<requirement type="package" version="1.1_4">r-batch</requirement>
<requirement type="package" version="1.8.0">libssh2</requirement>
<requirement type="package" version="1.13.2">krb5</requirement>
</requirements>
<stdio>
<exit_code range="1:" level="fatal" />
</stdio>
<command detect_errors="aggressive"><![CDATA[
Rscript $__tool_directory__/w4mkmeans_wrapper.R
tool_directory $__tool_directory__
algorithm '$algorithm'
categorical_prefix '$categoricalPrefix'
data_matrix_path '$dataMatrix_in'
iter_max '$iter_max'
kfeatures '$kfeatures'
ksamples '$ksamples'
nstart '$nstart'
sampleMetadata_out '$sampleMetadata_out'
sample_metadata_path '$sampleMetadata_in'
scores_out '$scores_out'
slots "\${GALAXY_SLOTS:-1}"
variableMetadata_out '$variableMetadata_out'
variable_metadata_path '$variableMetadata_in'
]]></command>
<inputs>
<param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="Feature (variable) x sample; decimal point: '.'; missing: NA; mode: numerical; separator: tab" />
<param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="Sample x metadata columns; separator: tab" />
<param name="variableMetadata_in" label="Variable (feature) metadata file" type="data" format="tabular" help="Feature (variable) x metadata columns; separator: tab" />
<param name="categoricalPrefix" label="Prefix for cluster names " type="text" value="c" help="String prepended to cluster numbers in output; default 'c'; leave blank for no prefix." />
<param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
<param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
<param name="iter_max" label="Maximum number of iterations" type="text" value = "20" help="[iter_max] The maximum number of iterations allowed; default 20." />
<param name="nstart" label="Number of random sets of clusters" type="text" value = "20" help="[nstart] How many random sets of clusters should be chosen initially; default 20." />
<param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; <br />alternatives 'Lloyd', 'MacQueen'; 'Forgy' (synonym for 'Lloyd'); see references.">
<option value="Forgy">Forgy</option>
<option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
<option value="Lloyd">Lloyd</option>
<option value="MacQueen">MacQueen</option>
</param>
</inputs>
<outputs>
<data name="sampleMetadata_out" label="${sampleMetadata_in.name}.kmeans-smpl" format="tabular" ></data>
<data name="variableMetadata_out" label="${variableMetadata_in.name}.kmeans-vrbl" format="tabular" ></data>
<data name="scores_out" label="${dataMatrix_in.name}.kmeans-score" format="tabular" ></data>
</outputs>
<tests>
<test>
<param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
<param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
<param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
<param name="ksamples" value="3,4"/>
<param name="kfeatures" value="5,6,7"/>
<param name="iter_max" value="20"/>
<param name="nstart" value="20"/>
<param name="algorithm" value="Hartigan-Wong"/>
<output name="scores_out">
<assert_contents>
<has_text text="proportion" />
<has_text text="0.87482" />
<has_text text="0.91765" />
<has_text text="0.95362" />
<has_text text="0.95719" />
<has_text text="0.97966" />
</assert_contents>
</output>
</test>
</tests>
<help>
<![CDATA[
===========================
K-means for W4m data matrix
===========================
**Author** - Arthur Eschenlauer (University of Minnesota, [email protected])
**Source Code** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
**R code used** - The R code invoked by this wrapper is the R kmeans package at https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html
**Tool updates** - See the **NEWS** section at the bottom of this page
-----------
Description
-----------
This tool calculate K-means clusters for samples, features, or both using W4m dataMatrix (i.e., XCMS-preprocessed data files) as input and writes the results to new columns in sampleMetadata, variableMetadata, or both.
- If several, comma-separated K's are supplied, then one column is added for each K.
- For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
- For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
- Clustering is mutually exclusive, **not** hierarchical.
- Hierarchical clustering is available through the W4m Heat Map tool, https://github.com/workflow4metabolomics/heatmap
-----------------
Workflow Position
-----------------
- Tool category: Statistical Analysis
- Upstream tool category: Preprocessing
- Downstream tool categories: Statistical Analysis
-----------
Input files
-----------
+--------------------------------------------+------------+
| File | Format |
+============================================+============+
| Data matrix | tabular |
+--------------------------------------------+------------+
| Sample metadata | tabular |
+--------------------------------------------+------------+
| Variable (i.e., feature) metadata | tabular |
+--------------------------------------------+------------+
----------
Parameters
----------
**Data matrix** - input-file dataset
- W4m variable (i.e. feature) x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
**Sample metadata** - input-file dataset
- W4m sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
**Feature metadata** - input-file dataset
- W4m variable (i.e. feature) x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
**Prefix for cluster names** - character(s) to add as prefix to category number (default = 'c')
- Some tools treat only non-numeric data as categorical; this prefix ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools requiring categorical data accept integers).
**K-values for samples** - K or K-range for samples (default = 0)
- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
**K-values for features** - K or K's for features (default = 0)
- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
**Maximumn number of iterations** - (default = 20)
- Maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
**Number of random sets** - how many random sets should be chosen to start (default = 20)
- Number of random sets of clusters to be chosen to start calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
**Algorithm** - Algorithm for clustering" (default = Hartigan-Wong)
- K-means clustering algorithm: 'Hartigan-Wong', 'Lloyd', or 'MacQueen'; 'Forgy' is a synonym for 'Lloyd' (see references for further info).
------------
Output files
------------
**Sample Metadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
- **k#** - cluster number for clustering samples with K = #
**Variable Metadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
- **k#** - cluster number for clustering features with K = #
**scores** - (tabular separated values) file with one line for each K.
- **clusterOn** - what was clustered - either 'sample' or 'feature'
- **k** - the chosen K for clustering
- **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
- **betweenSS** - *between-treatements* sum of squares
- **proportion** - betweenSS / totalSS
---------------
Working example
---------------
.. class:: infomark
**Input files**
+-------------------------------------------------------------------------------------------------------------------+
| URL |
+===================================================================================================================+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
+-------------------------------------------------------------------------------------------------------------------+
.. class:: infomark
**Other input parameters**
+-----------------+---------------+
| Input Parameter | Value |
+=================+===============+
| prefix | c |
+-----------------+---------------+
| ksamples | 3,4 |
+-----------------+---------------+
| kfeatures | 5,6,7 |
+-----------------+---------------+
| iter_max | 20 |
+-----------------+---------------+
| nstart | 20 |
+-----------------+---------------+
| algorithm | Hartigan-Wong |
+-----------------+---------------+
.. class:: infomark
**Expected output files**
+-------------------------------------------------------------------------------------------------------------------+
| URL |
+===================================================================================================================+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-score.tsv |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-vrbl.tsv |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-smpl.tsv |
+-------------------------------------------------------------------------------------------------------------------+
----
NEWS
----
- February 2018, Version 0.98.4 - Renamed output datasets to append '``.kmeans-smpl``', '``.kmeans-vrbl``', or '``.kmeans-score``'; refactored multi-threading.
- August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical.
- August 2017, Version 0.98.1 - First release
---------
Citations
---------
]]>
</help>
<citations>
<citation type="bibtex"><![CDATA[
@incollection{RCoreTeam2017,
title = {stats::kmeans - K-Means Clustering},
booktitle = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
publisher = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2017},
url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html},
}
]]></citation>
<!-- Forgy algorithm -->
<citation type="bibtex"><![CDATA[
@article{forgy65,
added-at = {2006-03-23T12:22:43.000+0100},
author = {Forgy, E.},
biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho},
interhash = {c86383cba8cfe00d5e6ef200016aca3f},
intrahash = {1e31409932ce91df646c4731350e1207},
journal = {Biometrics},
keywords = {clustering kmeans},
number = 3,
pages = {768-769},
timestamp = {2006-03-23T12:22:43.000+0100},
title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification},
volume = 21,
year = 1965
}
]]></citation>
<!-- W4m 3.0 - Guitton et al. 2017-->
<citation type="doi">10.1016/j.biocel.2017.07.002</citation>
<!-- W4m 2.5 - Giacomini et al. 2014 -->
<citation type="doi">10.1093/bioinformatics/btu813</citation>
<!-- Hartigan and Wong algorithm -->
<citation type="doi">10.2307/2346830</citation>
<!-- Lloyd algorithm -->
<citation type="doi">10.1109/TIT.1982.1056489</citation>
<!-- MacQueen algorithm -->
<citation type="bibtex"><![CDATA[
@inproceedings{MacQueen1967,
added-at = {2011-01-11T13:35:01.000+0100},
author = {MacQueen, J. B.},
biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc},
booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability},
editor = {Cam, L. M. Le and Neyman, J.},
interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28},
intrahash = {5dcdb8cd9fba78e0e791af619d61d66d},
keywords = {kmeans clustering},
pages = {281-297},
publisher = {University of California Press},
timestamp = {2011-01-11T13:35:01.000+0100},
title = {Some Methods for Classification and Analysis of MultiVariate Observations},
volume = 1,
year = 1967
}
]]></citation>
</citations>
<!--
vim:et:sw=2:ts=2:
--> </tool>