w4mkmeans.xml

﻿<tool id="w4mkmeans" name="Kmeans for W4m" version="0.98.5">
  <description>Calculate K-means for W4m dataMatrix features or samples</description>

  <requirements>
    <requirement type="package" version="3.4.1">r-base</requirement>
    <requirement type="package" version="1.1_4">r-batch</requirement>
    <requirement type="package" version="1.8.0">libssh2</requirement>
    <requirement type="package" version="1.13.2">krb5</requirement>
  </requirements>

  <stdio>
    <exit_code range="1:" level="fatal" />
  </stdio>


  <command detect_errors="aggressive"><![CDATA[
    Rscript $__tool_directory__/w4mkmeans_wrapper.R
      tool_directory $__tool_directory__
      algorithm '$algorithm'
      categorical_prefix '$categoricalPrefix'
      data_matrix_path '$dataMatrix_in'
      iter_max '$iter_max'
      kfeatures '$kfeatures'
      ksamples '$ksamples'
      nstart '$nstart'
      sampleMetadata_out '$sampleMetadata_out'
      sample_metadata_path '$sampleMetadata_in'
      scores_out '$scores_out'
      slots "\${GALAXY_SLOTS:-1}"
      variableMetadata_out '$variableMetadata_out'
      variable_metadata_path '$variableMetadata_in'
  ]]></command>

  <inputs>
    <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="Feature (variable) x sample; decimal point: '.'; missing: NA; mode: numerical; separator: tab" />
    <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="Sample x metadata columns; separator: tab" />
    <param name="variableMetadata_in" label="Variable (feature) metadata file" type="data" format="tabular" help="Feature (variable) x metadata columns; separator: tab" />
    <param name="categoricalPrefix" label="Prefix for cluster names " type="text" value="c" help="String prepended to cluster numbers in output; default 'c'; leave blank for no prefix." />
    <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
    <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
    <param name="iter_max" label="Maximum number of iterations" type="text" value = "20" help="[iter_max] The maximum number of iterations allowed; default 20." />
    <param name="nstart" label="Number of random sets of clusters" type="text" value = "20" help="[nstart] How many random sets of clusters should be chosen initially; default 20." />
    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; &lt;br /&gt;alternatives 'Lloyd', 'MacQueen'; 'Forgy' (synonym for 'Lloyd'); see references.">
      <option value="Forgy">Forgy</option>
      <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
      <option value="Lloyd">Lloyd</option>
      <option value="MacQueen">MacQueen</option>
    </param>
  </inputs>

  <outputs>
    <data name="sampleMetadata_out" label="${sampleMetadata_in.name}.kmeans-smpl" format="tabular" ></data>
    <data name="variableMetadata_out" label="${variableMetadata_in.name}.kmeans-vrbl" format="tabular" ></data>
    <data name="scores_out" label="${dataMatrix_in.name}.kmeans-score" format="tabular" ></data>
  </outputs>

  <tests>
    <test>
      <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
      <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
      <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
      <param name="ksamples" value="3,4"/>
      <param name="kfeatures" value="5,6,7"/>
      <param name="iter_max" value="20"/>
      <param name="nstart" value="20"/>
      <param name="algorithm" value="Hartigan-Wong"/>
      <output name="scores_out">
        <assert_contents>
          <has_text     text="proportion" />
          <has_text     text="0.87482" />
          <has_text     text="0.91765" />
          <has_text     text="0.95362" />
          <has_text     text="0.95719" />
          <has_text     text="0.97966" />
        </assert_contents>
      </output>
    </test>
  </tests>

  <help>
    <![CDATA[

===========================
K-means for W4m data matrix
===========================

**Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)

**Source Code** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper

**R code used** - The R code invoked by this wrapper is the R kmeans package at https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html

**Tool updates** - See the **NEWS** section at the bottom of this page

-----------
Description
-----------

This tool calculate K-means clusters for samples, features, or both using W4m dataMatrix (i.e., XCMS-preprocessed data files) as input and writes the results to new columns in sampleMetadata, variableMetadata, or both.

- If several, comma-separated K's are supplied, then one column is added for each K.
- For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
- For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
- Clustering is mutually exclusive, **not** hierarchical.

    - Hierarchical clustering is available through the W4m Heat Map tool, https://github.com/workflow4metabolomics/heatmap

-----------------
Workflow Position
-----------------

- Tool category: Statistical Analysis
- Upstream tool category: Preprocessing
- Downstream tool categories: Statistical Analysis

-----------
Input files
-----------

+--------------------------------------------+------------+
| File                                       |   Format   |
+============================================+============+
|     Data matrix                            |   tabular  |
+--------------------------------------------+------------+
|     Sample metadata                        |   tabular  |
+--------------------------------------------+------------+
|     Variable (i.e., feature) metadata      |   tabular  |
+--------------------------------------------+------------+


----------
Parameters
----------

**Data matrix** - input-file dataset

- W4m variable (i.e. feature) x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)

**Sample metadata** - input-file dataset

- W4m sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values

**Feature metadata** - input-file dataset

- W4m variable (i.e. feature) x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values

**Prefix for cluster names** - character(s) to add as prefix to category number (default = 'c')

- Some tools treat only non-numeric data as categorical; this prefix ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools requiring categorical data accept integers).

**K-values for samples** - K or K-range for samples (default = 0)

- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.

**K-values for features** - K or K's for features (default = 0)

- Integer or comma-separated positive integers ; zero (or less) will result in no calculation.

**Maximumn number of iterations** - (default = 20)

- Maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).

**Number of random sets** - how many random sets should be chosen to start (default = 20)

- Number of random sets of clusters to be chosen to start calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).

**Algorithm** - Algorithm for clustering" (default = Hartigan-Wong)

- K-means clustering algorithm: 'Hartigan-Wong', 'Lloyd', or 'MacQueen'; 'Forgy' is a synonym for 'Lloyd' (see references for further info).

------------
Output files
------------

**Sample Metadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K

- **k#** - cluster number for clustering samples with K = #

**Variable Metadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K

- **k#** - cluster number for clustering features with K = #

**scores** - (tabular separated values) file with one line for each K.

- **clusterOn** - what was clustered - either 'sample' or 'feature'
- **k** - the chosen K for clustering
- **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
- **betweenSS** - *between-treatements* sum of squares
- **proportion** - betweenSS / totalSS

---------------
Working example
---------------

.. class:: infomark

**Input files**

+-------------------------------------------------------------------------------------------------------------------+
| URL                                                                                                               |
+===================================================================================================================+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv       |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv   |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
+-------------------------------------------------------------------------------------------------------------------+

.. class:: infomark

**Other input parameters**

+-----------------+---------------+
| Input Parameter | Value         |
+=================+===============+
| prefix          | c             |
+-----------------+---------------+
| ksamples        | 3,4           |
+-----------------+---------------+
| kfeatures       | 5,6,7         |
+-----------------+---------------+
| iter_max        | 20            |
+-----------------+---------------+
| nstart          | 20            |
+-----------------+---------------+
| algorithm       | Hartigan-Wong |
+-----------------+---------------+

.. class:: infomark

**Expected output files**

+-------------------------------------------------------------------------------------------------------------------+
| URL                                                                                                               |
+===================================================================================================================+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-score.tsv    |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-vrbl.tsv     |
+-------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-smpl.tsv     |
+-------------------------------------------------------------------------------------------------------------------+

----
NEWS
----

- February 2018, Version 0.98.4 - Renamed output datasets to append '``.kmeans-smpl``', '``.kmeans-vrbl``', or '``.kmeans-score``'; refactored multi-threading.
- August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical.
- August 2017, Version 0.98.1 - First release

---------
Citations
---------

    ]]>
  </help>
  <citations>
    <citation type="bibtex"><![CDATA[
@incollection{RCoreTeam2017,
  title = {stats::kmeans - K-Means Clustering},
  booktitle = {R: A Language and Environment for Statistical Computing},
  author = {{R Core Team}},
  publisher = {R Foundation for Statistical Computing},
  address = {Vienna, Austria},
  year = {2017},
  url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html},
}
    ]]></citation>
    <!-- Forgy algorithm -->
    <citation type="bibtex"><![CDATA[
@article{forgy65,
  added-at = {2006-03-23T12:22:43.000+0100},
  author = {Forgy, E.},
  biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho},
  interhash = {c86383cba8cfe00d5e6ef200016aca3f},
  intrahash = {1e31409932ce91df646c4731350e1207},
  journal = {Biometrics},
  keywords = {clustering kmeans},
  number = 3,
  pages = {768-769},
  timestamp = {2006-03-23T12:22:43.000+0100},
  title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification},
  volume = 21,
  year = 1965
}
    ]]></citation>
    <!-- W4m 3.0 - Guitton et al. 2017-->
    <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
    <!-- W4m 2.5 - Giacomini et al. 2014 -->
    <citation type="doi">10.1093/bioinformatics/btu813</citation>
    <!-- Hartigan and Wong algorithm -->
    <citation type="doi">10.2307/2346830</citation>
    <!-- Lloyd algorithm -->
    <citation type="doi">10.1109/TIT.1982.1056489</citation>
    <!-- MacQueen algorithm -->
    <citation type="bibtex"><![CDATA[
@inproceedings{MacQueen1967,
  added-at = {2011-01-11T13:35:01.000+0100},
  author = {MacQueen, J. B.},
  biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc},
  booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability},
  editor = {Cam, L. M. Le and Neyman, J.},
  interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28},
  intrahash = {5dcdb8cd9fba78e0e791af619d61d66d},
  keywords = {kmeans clustering},
  pages = {281-297},
  publisher = {University of California Press},
  timestamp = {2011-01-11T13:35:01.000+0100},
  title = {Some Methods for Classification and Analysis of MultiVariate Observations},
  volume = 1,
  year = 1967
}
    ]]></citation>
  </citations>
  <!--
     vim:et:sw=2:ts=2:
--> </tool>