KwanLab · chasemc · Dec 13, 2024 · Dec 13, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core==2.2
+          pip install nf-core==2.14.1
 
       - name: Run nf-core lint
         env:

diff --git a/.markdownlint.yml b/.markdownlint.yml
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -14,7 +14,6 @@ lint:
     - docs/images/nf-core-autometa_logo_dark.png
     - .github/ISSUE_TEMPLATE/bug_report.md
     - .github/ISSUE_TEMPLATE/feature_request.md
-
   files_unchanged:
     - manifest
     - .github/CONTRIBUTING.md
@@ -30,10 +29,11 @@ lint:
     - LICENSE
     - .github/PULL_REQUEST_TEMPLATE.md
     - lib/NfcoreTemplate.groovy
-
   actions_ci:
     - .github/workflows/ci.yml
-
   schema_lint: true
   template_strings: false
-  nextflow_config: false
+  nextflow_config:
+    - manifest.name
+    - manifest.homePage
+repository_type: pipeline
diff --git a/.prettierignore b/.prettierignore
@@ -0,0 +1,9 @@
+email_template.html
+.nextflow*
+work/
+data/
+results/
+.DS_Store
+testing/
+testing*
+*.pyc
diff --git a/.prettierrc.yml b/.prettierrc.yml
@@ -0,0 +1 @@
+printWidth: 120
diff --git a/Dockerfile b/Dockerfile
@@ -20,14 +20,16 @@ LABEL maintainer="[email protected]"
 # along with Autometa. If not, see <http://www.gnu.org/licenses/>.
 
 RUN apt-get update --allow-releaseinfo-change \
-    && apt-get install -y procps make \
+    && apt-get install -y procps make curl \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 COPY autometa-env.yml ./
 RUN mamba env update -n base --file=autometa-env.yml \
     && mamba clean --all -y
 
+RUN mamba env update -n base --file=autometa-env.yml \
+    && mamba clean --all -y
 
 COPY . /Autometa
 WORKDIR /Autometa
@@ -42,6 +44,11 @@ RUN hmmpress -f autometa/databases/markers/bacteria.single_copy.hmm \
     && autometa-config --section databases --option base --value ${DB_DIR} \
     && echo "databases base directory set in ${DB_DIR}/"
 
+
+# make the /scratch/dbs directory available to anyone
+RUN chmod -R 755 /scratch/dbs
+
+
 RUN echo "Testing autometa import" \
     && python -c "import autometa"
 
@@ -67,3 +74,5 @@ RUN echo "Checking autometa entrypoints" \
     && autometa-binning-ldm-loginfo -h > /dev/null \
     && autometa-benchmark -h > /dev/null \
     && autometa-download-dataset -h > /dev/null
+
+ENV NUMBA_CACHE_DIR=/tmp
diff --git a/autometa-env.yml b/autometa-env.yml
@@ -4,18 +4,23 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - aria2
   - attrs # test-data requirement
   - bedtools
   - biopython>=1.82
   - bowtie2
+  - curl
   - diamond>=2.0
+  - gzip
   - gdown
   - hmmer
+  - joblib>=1.1.0 # See https://stackoverflow.com/a/73830525/12671809
   - numba>=0.47
   - numpy>=1.13
   - pandas>=1.5
   - parallel
   - pip
+  - procps-ng # required by nextflow
   - prodigal # NOTE: 2.5 and 2.6 output format is different for sequence headers
   - python-annoy>=1.11 # required for trimap installation.
   - requests

diff --git a/autometa/common/kmers.py b/autometa/common/kmers.py
@@ -586,9 +586,12 @@ def embed(
             f"{method} not in embedding methods. Choices: {', '.join(choices)}"
         )
     # PCA
-    n_samples, n_components = df.shape
+
     # Drop any rows that all cols contain NaN. This may occur if the contig length is below the k-mer size
     X = df.dropna(axis="index", how="all").fillna(0).to_numpy()
+    n_samples, n_components = df.shape
+
+    logger.warning(f"n_samples: {n_samples} n_components: {n_components}")
     # Set random state using provided seed
     random_state = np.random.RandomState(seed)
     if isinstance(pca_dimensions, str):
@@ -599,11 +602,15 @@ def embed(
                 f"pca_dimensions must be an integer! given: {pca_dimensions}"
             )
     if n_components > pca_dimensions and pca_dimensions != 0:
+        if n_samples < pca_dimensions:
+            logging.error(
+                f"n_samples ({n_samples}) is less than pca_dimensions ({pca_dimensions}), lowering pca_dimensions to {min(n_samples, pca_dimensions)} ."
+            )
+            pca_dimensions = min(n_samples, pca_dimensions)
         logger.debug(
             f"Performing decomposition with PCA (seed {seed}): {n_components} to {pca_dimensions} dims"
         )
         X = PCA(n_components=pca_dimensions, random_state=random_state).fit_transform(X)
-        # X = PCA(n_components='mle').fit_transform(X)
         n_samples, n_components = X.shape
 
     logger.debug(f"{method}: {n_samples} data points and {n_components} dimensions")

diff --git a/autometa/config/databases.py b/autometa/config/databases.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """
 # License: GNU Affero General Public License v3 or later
 # A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.

diff --git a/autometa/taxonomy/download_gtdb_files.py b/autometa/taxonomy/download_gtdb_files.py
@@ -9,6 +9,8 @@
 
 from tqdm import tqdm
 
+from autometa.config.utilities import DEFAULT_FPATH
+
 
 # Set up logger
 logger = logging.getLogger(__name__)
@@ -312,3 +314,39 @@ def download_and_format(gtdb_host, gtdb_version, single_dir, force=False):
         "aa_reps_path": aa_reps_path,
         "combined_gtdb_fasta": combined_gtdb_fasta,
     }
+
+
+
+def main():
+    import argparse
+    import logging as logger
+
+    logger.basicConfig(
+        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
+        datefmt="%m/%d/%Y %I:%M:%S %p",
+        level=logger.DEBUG,
+    )
+    parser = argparse.ArgumentParser(
+        description="Download GTDB files",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--version",
+        help="GTDB version to download, 'latest' to get the latest version, otherwise specify a version number.",
+        default="220",
+    )
+    parser.add_argument(
+        "--host",
+        help="GTDB host to download files from.",
+        default="data.gtdb.ecogenomic.org",
+    )
+    parser.add_argument(
+        "--outdir",
+        help="Directory to save the downloaded files.",
+        required=True        
+    )
+    args = parser.parse_args()
+    download_and_format(gtdb_host=args.host, gtdb_version=args.version, single_dir=args.outdir)
+
+if __name__ == "__main__":
+    main()
diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py
@@ -63,7 +63,7 @@ def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG):
         self.names_fpath = os.path.join(dbdir, "names.dmp")
         self.merged_fpath = os.path.join(dbdir, "merged.dmp")
         self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp")
-        self.verify_databases()
+        # self.verify_databases()
         self.names = self.parse_names()
         self.nodes = self.parse_nodes()
         self.merged = self.parse_merged()

diff --git a/autometa/validation/datasets.py b/autometa/validation/datasets.py
@@ -63,7 +63,10 @@ def download(
             file_id = df.loc[(community_size, file_name), "file_id"]
             file_id_filepath = os.path.join(community_size_outdir, file_name)
             url = f"https://drive.google.com/uc?id={file_id}"
-
+            # if the file already exists, skip downloading
+            if os.path.exists(file_id_filepath):
+                logger.info(f"File {file_name} already exists in {community_size_outdir}. Skipping download.")
+                continue
             gdown.download(url, file_id_filepath)
 
 

diff --git a/bin/mock_data_report.R b/bin/mock_data_report.R
@@ -0,0 +1,28 @@
+#!/usr/bin/env Rscript
+
+args = commandArgs(trailingOnly=TRUE)
+
+rmarkdown::render(
+  input=args[[1]],
+  params=list(
+    bins_path=args[[2]],
+    assembly_to_locus_path=args[[2]],
+    assembly_report_path=args[[3]],
+    genus=FALSE
+  ),
+  knit_root_dir=getwd(),
+  output_dir=getwd(),
+  output_file="mock_data_report_by_assembly.html"
+)
+rmarkdown::render(
+  input=args[[1]],
+  params=list(
+    bins_path= args[[2]],
+    assembly_to_locus_path = args[[2]],
+    assembly_report_path = args[[3]],
+    genus=TRUE
+  ),
+  knit_root_dir=getwd(),
+  output_dir=getwd(),
+  output_file="mock_data_report_by_genus.html"
+)
diff --git a/conf/base.config b/conf/base.config
@@ -26,22 +26,22 @@ process {
     //        adding in your local modules too.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_low {
-        cpus   = { check_max( 1     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 2.GB  * task.attempt, 'memory'  ) }
+        cpus   = { check_max( 1 ) }
+        memory = { check_max( 6.GB  * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 8.GB  * task.attempt, 'memory'  ) }
+        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB  * task.attempt, 'memory'  ) }
         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 16    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 24.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+        time   = { check_max( 48.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
         memory = { check_max( 200.GB * task.attempt, 'memory' ) }
@@ -53,4 +53,7 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+        cache = false
+    }
 }