Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to nf-core last v2 version; incorporating GTDB #370

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install nf-core==2.2
pip install nf-core==2.14.1

- name: Run nf-core lint
env:
Expand Down
14 changes: 0 additions & 14 deletions .markdownlint.yml

This file was deleted.

8 changes: 4 additions & 4 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ lint:
- docs/images/nf-core-autometa_logo_dark.png
- .github/ISSUE_TEMPLATE/bug_report.md
- .github/ISSUE_TEMPLATE/feature_request.md

files_unchanged:
- manifest
- .github/CONTRIBUTING.md
Expand All @@ -30,10 +29,11 @@ lint:
- LICENSE
- .github/PULL_REQUEST_TEMPLATE.md
- lib/NfcoreTemplate.groovy

actions_ci:
- .github/workflows/ci.yml

schema_lint: true
template_strings: false
nextflow_config: false
nextflow_config:
- manifest.name
- manifest.homePage
repository_type: pipeline
9 changes: 9 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
email_template.html
.nextflow*
work/
data/
results/
.DS_Store
testing/
testing*
*.pyc
1 change: 1 addition & 0 deletions .prettierrc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
printWidth: 120
11 changes: 10 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ LABEL maintainer="[email protected]"
# along with Autometa. If not, see <http://www.gnu.org/licenses/>.

RUN apt-get update --allow-releaseinfo-change \
&& apt-get install -y procps make \
&& apt-get install -y procps make curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

COPY autometa-env.yml ./
RUN mamba env update -n base --file=autometa-env.yml \
&& mamba clean --all -y

RUN mamba env update -n base --file=autometa-env.yml \
&& mamba clean --all -y

COPY . /Autometa
WORKDIR /Autometa
Expand All @@ -42,6 +44,11 @@ RUN hmmpress -f autometa/databases/markers/bacteria.single_copy.hmm \
&& autometa-config --section databases --option base --value ${DB_DIR} \
&& echo "databases base directory set in ${DB_DIR}/"


# make the /scratch/dbs directory available to anyone
RUN chmod -R 755 /scratch/dbs


RUN echo "Testing autometa import" \
&& python -c "import autometa"

Expand All @@ -67,3 +74,5 @@ RUN echo "Checking autometa entrypoints" \
&& autometa-binning-ldm-loginfo -h > /dev/null \
&& autometa-benchmark -h > /dev/null \
&& autometa-download-dataset -h > /dev/null

ENV NUMBA_CACHE_DIR=/tmp
5 changes: 5 additions & 0 deletions autometa-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@ channels:
- bioconda
- defaults
dependencies:
- aria2
- attrs # test-data requirement
- bedtools
- biopython>=1.82
- bowtie2
- curl
- diamond>=2.0
- gzip
- gdown
- hmmer
- joblib>=1.1.0 # See https://stackoverflow.com/a/73830525/12671809
- numba>=0.47
- numpy>=1.13
- pandas>=1.5
- parallel
- pip
- procps-ng # required by nextflow
- prodigal # NOTE: 2.5 and 2.6 output format is different for sequence headers
- python-annoy>=1.11 # required for trimap installation.
- requests
Expand Down
11 changes: 9 additions & 2 deletions autometa/common/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,9 +586,12 @@ def embed(
f"{method} not in embedding methods. Choices: {', '.join(choices)}"
)
# PCA
n_samples, n_components = df.shape

# Drop any rows that all cols contain NaN. This may occur if the contig length is below the k-mer size
X = df.dropna(axis="index", how="all").fillna(0).to_numpy()
n_samples, n_components = df.shape

logger.warning(f"n_samples: {n_samples} n_components: {n_components}")
# Set random state using provided seed
random_state = np.random.RandomState(seed)
if isinstance(pca_dimensions, str):
Expand All @@ -599,11 +602,15 @@ def embed(
f"pca_dimensions must be an integer! given: {pca_dimensions}"
)
if n_components > pca_dimensions and pca_dimensions != 0:
if n_samples < pca_dimensions:
logging.error(
f"n_samples ({n_samples}) is less than pca_dimensions ({pca_dimensions}), lowering pca_dimensions to {min(n_samples, pca_dimensions)} ."
)
pca_dimensions = min(n_samples, pca_dimensions)
logger.debug(
f"Performing decomposition with PCA (seed {seed}): {n_components} to {pca_dimensions} dims"
)
X = PCA(n_components=pca_dimensions, random_state=random_state).fit_transform(X)
# X = PCA(n_components='mle').fit_transform(X)
n_samples, n_components = X.shape

logger.debug(f"{method}: {n_samples} data points and {n_components} dimensions")
Expand Down
1 change: 0 additions & 1 deletion autometa/config/databases.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
"""
# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
Expand Down
38 changes: 38 additions & 0 deletions autometa/taxonomy/download_gtdb_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from tqdm import tqdm

from autometa.config.utilities import DEFAULT_FPATH


# Set up logger
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -312,3 +314,39 @@ def download_and_format(gtdb_host, gtdb_version, single_dir, force=False):
"aa_reps_path": aa_reps_path,
"combined_gtdb_fasta": combined_gtdb_fasta,
}



def main():
import argparse
import logging as logger

logger.basicConfig(
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)
parser = argparse.ArgumentParser(
description="Download GTDB files",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--version",
help="GTDB version to download, 'latest' to get the latest version, otherwise specify a version number.",
default="220",
)
parser.add_argument(
"--host",
help="GTDB host to download files from.",
default="data.gtdb.ecogenomic.org",
)
parser.add_argument(
"--outdir",
help="Directory to save the downloaded files.",
required=True
)
args = parser.parse_args()
download_and_format(gtdb_host=args.host, gtdb_version=args.version, single_dir=args.outdir)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion autometa/taxonomy/gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG):
self.names_fpath = os.path.join(dbdir, "names.dmp")
self.merged_fpath = os.path.join(dbdir, "merged.dmp")
self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp")
self.verify_databases()
# self.verify_databases()
self.names = self.parse_names()
self.nodes = self.parse_nodes()
self.merged = self.parse_merged()
Expand Down
5 changes: 4 additions & 1 deletion autometa/validation/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def download(
file_id = df.loc[(community_size, file_name), "file_id"]
file_id_filepath = os.path.join(community_size_outdir, file_name)
url = f"https://drive.google.com/uc?id={file_id}"

# if the file already exists, skip downloading
if os.path.exists(file_id_filepath):
logger.info(f"File {file_name} already exists in {community_size_outdir}. Skipping download.")
continue
gdown.download(url, file_id_filepath)


Expand Down
28 changes: 28 additions & 0 deletions bin/mock_data_report.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env Rscript

args = commandArgs(trailingOnly=TRUE)

rmarkdown::render(
input=args[[1]],
params=list(
bins_path=args[[2]],
assembly_to_locus_path=args[[2]],
assembly_report_path=args[[3]],
genus=FALSE
),
knit_root_dir=getwd(),
output_dir=getwd(),
output_file="mock_data_report_by_assembly.html"
)
rmarkdown::render(
input=args[[1]],
params=list(
bins_path= args[[2]],
assembly_to_locus_path = args[[2]],
assembly_report_path = args[[3]],
genus=TRUE
),
knit_root_dir=getwd(),
output_dir=getwd(),
output_file="mock_data_report_by_genus.html"
)
19 changes: 11 additions & 8 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,22 @@ process {
// adding in your local modules too.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_low {
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
cpus = { check_max( 1 ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel:process_high {
cpus = { check_max( 16 * task.attempt, 'cpus' ) }
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 24.h * task.attempt, 'time' ) }
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
time = { check_max( 48.h * task.attempt, 'time' ) }
}
withLabel:process_high_memory {
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
Expand All @@ -53,4 +53,7 @@ process {
errorStrategy = 'retry'
maxRetries = 2
}
withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
}
Loading
Loading