From 5a7e4d7b82a26d7c3ac236ce2d9ab959d89f5871 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 25 May 2022 13:50:27 -0700 Subject: [PATCH 1/2] Revise mem_mb definitions to always result in an integer > 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously they would result in mem_mb=0 when the input size in GiB was less than 1, as the int() would truncate that to 0 which then was propagated to the final result by multiplication via the constant scaling factor. This only impacted smaller builds, such as our CI, but good to fix anyway. Use ceil() instead of int() to ensure that we always at least result in mem_mb=1 (although almost every process we run is going to have a larger memory footprint than that in reality, so there's still a bit of inaccuracy…). --- .../snakemake_rules/export_for_nextstrain.smk | 3 ++- workflow/snakemake_rules/main_workflow.smk | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index 8655a84bc..ae1eab1c4 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -22,6 +22,7 @@ import re import requests import json +from math import ceil from workflow.lib.persistent_dict import PersistentDict, NoSuchEntryError ruleorder: dated_json > finalize @@ -80,7 +81,7 @@ rule export_all_regions: # Memory use scales primarily with the size of the metadata file. # Compared to other rules, this rule loads metadata as a pandas # DataFrame instead of a dictionary, so it uses much less memory. - mem_mb=lambda wildcards, input: 5 * int(input.metadata.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(5 * (input.metadata.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index d93d1e7c9..fbcbabc76 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -1,3 +1,5 @@ +from math import ceil + rule sanitize_metadata: input: metadata=lambda wildcards: _get_path_for_input("metadata", wildcards.origin) @@ -803,7 +805,7 @@ rule tree: # Multiple sequence alignments can use up to 40 times their disk size in # memory, especially for larger alignments. # Note that Snakemake >5.10.0 supports input.size_mb to avoid converting from bytes to MB. - mem_mb=lambda wildcards, input: 40 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(40 * (input.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ @@ -839,7 +841,7 @@ rule refine: # Multiple sequence alignments can use up to 15 times their disk size in # memory. # Note that Snakemake >5.10.0 supports input.size_mb to avoid converting from bytes to MB. - mem_mb=lambda wildcards, input: 15 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(15 * (input.size / 1024 / 1024)) params: root = config["refine"]["root"], clock_rate = config["refine"]["clock_rate"], @@ -893,7 +895,7 @@ rule ancestral: # Multiple sequence alignments can use up to 15 times their disk size in # memory. # Note that Snakemake >5.10.0 supports input.size_mb to avoid converting from bytes to MB. - mem_mb=lambda wildcards, input: 15 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(15 * (input.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ @@ -924,7 +926,7 @@ rule translate: # Multiple sequence alignments can use up to 15 times their disk size in # memory. # Note that Snakemake >5.10.0 supports input.size_mb to avoid converting from bytes to MB. - mem_mb=lambda wildcards, input: 15 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(15 * (input.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ @@ -1055,7 +1057,7 @@ rule clades: "benchmarks/clades_{build_name}.txt" resources: # Memory use scales primarily with size of the node data. - mem_mb=lambda wildcards, input: 3 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(3 * (input.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ @@ -1081,7 +1083,7 @@ rule emerging_lineages: "benchmarks/emerging_lineages_{build_name}.txt" resources: # Memory use scales primarily with size of the node data. - mem_mb=lambda wildcards, input: 3 * int(input.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(3 * (input.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ @@ -1125,7 +1127,7 @@ rule colors: # Memory use scales primarily with the size of the metadata file. # Compared to other rules, this rule loads metadata as a pandas # DataFrame instead of a dictionary, so it uses much less memory. - mem_mb=lambda wildcards, input: 5 * int(input.metadata.size / 1024 / 1024) + mem_mb=lambda wildcards, input: ceil(5 * (input.metadata.size / 1024 / 1024)) conda: config["conda_environment"] shell: """ From acb198eefe99ec21d164d31ec96e7b89731f74b0 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 25 May 2022 14:00:17 -0700 Subject: [PATCH 2/2] Pass the tree rule's mem_mb through to IQ-TREE While most of our mem_mb definitions are only heuristics for Snakemake's scheduler and the commands themselves aren't limited or aware of the mem_mb defined, IQ-TREE *does* support memory limits. Resolves . --- workflow/snakemake_rules/main_workflow.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index fbcbabc76..13ef8e3f7 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -811,7 +811,7 @@ rule tree: """ augur tree \ --alignment {input.alignment} \ - --tree-builder-args {params.args} \ + --tree-builder-args {params.args}' --mem {resources.mem_mb}M' \ {params.exclude_sites} \ --output {output.tree} \ --nthreads {threads} 2>&1 | tee {log}