From ff49561d5e26860d830a1db72b3ee93d46a9fae1 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 21 Apr 2021 15:53:30 -0700
Subject: [PATCH 1/7] Use nextalign for alignment instead of mafft

Makes nextalign the default and only tool for alignment, removing mafft
rules and adding configuration details (genes) needed by nextalign to
the default parameters.
---
 defaults/parameters.yaml                   |   3 +
 workflow/snakemake_rules/main_workflow.smk | 328 +++++++++------------
 2 files changed, 135 insertions(+), 196 deletions(-)

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index 7e382e8cf..49d57dd1e 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -36,6 +36,9 @@ files:
   clades: "defaults/clades.tsv"
   emerging_lineages: "defaults/emerging_lineages.tsv"
 
+# Define genes to translate during alignment by nextalign.
+genes: ["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"]
+
 # Filter settings
 filter:
   # Require nearly full-length genomes.
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index a8ea9aecd..eeb11ffe3 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -42,85 +42,50 @@ rule combine_input_metadata:
         python3 scripts/combine_metadata.py --metadata {input.metadata} --origins {params.origins} --output {output.metadata} 2>&1 | tee {log}
         """
 
-if "use_nextalign" in config and config["use_nextalign"]:
-    rule align:
-        message:
-            """
-            Aligning sequences to {input.reference}
-              - gaps relative to reference are considered real
-            """
-        input:
-            sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin),
-            genemap = config["files"]["annotation"],
-            reference = config["files"]["alignment_reference"]
-        output:
-            alignment = "results/aligned{origin}.fasta",
-            insertions = "results/insertions{origin}.tsv",
-            translations = expand("results/translations/seqs{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S']))
-        params:
-            outdir = "results/translations",
-            genes = ','.join(config.get('genes', ['S'])),
-            basename = "seqs{origin}",
-            strain_prefixes=config["strip_strain_prefixes"],
-        log:
-            "logs/align{origin}.txt"
-        benchmark:
-            "benchmarks/align{origin}.txt"
-        conda: config["conda_environment"]
-        threads: 8
-        resources:
-            mem_mb=3000
-        shell:
-            """
-            python3 scripts/sanitize_sequences.py \
-                --sequences {input.sequences} \
-                --strip-prefixes {params.strain_prefixes:q} \
-                --output /dev/stdout \
-                | nextalign \
-                --jobs={threads} \
-                --reference {input.reference} \
-                --genemap {input.genemap} \
-                --genes {params.genes} \
-                --sequences /dev/stdin \
-                --output-dir {params.outdir} \
-                --output-basename {params.basename} \
-                --output-fasta {output.alignment} \
-                --output-insertions {output.insertions} > {log} 2>&1
-            """
-else:
-    rule align:
-        message:
-            """
-            Aligning sequences from {input.sequences} to {input.reference}
+rule align:
+    message:
+        """
+        Aligning sequences to {input.reference}
             - gaps relative to reference are considered real
-            """
-        input:
-            sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin),
-            reference = config["files"]["alignment_reference"]
-        output:
-            alignment = "results/aligned{origin}.fasta"
-        log:
-            "logs/align{origin}.txt"
-        benchmark:
-            "benchmarks/align{origin}.txt"
-        threads: 16
-        conda: config["conda_environment"]
-        params:
-            strain_prefixes=config["strip_strain_prefixes"],
-        shell:
-            """
-            python3 scripts/sanitize_sequences.py \
-                --sequences {input.sequences} \
-                --strip-prefixes {params.strain_prefixes:q} \
-                --output /dev/stdout \
-                | mafft \
-                --auto \
-                --thread {threads} \
-                --keeplength \
-                --addfragments \
-                /dev/stdin \
-                {input.reference} > {output} 2> {log}
-            """
+        """
+    input:
+        sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin),
+        genemap = config["files"]["annotation"],
+        reference = config["files"]["alignment_reference"]
+    output:
+        alignment = "results/aligned{origin}.fasta",
+        insertions = "results/insertions{origin}.tsv",
+        translations = expand("results/translations/seqs{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S']))
+    params:
+        outdir = "results/translations",
+        genes = ','.join(config.get('genes', ['S'])),
+        basename = "seqs{origin}",
+        strain_prefixes=config["strip_strain_prefixes"],
+    log:
+        "logs/align{origin}.txt"
+    benchmark:
+        "benchmarks/align{origin}.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    resources:
+        mem_mb=3000
+    shell:
+        """
+        python3 scripts/sanitize_sequences.py \
+            --sequences {input.sequences} \
+            --strip-prefixes {params.strain_prefixes:q} \
+            --output /dev/stdout \
+            | nextalign \
+            --jobs={threads} \
+            --reference {input.reference} \
+            --genemap {input.genemap} \
+            --genes {params.genes} \
+            --sequences /dev/stdin \
+            --output-dir {params.outdir} \
+            --output-basename {params.basename} \
+            --output-fasta {output.alignment} \
+            --output-insertions {output.insertions} > {log} 2>&1
+        """
 
 rule diagnostic:
     message: "Scanning aligned sequences {input.alignment} for problematic sequences"
@@ -551,74 +516,45 @@ rule combine_samples:
             --output-metadata {output.metadata} 2>&1 | tee {log}
         """
 
-if "use_nextalign" in config and config["use_nextalign"]:
-    rule build_align:
-        message:
-            """
-            Aligning sequences to {input.reference}
-              - gaps relative to reference are considered real
-            """
-        input:
-            sequences = rules.combine_samples.output.sequences,
-            genemap = config["files"]["annotation"],
-            reference = config["files"]["alignment_reference"]
-        output:
-            alignment = "results/{build_name}/aligned.fasta",
-            insertions = "results/{build_name}/insertions.tsv",
-            translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
-        params:
-            outdir = "results/{build_name}/translations",
-            genes = ','.join(config.get('genes', ['S'])),
-            basename = "aligned"
-        log:
-            "logs/align_{build_name}.txt"
-        benchmark:
-            "benchmarks/align_{build_name}.txt"
-        conda: config["conda_environment"]
-        threads: 8
-        resources:
-            mem_mb=3000
-        shell:
-            """
-            nextalign \
-                --jobs={threads} \
-                --reference {input.reference} \
-                --genemap {input.genemap} \
-                --genes {params.genes} \
-                --sequences {input.sequences} \
-                --output-dir {params.outdir} \
-                --output-basename {params.basename} \
-                --output-fasta {output.alignment} \
-                --output-insertions {output.insertions} > {log} 2>&1
-            """
-else:
-    rule build_align:
-        message:
-            """
-            Aligning sequences from {input.sequences} to {input.reference}
+rule build_align:
+    message:
+        """
+        Aligning sequences to {input.reference}
             - gaps relative to reference are considered real
-            """
-        input:
-            sequences = rules.combine_samples.output.sequences,
-            reference = config["files"]["alignment_reference"]
-        output:
-            alignment = "results/{build_name}/aligned.fasta"
-        log:
-            "logs/align_{build_name}.txt"
-        benchmark:
-            "benchmarks/align_{build_name}.txt"
-        threads: 16
-        conda: config["conda_environment"]
-        shell:
-            """
-            mafft \
-                --auto \
-                --thread {threads} \
-                --keeplength \
-                --addfragments \
-                {input.sequences} \
-                {input.reference} > {output} 2> {log}
-            """
+        """
+    input:
+        sequences = rules.combine_samples.output.sequences,
+        genemap = config["files"]["annotation"],
+        reference = config["files"]["alignment_reference"]
+    output:
+        alignment = "results/{build_name}/aligned.fasta",
+        insertions = "results/{build_name}/insertions.tsv",
+        translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
+    params:
+        outdir = "results/{build_name}/translations",
+        genes = ','.join(config.get('genes', ['S'])),
+        basename = "aligned"
+    log:
+        "logs/align_{build_name}.txt"
+    benchmark:
+        "benchmarks/align_{build_name}.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    resources:
+        mem_mb=3000
+    shell:
+        """
+        nextalign \
+            --jobs={threads} \
+            --reference {input.reference} \
+            --genemap {input.genemap} \
+            --genes {params.genes} \
+            --sequences {input.sequences} \
+            --output-dir {params.outdir} \
+            --output-basename {params.basename} \
+            --output-fasta {output.alignment} \
+            --output-insertions {output.insertions} > {log} 2>&1
+        """
 
 if "run_pangolin" in config and config["run_pangolin"]:
     rule run_pangolin:
@@ -886,57 +822,57 @@ rule aa_muts_explicit:
             --genes {params.genes} \
             --output {output.node_data} 2>&1 | tee {log}
         """
-if "use_nextalign" in config and config["use_nextalign"]:
-    rule build_mutation_summary:
-        message: "Summarizing {input.alignment}"
-        input:
-            alignment = rules.build_align.output.alignment,
-            insertions = rules.build_align.output.insertions,
-            translations = rules.build_align.output.translations,
-            reference = config["files"]["alignment_reference"],
-            genemap = config["files"]["annotation"]
-        output:
-            mutation_summary = "results/{build_name}/mutation_summary.tsv"
-        log:
-            "logs/mutation_summary_{build_name}.txt"
-        params:
-            outdir = "results/{build_name}/translations",
-            basename = "aligned"
-        conda: config["conda_environment"]
-        shell:
-            """
-            python3 scripts/mutation_summary.py \
-                --alignment {input.alignment} \
-                --insertions {input.insertions} \
-                --directory {params.outdir} \
-                --basename {params.basename} \
-                --reference {input.reference} \
-                --genemap {input.genemap} \
-                --output {output.mutation_summary} 2>&1 | tee {log}
-            """
 
-    rule distances:
-        input:
-            tree = rules.refine.output.tree,
-            alignments = "results/{build_name}/translations/aligned.gene.S_withInternalNodes.fasta",
-            distance_maps = ["defaults/distance_maps/S1.json"]
-        params:
-            genes = 'S',
-            comparisons = ['root'],
-            attribute_names = ['S1_mutations']
-        output:
-            node_data = "results/{build_name}/distances.json"
-        shell:
-            """
-            python scripts/mutation_counts.py \
-                --tree {input.tree} \
-                --alignment {input.alignments} \
-                --gene-names {params.genes} \
-                --compare-to {params.comparisons} \
-                --attribute-name {params.attribute_names} \
-                --map {input.distance_maps} \
-                --output {output}
-            """
+rule build_mutation_summary:
+    message: "Summarizing {input.alignment}"
+    input:
+        alignment = rules.build_align.output.alignment,
+        insertions = rules.build_align.output.insertions,
+        translations = rules.build_align.output.translations,
+        reference = config["files"]["alignment_reference"],
+        genemap = config["files"]["annotation"]
+    output:
+        mutation_summary = "results/{build_name}/mutation_summary.tsv"
+    log:
+        "logs/mutation_summary_{build_name}.txt"
+    params:
+        outdir = "results/{build_name}/translations",
+        basename = "aligned"
+    conda: config["conda_environment"]
+    shell:
+        """
+        python3 scripts/mutation_summary.py \
+            --alignment {input.alignment} \
+            --insertions {input.insertions} \
+            --directory {params.outdir} \
+            --basename {params.basename} \
+            --reference {input.reference} \
+            --genemap {input.genemap} \
+            --output {output.mutation_summary} 2>&1 | tee {log}
+        """
+
+rule distances:
+    input:
+        tree = rules.refine.output.tree,
+        alignments = "results/{build_name}/translations/aligned.gene.S_withInternalNodes.fasta",
+        distance_maps = ["defaults/distance_maps/S1.json"]
+    params:
+        genes = 'S',
+        comparisons = ['root'],
+        attribute_names = ['S1_mutations']
+    output:
+        node_data = "results/{build_name}/distances.json"
+    shell:
+        """
+        python scripts/mutation_counts.py \
+            --tree {input.tree} \
+            --alignment {input.alignments} \
+            --gene-names {params.genes} \
+            --compare-to {params.comparisons} \
+            --attribute-name {params.attribute_names} \
+            --map {input.distance_maps} \
+            --output {output}
+        """
 
 rule traits:
     message:
@@ -1314,7 +1250,7 @@ rule add_branch_labels:
         python3 ./scripts/add_branch_labels.py \
             --input {input.auspice_json} \
             --emerging-clades {input.emerging_clades} \
-            --output {output.auspice_json} 
+            --output {output.auspice_json}
         """
 
 rule incorporate_travel_history:

From 383e1f6d644cdd206c1edda6994efb94ad7531a6 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 21 Apr 2021 15:55:05 -0700
Subject: [PATCH 2/7] Drop unused haplotype status rule and script

Removes the haplotype status rule and script that is no longer required.
---
 scripts/annotate-haplotype-status.py       | 36 ----------------------
 workflow/snakemake_rules/main_workflow.smk | 21 -------------
 2 files changed, 57 deletions(-)
 delete mode 100644 scripts/annotate-haplotype-status.py

diff --git a/scripts/annotate-haplotype-status.py b/scripts/annotate-haplotype-status.py
deleted file mode 100644
index cc5e54dac..000000000
--- a/scripts/annotate-haplotype-status.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""Annotate haplotype status for all sequences that match a given reference node name.
-"""
-import argparse
-from augur.utils import write_json
-import json
-import sys
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--ancestral-sequences", required=True, help="node data JSON of nucleotide mutations including observed and inferred by TreeTime")
-    parser.add_argument("--reference-node-name", required=True, help="name of the node whose sequence flags the desired haplotype")
-    parser.add_argument("--attribute-name", default="haplotype_status", help="name of attribute for haplotype status")
-    parser.add_argument("--output", required=True, help="node data JSON with annotated haplotype status based on the given reference node's sequence")
-
-    args = parser.parse_args()
-
-    with open(args.ancestral_sequences, "r") as fh:
-        sequences = json.load(fh)
-
-    if args.reference_node_name not in sequences["nodes"]:
-        print("ERROR: Could not find the requested reference node named '%s' in the given ancestral sequences." % args.reference_node_name, file=sys.stderr)
-        sys.exit(1)
-
-    haplotype_sequence = sequences["nodes"][args.reference_node_name]["sequence"]
-    haplotype_status = {"nodes": {}}
-
-    for node in sequences["nodes"]:
-        if sequences["nodes"][node]["sequence"] == haplotype_sequence:
-            status = "haplotype matches %s" % args.reference_node_name
-        else:
-            status = "haplotype does not match %s" % args.reference_node_name
-
-        haplotype_status["nodes"][node] = {args.attribute_name: status}
-
-    write_json(haplotype_status, args.output)
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index eeb11ffe3..a9d4b3eb1 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -748,27 +748,6 @@ rule ancestral:
             --infer-ambiguous 2>&1 | tee {log}
         """
 
-rule haplotype_status:
-    message: "Annotating haplotype status relative to {params.reference_node_name}"
-    input:
-        nt_muts = rules.ancestral.output.node_data
-    output:
-        node_data = "results/{build_name}/haplotype_status.json"
-    log:
-        "logs/haplotype_status_{build_name}.txt"
-    benchmark:
-        "benchmarks/haplotype_status_{build_name}.txt"
-    params:
-        reference_node_name = config["reference_node_name"]
-    conda: config["conda_environment"]
-    shell:
-        """
-        python3 scripts/annotate-haplotype-status.py \
-            --ancestral-sequences {input.nt_muts} \
-            --reference-node-name {params.reference_node_name:q} \
-            --output {output.node_data} 2>&1 | tee {log}
-        """
-
 rule translate:
     message: "Translating amino acid sequences"
     input:

From 74c27f96bcccc08cdc1cdc425195dc31111c4844 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 21 Apr 2021 16:00:34 -0700
Subject: [PATCH 3/7] Remove unused nucleotide mutation frequencies rule

Removes experimental and unused rule to calculate mutation frequencies
from multiple sequence alignments.
---
 .../snakemake_rules/export_for_nextstrain.smk |  3 --
 workflow/snakemake_rules/main_workflow.smk    | 36 -------------------
 2 files changed, 39 deletions(-)

diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk
index 782f58051..68a3ed42f 100644
--- a/workflow/snakemake_rules/export_for_nextstrain.smk
+++ b/workflow/snakemake_rules/export_for_nextstrain.smk
@@ -85,9 +85,6 @@ rule export_all_regions:
         """
 
 
-rule all_mutation_frequencies:
-    input: expand("results/{build_name}/nucleotide_mutation_frequencies.json", build_name=BUILD_NAMES)
-
 rule mutation_summary:
     message: "Summarizing {input.alignment}"
     input:
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index a9d4b3eb1..540e12298 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -1094,42 +1094,6 @@ rule logistic_growth:
             --output {output.node_data} 2>&1 | tee {log}
         """
 
-rule nucleotide_mutation_frequencies:
-    message: "Estimate nucleotide mutation frequencies"
-    input:
-        alignment = rules.build_align.output.alignment,
-        metadata = _get_metadata_by_wildcards
-    output:
-        frequencies = "results/{build_name}/nucleotide_mutation_frequencies.json"
-    log:
-        "logs/nucleotide_mutation_frequencies_{build_name}.txt"
-    benchmark:
-        "benchmarks/nucleotide_mutation_frequencies_{build_name}.txt"
-    params:
-        min_date = config["frequencies"]["min_date"],
-        minimal_frequency = config["frequencies"]["minimal_frequency"],
-        pivot_interval = config["frequencies"]["pivot_interval"],
-        stiffness = config["frequencies"]["stiffness"],
-        inertia = config["frequencies"]["inertia"]
-    resources:
-        # Memory use scales primarily with the size of the metadata file.
-        mem_mb=lambda wildcards, input: 15 * int(input.metadata.size / 1024 / 1024)
-    conda: config["conda_environment"]
-    shell:
-        """
-        augur frequencies \
-            --method diffusion \
-            --alignments {input.alignment} \
-            --gene-names nuc \
-            --metadata {input.metadata} \
-            --min-date {params.min_date} \
-            --minimal-frequency {params.minimal_frequency} \
-            --pivot-interval {params.pivot_interval} \
-            --stiffness {params.stiffness} \
-            --inertia {params.inertia} \
-            --output {output.frequencies} 2>&1 | tee {log}
-        """
-
 def export_title(wildcards):
     # TODO: maybe we could replace this with a config entry for full/human-readable build name?
     location_name = wildcards.build_name

From aba6b1b0eb18aa0af3cabe10bcfb006e81118888 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 21 Apr 2021 16:08:03 -0700
Subject: [PATCH 4/7] Use augur distance for mutation counts

Replaces custom script with augur distance command that does the same
thing.
---
 scripts/mutation_counts.py                 | 55 ----------------------
 workflow/snakemake_rules/main_workflow.smk |  2 +-
 2 files changed, 1 insertion(+), 56 deletions(-)
 delete mode 100644 scripts/mutation_counts.py

diff --git a/scripts/mutation_counts.py b/scripts/mutation_counts.py
deleted file mode 100644
index e1ed496f3..000000000
--- a/scripts/mutation_counts.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import argparse, json
-from Bio import Phylo, SeqIO
-import pandas as pd
-import numpy as np
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="count distances based on distance maps with counting gaps as one event",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--tree', type=str, required=True, help="tree file")
-    parser.add_argument('--alignment', type=str)
-    parser.add_argument('--gene-names', nargs='+')
-    parser.add_argument('--compare-to', nargs='+')
-    parser.add_argument('--map', nargs='+')
-    parser.add_argument('--attribute-name', nargs='+')
-    parser.add_argument('--output', type=str, required=True, help="output json file")
-    args = parser.parse_args()
-
-    if type(args.map)==str:
-        args.map = [args.map]
-
-    if type(args.attribute_name)==str:
-        args.attribute_name = [args.attribute_name]
-    
-    node_data = {}
-
-    T = Phylo.read(args.tree, 'newick')
-    aln = {s.name:str(s.seq) for s in SeqIO.parse(args.alignment, 'fasta')}
-    root = aln[T.root.name]
-
-    for map_file, attribute in zip(args.map, args.attribute_name):
-        with open(map_file) as fh:
-            map = json.load(fh)
-
-        weights = {int(k)-1:v for k,v in map["map"][args.gene_names[0]].items()}
-        default = map['default']
-        for n in T.find_clades():
-            if n.name not in node_data: node_data[n.name] = {}
-            distance = 0
-            gap_extend=False
-            if n.name not in aln:
-                print(f"{n.name} not found in alignment")
-                continue
-
-            for p, (a,d) in enumerate(zip(root, aln[n.name])):
-                if a!=d and (not gap_extend):
-                    distance+=weights.get(p, default)
-
-                gap_extend= d=='-'
-            node_data[n.name][f"{attribute}"] = distance
-
-    with open(args.output, 'w') as fh:
-        json.dump({"nodes":node_data}, fh)
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 540e12298..de65fc0f6 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -843,7 +843,7 @@ rule distances:
         node_data = "results/{build_name}/distances.json"
     shell:
         """
-        python scripts/mutation_counts.py \
+        augur distance \
             --tree {input.tree} \
             --alignment {input.alignments} \
             --gene-names {params.genes} \

From 074816e3bbca5a27e08fbfbc2af74da4b1a9d5a3 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 21 Apr 2021 15:49:26 -0700
Subject: [PATCH 5/7] Drop support for old sequence/metadata inputs

Removes deprecated sequence and metadata inputs from the configuration
file and removes Snakemake logic required to support these files. Also,
removes references to this deprecated input format from the example
profiles and the "multiple inputs" tutorial.

Since we no longer support this old input format, we also no longer need
to support empty origin wildcards. We drop support for empty origin
wildcard and remove all references to trimming of origin wildcards
that start with an underscore and update all rules to reference the origin
wildcard with the underscore in the filename.

We also now print helpful errors when inputs aren't defined properly
through checks for configurations with old-style input definitions or
without any inputs defined. These error messages provide recommendations
about how to update the workflow configuration to fix the issues.

Fixes #616
---
 Snakefile                                     | 23 ++++++-
 defaults/parameters.yaml                      |  5 --
 docs/multiple_inputs.md                       | 20 +++---
 my_profiles/example/builds.yaml               |  6 ++
 my_profiles/example/config.yaml               |  4 --
 my_profiles/getting_started/builds.yaml       |  6 ++
 my_profiles/getting_started/config.yaml       |  4 --
 workflow/snakemake_rules/common.smk           | 62 +++++++------------
 workflow/snakemake_rules/download.smk         | 50 +++++++--------
 .../snakemake_rules/export_for_nextstrain.smk |  8 +--
 workflow/snakemake_rules/main_workflow.smk    | 50 +++++++--------
 11 files changed, 118 insertions(+), 120 deletions(-)

diff --git a/Snakefile b/Snakefile
index acaa62f35..cc829401d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -8,6 +8,7 @@ from getpass import getuser
 from snakemake.logging import logger
 from snakemake.utils import validate
 from collections import OrderedDict
+import textwrap
 import time
 
 # Store the user's configuration prior to loading defaults, so we can check for
@@ -74,6 +75,26 @@ if "builds" not in config:
 
 include: "workflow/snakemake_rules/reference_build_definitions.smk"
 
+# Check for old-style input file references and alert users to the new format.
+if "sequences" in config or "metadata" in config:
+    logger.error("ERROR: Your configuration file includes references to an unsupported specification of input files (e.g., `config['sequences']` or `config['metadata']`).")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define your inputs as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  name: local-data\n  metadata: {config['metadata']}\n  sequences: {config['sequences']}\n",
+        "  "
+    ))
+    sys.exit(1)
+
+# Check for missing inputs.
+if "inputs" not in config:
+    logger.error("ERROR: Your workflow does not define any input files to start with.")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define at least one input dataset as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  name: local-data\n  metadata: data/example_metadata.tsv\n  sequences: data/example_sequences.fasta.gz\n",
+        "  "
+    ))
+    sys.exit(1)
+
 # Allow users to specify a list of active builds from the command line.
 if config.get("active_builds"):
     BUILD_NAMES = config["active_builds"].split(",")
@@ -93,7 +114,7 @@ wildcard_constraints:
     # but not special strings used for Nextstrain builds.
     build_name = r'(?:[_a-zA-Z-](?!(tip-frequencies)))+',
     date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
-    origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string
+    origin = r"[a-zA-Z0-9-_]+"
 
 localrules: download_metadata, download_sequences, clean
 
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index 49d57dd1e..f0974186b 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -6,11 +6,6 @@
 # This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`).
 conda_environment: "workflow/envs/nextstrain.yaml"
 
-# These are the two main starting files for the run.
-# If they do not exist, we will attempt to fetch them from a S3 bucket (see below)
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
-
 strip_strain_prefixes:
   - hCoV-19/
   - SARS-CoV-2/
diff --git a/docs/multiple_inputs.md b/docs/multiple_inputs.md
index a40646b2c..7ba784d43 100644
--- a/docs/multiple_inputs.md
+++ b/docs/multiple_inputs.md
@@ -51,15 +51,17 @@ my_profiles/example_multiple_inputs/my_auspice_config.json
 
 ## Setting up the config
 
-Typically, inside the `builds.yaml` one would specify input files such as
+You can define a single input dataset in `builds.yaml` as follows.
 
 ```yaml
-# traditional syntax for specifying starting files
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
+inputs:
+  - name: my-data
+    metadata: "data/metadata.tsv"
+    sequences: "data/sequences.fasta"
 ```
 
-For multiple inputs, we shall use the new `inputs` section of the config to specify that we have two different inputs, and we will give them the names "aus" and "worldwide":
+For multiple inputs, you can add another entry to the `inputs` config list.
+Here, we will give them the names "aus" and "worldwide":
 
 ```yaml
 # my_profiles/example_multiple_inputs/builds.yaml
@@ -72,15 +74,11 @@ inputs:
     sequences: "data/example_sequences_worldwide.fasta"
 ```
 
-> Note that if you also specify `sequences` or `metadata` as top level entries in the config, they will be ignored.
-
 ### Snakemake terminology
 
 Inside the Snakemake rules, we use a wildcard `origin` to define different starting points.
-For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="_worldwide"` and we expect that the config has defined
-a sequences input via `config["sequences"]["worldwide"]=<path to fasta>` (note the leading `_` has been stripped from the `origin` in the config).
-If we use the older syntax (specifying `sequences` or `metadata` as top level entries in the config) then `wildcards.origin=""`.
-
+For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="worldwide"` and we expect that the config has defined
+a sequences input as shown above.
 
 ## How is metadata combined?
 
diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
index eea133e11..4ff505fd9 100644
--- a/my_profiles/example/builds.yaml
+++ b/my_profiles/example/builds.yaml
@@ -11,6 +11,12 @@
 
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta
+
 builds:
   # Focus on King County (location) in Washington State (division) in the USA (country)
   # with a build name that will produce the following URL fragment on Nextstrain/auspice:
diff --git a/my_profiles/example/config.yaml b/my_profiles/example/config.yaml
index 560e90c71..42aa8503d 100644
--- a/my_profiles/example/config.yaml
+++ b/my_profiles/example/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/example/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 2
 
diff --git a/my_profiles/getting_started/builds.yaml b/my_profiles/getting_started/builds.yaml
index f72a562c7..0dd5df7b0 100644
--- a/my_profiles/getting_started/builds.yaml
+++ b/my_profiles/getting_started/builds.yaml
@@ -8,6 +8,12 @@
 # These subsample primarily from the area of interest ("focus"), and add in background ("contextual") sequences from the rest of the world.
 # Contextual sequences that are genetically similar to (hamming distance) and geographically near the focal sequences are heavily prioritized.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta.gz
+
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 builds:
   # This build samples evenly from the globe
diff --git a/my_profiles/getting_started/config.yaml b/my_profiles/getting_started/config.yaml
index 8b5fa831d..c0e97f068 100644
--- a/my_profiles/getting_started/config.yaml
+++ b/my_profiles/getting_started/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/getting_started/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 1
 
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
index 76c4d37b2..aa31fd28e 100644
--- a/workflow/snakemake_rules/common.smk
+++ b/workflow/snakemake_rules/common.smk
@@ -25,12 +25,6 @@ def numeric_date(dt=None):
 
     return res
 
-def _trim_origin(origin):
-    """the origin wildcard includes a leading `_`. This function returns the value without this `_`"""
-    if origin=="":
-        return ""
-    return origin[1:]
-
 def _get_subsampling_scheme_by_build_name(build_name):
     return config["builds"][build_name].get("subsampling_scheme", build_name)
 
@@ -38,7 +32,7 @@ def _get_filter_value(wildcards, key):
     default = config["filter"].get(key, "")
     if wildcards["origin"] == "":
         return default
-    return config["filter"].get(_trim_origin(wildcards["origin"]), {}).get(key, default)
+    return config["filter"].get(wildcards["origin"], {}).get(key, default)
 
 def _get_path_for_input(stage, origin_wildcard):
     """
@@ -46,45 +40,36 @@ def _get_path_for_input(stage, origin_wildcard):
     This function always returns a local filepath, the format of which decides whether rules should
     create this by downloading from a remote resource, or create it by a local compute rule.
     """
-    if not origin_wildcard:
-        # No origin wildcards => deprecated single inputs (e.g. `config["sequences"]`) which cannot
-        # be downloaded from remote resources
-        if config.get("inputs"):
-            raise Exception("ERROR: empty origin wildcard but config defines 'inputs`")
-        path_or_url = config[stage] if stage in ["metadata", "sequences"] else ""
-        remote = False
-    else:
-        trimmed_origin = _trim_origin(origin_wildcard)
-        path_or_url = config.get("inputs", {}).get(trimmed_origin, {}).get(stage, "")
-        scheme = urlsplit(path_or_url).scheme
-        remote = bool(scheme)
+    path_or_url = config.get("inputs", {}).get(origin_wildcard, {}).get(stage, "")
+    scheme = urlsplit(path_or_url).scheme
+    remote = bool(scheme)
 
-        # Following checking should be the remit of the rule which downloads the remote resource
-        if scheme and scheme!="s3":
-            raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
+    # Following checking should be the remit of the rule which downloads the remote resource
+    if scheme and scheme!="s3":
+        raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
 
-        ## Basic checking which could be taken care of by the config schema
-        ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
-        if path_or_url=="" and stage in ["metadata", "sequences"]:
-            raise Exception(f"ERROR: config->input->{trimmed_origin}->{stage} is not defined.")
+    ## Basic checking which could be taken care of by the config schema
+    ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
+    if path_or_url=="" and stage in ["metadata", "sequences"]:
+        raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.")
 
     if stage=="metadata":
-        return f"data/downloaded{origin_wildcard}.tsv" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.tsv" if remote else path_or_url
     if stage=="sequences":
-        return f"data/downloaded{origin_wildcard}.fasta" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.fasta" if remote else path_or_url
     if stage=="aligned":
-        return f"results/precomputed-aligned{origin_wildcard}.fasta" if remote else f"results/aligned{origin_wildcard}.fasta"
+        return f"results/precomputed-aligned_{origin_wildcard}.fasta" if remote else f"results/aligned_{origin_wildcard}.fasta"
     if stage=="to-exclude":
-        return f"results/precomputed-to-exclude{origin_wildcard}.txt" if remote else f"results/to-exclude{origin_wildcard}.txt"
+        return f"results/precomputed-to-exclude_{origin_wildcard}.txt" if remote else f"results/to-exclude_{origin_wildcard}.txt"
     if stage=="masked":
-        return f"results/precomputed-masked{origin_wildcard}.fasta" if remote else f"results/masked{origin_wildcard}.fasta"
+        return f"results/precomputed-masked_{origin_wildcard}.fasta" if remote else f"results/masked_{origin_wildcard}.fasta"
     if stage=="filtered":
         if remote:
-            return f"results/precomputed-filtered{origin_wildcard}.fasta"
+            return f"results/precomputed-filtered_{origin_wildcard}.fasta"
         elif path_or_url:
             return path_or_url
         else:
-            return f"results/filtered{origin_wildcard}.fasta"
+            return f"results/filtered_{origin_wildcard}.fasta"
 
     raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"")
 
@@ -92,22 +77,17 @@ def _get_path_for_input(stage, origin_wildcard):
 def _get_unified_metadata(wildcards):
     """
     Returns a single metadata file representing the input metadata file(s).
-    If there was only one supplied metadata file (e.g. the deprecated
-    `config["metadata"]` syntax, or one entry in the `config["inputs"] dict`)
+    If there was only one supplied metadata file in the `config["inputs"] dict`,
     then that file is returned. Else "results/combined_metadata.tsv" is returned
     which will run the `combine_input_metadata` rule to make it.
     """
-    if not config.get("inputs"):
-        return config["metadata"]
     if len(list(config["inputs"].keys()))==1:
-        return "results/sanitized_metadata{origin}.tsv".format(origin="_"+list(config["inputs"].keys())[0])
+        return "results/sanitized_metadata_{origin}.tsv".format(origin=list(config["inputs"].keys())[0])
     return "results/combined_metadata.tsv"
 
 def _get_unified_alignment(wildcards):
-    if not config.get("inputs"):
-        return "results/filtered.fasta"
     if len(list(config["inputs"].keys()))==1:
-        return _get_path_for_input("filtered", "_"+list(config["inputs"].keys())[0])
+        return _get_path_for_input("filtered", list(config["inputs"].keys())[0])
     return "results/combined_sequences_for_subsampling.fasta",
 
 def _get_metadata_by_build_name(build_name):
diff --git a/workflow/snakemake_rules/download.smk b/workflow/snakemake_rules/download.smk
index f7a0b09db..9dd9fc798 100644
--- a/workflow/snakemake_rules/download.smk
+++ b/workflow/snakemake_rules/download.smk
@@ -8,8 +8,8 @@
 #       rule: align
 #           input: _get_path_for_input
 #           ...
-# will result in an input file looking like "results/aligned{origin}.fasta" or
-# "results/download-aligned{origin}.fasta" (which one is chosen depends on the
+# will result in an input file looking like "results/aligned_{origin}.fasta" or
+# "results/download-aligned_{origin}.fasta" (which one is chosen depends on the
 # supplied `config`). In the latter case, `rule download_aligned` will be used.
 # See https://github.com/nextstrain/ncov/compare/remote-files for an example of
 # how we could leverage snakemake to do this without needing a separate rule!
@@ -33,11 +33,11 @@ def _infer_decompression(input):
 rule download_sequences:
     message: "Downloading sequences from {params.address} -> {output.sequences}"
     output:
-        sequences = "data/downloaded{origin}.fasta"
+        sequences = "data/downloaded_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["sequences"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["sequences"])
+        address = lambda w: config["inputs"][w.origin]["sequences"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["sequences"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -46,11 +46,11 @@ rule download_sequences:
 rule download_metadata:
     message: "Downloading metadata from {params.address} -> {output.metadata}"
     output:
-        metadata = "data/downloaded{origin}.tsv"
+        metadata = "data/downloaded_{origin}.tsv"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["metadata"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["metadata"]),
+        address = lambda w: config["inputs"][w.origin]["metadata"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["metadata"]),
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.metadata:q}
@@ -59,11 +59,11 @@ rule download_metadata:
 rule download_aligned:
     message: "Downloading aligned fasta files from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-aligned{origin}.fasta"
+        sequences = "results/precomputed-aligned_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["aligned"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["aligned"])
+        address = lambda w: config["inputs"][w.origin]["aligned"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["aligned"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -77,17 +77,17 @@ rule download_diagnostic:
         {params.to_exclude_address} -> {output.to_exclude}
     """
     output:
-        diagnostics = "results/precomputed-sequence-diagnostics{origin}.tsv",
-        flagged = "results/precomputed-flagged-sequences{origin}.tsv",
-        to_exclude = "results/precomputed-to-exclude{origin}.txt"
+        diagnostics = "results/precomputed-sequence-diagnostics_{origin}.tsv",
+        flagged = "results/precomputed-flagged-sequences_{origin}.tsv",
+        to_exclude = "results/precomputed-to-exclude_{origin}.txt"
     conda: config["conda_environment"]
     params:
         # Only `to-exclude` is defined via the config, so we make some assumptions about the format of the other filenames
-        to_exclude_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"],
-        flagged_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'),
-        diagnostics_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'),
+        to_exclude_address = lambda w: config["inputs"][w.origin]["to-exclude"],
+        flagged_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'),
+        diagnostics_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'),
         # assume the compression is the same across all 3 addresses
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["to-exclude"])
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["to-exclude"])
     shell:
         """
         aws s3 cp {params.to_exclude_address} - | {params.deflate} > {output.to_exclude:q}
@@ -99,11 +99,11 @@ rule download_diagnostic:
 rule download_masked:
     message: "Downloading aligned & masked FASTA from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-masked{origin}.fasta"
+        sequences = "results/precomputed-masked_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["masked"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["masked"])
+        address = lambda w: config["inputs"][w.origin]["masked"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["masked"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -113,12 +113,12 @@ rule download_masked:
 rule download_filtered:
     message: "Downloading pre-computed filtered alignment from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-filtered{origin}.fasta"
+        sequences = "results/precomputed-filtered_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["filtered"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["filtered"])
+        address = lambda w: config["inputs"][w.origin]["filtered"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["filtered"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
-        """
\ No newline at end of file
+        """
diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk
index 68a3ed42f..b1dd901dd 100644
--- a/workflow/snakemake_rules/export_for_nextstrain.smk
+++ b/workflow/snakemake_rules/export_for_nextstrain.smk
@@ -94,14 +94,14 @@ rule mutation_summary:
         reference = config["files"]["alignment_reference"],
         genemap = config["files"]["annotation"]
     output:
-        mutation_summary = "results/mutation_summary{origin}.tsv"
+        mutation_summary = "results/mutation_summary_{origin}.tsv"
     log:
-        "logs/mutation_summary{origin}.txt"
+        "logs/mutation_summary_{origin}.txt"
     benchmark:
-        "benchmarks/mutation_summary{origin}.txt"
+        "benchmarks/mutation_summary_{origin}.txt"
     params:
         outdir = "results/translations",
-        basename = "seqs{origin}"
+        basename = "seqs_{origin}"
     conda: config["conda_environment"]
     shell:
         """
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index de65fc0f6..af221f3f2 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -2,13 +2,13 @@ rule sanitize_metadata:
     input:
         metadata=lambda wildcards: _get_path_for_input("metadata", wildcards.origin)
     output:
-        metadata="results/sanitized_metadata{origin}.tsv"
+        metadata="results/sanitized_metadata_{origin}.tsv"
     benchmark:
-        "benchmarks/sanitize_metadata{origin}.txt"
+        "benchmarks/sanitize_metadata_{origin}.txt"
     conda:
         config["conda_environment"]
     log:
-        "logs/sanitize_metadata{origin}.txt"
+        "logs/sanitize_metadata_{origin}.txt"
     params:
         strain_prefixes=config["strip_strain_prefixes"],
     shell:
@@ -27,7 +27,7 @@ rule combine_input_metadata:
         Combining metadata files {input.metadata} -> {output.metadata} and adding columns to represent origin
         """
     input:
-        metadata=expand("results/sanitized_metadata{origin}.tsv", origin=[f"_{origin}" if origin else "" for origin in config.get("inputs", "")]),
+        metadata=expand("results/sanitized_metadata_{origin}.tsv", origin=config.get("inputs")),
     output:
         metadata = "results/combined_metadata.tsv"
     params:
@@ -53,18 +53,18 @@ rule align:
         genemap = config["files"]["annotation"],
         reference = config["files"]["alignment_reference"]
     output:
-        alignment = "results/aligned{origin}.fasta",
-        insertions = "results/insertions{origin}.tsv",
-        translations = expand("results/translations/seqs{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S']))
+        alignment = "results/aligned_{origin}.fasta",
+        insertions = "results/insertions_{origin}.tsv",
+        translations = expand("results/translations/seqs_{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S']))
     params:
         outdir = "results/translations",
         genes = ','.join(config.get('genes', ['S'])),
-        basename = "seqs{origin}",
+        basename = "seqs_{origin}",
         strain_prefixes=config["strip_strain_prefixes"],
     log:
-        "logs/align{origin}.txt"
+        "logs/align_{origin}.txt"
     benchmark:
-        "benchmarks/align{origin}.txt"
+        "benchmarks/align_{origin}.txt"
     conda: config["conda_environment"]
     threads: 8
     resources:
@@ -91,19 +91,19 @@ rule diagnostic:
     message: "Scanning aligned sequences {input.alignment} for problematic sequences"
     input:
         alignment = lambda wildcards: _get_path_for_input("aligned", wildcards.origin),
-        metadata = "results/sanitized_metadata{origin}.tsv",
+        metadata = "results/sanitized_metadata_{origin}.tsv",
         reference = config["files"]["reference"]
     output:
-        diagnostics = "results/sequence-diagnostics{origin}.tsv",
-        flagged = "results/flagged-sequences{origin}.tsv",
-        to_exclude = "results/to-exclude{origin}.txt"
+        diagnostics = "results/sequence-diagnostics_{origin}.tsv",
+        flagged = "results/flagged-sequences_{origin}.tsv",
+        to_exclude = "results/to-exclude_{origin}.txt"
     log:
-        "logs/diagnostics{origin}.txt"
+        "logs/diagnostics_{origin}.txt"
     params:
         mask_from_beginning = config["mask"]["mask_from_beginning"],
         mask_from_end = config["mask"]["mask_from_end"]
     benchmark:
-        "benchmarks/diagnostics{origin}.txt"
+        "benchmarks/diagnostics_{origin}.txt"
     resources:
         # Memory use scales primarily with the size of the metadata file.
         mem_mb=lambda wildcards, input: 15 * int(input.metadata.size / 1024 / 1024)
@@ -126,7 +126,7 @@ def _collect_exclusion_files(wildcards):
     # As well as the sequences flagged by the diagnostic step.
     # Note that we can skip the diagnostic step on a per-input (per-origin) basis.
     exclude_files = [ config["files"]["exclude"] ]
-    if not config["filter"].get(_trim_origin(wildcards["origin"]), {}).get("skip_diagnostics", False):
+    if not config["filter"].get(wildcards["origin"], {}).get("skip_diagnostics", False):
         exclude_files.append(_get_path_for_input("to-exclude", wildcards.origin))
     return exclude_files
 
@@ -141,11 +141,11 @@ rule mask:
     input:
         alignment = lambda w: _get_path_for_input("aligned", w.origin)
     output:
-        alignment = "results/masked{origin}.fasta"
+        alignment = "results/masked_{origin}.fasta"
     log:
-        "logs/mask{origin}.txt"
+        "logs/mask_{origin}.txt"
     benchmark:
-        "benchmarks/mask{origin}.txt"
+        "benchmarks/mask_{origin}.txt"
     params:
         mask_from_beginning = config["mask"]["mask_from_beginning"],
         mask_from_end = config["mask"]["mask_from_end"],
@@ -172,16 +172,16 @@ rule filter:
         """
     input:
         sequences = lambda wildcards: _get_path_for_input("masked", wildcards.origin),
-        metadata = "results/sanitized_metadata{origin}.tsv",
+        metadata = "results/sanitized_metadata_{origin}.tsv",
         # TODO - currently the include / exclude files are not input (origin) specific, but this is possible if we want
         include = config["files"]["include"],
         exclude = _collect_exclusion_files,
     output:
-        sequences = "results/filtered{origin}.fasta"
+        sequences = "results/filtered_{origin}.fasta"
     log:
-        "logs/filtered{origin}.txt"
+        "logs/filtered_{origin}.txt"
     benchmark:
-        "benchmarks/filter{origin}.txt"
+        "benchmarks/filter_{origin}.txt"
     params:
         min_length = lambda wildcards: _get_filter_value(wildcards, "min_length"),
         exclude_where = lambda wildcards: _get_filter_value(wildcards, "exclude_where"),
@@ -315,7 +315,7 @@ rule combine_sequences_for_subsampling:
         Combine and deduplicate aligned & filtered FASTAs from multiple origins in preparation for subsampling.
         """
     input:
-        lambda w: [_get_path_for_input("filtered", f"_{origin}") for origin in config.get("inputs", {})]
+        lambda w: [_get_path_for_input("filtered", origin) for origin in config.get("inputs", {})]
     output:
         "results/combined_sequences_for_subsampling.fasta"
     benchmark:

From f01d1e28f17e3b622ab5beb0980005bab545eaac Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 7 May 2021 09:56:46 -0700
Subject: [PATCH 6/7] Add missing Conda environment to distances rule

Without a Conda environment, the distances rule tries to use whatever version of
Augur is installed globally for the user which either leads to an error (when
Augur is not installed) or a potential conflict in node data JSON versions
during augur export v2 (when an older version of Augur than 12.0.0 is installed
globally).
---
 workflow/snakemake_rules/main_workflow.smk | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index af221f3f2..f9cacf084 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -841,6 +841,8 @@ rule distances:
         attribute_names = ['S1_mutations']
     output:
         node_data = "results/{build_name}/distances.json"
+    conda:
+        config["conda_environment"]
     shell:
         """
         augur distance \

From 98d51a63cd47d9bb0818fbb4e855075ee5b6e32a Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 7 May 2021 10:14:54 -0700
Subject: [PATCH 7/7] Update change log to reflect new release

---
 docs/change_log.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/change_log.md b/docs/change_log.md
index a07f5083a..775db6ddc 100644
--- a/docs/change_log.md
+++ b/docs/change_log.md
@@ -3,6 +3,21 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v5 (7 May 2021)
+
+[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/615) for more details about this release.
+
+### Major changes
+
+- Drop support for old sequence/metadata inputs
+- Use nextalign for alignment instead of mafft
+
+### Minor changes
+
+- Drop unused haplotype status rule and script
+- Remove unused nucleotide mutation frequencies rule
+- Use augur distance for mutation counts
+
 ## v4 (5 May 2021)
 
 [See the corresponding pull request](https://github.com/nextstrain/ncov/pull/605) for more details about changes in this release.