From 99c8fbf9600f39d9588bbfe18999627fbc5c1a36 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 1 Nov 2021 22:43:00 +0100 Subject: [PATCH 01/19] feat: join pango designations onto metadata --- defaults/parameters.yaml | 24 ++++++-- profiles/basel-combined/builds.yaml | 7 --- profiles/clades/auspice_config.json | 67 ----------------------- profiles/clades/builds.yaml | 48 ---------------- profiles/clades/config.yaml | 23 -------- profiles/pango/builds.yaml | 10 ++++ profiles/pango/config.yaml | 15 +++++ profiles/{clades => pango}/description.md | 3 - workflow/snakemake_rules/preprocess.smk | 23 ++++++++ 9 files changed, 68 insertions(+), 152 deletions(-) delete mode 100644 profiles/clades/auspice_config.json delete mode 100644 profiles/clades/builds.yaml delete mode 100644 profiles/clades/config.yaml create mode 100644 profiles/pango/builds.yaml create mode 100644 profiles/pango/config.yaml rename profiles/{clades => pango}/description.md (69%) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 9dd1470..5c39436 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -11,12 +11,28 @@ files: annotation: "defaults/annotation.gff" include: "defaults/include.txt" color_schemes: "defaults/color_schemes.tsv" - clades: "defaults/clades.tsv" - ordering: "defaults/color_ordering.tsv" - lat_longs: "defaults/lat_longs.tsv" auspice_config: "defaults/auspice_config.json" description: "defaults/description.md" - mut_fit: "defaults/mutational_fitness_distance_map.json" + clades: "builds/clades.tsv" + ordering: "builds/color_ordering.tsv" + lat_longs: "builds/lat_longs.tsv" + mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" + +data_source: + clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" + lat_longs: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/lat_longs.tsv" + color_ordering: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/color_ordering.tsv" + mut_fit: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/mutational_fitness_distance_map.json" + pango_designations: "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv" + +origins: + gisaid: + metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" + sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" + exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" + filters: "--min-length 27000 --min-date 2019-12-01" tree: tree-builder-args: "'-ninit 10 -n 4 -czb'" diff --git a/profiles/basel-combined/builds.yaml b/profiles/basel-combined/builds.yaml index bdc7b6b..863820e 100644 --- a/profiles/basel-combined/builds.yaml +++ b/profiles/basel-combined/builds.yaml @@ -15,13 +15,6 @@ data_source: color_ordering: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/color_ordering.tsv" mut_fit: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/mutational_fitness_distance_map.json" -origins: - gisaid: - metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" - exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" - filters: "--min-length 27000 --min-date 2019-12-01" - build_dir: "builds-combined" auspice_dir: "auspice-combined" diff --git a/profiles/clades/auspice_config.json b/profiles/clades/auspice_config.json deleted file mode 100644 index fa6a530..0000000 --- a/profiles/clades/auspice_config.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "title": "Your samples placed on a SARS-CoV-2 phylogeny", - "build_url": "https://github.com/nextstrain/nextclade", - "maintainers": [ - {"name": "Ivan Aksamentov", "url": "https://neherlab.org"}, - {"name": "Richard Neher", "url": "https://neherlab.org"} - ], - "colorings": [ - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "Pango Lineage", - "type": "categorical" - }, - { - "key": "GISAID_clade", - "title": "GISAID Clade", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": [ - "region" - ], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "div", - "geo_resolution": "region", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "region", - "country", - "division", - "new_node", - "pango_lineage", - "clade_membership" - ], - "panels": [ - "tree" - ] - } diff --git a/profiles/clades/builds.yaml b/profiles/clades/builds.yaml deleted file mode 100644 index 8952a0c..0000000 --- a/profiles/clades/builds.yaml +++ /dev/null @@ -1,48 +0,0 @@ -title: 'Your samples placed on a SARS-CoV-2 phylogeny' - -files: - reference: "defaults/reference_seq.gb" - alignment_reference: "defaults/reference_seq.fasta" - annotation: "defaults/annotation.gff" - include: "defaults/include.txt" - color_schemes: "defaults/color_schemes.tsv" - clades: "defaults/clades.tsv" - ordering: "defaults/color_ordering.tsv" - lat_longs: "defaults/lat_longs.tsv" - auspice_config: "profiles/clades/auspice_config.json" - description: "profiles/clades/description.md" - -data_source: - clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" - -origins: - gisaid: - metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.gz" - exclude: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/exclude.txt" - filters: "--min-length 27000" - -refine: - root: 'Wuhan/Hu-1/2019' - divergence_unit: 'mutations' - clock_filter_iqd: 4 - no_timetree: True - clock_rate: 0.0007 - clock_std_dev: 0.0003 - coalescent: "skyline" - date_inference: "marginal" - clock_filter_iqd: 4 - -builds: - nextclade: - subsamples: - early: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 1000 --max-date 2021-03-01 --exclude-where QC_rare_mutations!=good QC_snp_clusters!=good" - late: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 3000 --min-date 2021-03-01 --exclude-where 'rare_mutations>15' QC_snp_clusters!=good" - nextclade-2k: - subsamples: - early: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 500 --max-date 2021-03-01 --exclude-where QC_rare_mutations!=good QC_snp_clusters!=good" - late: - filters: "--min-length 28500 --group-by year month division pango_lineage --subsample-max-sequences 1500 --min-date 2021-03-01 --exclude-where 'rare_mutations>15' QC_snp_clusters!=good" diff --git a/profiles/clades/config.yaml b/profiles/clades/config.yaml deleted file mode 100644 index 04e4077..0000000 --- a/profiles/clades/config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -configfile: - - defaults/parameters.yaml # Pull in the default values - - profiles/clades/builds.yaml # Specific builds for this profile - -# Always print the commands that will be run to the screen for debugging. -printshellcmds: True -keep-going: True -reason: True -stats: stats.json - -# Print log files of failed jobs -show-failed-logs: True - -cluster-config: profiles/cluster/cluster.json - -cluster: "sbatch --time={cluster.time} --mem={cluster.mem} --cpus-per-task={cluster.n} --qos={cluster.qos}" - -jobs: 128 - -jobscript: profiles/cluster/submit.sh - -# Set the name for the job as display in the cluster queue. -jobname: "{rulename}.{jobid}.sh" diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml new file mode 100644 index 0000000..6247b7e --- /dev/null +++ b/profiles/pango/builds.yaml @@ -0,0 +1,10 @@ +build_dir: "builds-pango" +auspice_dir: "auspice-pango" + +builds: + pango: + subsamples: + early: + filters: "--group-by division year month --subsample-max-sequences 1000 --max-date 2021-02-01" + late: + filters: "--group-by division year month --subsample-max-sequences 3000 --min-date 2021-02-01" \ No newline at end of file diff --git a/profiles/pango/config.yaml b/profiles/pango/config.yaml new file mode 100644 index 0000000..2db00c8 --- /dev/null +++ b/profiles/pango/config.yaml @@ -0,0 +1,15 @@ +configfile: + - defaults/parameters.yaml # Pull in the default values + - profiles/pango/builds.yaml # Specific builds for this profile + - profiles/pango/secrets.yaml # Secrets not committed to git + +# Always print the commands that will be run to the screen for debugging. +printshellcmds: True +keep-going: True +reason: True +stats: stats.json + +# Print log files of failed jobs +show-failed-logs: True + +cores: 16 diff --git a/profiles/clades/description.md b/profiles/pango/description.md similarity index 69% rename from profiles/clades/description.md rename to profiles/pango/description.md index cde0eed..a541b5a 100644 --- a/profiles/clades/description.md +++ b/profiles/pango/description.md @@ -1,4 +1 @@ -This phylogeny shows evolutionary relationships of SARS-CoV-2 viruses from the ongoing novel coronavirus COVID-19 pandemic. - -All data we use were deposited in GISAID by scientists around the world. We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequence and metadata made available through [GISAID](https://gisaid.org) on which this research is based. A full listing of all originating and submitting laboratories is available below. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Strain Metadata" in the resulting dialog box. diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index 19dcc1f..f6be040 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -97,6 +97,29 @@ rule download_mutational_fitness_map: source = config["data_source"]["mut_fit"] shell: "curl {params.source} -o {output}" +rule download_pango_designations: + output: config["files"]["pango_designations"] + params: + source = config["data_source"]["pango_designations"] + shell: "curl {params.source} -o {output}" + +# TODO: Fix matching of strain names with whitespace +rule join_designations_and_metadata: + input: + designations = config["files"]["pango_designations"], + metadata = "pre-processed/metadata.tsv", + output: + metadata = config["files"]["metadata_designated"], + designations = "builds/pango_designations.tsv" + + shell: + """ + csv2tsv < {input.designations} > {output.designations} && \ + tsv-join -H --filter-file {output.designations} \ + -k taxon -d strain -a lineage {input.metadata} \ + > {output.metadata} + """ + rule prealign: message: """ From cfb73d55045f77179a3f5578d5cc423313894fa0 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 01:40:37 +0100 Subject: [PATCH 02/19] feat: join pango designation lineage onto metadata --- workflow/snakemake_rules/preprocess.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index f6be040..21d14bf 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -109,14 +109,14 @@ rule join_designations_and_metadata: designations = config["files"]["pango_designations"], metadata = "pre-processed/metadata.tsv", output: - metadata = config["files"]["metadata_designated"], + metadata = "pre-processed/metadata_designations.tsv", designations = "builds/pango_designations.tsv" - shell: """ csv2tsv < {input.designations} > {output.designations} && \ tsv-join -H --filter-file {output.designations} \ - -k taxon -d strain -a lineage {input.metadata} \ + --key-fields taxon --data-fields strain --append-fields lineage {input.metadata} \ + --write-all undesignated \ > {output.metadata} """ From ec4bab632abfa42eaf5bf52b33bd84a80862bedf Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 01:52:35 +0100 Subject: [PATCH 03/19] feat: make pango designation joins part of preprocess --- workflow/snakemake_rules/preprocess.smk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index 21d14bf..7879fdd 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -58,7 +58,7 @@ rule download_metadata: deflate = lambda w: _infer_decompression(config['origins'][w.origin]['metadata']), address = lambda w: config['origins'][w.origin]['metadata'] output: - metadata = "data/{origin}/metadata.tsv" + metadata = "data/{origin}/metadata_raw.tsv" shell: "aws s3 cp {params.address} - | {params.deflate} {input} > {output:q}" rule download_exclude: @@ -107,9 +107,9 @@ rule download_pango_designations: rule join_designations_and_metadata: input: designations = config["files"]["pango_designations"], - metadata = "pre-processed/metadata.tsv", + metadata = "pre-processed/metadata_raw.tsv", output: - metadata = "pre-processed/metadata_designations.tsv", + metadata = "pre-processed/metadata.tsv", designations = "builds/pango_designations.tsv" shell: """ @@ -165,7 +165,7 @@ rule prealign: rule diagnostic: message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}." input: - metadata = "data/{origin}/metadata.tsv" + metadata = "data/{origin}/metadata_raw.tsv" output: to_exclude = "pre-processed/{origin}/problematic_exclude.txt" params: @@ -200,7 +200,7 @@ rule filter: """ input: sequences = "pre-processed/{origin}/alignment.fasta.xz", - metadata = "data/{origin}/metadata.tsv", + metadata = "data/{origin}/metadata_raw.tsv", include = "defaults/include.txt", exclude = "data/{origin}/exclude.txt", problematic = "pre-processed/{origin}/problematic_exclude.txt" @@ -244,7 +244,7 @@ rule combine_bulk_metadata: input: [f"data/{origin}/metadata.tsv" for origin in config["origins"]] output: - rules.preprocess.input.metadata + "pre-processed/metadata_raw.tsv" run: if len(input)==1: shell(f"cp {input} {output}") From 956fbf396771952eb864f610bb189c143fcf2e0d Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 01:53:35 +0100 Subject: [PATCH 04/19] feat: create filter rules for pango build --- profiles/pango/builds.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml index 6247b7e..1e41628 100644 --- a/profiles/pango/builds.yaml +++ b/profiles/pango/builds.yaml @@ -5,6 +5,6 @@ builds: pango: subsamples: early: - filters: "--group-by division year month --subsample-max-sequences 1000 --max-date 2021-02-01" + filters: "--exclude-where lineage='undesignated' --group-by lineage year month --subsample-max-sequences 1000 --max-date 2021-02-01 " late: - filters: "--group-by division year month --subsample-max-sequences 3000 --min-date 2021-02-01" \ No newline at end of file + filters: "--exclude-where lineage='undesignated' --group-by lineage year month --subsample-max-sequences 3000 --min-date 2021-02-01" \ No newline at end of file From fa6da7d4fbe9e8a1b1b87b98305f1860c469c462 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 02:07:27 +0100 Subject: [PATCH 05/19] fix: add distance map configs to default/parameters.yaml --- defaults/parameters.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 5c39436..d4700f7 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -99,3 +99,13 @@ traits: sampling_bias_correction: 2.5 columns: ["country"] +distances: + comparisons: ['root', 'root', 'root', 'root', 'root', 'root'] + attributes: ['S1_mutations', 'DMS_convalescent_serum', 'DMS_Class_1', 'DMS_Class_2', 'DMS_Class_3', 'ACE2_binding_site_mutations'] + maps: + - "defaults/distance_maps/S1.json" + - "defaults/distance_maps/convalescent_serum_mean_dms.json" + - "defaults/distance_maps/class_1_mean_dms.json" + - "defaults/distance_maps/class_2_mean_dms.json" + - "defaults/distance_maps/class_3_mean_dms.json" + - "defaults/distance_maps/ace2.json" \ No newline at end of file From e8ba420c1ffe793ba53e9cf86b8f5d4cfbd39b22 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 02:10:06 +0100 Subject: [PATCH 06/19] fix: distance_map -> distance_maps typo --- workflow/snakemake_rules/core.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index f74a121..7ecbc65 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -381,7 +381,7 @@ rule mutational_fitness: --gene-names {params.genes} \ --compare-to {params.compare_to} \ --attribute-name {params.attribute_name} \ - --map {input.distance_map} \ + --map {input.distance_maps} \ --output {output} 2>&1 | tee {log} """ From 804c49caaf7c6c61f7cbb4e641b90f81cfacfeb8 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 02:13:18 +0100 Subject: [PATCH 07/19] feat: add pango-cluster to profiles --- profiles/pango-cluster/config.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 profiles/pango-cluster/config.yaml diff --git a/profiles/pango-cluster/config.yaml b/profiles/pango-cluster/config.yaml new file mode 100644 index 0000000..b3c8725 --- /dev/null +++ b/profiles/pango-cluster/config.yaml @@ -0,0 +1,27 @@ +configfile: + - defaults/parameters.yaml # Pull in the default values + - profiles/pango/builds.yaml # Specific builds for this profile + - profiles/basel-combined/secrets.yaml # Secrets not committed to git + +# Always print the commands that will be run to the screen for debugging. +printshellcmds: True +keep-going: True +reason: True +stats: stats.json + +# Print log files of failed jobs +show-failed-logs: True + +cluster-config: profiles/cluster/cluster.json + +cluster: "python3 profiles/cluster/submit.py" + +jobs: 512 + +# jobscript: profiles/cluster/submit.sh + +# Set the name for the job as display in the cluster queue. +jobname: "{rulename}.{jobid}.sh" + +# For local rules +cores: 4 From eab23740b6c0e02a94f09a961780d758bac0ced9 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 12:26:28 +0100 Subject: [PATCH 08/19] fix: use proper auspice_config for pango build --- profiles/pango/auspice_config.json | 135 +++++++++++++++++++++++++++++ profiles/pango/builds.yaml | 7 +- 2 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 profiles/pango/auspice_config.json diff --git a/profiles/pango/auspice_config.json b/profiles/pango/auspice_config.json new file mode 100644 index 0000000..c82c100 --- /dev/null +++ b/profiles/pango/auspice_config.json @@ -0,0 +1,135 @@ +{ + "title": "Diversity build using only pango designated sequences", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage by GISAID", + "type": "categorical" + }, + { + "key": "pango_default", + "title": "PANGO Lineage by Pangolin", + "type": "categorical" + }, + { + "key": "pango_usher", + "title": "PANGO Lineage by Usher", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "GISAID_clade", + "title": "GISAID Clade", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "S1_mutations", + "pango_lineage", + "pango_default", + "pango_usher", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml index 1e41628..412cff5 100644 --- a/profiles/pango/builds.yaml +++ b/profiles/pango/builds.yaml @@ -1,5 +1,8 @@ -build_dir: "builds-pango" -auspice_dir: "auspice-pango" +files: + auspice-config: profiles/pango/builds.yaml + +build_dir: builds-pango +auspice_dir: auspice-pango builds: pango: From 6a00fe371827a34c203abf601fbce76912649961 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 12:26:57 +0100 Subject: [PATCH 09/19] chore: dos2unix since our profiles were sadly dosed --- profiles/basel-countries/auspice_config.json | 270 +++++++++---------- profiles/basel-swiss/auspice_config.json | 248 ++++++++--------- profiles/early_pandemic/auspice_config.json | 198 +++++++------- profiles/genbank/auspice_config.json | 232 ++++++++-------- 4 files changed, 474 insertions(+), 474 deletions(-) diff --git a/profiles/basel-countries/auspice_config.json b/profiles/basel-countries/auspice_config.json index ceeb579..8959dc0 100644 --- a/profiles/basel-countries/auspice_config.json +++ b/profiles/basel-countries/auspice_config.json @@ -1,135 +1,135 @@ -{ - "title": "Genomic epidemiology of SARS-CoV-2 in Europe", - "build_url": "https://github.com/neherlab/ncov-simple", - "maintainers": [ - { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage by GISAID", - "type": "categorical" - }, - { - "key": "pango_default", - "title": "PANGO Lineage by Pangolin", - "type": "categorical" - }, - { - "key": "pango_usher", - "title": "PANGO Lineage by Usher", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "GISAID_clade", - "title": "GISAID Clade", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "country", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "recency", - "region", - "country", - "division", - "location", - "host", - "S1_mutations", - "pango_lineage", - "pango_default", - "pango_usher", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of SARS-CoV-2 in Europe", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage by GISAID", + "type": "categorical" + }, + { + "key": "pango_default", + "title": "PANGO Lineage by Pangolin", + "type": "categorical" + }, + { + "key": "pango_usher", + "title": "PANGO Lineage by Usher", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "GISAID_clade", + "title": "GISAID Clade", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "S1_mutations", + "pango_lineage", + "pango_default", + "pango_usher", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/basel-swiss/auspice_config.json b/profiles/basel-swiss/auspice_config.json index 6da6838..a65e9d2 100644 --- a/profiles/basel-swiss/auspice_config.json +++ b/profiles/basel-swiss/auspice_config.json @@ -1,124 +1,124 @@ -{ - "title": "Genomic epidemiology of novel coronavirus in Switzerland", - "build_url": "https://github.com/neherlab/ncov-simple", - "maintainers": [ - { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" }, - { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, - { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "location", - "title": "Location", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "division", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "country", - "region", - "recency", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of novel coronavirus in Switzerland", + "build_url": "https://github.com/neherlab/ncov-simple", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" }, + { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, + { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "location", + "title": "Location", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "division", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/early_pandemic/auspice_config.json b/profiles/early_pandemic/auspice_config.json index 16ebe66..ce7294e 100644 --- a/profiles/early_pandemic/auspice_config.json +++ b/profiles/early_pandemic/auspice_config.json @@ -1,99 +1,99 @@ -{ - "title": "Genomic epidemiology of novel coronavirus in Switzerland", - "build_url": "https://github.com/nextstrain/ncov", - "maintainers": [ - { "name": "Emma Hodcroft", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" }, - { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, - { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } - ], - "data_provenance": [ - { - "name": "GISAID" - } - ], - "colorings": [ - { - "key": "location", - "title": "Location", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "country_exposure", - "distance_measure": "num_date", - "geo_resolution": "division", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "country", - "region", - "recency", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of novel coronavirus in Switzerland", + "build_url": "https://github.com/nextstrain/ncov", + "maintainers": [ + { "name": "Emma Hodcroft", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" }, + { "name": "Sarah Nadeau", "url": "https://bsse.ethz.ch/cevo" }, + { "name": "Tanja Stadler", "url": "https://bsse.ethz.ch/cevo" } + ], + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "location", + "title": "Location", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "country_exposure", + "distance_measure": "num_date", + "geo_resolution": "division", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} diff --git a/profiles/genbank/auspice_config.json b/profiles/genbank/auspice_config.json index 8fd3d27..7d3e64e 100644 --- a/profiles/genbank/auspice_config.json +++ b/profiles/genbank/auspice_config.json @@ -1,116 +1,116 @@ -{ - "title": "Genomic epidemiology of SARS-CoV-2", - "build_url": "https://github.com/nextstrain/ncov-simple", - "maintainers": [ - { "name": "nextstrain team", "url": "https://nextstrain.org" } - ], - "data_provenance": [ - { - "name": "INSDC" - } - ], - "colorings": [ - { - "key": "country", - "title": "Country", - "type": "categorical" - }, - { - "key": "division", - "title": "Admin Division", - "type": "categorical" - }, - { - "key": "pango_lineage", - "title": "PANGO Lineage", - "type": "categorical" - }, - { - "key": "S1_mutations", - "title": "S1 mutations", - "type": "continuous" - }, - { - "key": "subclade_membership", - "title": "Emerging clade", - "type": "categorical" - }, - { - "key": "region", - "title": "Region", - "type": "categorical" - }, - { - "key": "host", - "title": "Host", - "type": "categorical" - }, - { - "key": "age", - "title": "Age", - "type": "continuous" - }, - { - "key": "sex", - "title": "Sex", - "type": "categorical" - }, - { - "key": "author", - "title": "Authors", - "type": "categorical" - }, - { - "key": "originating_lab", - "title": "Originating Lab", - "type": "categorical" - }, - { - "key": "submitting_lab", - "title": "Submitting Lab", - "type": "categorical" - }, - { - "key": "recency", - "title": "Submission Date", - "type": "categorical" - }, - { - "key": "country_exposure", - "title": "Country of exposure", - "type": "categorical" - }, - { - "key": "division_exposure", - "title": "Division of exposure", - "type": "categorical" - }, - { - "key": "region_exposure", - "title": "Region of exposure", - "type": "categorical" - } - ], - "geo_resolutions": ["location", "division", "country", "region"], - "display_defaults": { - "color_by": "clade_membership", - "distance_measure": "num_date", - "geo_resolution": "country", - "map_triplicate": true, - "branch_label": "clade", - "transmission_lines": false - }, - "filters": [ - "recency", - "region", - "country", - "division", - "location", - "host", - "pango_lineage", - "clade_membership", - "emerging_lineage", - "author" - ], - "panels": ["tree", "map", "entropy", "frequencies"] -} +{ + "title": "Genomic epidemiology of SARS-CoV-2", + "build_url": "https://github.com/nextstrain/ncov-simple", + "maintainers": [ + { "name": "nextstrain team", "url": "https://nextstrain.org" } + ], + "data_provenance": [ + { + "name": "INSDC" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "division", + "title": "Admin Division", + "type": "categorical" + }, + { + "key": "pango_lineage", + "title": "PANGO Lineage", + "type": "categorical" + }, + { + "key": "S1_mutations", + "title": "S1 mutations", + "type": "continuous" + }, + { + "key": "subclade_membership", + "title": "Emerging clade", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "age", + "title": "Age", + "type": "continuous" + }, + { + "key": "sex", + "title": "Sex", + "type": "categorical" + }, + { + "key": "author", + "title": "Authors", + "type": "categorical" + }, + { + "key": "originating_lab", + "title": "Originating Lab", + "type": "categorical" + }, + { + "key": "submitting_lab", + "title": "Submitting Lab", + "type": "categorical" + }, + { + "key": "recency", + "title": "Submission Date", + "type": "categorical" + }, + { + "key": "country_exposure", + "title": "Country of exposure", + "type": "categorical" + }, + { + "key": "division_exposure", + "title": "Division of exposure", + "type": "categorical" + }, + { + "key": "region_exposure", + "title": "Region of exposure", + "type": "categorical" + } + ], + "geo_resolutions": ["location", "division", "country", "region"], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "num_date", + "geo_resolution": "country", + "map_triplicate": true, + "branch_label": "clade", + "transmission_lines": false + }, + "filters": [ + "recency", + "region", + "country", + "division", + "location", + "host", + "pango_lineage", + "clade_membership", + "emerging_lineage", + "author" + ], + "panels": ["tree", "map", "entropy", "frequencies"] +} From 0c1b3b8560386643d512dd68003105af744b600d Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 15:48:01 +0100 Subject: [PATCH 10/19] fix: config overwrites keys not appends --- profiles/pango/builds.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml index 412cff5..39aadf1 100644 --- a/profiles/pango/builds.yaml +++ b/profiles/pango/builds.yaml @@ -1,5 +1,17 @@ files: - auspice-config: profiles/pango/builds.yaml + reference: "defaults/reference_seq.gb" + alignment_reference: "defaults/reference_seq.fasta" + annotation: "defaults/annotation.gff" + include: "defaults/include.txt" + color_schemes: "defaults/color_schemes.tsv" + description: "defaults/description.md" + clades: "builds/clades.tsv" + ordering: "builds/color_ordering.tsv" + lat_longs: "builds/lat_longs.tsv" + mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" + auspice_config: "profiles/pango/auspice_config.json" build_dir: builds-pango auspice_dir: auspice-pango From ef786baca856129da3a91d3a7018fad7aa332391 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 4 Nov 2021 16:02:23 +0100 Subject: [PATCH 11/19] feat: add designation coloring --- profiles/pango/auspice_config.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/profiles/pango/auspice_config.json b/profiles/pango/auspice_config.json index c82c100..dac91a3 100644 --- a/profiles/pango/auspice_config.json +++ b/profiles/pango/auspice_config.json @@ -21,6 +21,11 @@ "title": "Admin Division", "type": "categorical" }, + { + "key": "lineage", + "title": "PANGO Lineage by designation", + "type": "categorical" + }, { "key": "pango_lineage", "title": "PANGO Lineage by GISAID", From 618930befa8958e0c685657f575112c8d882767f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 8 Nov 2021 19:20:19 +0100 Subject: [PATCH 12/19] feat: sophisticated pango sampling --- .gitignore | 2 ++ profiles/pango/builds.yaml | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index f58380d..bbc9a19 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ stats.json log/* !log/placeholder_for_sbatch_output deployed/* +freezed + .vscode/* .DS_Store diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml index 39aadf1..4a0b105 100644 --- a/profiles/pango/builds.yaml +++ b/profiles/pango/builds.yaml @@ -19,7 +19,17 @@ auspice_dir: auspice-pango builds: pango: subsamples: - early: - filters: "--exclude-where lineage='undesignated' --group-by lineage year month --subsample-max-sequences 1000 --max-date 2021-02-01 " - late: - filters: "--exclude-where lineage='undesignated' --group-by lineage year month --subsample-max-sequences 3000 --min-date 2021-02-01" \ No newline at end of file + lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 2" + medium_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-01-01" + young_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-06-01" + very_young_lineages: + filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 1 --min-date 2021-09-01" + clades: + filters: "--exclude-where lineage='undesignated' --group-by Nextstrain_clade --sequences-per-group 10" + delta: + filters: "--exclude-where lineage!='B.1.617.2' --group-by month --subsample-max-sequences 500 --min-date 2021-02-01" + recent: + filters: "--exclude-where lineage='undesignated' --subsample-max-sequences 500 --min-date 2021-08-01" \ No newline at end of file From 0f70a73504c95b72069c86a6f61930dfa52e5bd1 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 8 Nov 2021 22:30:38 +0100 Subject: [PATCH 13/19] fix: add download_pango_designations to localrules --- workflow/snakemake_rules/preprocess.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index 03329b1..e6452f4 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -12,7 +12,7 @@ and produces ''' import os -localrules: download_sequences, download_metadata, download_exclude, download_clades, preprocess, download_lat_longs, download_color_ordering, download_mutational_fitness_map +localrules: download_sequences, download_metadata, download_exclude, download_clades, preprocess, download_lat_longs, download_color_ordering, download_mutational_fitness_map, download_pango_designations rule preprocess: input: From 3eee3927b3cdafe892d50f268838e40f5e28ea7f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 9 Nov 2021 15:04:40 +0100 Subject: [PATCH 14/19] fix: add new designation files to builds.yaml for profile/basel-combined --- profiles/basel-combined/builds.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/profiles/basel-combined/builds.yaml b/profiles/basel-combined/builds.yaml index 863820e..f756d02 100644 --- a/profiles/basel-combined/builds.yaml +++ b/profiles/basel-combined/builds.yaml @@ -8,6 +8,8 @@ files: ordering: "builds/color_ordering.tsv" lat_longs: "builds/lat_longs.tsv" mut_fit: "builds/mutational_fitness_distance_map.json" + pango_designations: "builds/pango_designations.csv" + metadata_designated: "builds/metadata_designated.tsv" data_source: clades: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/clades.tsv" From 42e4928d9ef8262353d2b60dd5f80b56dad67d3f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 9 Nov 2021 15:08:38 +0100 Subject: [PATCH 15/19] fix: add designation origin to basel-combined builds.yaml --- profiles/basel-combined/builds.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/profiles/basel-combined/builds.yaml b/profiles/basel-combined/builds.yaml index f756d02..399d454 100644 --- a/profiles/basel-combined/builds.yaml +++ b/profiles/basel-combined/builds.yaml @@ -16,6 +16,7 @@ data_source: lat_longs: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/lat_longs.tsv" color_ordering: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/color_ordering.tsv" mut_fit: "https://raw.githubusercontent.com/nextstrain/ncov/master/defaults/mutational_fitness_distance_map.json" + pango_designations: "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv" build_dir: "builds-combined" auspice_dir: "auspice-combined" From 42ec77ab1a4bb06f543a2c3d1870db7ac8309551 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 9 Nov 2021 15:15:14 +0100 Subject: [PATCH 16/19] fix: bug in metadata download rule output --- workflow/snakemake_rules/preprocess.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index e6452f4..fcbb325 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -58,7 +58,7 @@ rule download_metadata: deflate = lambda w: _infer_decompression(config['origins'][w.origin]['metadata']), address = lambda w: config['origins'][w.origin]['metadata'] output: - metadata = "data/{origin}/metadata_raw.tsv" + "data/{origin}/metadata_raw.tsv" shell: "aws s3 cp {params.address} - | {params.deflate} {input} > {output:q}" rule download_exclude: From e8190aef2bb90648d6703144f7525fa03ddc7f7f Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 9 Nov 2021 15:17:05 +0100 Subject: [PATCH 17/19] fix: bug in metadata download rule output number 2 --- workflow/snakemake_rules/preprocess.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index fcbb325..5ad1c54 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -58,7 +58,7 @@ rule download_metadata: deflate = lambda w: _infer_decompression(config['origins'][w.origin]['metadata']), address = lambda w: config['origins'][w.origin]['metadata'] output: - "data/{origin}/metadata_raw.tsv" + "data/{origin}/metadata.tsv" shell: "aws s3 cp {params.address} - | {params.deflate} {input} > {output:q}" rule download_exclude: From 1c49e22c4e988ad08eef6af66eaf1b45234a68b0 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 9 Nov 2021 15:19:09 +0100 Subject: [PATCH 18/19] fix: bug in metadata download rule output number 3 --- workflow/snakemake_rules/preprocess.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/snakemake_rules/preprocess.smk b/workflow/snakemake_rules/preprocess.smk index 5ad1c54..4c70dad 100644 --- a/workflow/snakemake_rules/preprocess.smk +++ b/workflow/snakemake_rules/preprocess.smk @@ -58,7 +58,7 @@ rule download_metadata: deflate = lambda w: _infer_decompression(config['origins'][w.origin]['metadata']), address = lambda w: config['origins'][w.origin]['metadata'] output: - "data/{origin}/metadata.tsv" + "data/{origin}/metadata_raw.tsv" shell: "aws s3 cp {params.address} - | {params.deflate} {input} > {output:q}" rule download_exclude: @@ -243,7 +243,7 @@ rule combine_bulk_sequences: rule combine_bulk_metadata: input: - [f"data/{origin}/metadata.tsv" for origin in config["origins"]] + [f"data/{origin}/metadata_raw.tsv" for origin in config["origins"]] output: "pre-processed/metadata_raw.tsv" run: From 620e79c0eb662797b358fa109eb0509dbb379f61 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Wed, 10 Nov 2021 14:17:06 +0100 Subject: [PATCH 19/19] fix: add deploy url for pango builds --- profiles/pango/builds.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/profiles/pango/builds.yaml b/profiles/pango/builds.yaml index 4a0b105..7791537 100644 --- a/profiles/pango/builds.yaml +++ b/profiles/pango/builds.yaml @@ -18,6 +18,8 @@ auspice_dir: auspice-pango builds: pango: + deploy_urls: + - "s3://nextstrain-neherlab" subsamples: lineages: filters: "--exclude-where lineage='undesignated' --group-by lineage --sequences-per-group 2" @@ -32,4 +34,4 @@ builds: delta: filters: "--exclude-where lineage!='B.1.617.2' --group-by month --subsample-max-sequences 500 --min-date 2021-02-01" recent: - filters: "--exclude-where lineage='undesignated' --subsample-max-sequences 500 --min-date 2021-08-01" \ No newline at end of file + filters: "--exclude-where lineage='undesignated' --subsample-max-sequences 500 --min-date 2021-08-01"