Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use weighted sampling for Asia builds #1106

Merged
merged 3 commits into from
Aug 22, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use population-based weighted sampling for Asia builds
This replaces the Asia/China/India split with population-based weighted
sampling (possible in Augur version 25.3.0).

This requires changing the geographical grouping resolution from
division to country, but I assume it was only grouped by division in an
attempt to have varying group sizes per country, and that
population-based weighting is an acceptable replacement.
victorlin committed Aug 22, 2024
commit bc3f69eec161aa5bddde48661b951a5ad4501436
144 changes: 32 additions & 112 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
@@ -273,30 +273,18 @@ subsampling:

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_1m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
@@ -305,22 +293,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
@@ -330,30 +307,18 @@ subsampling:

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_2m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
@@ -362,22 +327,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
@@ -387,30 +341,18 @@ subsampling:

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_6m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
@@ -419,22 +361,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division year month"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country year month"
@@ -443,27 +374,16 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over all-time
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_all_time:
# Focal samples for Asia
asia:
group_by: "division year month"
max_sequences: 1500
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Focal samples for China
china:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=China'"
# Focal samples for India
india:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=India'"
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 3500
exclude: "--exclude-where 'region!=Asia'"
# Contextual samples from the rest of the world
context:
group_by: "country year month"
Loading