-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use groovy to parse samplesheets (#243)
* implement groovy functions for samplesheet parsing * set up utility class * add build to map * add json support * rename json parser * delete samplesheet module * remove mount docs (deprecated) * remove out.json from expected test output * fix parsing vcf samplesheet
- Loading branch information
Showing
12 changed files
with
244 additions
and
168 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,11 @@ | ||
[{"sampleset":"cineca_synthetic_subset","vcf_path":null,"chrom":22,"bed":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim","bim":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed","fam":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam"}] | ||
[ | ||
{ | ||
"sampleset": "hgdp", | ||
"chrom": null, | ||
"vcf_import_dosage": false, | ||
"geno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed", | ||
"pheno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam", | ||
"variants": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim", | ||
"format": "bfile" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
import java.nio.file.NoSuchFileException | ||
import java.nio.file.Path | ||
import nextflow.Nextflow | ||
|
||
class SamplesheetParser { | ||
Path path | ||
Integer n_chrom | ||
String target_build | ||
|
||
SamplesheetParser(path, n_chro, target_build) { | ||
this.path = path | ||
this.n_chrom = n_chrom | ||
this.target_build = target_build | ||
} | ||
|
||
def parseCSVRow(row) { | ||
def parsed_row = [:] | ||
parsed_row.id = row.sampleset | ||
parsed_row.n_chrom = this.n_chrom | ||
parsed_row.chrom = truncateChrom(row) | ||
parsed_row.format = row.format | ||
parsed_row.build = this.target_build | ||
parsed_row.vcf_import_dosage = importDosage(row) | ||
parsed_row = parsed_row + getFlagMap(row) | ||
|
||
return [parsed_row, getFilePaths(row)] | ||
} | ||
|
||
def parseJSON(json) { | ||
// note: we don't check for file existence here | ||
// relative paths won't work, because the JSON object doesn't use path_prefix | ||
def parsed = json.subMap("chrom", "vcf_import_dosage", "n_chrom", "format") | ||
parsed.id = json.sampleset | ||
parsed = parsed + getFlagMap(json) | ||
parsed.build = this.target_build | ||
parsed.chrom = truncateChrom(json) | ||
|
||
return [parsed, [json.geno, json.variants, json.pheno]] | ||
} | ||
|
||
def verifySamplesheet(rows) { | ||
checkChroms(rows) | ||
checkOneSampleset(rows) | ||
checkDuplicateChromosomes(rows) | ||
checkReservedName(rows) | ||
return rows | ||
} | ||
|
||
private def getFlagMap(row) { | ||
// make a map with some helpful bool flags. useful for both JSON and CSV | ||
def flags = [:] | ||
flags.is_vcf = false | ||
flags.is_bfile = false | ||
flags.is_pfile = false | ||
|
||
switch (row.format) { | ||
case "pfile": | ||
flags.is_pfile = true | ||
break | ||
case "bfile": | ||
flags.is_bfile = true | ||
break | ||
case "vcf": | ||
flags.is_vcf = true | ||
break | ||
default: | ||
Nextflow.error("Invalid format: ${row.format}") | ||
} | ||
return flags | ||
} | ||
|
||
private static def truncateChrom(row) { | ||
// when plink recodes chromosomes, it drops chr prefix. make sure the samplesheet matches this | ||
return row.chrom ? row.chrom.toString().replaceFirst("chr", "") : "ALL" | ||
} | ||
|
||
private def getFilePaths(row) { | ||
// return a list in order of geno, variants, pheno | ||
def resolved_path = resolvePath(row.path_prefix) | ||
def suffix = [:] | ||
|
||
switch (row.format) { | ||
case "pfile": | ||
suffix = [variants: ".pvar", geno: ".pgen", pheno: ".psam"] | ||
break | ||
case "bfile": | ||
suffix = [variants: ".bim", geno: ".bed", pheno: ".fam"] | ||
break | ||
case "vcf": | ||
// gzip compression gets picked up later | ||
suffix = [variants: ".vcf", geno: ".vcf", pheno: ".vcf"] | ||
break | ||
default: | ||
Nextflow.error("Invalid format: ${row.format}") | ||
} | ||
|
||
// automatically prefer compressed variant information data (and vcfs) | ||
def variant_path = suffix.subMap("variants").collect { k, v -> | ||
def f | ||
try { | ||
// always prefer zstd compressed data (nobody does this to VCFs... hopefully) | ||
f = Nextflow.file(resolved_path + v + ".zst", checkIfExists: true) | ||
} | ||
catch (NoSuchFileException zst_e) { | ||
try { | ||
// but gzipped is OK too | ||
f = Nextflow.file(resolved_path + v + ".gz", checkIfExists: true) | ||
} catch (NoSuchFileException gzip_e) { | ||
// try uncompressed data as last resort | ||
f = Nextflow.file(resolved_path + v, checkIfExists: true) | ||
} | ||
} | ||
return f | ||
} | ||
|
||
def path_list | ||
if (row.format != "vcf") { | ||
def other_paths = suffix.subMap(["geno", "pheno"]).collect { k, v -> | ||
Nextflow.file(resolved_path + v, checkIfExists: true) | ||
} | ||
path_list = other_paths.plus(1, variant_path) | ||
} else { | ||
// vcfs aren't split into geno / variant / pheno | ||
path_list = variant_path | ||
} | ||
|
||
return path_list | ||
} | ||
|
||
private def resolvePath(path) { | ||
// paths in a CSV samplesheet might be relative, and should be resolved from the samplesheet path | ||
def is_absolute = path.startsWith('/') // isAbsolute() was causing weird issues | ||
|
||
def resolved_path | ||
if (is_absolute) { | ||
resolved_path = Nextflow.file(path).resolve() | ||
} else { | ||
resolved_path = Nextflow.file(this.path).getParent().resolve(path) | ||
} | ||
|
||
return resolved_path | ||
} | ||
|
||
private static def importDosage(row) { | ||
// vcf_genotype_field is an optional field in the samplesheet | ||
def vcf_import_dosage = false | ||
if (row.containsKey("vcf_genotype_field")) { | ||
if (row["vcf_genotype_field"] == "DS") { | ||
vcf_import_dosage = true | ||
} | ||
} | ||
|
||
return vcf_import_dosage | ||
} | ||
|
||
// samplesheet verification methods from here | ||
|
||
private def checkChroms(rows) { | ||
// one missing chromosome (i.e. a combined file) is OK. more than this isn't | ||
def chroms = rows.collect { row -> row.chrom } | ||
def n_empty_chrom = chroms.count { it == "" } | ||
if (n_empty_chrom > 1) { | ||
Nextflow.error("${n_empty_chrom} missing chromosomes detected! Maximum is 1. Check your samplesheet.") | ||
} | ||
} | ||
|
||
private def checkOneSampleset(rows) { | ||
def samplesets = rows.collect { row -> row.sampleset } | ||
def n_samplesets = samplesets.toSet().size() | ||
if (n_samplesets > 1) { | ||
Nextflow.error("${n_samplesets} missing chromosomes detected! Maximum is 1. Check your samplesheet.") | ||
} | ||
} | ||
|
||
private def checkReservedName(samplesheet) { | ||
def samplesets = samplesheet.collect { row -> row.sampleset } | ||
def n_bad_name = samplesets.count { it == "reference" } | ||
|
||
if (n_bad_name != 0) { | ||
Nextflow.error("Reserved sampleset name detected. Please don't call your sampleset 'reference'") | ||
} | ||
} | ||
|
||
private def checkDuplicateChromosomes(samplesheet) { | ||
def chroms = samplesheet.collect { row -> row.chrom } | ||
def n_unique_chroms = chroms.toSet().size() | ||
def n_chroms = chroms.size() | ||
|
||
if (n_unique_chroms != n_chroms) { | ||
Nextflow.error("Duplicated chromosome entries detected in samplesheet. Check your samplesheet.") | ||
} | ||
} | ||
|
||
} | ||
|
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.