Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use groovy to parse samplesheets #243

Merged
merged 9 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion assets/examples/samplesheet.json
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
[{"sampleset":"cineca_synthetic_subset","vcf_path":null,"chrom":22,"bed":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim","bim":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed","fam":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam"}]
[
{
"sampleset": "hgdp",
"chrom": null,
"vcf_import_dosage": false,
"geno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed",
"pheno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam",
"variants": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim",
"format": "bfile"
}
]
1 change: 0 additions & 1 deletion docs/how-to/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ Working in different environments
bigjob
arm
offline
mount

Working with complex datasets
-----------------------------
Expand Down
33 changes: 0 additions & 33 deletions docs/how-to/mount.rst

This file was deleted.

195 changes: 195 additions & 0 deletions lib/SamplesheetParser.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import java.nio.file.NoSuchFileException
import java.nio.file.Path
import nextflow.Nextflow

class SamplesheetParser {
Path path
Integer n_chrom
String target_build

SamplesheetParser(path, n_chro, target_build) {
this.path = path
this.n_chrom = n_chrom
this.target_build = target_build
}

def parseCSVRow(row) {
def parsed_row = [:]
parsed_row.id = row.sampleset
parsed_row.n_chrom = this.n_chrom
parsed_row.chrom = truncateChrom(row)
parsed_row.format = row.format
parsed_row.build = this.target_build
parsed_row.vcf_import_dosage = importDosage(row)
parsed_row = parsed_row + getFlagMap(row)

return [parsed_row, getFilePaths(row)]
}

def parseJSON(json) {
// note: we don't check for file existence here
// relative paths won't work, because the JSON object doesn't use path_prefix
def parsed = json.subMap("chrom", "vcf_import_dosage", "n_chrom", "format")
parsed.id = json.sampleset
parsed = parsed + getFlagMap(json)
parsed.build = this.target_build
parsed.chrom = truncateChrom(json)

return [parsed, [json.geno, json.variants, json.pheno]]
}

def verifySamplesheet(rows) {
checkChroms(rows)
checkOneSampleset(rows)
checkDuplicateChromosomes(rows)
checkReservedName(rows)
return rows
}

private def getFlagMap(row) {
// make a map with some helpful bool flags. useful for both JSON and CSV
def flags = [:]
flags.is_vcf = false
flags.is_bfile = false
flags.is_pfile = false

switch (row.format) {
case "pfile":
flags.is_pfile = true
break
case "bfile":
flags.is_bfile = true
break
case "vcf":
flags.is_vcf = true
break
default:
Nextflow.error("Invalid format: ${row.format}")
}
return flags
}

private static def truncateChrom(row) {
// when plink recodes chromosomes, it drops chr prefix. make sure the samplesheet matches this
return row.chrom ? row.chrom.toString().replaceFirst("chr", "") : "ALL"
}

private def getFilePaths(row) {
// return a list in order of geno, variants, pheno
def resolved_path = resolvePath(row.path_prefix)
def suffix = [:]

switch (row.format) {
case "pfile":
suffix = [variants: ".pvar", geno: ".pgen", pheno: ".psam"]
break
case "bfile":
suffix = [variants: ".bim", geno: ".bed", pheno: ".fam"]
break
case "vcf":
// gzip compression gets picked up later
suffix = [variants: ".vcf", geno: ".vcf", pheno: ".vcf"]
break
default:
Nextflow.error("Invalid format: ${row.format}")
}

// automatically prefer compressed variant information data (and vcfs)
def variant_path = suffix.subMap("variants").collect { k, v ->
def f
try {
// always prefer zstd compressed data (nobody does this to VCFs... hopefully)
f = Nextflow.file(resolved_path + v + ".zst", checkIfExists: true)
}
catch (NoSuchFileException zst_e) {
try {
// but gzipped is OK too
f = Nextflow.file(resolved_path + v + ".gz", checkIfExists: true)
} catch (NoSuchFileException gzip_e) {
// try uncompressed data as last resort
f = Nextflow.file(resolved_path + v, checkIfExists: true)
}
}
return f
}

def path_list
if (row.format != "vcf") {
def other_paths = suffix.subMap(["geno", "pheno"]).collect { k, v ->
Nextflow.file(resolved_path + v, checkIfExists: true)
}
path_list = other_paths.plus(1, variant_path)
} else {
// vcfs aren't split into geno / variant / pheno
path_list = variant_path
}

return path_list
}

private def resolvePath(path) {
// paths in a CSV samplesheet might be relative, and should be resolved from the samplesheet path
def is_absolute = path.startsWith('/') // isAbsolute() was causing weird issues

def resolved_path
if (is_absolute) {
resolved_path = Nextflow.file(path).resolve()
} else {
resolved_path = Nextflow.file(this.path).getParent().resolve(path)
}

return resolved_path
}

private static def importDosage(row) {
// vcf_genotype_field is an optional field in the samplesheet
def vcf_import_dosage = false
if (row.containsKey("vcf_genotype_field")) {
if (row["vcf_genotype_field"] == "DS") {
vcf_import_dosage = true
}
}

return vcf_import_dosage
}

// samplesheet verification methods from here

private def checkChroms(rows) {
// one missing chromosome (i.e. a combined file) is OK. more than this isn't
def chroms = rows.collect { row -> row.chrom }
def n_empty_chrom = chroms.count { it == "" }
if (n_empty_chrom > 1) {
Nextflow.error("${n_empty_chrom} missing chromosomes detected! Maximum is 1. Check your samplesheet.")
}
}

private def checkOneSampleset(rows) {
def samplesets = rows.collect { row -> row.sampleset }
def n_samplesets = samplesets.toSet().size()
if (n_samplesets > 1) {
Nextflow.error("${n_samplesets} missing chromosomes detected! Maximum is 1. Check your samplesheet.")
}
}

private def checkReservedName(samplesheet) {
def samplesets = samplesheet.collect { row -> row.sampleset }
def n_bad_name = samplesets.count { it == "reference" }

if (n_bad_name != 0) {
Nextflow.error("Reserved sampleset name detected. Please don't call your sampleset 'reference'")
}
}

private def checkDuplicateChromosomes(samplesheet) {
def chroms = samplesheet.collect { row -> row.chrom }
def n_unique_chroms = chroms.toSet().size()
def n_chroms = chroms.size()

if (n_unique_chroms != n_chroms) {
Nextflow.error("Duplicated chromosome entries detected in samplesheet. Check your samplesheet.")
}
}

}

31 changes: 0 additions & 31 deletions modules/local/samplesheet_json.nf

This file was deleted.

Loading
Loading