Use groovy to parse samplesheets (#243)

* implement groovy functions for samplesheet parsing * set up utility class * add build to map * add json support * rename json parser * delete samplesheet module * remove mount docs (deprecated) * remove out.json from expected test output * fix parsing vcf samplesheet
PGScatalog · Feb 15, 2024 · cc163b7 · cc163b7
1 parent 3161c07
commit cc163b7
Show file tree

Hide file tree

Showing 12 changed files with 244 additions and 168 deletions.
diff --git a/assets/examples/samplesheet.json b/assets/examples/samplesheet.json
@@ -1 +1,11 @@
-[{"sampleset":"cineca_synthetic_subset","vcf_path":null,"chrom":22,"bed":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim","bim":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed","fam":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam"}]
+[
+    {
+        "sampleset": "hgdp",
+        "chrom": null,
+        "vcf_import_dosage": false,
+        "geno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed",
+        "pheno": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam",
+        "variants": "https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim",
+        "format": "bfile"
+    }
+]
diff --git a/docs/how-to/index.rst b/docs/how-to/index.rst
@@ -42,7 +42,6 @@ Working in different environments
    bigjob
    arm
    offline
-   mount
 
 Working with complex datasets
 -----------------------------

diff --git a/docs/how-to/mount.rst b/docs/how-to/mount.rst
diff --git a/lib/SamplesheetParser.groovy b/lib/SamplesheetParser.groovy
@@ -0,0 +1,195 @@
+import java.nio.file.NoSuchFileException
+import java.nio.file.Path
+import nextflow.Nextflow
+
+class SamplesheetParser {
+    Path path
+    Integer n_chrom
+    String target_build
+
+    SamplesheetParser(path, n_chro, target_build) {
+        this.path = path
+        this.n_chrom = n_chrom
+        this.target_build = target_build
+    }
+
+    def parseCSVRow(row) {
+        def parsed_row = [:]
+        parsed_row.id = row.sampleset
+        parsed_row.n_chrom = this.n_chrom
+        parsed_row.chrom = truncateChrom(row)
+        parsed_row.format = row.format
+        parsed_row.build = this.target_build
+        parsed_row.vcf_import_dosage = importDosage(row)
+        parsed_row = parsed_row + getFlagMap(row)
+
+        return [parsed_row, getFilePaths(row)]
+    }
+
+    def parseJSON(json) {
+        // note: we don't check for file existence here 
+        // relative paths won't work, because the JSON object doesn't use path_prefix
+        def parsed = json.subMap("chrom", "vcf_import_dosage", "n_chrom", "format") 
+        parsed.id = json.sampleset
+        parsed = parsed + getFlagMap(json)
+        parsed.build = this.target_build
+        parsed.chrom = truncateChrom(json)
+
+        return [parsed, [json.geno, json.variants, json.pheno]]
+    }
+
+    def verifySamplesheet(rows) {
+        checkChroms(rows)
+        checkOneSampleset(rows)
+        checkDuplicateChromosomes(rows)
+        checkReservedName(rows)
+        return rows
+    }
+
+    private def getFlagMap(row) {
+        // make a map with some helpful bool flags. useful for both JSON and CSV
+        def flags = [:]
+        flags.is_vcf = false
+        flags.is_bfile = false
+        flags.is_pfile = false
+
+        switch (row.format) {
+            case "pfile":
+                flags.is_pfile = true
+                break
+            case "bfile":
+                flags.is_bfile = true
+                break
+            case "vcf":
+                flags.is_vcf = true
+                break
+            default:
+                Nextflow.error("Invalid format: ${row.format}")
+        }
+        return flags
+    }
+
+    private static def truncateChrom(row) {
+        // when plink recodes chromosomes, it drops chr prefix. make sure the samplesheet matches this
+        return row.chrom ? row.chrom.toString().replaceFirst("chr", "") : "ALL"
+    }
+
+    private def getFilePaths(row) {
+        // return a list in order of geno, variants, pheno
+        def resolved_path = resolvePath(row.path_prefix)
+        def suffix = [:]
+
+        switch (row.format) {
+            case "pfile":
+                suffix = [variants: ".pvar", geno: ".pgen", pheno: ".psam"]
+                break
+            case "bfile":
+                suffix = [variants: ".bim", geno: ".bed", pheno: ".fam"]
+                break
+            case "vcf":
+                // gzip compression gets picked up later
+                suffix = [variants: ".vcf", geno: ".vcf", pheno: ".vcf"]
+                break
+            default:
+                Nextflow.error("Invalid format: ${row.format}")
+        }
+
+        // automatically prefer compressed variant information data (and vcfs)
+        def variant_path = suffix.subMap("variants").collect { k, v ->
+            def f
+            try {
+                // always prefer zstd compressed data (nobody does this to VCFs... hopefully)
+                f = Nextflow.file(resolved_path + v + ".zst", checkIfExists: true)
+            }
+            catch (NoSuchFileException zst_e) {
+                try {
+                    // but gzipped is OK too
+                    f = Nextflow.file(resolved_path + v + ".gz", checkIfExists: true)
+                } catch (NoSuchFileException gzip_e) {
+                    // try uncompressed data as last resort
+                    f = Nextflow.file(resolved_path + v, checkIfExists: true)
+                }
+            }
+            return f
+        }
+
+        def path_list
+        if (row.format != "vcf") {
+            def other_paths = suffix.subMap(["geno", "pheno"]).collect { k, v ->
+                Nextflow.file(resolved_path + v, checkIfExists: true)
+            }
+            path_list = other_paths.plus(1, variant_path)
+        } else {
+            // vcfs aren't split into geno / variant / pheno
+            path_list = variant_path
+        }
+
+        return path_list
+    }
+
+    private def resolvePath(path) {
+        // paths in a CSV samplesheet might be relative, and should be resolved from the samplesheet path
+        def is_absolute = path.startsWith('/') // isAbsolute() was causing weird issues
+
+        def resolved_path
+        if (is_absolute) {
+            resolved_path = Nextflow.file(path).resolve()
+        } else {
+            resolved_path = Nextflow.file(this.path).getParent().resolve(path)
+        }
+
+        return resolved_path
+    }
+
+    private static def importDosage(row) {
+        // vcf_genotype_field is an optional field in the samplesheet
+        def vcf_import_dosage = false
+        if (row.containsKey("vcf_genotype_field")) {
+            if (row["vcf_genotype_field"] == "DS") {
+                vcf_import_dosage = true
+            }
+        }
+
+        return vcf_import_dosage
+    }
+
+    // samplesheet verification methods from here
+
+    private def checkChroms(rows) {
+        // one missing chromosome (i.e. a combined file) is OK. more than this isn't
+        def chroms = rows.collect { row -> row.chrom }
+        def n_empty_chrom = chroms.count { it == "" }
+        if (n_empty_chrom > 1) {
+            Nextflow.error("${n_empty_chrom} missing chromosomes detected! Maximum is 1. Check your samplesheet.")
+        }
+    }
+
+    private def checkOneSampleset(rows) {
+        def samplesets = rows.collect { row -> row.sampleset }
+        def n_samplesets = samplesets.toSet().size()
+        if (n_samplesets > 1) {
+            Nextflow.error("${n_samplesets} missing chromosomes detected! Maximum is 1. Check your samplesheet.")
+        }
+    }
+
+    private def checkReservedName(samplesheet) {
+        def samplesets = samplesheet.collect { row -> row.sampleset }
+        def n_bad_name = samplesets.count { it == "reference" }
+
+        if (n_bad_name != 0) {
+            Nextflow.error("Reserved sampleset name detected. Please don't call your sampleset 'reference'")
+        }
+    }
+
+    private def checkDuplicateChromosomes(samplesheet) {
+        def chroms = samplesheet.collect { row -> row.chrom }
+        def n_unique_chroms = chroms.toSet().size()
+        def n_chroms = chroms.size()
+
+        if (n_unique_chroms != n_chroms) {
+            Nextflow.error("Duplicated chromosome entries detected in samplesheet. Check your samplesheet.")
+        }
+    }
+
+}
+
diff --git a/modules/local/samplesheet_json.nf b/modules/local/samplesheet_json.nf