wtsi-hgi · GennadiyZakharov · Jun 28, 2024 · Jun 28, 2024 · Jul 2, 2024 · Jul 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Hail run logs
+hail-*.log
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/1-import_data/1-import_gatk_vcfs_to_hail.py b/1-import_data/1-import_gatk_vcfs_to_hail.py
@@ -1,42 +1,63 @@
 # Load GATK VCFs into hail and save as matrixtable
-import hail as hl
 import pyspark
 import yaml
 import os
 import sys
-from wes_qc.utils.utils import parse_config
+import re
+import hail as hl
+from utils.utils import parse_config, path_local, path_spark
 
+# DEBUG: for some reason, paths prefix is `file:`, not a `file://`
+VCF_PATTERN = re.compile("file:.*vcf.b?gz")
 
-def load_vcfs_to_mt(indir, outdir, tmp_dir, header):
+def load_vcfs_to_mt(config):
     '''
-    load VCFs and save as hail mt
+    load VCFs and save as hail mt.  
+    Save mt as outdir/gatk_unprocessed.mt
+
+    ### Config fields
+    ```
+    step1.gatk_vcf_header_infile
+    step1.gatk_vcf_indir
+    step1.gatk_mt_outfile
+    ```
     '''
-    objects = hl.utils.hadoop_ls(indir)
-    vcfs = [vcf["path"] for vcf in objects if (vcf["path"].startswith("file") and vcf["path"].endswith("vcf.gz"))]
-    print("Loading VCFs")
+    indir, header, outfile = (
+        config['step1']['gatk_vcf_indir'], 
+        config['step1'].get('gatk_vcf_header_infile'), # optional
+        config['step1']['gatk_mt_outfile']
+    )
+
+    objects = hl.utils.hadoop_ls(path_spark(indir))
+
+    # get paths of all vcf files
+    vcfs = [vcf["path"] for vcf in objects if VCF_PATTERN.match(vcf["path"])]
+    print(f"info: Found {len(vcfs)} VCFs in {indir}")
     #create and save MT
-    mt = hl.import_vcf(vcfs, array_elements_required=False, force_bgz=True, header_file = header)
-    print("Saving as hail mt")
-    mt_out_file = outdir + "gatk_unprocessed.mt"
+    if header:
+        print("info: Loading VCFs with header")
+        mt = hl.import_vcf(vcfs, array_elements_required=False, force_bgz=True, header_file=header)
+    else:
+        print("info: Loading VCFs WITHOUT header")
+        mt = hl.import_vcf(vcfs, array_elements_required=False, force_bgz=True)
+
+    mt_out_file = path_spark(outfile)
+    print(f"Saving as hail mt to {mt_out_file}")
     mt.write(mt_out_file, overwrite=True)
 
-
 def main():
     #set up input variables
-    inputs = parse_config()
-    vcf_header = inputs['gatk_vcf_header']
-    import_vcf_dir = inputs['gatk_import_lustre_dir']
-    mtdir = inputs['matrixtables_lustre_dir']
-
+    config = parse_config()
+
     #initialise hail
-    tmp_dir = "hdfs://spark-master:9820/"
-    sc = pyspark.SparkContext()
-    hadoop_config = sc._jsc.hadoopConfiguration()
-    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
+    tmp_dir = config['general']['tmp_dir']
+    # sc = pyspark.SparkContext()
+    sc = pyspark.SparkContext.getOrCreate()
+    hadoop_config = sc._jsc.hadoopConfiguration() # unused
+    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38", idempotent=True)
 
     #load VCFs
-    load_vcfs_to_mt(import_vcf_dir, mtdir, tmp_dir, vcf_header)
-
+    load_vcfs_to_mt(config)
 
 if __name__ == '__main__':
-    main() 
+    main() 
diff --git a/2-sample_qc/1-hard_filters_sex_annotation.py b/2-sample_qc/1-hard_filters_sex_annotation.py
@@ -1,68 +1,104 @@
 #apply gnomad's hard filters and impute sex
 #input gatk_unprocessed.mt from step 1.1
+import os
 import hail as hl
+import hailtop.fs as hfs
 import pyspark
-from utils.utils import parse_config
+from utils.utils import parse_config, path_local, path_spark
+import os
 
-def apply_hard_filters(mt: hl.MatrixTable, mtdir: str) -> hl.MatrixTable:
+def apply_hard_filters(mt: hl.MatrixTable, config: dict) -> hl.MatrixTable:
     '''
     Applies hard filters and annotates samples in the filtered set with call rate
     :param MatrixTable mt: MT containing samples to be ascertained for sex
     :param str mtdir: directory output matrix tables are written to
+    :param dict config:
     :return: MatrixTable with hard filtering annotation
     :rtype: MatrixTable
+
+    ### Config fields
+    step2.sex_annotation_hard_filters.filtered_mt_outfile : path  
+    step2.sex_annotation_hard_filters.n_alt_alleles_threshold : float  
+    step2.sex_annotation_hard_filters.defined_gt_frac_threshold : float  
     '''
+    conf = config['step2']['sex_annotation_hard_filters']
+
     print("Applying hard filters")
-    filtered_mt_file = mtdir + "mt_hard_filters_annotated.mt"
+    filtered_mt_file = path_spark(conf['filtered_mt_outfile']) # output
+
     mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) &
-        (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001) &
-        (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99))
+        (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > conf['n_alt_alleles_threshold']) &
+        (hl.agg.fraction(hl.is_defined(mt.GT)) > conf['defined_gt_frac_threshold']))
     mt = mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined(mt.GT)))
     mt.write(filtered_mt_file, overwrite=True)
 
     return mt
 
 
-def impute_sex(mt: hl.MatrixTable, mtdir: str, annotdir: str, male_threshold: float = 0.8, female_threshold: float = 0.5) -> hl.MatrixTable:
+def impute_sex(mt: hl.MatrixTable, config: dict) -> hl.MatrixTable:
     '''
     Imputes sex, exports data, and annotates mt with this data
     :param MatrixTable mt: MT containing samples to be ascertained for sex
-    :param str mtdir: directory output matrix tables are written to
-    :param str annotdir: directory annotation files are written to
+    :param dict config:
     :return: MatrixTable with imputed sex annotations stashed in column annotation 'sex_check'
     :rtype: MatrixTable
+
+    ### Config fields
+    step2.impute_sex.sex_ht_outfile : path  
+    step2.impute_sex.sex_mt_outfile : path  
+    step2.impute_sex.female_threshold : float  
+    step2.impute_sex.male_threshold : float  
+    step2.impute_sex.aaf_threshold : float  
     '''
-    print("Imputing sex with male_threshold = " + str(male_threshold) + " and female threshold = " + str(female_threshold))
+
+    conf = config['step2']['impute_sex']
+    print("Imputing sex with male_threshold = " + str(conf['male_threshold']) + " and female threshold = " + str(conf['female_threshold']))
 
     #filter to X and select unphased diploid genotypes - no need to filter to X as impute_sex takes care of this
     #mt1 = hl.filter_intervals(mt, [hl.parse_locus_interval('chrX')])
     mt1 = hl.split_multi_hts(mt)
     mtx_unphased = mt1.select_entries(GT=hl.unphased_diploid_gt_index_call(mt1.GT.n_alt_alleles()))
     #imput sex on the unphased diploid GTs
-    sex_ht = hl.impute_sex(mtx_unphased.GT, aaf_threshold=0.05, female_threshold=female_threshold, male_threshold=male_threshold)
+    sex_ht = hl.impute_sex(mtx_unphased.GT, aaf_threshold=conf['aaf_threshold'], female_threshold=conf['female_threshold'], male_threshold=conf['male_threshold'])
     #export
-    sex_ht.export(annotdir + '/sex_annotated.sex_check.txt.bgz')
+    sex_ht.export(path_spark(conf['sex_ht_outfile'])) # output
     #annotate input (all chroms) mt with imputed sex and write to file
     sex_colnames = ['f_stat', 'is_female']
     sex_ht = sex_ht.select(*sex_colnames)
     mt = mt.annotate_cols(**sex_ht[mt.col_key])
-    sex_mt_file = mtdir + "mt_sex_annotated.mt"
+    sex_mt_file = path_spark(conf['sex_mt_outfile']) # output
     print("Writing to " + sex_mt_file)
     mt.write(sex_mt_file, overwrite=True)
 
     return mt
 
 
-def identify_inconsistencies(mt: hl.MatrixTable, mtdir: str, annotdir: str, resourcedir: str):
+def identify_inconsistencies(mt: hl.MatrixTable, config: dict):
     '''
     Find samples where annotated sex conflicts with the sex in our metadata
     Find samples where sex is not annotated
-    Find samples where f_stat is between 0.2 and 0.8
+    Find samples where f_stat is between fstat_low and fstat_high
     :param MatrixTable mt: MT containing imputed sex in column 'sex_check'
-    :param str mtdir: directory output matrix tables are written to
-    :param str annotdir: directory annotation files are written to
-    :param str resourcedir: directory annotation files are written to
+    :param dict config:
+
+    ### Config fields
+    step2.sex_inconsistencies.sex_metadata_file : input path : TODO explain metadata structure and constants  
+    step2.sex_inconsistencies.conflicting_sex_report_file : output path : TODO  
+    step2.sex_inconsistencies.fstat_outliers_report_file : output path : TODO  
+    step2.sex_inconsistencies.fstat_low : float  
+    step2.sex_inconsistencies.fstat_high : float  
     '''
+    conf = config['step2']['sex_inconsistencies']
+
+    # TODO: do we need such a detailed logging, or a single if (... and ... and ...) will suffice?
+    error = False
+    if not hfs.exists(conf['sex_metadata_file']):
+        print("error: identify_inconsistencies: missing input: sex_metadata_file")
+        error = True
+    if error:
+        print("skip identify_inconsistencies because of previous errors")
+        return
+
     print("Annotating samples with inconsistencies:")
     qc_ht = mt.cols()
     #convert is_female boolean to sex
@@ -73,8 +109,8 @@ def identify_inconsistencies(mt: hl.MatrixTable, mtdir: str, annotdir: str, reso
     qc_ht = qc_ht.annotate(sex=sex_expr).key_by('s')
 
     #annotate with manifest sex - keyed on ega to match identifiers in matrixtable
-    metadata_file =  resourcedir +  '/mlwh_sample_and_sex.txt'
-    metadata_ht = hl.import_table(metadata_file, delimiter="\t").key_by('accession_number')
+
+    metadata_ht = hl.import_table(path_spark(conf['sex_metadata_file']), delimiter="\t").key_by('accession_number')
     #we only want those from the metadata file where sex is known
     metadata_ht = metadata_ht.filter((metadata_ht.gender == 'Male') | (metadata_ht.gender == 'Female'))
 
@@ -84,41 +120,45 @@ def identify_inconsistencies(mt: hl.MatrixTable, mtdir: str, annotdir: str, reso
     #identify samples where imputed sex and manifest sex conflict
     conflicting_sex_ht = ht_joined.filter(((ht_joined.sex == 'male') & (ht_joined.manifest_sex == 'Female')) | (
         (ht_joined.sex == 'female') & (ht_joined.manifest_sex == 'Male')))
-    conflicting_sex_ht.export(annotdir + '/conflicting_sex.txt.bgz')
 
-    #identify samples where f stat is between 0.2 and 0.8
-    f_stat_ht = qc_ht.filter( (qc_ht.f_stat > 0.2) & (qc_ht.f_stat < 0.8) )
-    f_stat_ht.export(annotdir + '/sex_annotation_f_stat_outliers.txt.bgz')
+    # TODO: do we need this redundancy? the paths already have the "file://" prefix
+    conflicting_sex_ht.export(path_spark(conf['conflicting_sex_report_file'])) # output
+
+    #identify samples where f stat is between fstat_low and fstat_high
+    f_stat_ht = qc_ht.filter( (qc_ht.f_stat > conf['fstat_low']) & (qc_ht.f_stat < conf['fstat_high']) )
+    f_stat_ht.export(path_spark(conf['fstat_outliers_report_file'])) # output
 
 
 def main():
     #set up
-    inputs = parse_config()
+    config = parse_config()
     #importmtdir = inputs['load_matrixtables_lustre_dir']
-    mtdir = inputs['matrixtables_lustre_dir']
-    annotdir = inputs['annotation_lustre_dir']
-    resourcedir = inputs['resource_dir']
 
-    #initialise hail
-    tmp_dir = "hdfs://spark-master:9820/"
-    sc = pyspark.SparkContext()
+    #initialise hailS
+    tmp_dir = config['general']['tmp_dir']
+    # sc = pyspark.SparkContext()
+    sc = pyspark.SparkContext.getOrCreate()
     hadoop_config = sc._jsc.hadoopConfiguration()
-    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
+    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38", idempotent=True)
 
-    mt_in_file = mtdir + "/gatk_unprocessed.mt"
+    mt_infile = config['step1']['gatk_mt_outfile'] # input from 1.1
     print("Reading input matrix")
-    mt_unfiiltered = hl.read_matrix_table(mt_in_file)
+    mt_unfiltered = hl.read_matrix_table(path_spark(mt_infile))
 
     #apply hard fitlers
-    mt_filtered = apply_hard_filters(mt_unfiiltered, mtdir)
+    mt_filtered = apply_hard_filters(mt_unfiltered, config)
 
     #impute sex
-    mt_sex = impute_sex(mt_filtered, mtdir, annotdir, male_threshold=0.6)
+    mt_sex = impute_sex(mt_filtered, config)
 
+    # TODO: where is this function?
     # annotate_ambiguous_sex(mt_sex, mtdir)
-    identify_inconsistencies(mt_sex, mtdir, annotdir, resourcedir)
-
 
+    # TODO: make this optional and check how it affects the downstream steps
+    # there is no metadata for our contrived test datasets
+    #identify_inconsistencies
+    identify_inconsistencies(mt_sex, config)
+
 if __name__ == '__main__':
     main()