separated routines for merging hla alleles to allow incorporating cus…

…tom alleles more easier
ylab-hi · Jan 16, 2024 · 97a9eeb · 97a9eeb
1 parent 32e81c8
commit 97a9eeb
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 55 deletions.
diff --git a/.tests/integration/config_basic/config.yaml b/.tests/integration/config_basic/config.yaml
@@ -5,14 +5,14 @@ basequal: 20  # overall required base quality
 
 ### data
 data:
-  name:  patient2_test
+  name:  basic_sample
   dnaseq: 
     dna_normal: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz
     dna_tumor: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz
   rnaseq:
     rna_tumor: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz
   normal: dna_normal
-
+  
   custom:
     variants:
     hlatyping:
@@ -84,16 +84,15 @@ quantification:
   mode: BOTH # RNA, RNA or BOTH
 
 hlatyping:
-  class: BOTH # I, II or BOTH
-  mode: BOTH  # DNA, RNA or BOTH
+  class: I # I, II or BOTH
   # specific path for class II hlatyping (only required when class: II, or BOTH)
-  MHC-I_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
+  MHC-I_mode: DNA, RNA # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
   MHC-II_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
   freqdata: ./hlahd_files/freq_data/ 
   split: ./hlahd_files/HLA_gene.split.txt
   dict: ./hlahd_files/dictionary/
 
-priorization:
+prioritization:
   class: I # I, II or BOTH
   lengths:
     MHC-I: 8,9,10,11

diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ cd ScanNeo2
 
 ### Running the Workflow
 
-    To run the workflow, use the following command:
+To run the workflow, use the following command:
 
 ```bash
 cd /path/to/your/working/directory/

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -199,11 +199,11 @@ def aggregate_mhcI_PE(wildcards):
     no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no)
 
 
-def get_mhcI_alleles(wildcards):
+def get_predicted_mhcI_alleles(wildcards):
   values = []
 
   # routines to genotype from DNA
-  if config['hlatyping']['MHC-I_mode'] in ['DNA', 'BOTH']:
+  if "DNA" in config['hlatyping']['MHC-I_mode']:
     if config['data']['dnaseq'] is not None:
       for key in config['data']['dnaseq'].keys():
         if key not in config['data']['normal']:
@@ -216,7 +216,7 @@ def get_mhcI_alleles(wildcards):
       print('dnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is DNA or BOTH -- will be ignored')
 
   # routines to genotype from RNA
-  if config['hlatyping']['MHC-I_mode'] in ['RNA', 'BOTH']:
+  if "RNA" in config['hlatyping']['MHC-I_mode']:
     if config['data']['rnaseq'] is not None:
       for key in config['data']['rnaseq'].keys():
         if key not in config['data']['normal']:
@@ -230,8 +230,11 @@ def get_mhcI_alleles(wildcards):
 
 
   # if alleles have been specified in the config file, add them to the list
-  if config['data']['custom']['hlatyping']['MHC-I'] is not None:
-    values.append(config['custom']['hlatyping']['MHC-I'])
+  #if "custom" in config['hlatyping']['MHC-I_mode']:
+    #if config['data']['custom']['hlatyping']['MHC-I'] is not None:
+      #values.append(config['custom']['hlatyping']['MHC-I'])
+  #if config['data']['custom']['hlatyping']['MHC-I'] is not None:
+    #values.append(config['custom']['hlatyping']['MHC-I'])
 
 
   if len(values) == 0:
@@ -240,14 +243,28 @@ def get_mhcI_alleles(wildcards):
 
   return values
 
+def get_all_mhcI_alleles{wildcards):
+  values = []
 
-##### MHC CLASS I
+  if ("DNA" in config['hlatyping']['MHC-I_mode'] or
+      "RNA" in config['hlatyping']['MHC-I_mode']):
+    values += expand("results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv",
+                    sample = wildcards.sample)
 
+  if "custom" in config["hlatyping"]["MHC-I_mode"]:
+    values += config["data"]["custom"]["hlatyping"]["MHC-I"]
 
+  if len(values) == 0:
+    print('No hla data found. Check config file for correct specification of data and hla genotyping mode')
+    sys.exit(1)
 
+  return values
 
 
 
+##### MHC CLASS I
+
+
 # returns list of hla typing results for the given sample and group
 
 ###### MHC Class II #########
@@ -555,6 +572,7 @@ def get_mhcI(wildcards):
   if config['prioritization']['class'] in ['I', 'BOTH']:
     alleles += expand("results/{sample}/hla/mhc-I.tsv",
                       sample=config['data']['name'])
+
   return alleles
 
 def get_mhcII(wildcards):

diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk
@@ -288,23 +288,41 @@ rule combine_mhcI_PE:
           '{input}' {output}
     """
 
-rule merge_mhcI_allels:
+rule merge_predicted_mhcI_allels:
   input:
-    get_mhcI_alleles
+    get_predicted_mhcI_alleles
   output:
-    "results/{sample}/hla/mhc-I.tsv",
+    "results/{sample}/hla/genotyping/mhc-I.tsv",
   message:
     "Merging HLA alleles from different sources"
   log:
-    "logs/{sample}/optitype/merge_classI_alleles.log"
+    "logs/{sample}/optitype/merge_predicted_mhc-I.log"
   conda:
     "../envs/basic.yml"
   threads: 1
   shell:
     """
-      python workflow/scripts/merge_mhcI_alleles.py \
+      python workflow/scripts/genotyping/merge_predicted_mhcI.py \
           '{input}' {output}
     """
+
+rule combine_all_mhcI_alleles:
+  input:
+    get_all_mhcI_alleles:
+  output:
+    "results/{sample}/hla/mhc-I.tsv"
+  message:
+    "Combining HLA alleles from different sources"
+  log:
+    "logs/{sample}/genotyping/combine_all_mhc-I.log"
+  conda:
+    "../envs/basic.yml"
+  threads: 1
+  shell:
+    """
+      python workflow/scripts/genotyping/combine_all_alleles.py \
+          '{input}' {output} > {log} 2>&1\
+    """
 
 ######### MHC-II HLA GENOTYPING ###########
 rule filter_reads_mhcII_PE:

diff --git a/workflow/scripts/merge_mhcI_alleles.py b/workflow/scripts/merge_mhcI_alleles.py