Merge branch 'development'

autosome-ru · Nov 11, 2019 · 3ff07f8 · 3ff07f8
2 parents 84f2f0b + 97178bd
commit 3ff07f8
Show file tree

Hide file tree

Showing 9 changed files with 74 additions and 50 deletions.
diff --git a/scripts/ASBcalling/Aggregation.py b/scripts/ASBcalling/Aggregation.py
@@ -62,7 +62,7 @@ def get_another_agr(path, what_for):
         tables = cell_lines_dict[key_name]
     if what_for == "TF":
         tables = tf_dict[key_name]
-    print('Reading datasets for {} '.format(what_for) + key_name)
+    print('Reading datasets for {} {}'.format(what_for, key_name))
     common_snps = dict()
     for table in tables:
         if os.path.isfile(table):
@@ -89,15 +89,15 @@ def get_another_agr(path, what_for):
                                                                           dip_qual, lq, rq, seg_c, p_ref, p_alt,
                                                                           table_name, another_agr)]
 
-    print('Writing ', key_name)
+    print('Writing {}'.format(key_name))
 
     with open(results_path + what_for + "_P-values/" + key_name + '_common_table.tsv', 'w') as out:
         out.write(pack(['#chr', 'pos', 'ID', 'ref', 'alt', 'repeat_type', 'total_callers', 'unique_callers', 'm_ploidy',
                         'm_q', 'm_dipq', 'm_segc', 'm_datasets', 'maxdepth_ref/alt', 'maxdepth_ploidy', 'maxdepth_m1',
                         'maxdepth_m2', 'mostsig_ref/alt', 'mostsig_ploidy', 'mostsig_m1', 'mostsig_m2',
                         'min_cover', 'max_cover', 'med_cover', 'mean_cover', 'total_cover', 'm1_ref', 'm1_alt',
                         'm2_ref', 'm2_alt',
-                        'm_hpref', 'm_hpalt', 'm_fpref', 'm_fpalt', 'm_stpref', 'm_stpalt']))
+                        'm_hpref', 'm_hpalt', 'm_fpref', 'm_fpalt', 'm_logpref', 'm_logpalt', 'm_stpref', 'm_stpalt']))
 
         filtered_snps = dict()
         for key in common_snps:
@@ -107,7 +107,7 @@ def get_another_agr(path, what_for):
                     break
 
         counter = 0
-        print(len(filtered_snps), 'snps')
+        print('{} snps'.format(len(filtered_snps)))
 
         if len(filtered_snps) == 0:
             sys.exit(0)
@@ -120,7 +120,7 @@ def get_another_agr(path, what_for):
             value = filtered_snps[key]
             counter += 1
             if counter % 10000 == 0:
-                print(counter, 'done')
+                print('done {}'.format(counter))
             c_uniq_callers = dict(zip(callers_names, [False]*len(callers_names)))
             m_total_callers = 0
             c_ploidy = []
@@ -184,6 +184,8 @@ def get_another_agr(path, what_for):
             m_hpalt = stats.hmean(c_palt)
             m_fpref = stats.combine_pvalues(c_pref, method='fisher')[1]
             m_fpalt = stats.combine_pvalues(c_palt, method='fisher')[1]
+            m_logpref = stats.combine_pvalues(c_pref, method='mudholkar_george')[1]
+            m_logpalt = stats.combine_pvalues(c_palt, method='mudholkar_george')[1]
             m_stpref = stats.combine_pvalues(c_pref, method='stouffer')[1]
             m_stpalt = stats.combine_pvalues(c_palt, method='stouffer')[1]
 
@@ -248,6 +250,7 @@ def get_another_agr(path, what_for):
                  m1_ref, m1_alt, m2_ref, m2_alt,
                  m_hpref, m_hpalt,
                  m_fpref, m_fpalt,
+                 m_logpref, m_logpalt,
                  m_stpref, m_stpalt]))
             origin_of_snp_dict["\t".join(map(str, key))] = {'aligns': c_table_names,
                                                             expected_args[what_for]: c_another_agr,
@@ -257,9 +260,9 @@ def get_another_agr(path, what_for):
     print("Counting FDR")
     with open(results_path + what_for + "_P-values/" + key_name + '_common_table.tsv', 'r') as f:
         table = pd.read_table(f)
-    bool_ar_ref, p_val_ref, _, _ = statsmodels.stats.multitest.multipletests(table["m_fpref"],
+    bool_ar_ref, p_val_ref, _, _ = statsmodels.stats.multitest.multipletests(table["m_logpref"],
                                                                              alpha=0.05, method='fdr_bh')
-    bool_ar_alt, p_val_alt, _, _ = statsmodels.stats.multitest.multipletests(table["m_fpalt"],
+    bool_ar_alt, p_val_alt, _, _ = statsmodels.stats.multitest.multipletests(table["m_logpalt"],
                                                                              alpha=0.05, method='fdr_bh')
     table["m_fdr_ref"] = pd.Series(p_val_ref)
     table["m_fdr_alt"] = pd.Series(p_val_alt)

diff --git a/scripts/CORRELATIONanalysis/Annotate_SNPs_with_ploidy.py b/scripts/CORRELATIONanalysis/Annotate_SNPs_with_ploidy.py
@@ -8,7 +8,10 @@
 
 
 def unpack_ploidy_segments(line):
+    if line[0] == '#':
+        return [''] * 6
     line = line.strip().split('\t')
+
     return [line[0], int(line[1]), int(line[2]), float(line[3]), int(line[4]), int(line[7])]
 
 
@@ -33,14 +36,14 @@ def unpack_snps(line):
 lab = file_name.split('!')[1][:-4]
 
 try:
-    aligns = list(set(aligns_by_cell_type[file_name[:-4]]))  # .tsv
-    datasetsn = len(aligns)
-    al_list = [align[29:-7] for align in aligns]
+    aligns = aligns_by_cell_type[file_name[:-4]]  # .tsv
+    al_list = [align[29:-7] for align in aligns if os.path.isfile(align)]
+    datasetsn = len(al_list)
 except KeyError:
     datasetsn = 'nan'
     al_list = []
     print(file_name)
-    
+
 names, _ = read_synonims()
 if name in names:
     table_path = ploidy_path + file_name
@@ -50,7 +53,7 @@ def unpack_snps(line):
         ploidy_file_path = ploidy_path + mode + '/' + name + '!' + lab + '_ploidy.tsv'
         out_path = correlation_path + mode + '_tables/' + name + '_' + lab.replace('_', '-') + '.tsv'
         print(out_path)
-
+        
         with open(table_path, 'r') as table, open(ploidy_file_path, 'r') as ploidy, open(out_path, 'w') as out:
             out.write('#' + str(datasetsn) + '!' + lab + '!' + '>'.join(al_list) + '\n')
             for chr, pos, ref, alt, in_intersection, segment_ploidy, qual, segn \

diff --git a/scripts/CORRELATIONanalysis/CorStats.py b/scripts/CORRELATIONanalysis/CorStats.py
@@ -3,7 +3,7 @@
 from scipy.stats import kendalltau
 
 sys.path.insert(1, '/home/abramov/ASB-Project')
-from scripts.HELPERS.helpers import CorrelationReader, Intersection, pack, read_synonims
+from scripts.HELPERS.helpers import CorrelationReader, Intersection, pack, read_synonims, ChromPos
 from scripts.HELPERS.paths import parameters_path, correlation_path, heatmap_data_path
 
 CGH_path = parameters_path + 'CHIP_hg38.sorted.bed'
@@ -23,7 +23,7 @@ def count_cosmic_segments():
 
 def unpack_cosmic_segments(line, mode='normal'):
     if line[0] == '#':
-        return []
+        return [''] * len(line.strip().split('\t'))
     line = line.strip().split('\t')
 
     if line[0] != cosmic_names[cell_line_name]:
@@ -41,32 +41,51 @@ def unpack_cosmic_segments(line, mode='normal'):
     return [line[1], int(line[2]), int(line[3]), value]
 
 
-def correlation_with_cosmic(SNPs_iterator, mode, heatmap_file_path=None):
-    heat_map = None
-    if heatmap_file_path is not None:
-        heat_map = open(heatmap_file_path, 'w')
+def correlation_with_cosmic(SNP_objects, mode, heatmap_data_file=None):
+    if heatmap_data_file is not None:
+        heatmap = open(heatmap_data_file, 'w')
+    cosmic_segments = []
     with open(cosmic_path, 'r') as cosmic_file:
-        snp_bad_list = []
-        cosmic_bad_list = []
-        for chr, pos, ploidy, qual, segn, in_intersect, cosmic_bad \
-                in Intersection(SNPs_iterator, cosmic_file, write_intersect=True,
-                                unpack_segments_function=lambda x: unpack_cosmic_segments(x, mode=mode),
-                                write_segment_args=True):
-            if not in_intersect:
-                continue
-            snp_bad_list.append(ploidy)
-            cosmic_bad_list.append(cosmic_bad)
-
-            if heat_map is not None:
-                heat_map.write(pack([chr, pos, ploidy, cosmic_bad]))
-    if heat_map is not None:
-        heat_map.close()
-
-    if len(snp_bad_list) != 0:
-        return kendalltau(snp_bad_list, cosmic_bad_list)[0]
+        for line in cosmic_file:
+            # TODO: change v name
+            v = unpack_cosmic_segments(line, mode=mode)
+            if v:
+                cosmic_segments.append(v)
+    snp_ploidy = []
+    cosm_ploidy = []
+    for chr, pos, ploidy, qual, segn, in_intersect, cosmic_ploidy \
+            in Intersection(SNP_objects, cosmic_segments, write_intersect=True,
+                            write_segment_args=True):
+        if not in_intersect:
+            continue
+        snp_ploidy.append(ploidy)
+        cosm_ploidy.append(cosmic_ploidy)
+
+        if heatmap_data_file is not None:
+            heatmap.write(pack([chr, pos, ploidy, cosmic_ploidy]))
+    if heatmap_data_file is not None:
+        heatmap.close()
+
+    if len(snp_ploidy) != 0:
+        return kendalltau(snp_ploidy, cosm_ploidy)[0]
     return 'NaN'
 
 
+def find_nearest_probe_to_SNP(SNP_objects, CGH_objects):
+    nearest_probes = []
+    if not CGH_objects:
+        return []
+    i = 0
+    for SNP in SNP_objects:
+        SNP = [ChromPos(SNP[0], SNP[1])] + SNP[2:]
+        current_distance = SNP[0].distance(ChromPos(CGH_objects[i][0], CGH_objects[i][1]))
+        while SNP[0].distance(ChromPos(CGH_objects[i + 1][0], CGH_objects[i + 1][1])) <= current_distance:
+            current_distance = SNP[0].distance(ChromPos(CGH_objects[i + 1][0], CGH_objects[i + 1][1]))
+            i += 1
+        nearest_probes.append(CGH_objects[i])
+    return nearest_probes
+
+
 if __name__ == '__main__':
     file_name = sys.argv[1]
     print(file_name)
@@ -112,22 +131,19 @@ def correlation_with_cosmic(SNPs_iterator, mode, heatmap_file_path=None):
             segment_numbers[model] = segments_number
             corr_to_objects[model] = correlation_with_cosmic(SNP_objects,
                                                              mode='normal',
-                                                             heatmap_file_path=heatmap_data_file)
+                                                             heatmap_data_file=heatmap_data_file)
 
         for naive_mode in naive_modes:
             number_of_datasets, lab, SNP_objects, aligns, segments_number = reader.read_SNPs(method=naive_mode)
 
             corr_to_objects[naive_mode] = correlation_with_cosmic(SNP_objects, mode='normal')
 
         # TODO: add 3-5 neighbours naive
-
-        # TODO: add closest chip COR
-
-        # print('reading CGH')
         CGH_objects = reader.read_CGH(cgh_names[cell_line_name])
+        nearest_cgh_objects = find_nearest_probe_to_SNP(SNP_objects, CGH_objects)
 
         corr_to_objects_chip = correlation_with_cosmic(CGH_objects, mode='total')
-
+        corr_to_objects_chip_nearest = correlation_with_cosmic(nearest_cgh_objects, mode='total')
         out_line = '\t'.join(map(lambda x: '\t'.join(map(str, x)),
 
                                  [[cell_line_name, lab, aligns, len(SNP_objects), number_of_datasets,
@@ -139,7 +155,9 @@ def correlation_with_cosmic(SNPs_iterator, mode, heatmap_file_path=None):
                                  [[corr_to_objects[naive_mode]]
                                   for naive_mode in naive_modes] +
 
-                                 [[corr_to_objects_chip]]
+                                 [[corr_to_objects_chip]] +
+
+                                 [[corr_to_objects_chip_nearest]]
 
                                  )) + '\n'
         out.write(out_line)
diff --git a/scripts/CORRELATIONanalysis/JoinThreads.py b/scripts/CORRELATIONanalysis/JoinThreads.py
@@ -21,7 +21,7 @@ def get_name_by_dir(dir_name):
                               'cor_by_snp' + get_name_by_dir(snp_dir)]
                              for snp_dir in snp_dirs] +
                             [['cor_by_snp_naive',
-                              'cor_by_probe_CGH']]
+                              'cor_by_probe_CGH', 'cor_by_snp_probe_CGH']]
                             )) + '\n')
 
     for file_name in os.listdir(Correlation_path):

diff --git a/scripts/HELPERS/helpers.py b/scripts/HELPERS/helpers.py
@@ -103,7 +103,6 @@ def get_next_snp(self):
     def get_next_segment(self):
         try:
             seg_chr, start_pos, end_pos, *self.seg_args = self.unpack_segments_function(next(self.segments))
-
             self.segment_start = ChromPos(seg_chr, start_pos)
             self.segment_end = ChromPos(seg_chr, end_pos)
         except StopIteration:
@@ -312,6 +311,8 @@ def read_CGH(self, cgh_name):
                    'LC:NCI-H460', 'LC:NCI-H522', 'OV:IGROV1', 'OV:OVCAR-3', 'OV:OVCAR-4', 'OV:OVCAR-5', 'OV:OVCAR-8',
                    'OV:SK-OV-3', 'OV:NCI/ADR-RES', 'PR:PC-3', 'PR:DU-145', 'RE:786-0', 'RE:A498', 'RE:ACHN',
                    'RE:CAKI-1', 'RE:RXF 393', 'RE:SN12C', 'RE:TK-10', 'RE:UO-31']
+        if cgh_name not in cgnames:
+            return []
         idx = cgnames.index(cgh_name) + 3
         with open(self.CGH_path, 'r') as file:
             result = []

diff --git a/scripts/PARAMETERS/cor.sh b/scripts/PARAMETERS/cor.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-ParametersListsFolder="/home/abramov/ParallelParameters/stats/"
+ParametersListsFolder="/home/abramov/ParallelParameters/"
 ScriptsFolder="/home/abramov/ASB-Project/scripts/"
 
 njobs=$1

diff --git a/scripts/PLOIDYcalling/PloidyEstimation.py b/scripts/PLOIDYcalling/PloidyEstimation.py
@@ -22,7 +22,6 @@ def __init__(self):
         self.border_numbers = None
         self.positions = []
 
-        # TODO: make abstract
         self.LINES = None
         self.last_snp_number = None
         self.candidate_numbers = None

diff --git a/scripts/SNPcalling/ProcessLine.sh b/scripts/SNPcalling/ProcessLine.sh
@@ -42,8 +42,8 @@ if [ "$TF" != "None" ]; then
   OutPath=${AlignmentsPath}"EXP/$TF/$ExpName/"
   AlignmentFullPath=${AlignmentsPath}"EXP/$TF/$ExpName/$AlignName.bam"
 else
-  if ! [ -d ${AlignmentsPath}"CTRL/$EXP" ]; then
-    if ! mkdir ${AlignmentsPath}"CTRL/$EXP"
+  if ! [ -d ${AlignmentsPath}"CTRL/$ExpName" ]; then
+    if ! mkdir ${AlignmentsPath}"CTRL/$ExpName"
     then
       echo "Failed to make dir $ExpName"
       exit 1

diff --git a/scripts/SNPcalling/SNPcalling.sh b/scripts/SNPcalling/SNPcalling.sh
@@ -75,7 +75,7 @@ if ! $Java $JavaParameters -jar "$PICARD" \
 	REMOVE_DUPLICATES=true \
 	M="$OutPath/${BamName}_metrics.txt"
 then
-    echo "Failed to picard.MarkDplicates"
+    echo "Failed to picard.MarkDuplicates"
     exit 1
 fi