From 8427e7f0321298cfd9cdfbedf02679e26e504289 Mon Sep 17 00:00:00 2001 From: benpullman Date: Thu, 30 Jun 2022 19:25:07 -0700 Subject: [PATCH 01/22] update mappings to separate contaminants --- peptide_statistics_hpp/binding.xml | 6 +++++ peptide_statistics_hpp/flow.xml | 15 ++++++++---- peptide_statistics_hpp/input.xml | 14 +++++++++++ peptide_statistics_hpp/tool.xml | 10 ++++++-- .../cosine_to_synthetics.py | 3 ++- .../download_latest_kb.py | 5 ++-- tools/peptide_statistics_hpp/map_peptides.py | 3 ++- .../peptide_protein_cosine.py | 23 ++++++++++++------- 8 files changed, 61 insertions(+), 18 deletions(-) diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml index 18189bb..827e7de 100644 --- a/peptide_statistics_hpp/binding.xml +++ b/peptide_statistics_hpp/binding.xml @@ -29,6 +29,9 @@ + + + @@ -54,6 +57,7 @@ + @@ -61,6 +65,7 @@ + @@ -88,6 +93,7 @@ + diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml index efff10a..374bc19 100644 --- a/peptide_statistics_hpp/flow.xml +++ b/peptide_statistics_hpp/flow.xml @@ -13,6 +13,7 @@ + @@ -41,6 +42,8 @@ + + @@ -60,7 +63,9 @@ - + + + @@ -68,8 +73,9 @@ - - + + + @@ -112,7 +118,8 @@ - + + diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml index f30a264..279e804 100644 --- a/peptide_statistics_hpp/input.xml +++ b/peptide_statistics_hpp/input.xml @@ -13,6 +13,10 @@ + + + + @@ -227,6 +231,16 @@ Proteome DB + + + + + + + Proteome DB - Contaminants + diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml index bbe1f8e..afd2332 100644 --- a/peptide_statistics_hpp/tool.xml +++ b/peptide_statistics_hpp/tool.xml @@ -68,7 +68,8 @@ - + + @@ -76,6 +77,7 @@ + @@ -85,11 +87,13 @@ + + @@ -137,6 +141,7 @@ + @@ -157,7 +162,8 @@ - + + diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py index aa4496c..68da376 100644 --- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py +++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py @@ -95,7 +95,8 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s precursor_filter_window=1.5, low_mass_filter=low_mass_filter, isobaric_tag_type=None, - min_snr=min_snr + min_snr=min_snr, + num_top_unannotated_envelopes_to_remove=2 ) ion_vector = spectrum._replace(peaks = ion_vector) ion_vector = processing.normalize_spectrum(ion_vector) diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py index db56a11..1d63925 100644 --- a/tools/peptide_statistics_hpp/download_latest_kb.py +++ b/tools/peptide_statistics_hpp/download_latest_kb.py @@ -10,7 +10,8 @@ def arguments(): parser = argparse.ArgumentParser(description='mzTab to list of peptides') parser.add_argument('-p','--params', type = str, help='Input Parameters') parser.add_argument('-c','--comparisons', type = Path, help='Comparison Jobs') - parser.add_argument('-f','--proteome_fasta', type = Path, help='FASTA File') + parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database') + parser.add_argument('-m','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database') parser.add_argument('-b','--backup_kb_pep', type = Path, help='Backup KB Peptides') parser.add_argument('-t','--use_job_level_thresholds', type = bool, help='Use job level thresholds',default=True) parser.add_argument('-k','--kb_pep', type = Path, help='Output KB Peptides') @@ -65,7 +66,7 @@ def main(): try: - proteome = mapping.add_decoys(mapping.read_uniprot(args.proteome_fasta)) + proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]) with open(args.kb_pep, 'w') as w: r = csv.DictWriter(w, delimiter = '\t', fieldnames = header) diff --git a/tools/peptide_statistics_hpp/map_peptides.py b/tools/peptide_statistics_hpp/map_peptides.py index 86e3308..40d298e 100755 --- a/tools/peptide_statistics_hpp/map_peptides.py +++ b/tools/peptide_statistics_hpp/map_peptides.py @@ -9,6 +9,7 @@ def arguments(): parser = argparse.ArgumentParser(description='Map Peptides') parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database') + parser.add_argument('-c','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database') parser.add_argument('-e','--exon_fasta', type = Path, help='Input FASTA Exon Mapping Database') parser.add_argument('-p','--peptide_list', type = Path, help='Peptide List') parser.add_argument('-o','--output_folder', type = Path, help='Output Folder') @@ -28,7 +29,7 @@ def load_peptide_list(input_peptide_filename): def main(): args = arguments() - proteome = mapping.read_uniprot(args.proteome_fasta) + proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]) proteome_with_decoys = mapping.add_decoys(proteome) peptide_list = load_peptide_list(args.peptide_list) diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 50cb0e0..5bb6d28 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -16,7 +16,8 @@ def arguments(): parser = argparse.ArgumentParser(description='mzTab to list of peptides') parser.add_argument('--comparison_pep', type = Path, help='Peptides to Compare') - parser.add_argument('--fasta', type = Path, help='Input FASTA') + parser.add_argument('--proteome_fasta', type = Path, help='Input Proteome FASTA') + parser.add_argument('--contaminants_fasta', type = Path, help='Input Contaminants FASTA') parser.add_argument('--input_psms', type = Path, help='Input PSMs') parser.add_argument('--input_psms_external', type = Path, help='Input PSMs (External)') parser.add_argument('--input_peptides', type = Path, help='Input PSMs (External)') @@ -92,7 +93,7 @@ def add_brackets(pep): pep = pep[:breakpoint] + end_bracket + pep[breakpoint:] return pep -protein_type = lambda protein, proteome: 'TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform') +protein_type = lambda protein, proteome: 'Contaminant' if proteome.proteins[protein].db == 'con' else ('TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform')) def msv_to_pxd(msv, msv_mapping): output_mapping = msv_mapping.get(msv,{}).get('px_accession') @@ -164,8 +165,9 @@ def main(): args = arguments() representative_per_precursor = {} + variant_to_precursors = defaultdict(list) - proteome = mapping.add_decoys(mapping.read_uniprot(args.fasta)) + proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])) all_datasets = set() datasets_per_sequence= defaultdict(set) @@ -237,6 +239,10 @@ def update_precursor_representative(l,from_psm = True): datasets = set([d for d in l.get('datasets','').split(';') if d != '']) tasks = set([t for t in l.get('tasks','').split(';') if t != '']) sequence, charge = l['sequence'],l['charge'] + precursor_theoretical_mz = theoretical_mz(sequence, charge) + aa_seq = ''.join([a for a in sequence if a.isalpha()]) + variant = (aa_seq, charge, int(precursor_theoretical_mz)) + if not (sequence, charge) in representative_per_precursor: representative_per_precursor[(sequence, charge)] = l.copy() precursor_representative = representative_per_precursor[(sequence, charge)] @@ -287,7 +293,7 @@ def update_precursor_representative(l,from_psm = True): precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi'] precursor_representative['score'] = float(l['score']) #consider best EI And matched ions over all representatives - if float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): + if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): precursor_representative['explained_intensity'] = l['explained_intensity'] precursor_representative['matched_ions'] = l['matched_ions'] @@ -447,7 +453,8 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, proteins = l['protein'].split(' ###')[0].split(';') # PSM-level FDR was inefficient at this scale - need to rethink #if row_pass_filters(l): - all_psms_with_score.append(fdr.ScoredElement(l['usi'],'XXX_' in proteins[0],l['score'])) + all_targets = [p for p in proteins if 'XXX_' not in p] + all_psms_with_score.append(fdr.ScoredElement(l['usi'],len(all_targets)==0,l['score'])) l.pop('mapped_proteins') l.pop('hpp') l.pop('len') @@ -488,7 +495,7 @@ def output_protein_level_results(best_psm): proteins = [p for p in best_psm['protein'].split(' ###')[0].split(';') if p != ''] if row_pass_filters(best_psm): - if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and 'Canonical' in best_psm.get('protein_type',''): + if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')): pos = (int(best_psm['aa_start']),int(best_psm['aa_end'])) if best_psm.get('hpp_match','') == 'Yes': precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge'])))) @@ -497,7 +504,7 @@ def output_protein_level_results(best_psm): precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge'])))) proteins = peptide_to_protein.get(sequence_il,[]) - cannonical_proteins = [protein for protein in proteins if proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso] + cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes': @@ -635,7 +642,7 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5): r.writeheader() for precursor in all_precursors: proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != ''] - if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and 'Canonical' in precursor.get('protein_type',''): + if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')): precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1) precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1) else: From e56b10a77ad559a87d74d4c2d7ea602b495ea4f2 Mon Sep 17 00:00:00 2001 From: benpullman Date: Wed, 6 Jul 2022 10:08:18 -0700 Subject: [PATCH 02/22] use variants --- .../download_latest_kb.py | 2 +- .../peptide_protein_cosine.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py index 1d63925..d3457a8 100644 --- a/tools/peptide_statistics_hpp/download_latest_kb.py +++ b/tools/peptide_statistics_hpp/download_latest_kb.py @@ -66,7 +66,7 @@ def main(): try: - proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]) + proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])) with open(args.kb_pep, 'w') as w: r = csv.DictWriter(w, delimiter = '\t', fieldnames = header) diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 5bb6d28..badfaec 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -165,7 +165,7 @@ def main(): args = arguments() representative_per_precursor = {} - variant_to_precursors = defaultdict(list) + variant_to_best_precursor = {} proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])) @@ -241,13 +241,17 @@ def update_precursor_representative(l,from_psm = True): sequence, charge = l['sequence'],l['charge'] precursor_theoretical_mz = theoretical_mz(sequence, charge) aa_seq = ''.join([a for a in sequence if a.isalpha()]) + variant = (aa_seq, charge, int(precursor_theoretical_mz)) - - if not (sequence, charge) in representative_per_precursor: + update_peptidoform = False + + if not variant in variant_to_best_precursor: + variant_to_best_precursor[variant] = (sequence, charge) representative_per_precursor[(sequence, charge)] = l.copy() precursor_representative = representative_per_precursor[(sequence, charge)] precursor_representative['datasets'] = datasets precursor_representative['tasks'] = tasks + precursor_representative['parent_mass'] = precursor_theoretical_mz if from_psm: precursor_representative['database_filename'] = l['filename'] precursor_representative['database_scan'] = l['scan'] @@ -259,7 +263,7 @@ def update_precursor_representative(l,from_psm = True): precursor_representative.pop('scan') precursor_representative.pop('usi') - precursor_representative = representative_per_precursor[(sequence, charge)] + precursor_representative = representative_per_precursor[variant_to_best_precursor[variant]] precursor_representative['datasets'] |= datasets precursor_representative['tasks'] |= tasks @@ -292,6 +296,7 @@ def update_precursor_representative(l,from_psm = True): precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan'] precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi'] precursor_representative['score'] = float(l['score']) + update_peptidoform = True #consider best EI And matched ions over all representatives if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): precursor_representative['explained_intensity'] = l['explained_intensity'] @@ -311,6 +316,11 @@ def update_precursor_representative(l,from_psm = True): else: precursor_representative['cosine_score_match'] = 'No' + if update_peptidoform: + representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant]) + variant_to_best_precursor[variant] = (sequence, charge) + + def update_mappings(protein_coverage_file,update_precursor_representatives): if(protein_coverage_file.is_file()): print("{}: Loading {} ({} cumulative peptides @ {} peptides/second)".format(datetime.now().strftime("%H:%M:%S"),protein_coverage_file,len(pep_mapping_info),len(pep_mapping_info)/(1+(datetime.now()-start_time).seconds))) From 77c17b94b63c8712e866f8eca6361f0a938ea956 Mon Sep 17 00:00:00 2001 From: benpullman Date: Wed, 6 Jul 2022 16:13:05 -0700 Subject: [PATCH 03/22] add non-numeric filter to PE column --- peptide_statistics_hpp/result.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml index 81de135..9099946 100644 --- a/peptide_statistics_hpp/result.xml +++ b/peptide_statistics_hpp/result.xml @@ -147,7 +147,7 @@ - + - + + + + + + diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 34b15b2..d4a2d2b 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -71,6 +71,16 @@ def unit_delta(aa1, aa2): no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()]) +def seq_theoretical_mass(sequence): + aa = ''.join([a for a in sequence if a.isalpha()]) + mods = ''.join([m for m in sequence if not m.isalpha()]) + if len(mods) > 0: + mods = eval(mods) + else: + mods = 0 + return (theoretical_mass(aa) + mods + 1.007276035) + + def theoretical_mz(sequence,charge): aa = ''.join([a for a in sequence if a.isalpha()]) mods = ''.join([m for m in sequence if not m.isalpha()]) @@ -346,8 +356,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps precursor_representative['best_overall_psm_score'] = float(l['score']) if (this_pass_ei or this_pass_cos) and this_pass_by: - precursor_representative['filtered_psms'] += l.get('filtered_psms',1) - precursor_representative['total_psms'] += l.get('total_psms',1) + precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1)) + precursor_representative['total_psms'] += int(l.get('total_psms',1)) precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant]) @@ -545,10 +555,10 @@ def output_protein_level_results(best_psm): if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')): pos = (int(best_psm['aa_start']),int(best_psm['aa_end'])) if best_psm.get('hpp_match','') == 'Yes': - precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge'])) - precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge'])) + precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) + precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) for protein in proteins: - precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge'])) + precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) proteins = peptide_to_protein.get(sequence_il,[]) cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] @@ -683,8 +693,9 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg if args.output_peptides: with open(args.output_peptides,'w') as w: - header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] + header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff'] + header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score'] r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore') r.writeheader() for precursor in all_precursors: From baf1bcc8c64ff871de2fe982f71729eb4ddd3bb5 Mon Sep 17 00:00:00 2001 From: benpullman Date: Sun, 10 Jul 2022 14:57:38 -0700 Subject: [PATCH 18/22] update outputs to be better named --- peptide_statistics_hpp/input.xml | 21 +++++++++---------- peptide_statistics_hpp/result.xml | 6 +++--- peptide_statistics_hpp/tool.xml | 2 +- .../peptide_protein_cosine.py | 6 +++--- tools/peptide_statistics_hpp/read_mappings.py | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml index 0fb3d02..64dae31 100644 --- a/peptide_statistics_hpp/input.xml +++ b/peptide_statistics_hpp/input.xml @@ -45,8 +45,8 @@ - - + + @@ -84,22 +84,21 @@ - - + @@ -136,7 +135,7 @@ - + @@ -396,14 +395,14 @@ - + diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml index 152981b..de2ceab 100644 --- a/peptide_statistics_hpp/result.xml +++ b/peptide_statistics_hpp/result.xml @@ -216,7 +216,7 @@ - + @@ -413,8 +413,8 @@ - - + + diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml index a13331a..64ebe25 100644 --- a/peptide_statistics_hpp/tool.xml +++ b/peptide_statistics_hpp/tool.xml @@ -187,7 +187,7 @@ - + diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index d4a2d2b..db9ff2a 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -636,7 +636,7 @@ def output_protein_level_results(best_psm): seen_picked = set() - def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggregation_func = lambda xs: sum(xs)): + def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregation_func = lambda xs: sum(xs)): used_precursors = [] for precursor in sorted(precursor_list, key = lambda x: x[0], reverse = True): found = False @@ -701,7 +701,7 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg for precursor in all_precursors: proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != ''] if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')): - precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1) + precursor['picked_protein_fdr'] = min(leftover_fdr_dict.get(proteins[0],1),1) precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1) else: precursor['picked_protein_fdr'] = 1 @@ -835,7 +835,7 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg } pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison - pass_picked_fdr, pass_hpp_fdr = protein_dict['picked_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr + pass_picked_fdr, pass_hpp_fdr = protein_dict['leftover_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr if args.main_fdr == 'traditional': pass_picked_fdr, pass_hpp_fdr = protein_dict['common_fdr'] <= args.picked_protein_fdr, protein_dict['common_fdr'] <= args.hpp_protein_fdr diff --git a/tools/peptide_statistics_hpp/read_mappings.py b/tools/peptide_statistics_hpp/read_mappings.py index 8fb0a9f..a64d7fb 100644 --- a/tools/peptide_statistics_hpp/read_mappings.py +++ b/tools/peptide_statistics_hpp/read_mappings.py @@ -36,7 +36,7 @@ def read_protein_coverage(protein_coverage_file,seen_sequences,proteome, filter mapped_protein_str = l['mapped_proteins'] if 'mapped_proteins' in l else l['all_proteins_w_coords'] mapped_exon_str = l['mapped_exons'] if 'mapped_exons' in l else l['all_mapped_exons'] - all_protein_fdr = float(l.get('picked_protein_fdr',-1)) + all_protein_fdr = float(l.get('leftover_protein_fdr',-1)) hpp_protein_fdr = float(l.get('hpp_protein_fdr',-1)) cosine = float(l.get('cosine',-1)) From 4b8aab5299f25ca3dba030668ed4445836b98cd1 Mon Sep 17 00:00:00 2001 From: benpullman Date: Mon, 11 Jul 2022 18:09:20 -0700 Subject: [PATCH 19/22] update to fix bug in EI representative --- peptide_statistics_hpp/result.xml | 19 ++++++++++--------- .../peptide_protein_cosine.py | 14 ++++++++++---- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml index de2ceab..01c5967 100644 --- a/peptide_statistics_hpp/result.xml +++ b/peptide_statistics_hpp/result.xml @@ -35,14 +35,15 @@ + + - - - - - - + + + + + @@ -73,7 +74,7 @@ - + @@ -83,7 +84,7 @@ - + @@ -94,7 +95,7 @@ - + diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index db9ff2a..53e746c 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -333,10 +333,16 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi'] precursor_representative['score'] = float(l['score']) update_peptidoform = True - #consider best EI And matched ions over all representatives - if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): - precursor_representative['explained_intensity'] = l['explained_intensity'] - precursor_representative['matched_ions'] = l['matched_ions'] + + #best EI does not pass #breaks threshold but current one does + if not best_pass_by and this_pass_by: + precursor_representative['explained_intensity'] = l['explained_intensity'] + precursor_representative['matched_ions'] = l['matched_ions'] + #two situations, either both pass #threshold or both fail #threshold + elif best_pass_by is this_pass_by and float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): + precursor_representative['explained_intensity'] = l['explained_intensity'] + precursor_representative['matched_ions'] = l['matched_ions'] + #otherwise best already passed #threshold, so we skip it if best_cosine and float(l['cosine']) >= 0 and l['synthetic_usi'] != 'N/A': precursor_representative['cosine_filename'] = l['filename'] if from_psm else l['cosine_filename'] From f9c160d366dcf765de3831e6fe46e95841298776 Mon Sep 17 00:00:00 2001 From: benpullman Date: Fri, 15 Jul 2022 11:31:24 -0700 Subject: [PATCH 20/22] update peptide protein cosine --- tools/peptide_statistics_hpp/peptide_protein_cosine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 53e746c..06cd849 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -331,7 +331,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps precursor_representative['database_filename'] = l['filename'] if from_psm else l['database_filename'] precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan'] precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi'] - precursor_representative['score'] = float(l['score']) + precursor_representative['score'] = score update_peptidoform = True #best EI does not pass #breaks threshold but current one does From 87388477434b96c728acaab5cc3199fe7ba2f156 Mon Sep 17 00:00:00 2001 From: benpullman Date: Fri, 5 Aug 2022 10:05:09 -0700 Subject: [PATCH 21/22] changes for window filters --- peptide_statistics_hpp/result.xml | 4 ++- .../cosine_to_synthetics.py | 4 ++- .../peptide_protein_cosine.py | 25 +++++++++++++------ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml index 01c5967..fbcf4ac 100644 --- a/peptide_statistics_hpp/result.xml +++ b/peptide_statistics_hpp/result.xml @@ -55,7 +55,7 @@ - + @@ -161,6 +161,7 @@ + @@ -246,6 +247,7 @@ + diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py index edfb8c3..13d6e35 100644 --- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py +++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py @@ -94,7 +94,9 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s fragment_tolerance, precursor_filter_window=1.5, low_mass_filter=low_mass_filter, - min_snr=min_snr, + min_snr=0, + window_filter_size = 50, + window_filter_top_peaks = 8, num_top_unannotated_envelopes_to_remove=2, isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs ) diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 06cd849..022c059 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -570,8 +570,17 @@ def output_protein_level_results(best_psm): cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) - if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes': - is_hpp = pep_mapping_info[sequence]['hpp'] + if len(cannonical_proteins) <= 1: + if row_pass_filters(best_psm): + for dataset in best_psm['datasets']: + all_datasets.add(dataset) + datasets_per_sequence[sequence_il].add(dataset) + for task in best_psm['tasks']: + all_tasks.add(task) + tasks_per_sequence[sequence_il].add(task) + + if len(cannonical_proteins) <= 1 : + is_hpp = pep_mapping_info[sequence]['hpp'] and best_psm.get('hpp_match','') == 'Yes' match = False has_synthetic = False has_synthetic_cosine = False @@ -583,12 +592,12 @@ def output_protein_level_results(best_psm): if float(best_psm['cosine']) >= args.cosine_cutoff: has_synthetic_cosine = True match = True - for dataset in best_psm['datasets']: - all_datasets.add(dataset) - datasets_per_sequence[sequence_il].add(dataset) - for task in best_psm['tasks']: - all_tasks.add(task) - tasks_per_sequence[sequence_il].add(task) + # for dataset in best_psm['datasets']: + # all_datasets.add(dataset) + # datasets_per_sequence[sequence_il].add(dataset) + # for task in best_psm['tasks']: + # all_tasks.add(task) + # tasks_per_sequence[sequence_il].add(task) if len(proteins) == 1: if row_pass_filters(best_psm): is_isoform_unique = True From 9c5e60f5cf38aad965c37997c1ff32db2a24e597 Mon Sep 17 00:00:00 2001 From: benpullman Date: Fri, 12 Aug 2022 13:11:59 -0700 Subject: [PATCH 22/22] changes to add FDR to comparison, should be discarded --- peptide_statistics_hpp/binding.xml | 1 + peptide_statistics_hpp/flow.xml | 1 + peptide_statistics_hpp/tool.xml | 2 + .../cosine_to_synthetics.py | 1 + .../peptide_protein_cosine.py | 185 ++++++++++-------- 5 files changed, 109 insertions(+), 81 deletions(-) diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml index 827e7de..12722e8 100644 --- a/peptide_statistics_hpp/binding.xml +++ b/peptide_statistics_hpp/binding.xml @@ -92,6 +92,7 @@ + diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml index 374bc19..63e744e 100644 --- a/peptide_statistics_hpp/flow.xml +++ b/peptide_statistics_hpp/flow.xml @@ -117,6 +117,7 @@ + diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml index 64ebe25..7d30b04 100644 --- a/peptide_statistics_hpp/tool.xml +++ b/peptide_statistics_hpp/tool.xml @@ -146,6 +146,7 @@ + @@ -167,6 +168,7 @@ + diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py index 13d6e35..9f970fc 100644 --- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py +++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py @@ -339,6 +339,7 @@ def main(): psm['synthetic_filename'] = synthetic_filename psm['synthetic_scan'] = synthetic_scan psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, synthetic_peptide, psm['charge']) + psm['synthetic_sequence'] = synthetic_peptide psm['cosine'] = cosine ei, num_matched_peaks = explained_intensity_per_spectrum.get((psm['filename'],psm['scan']),(0,0)) psm['explained_intensity'] = ei diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 022c059..762af4b 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -33,6 +33,7 @@ def arguments(): parser.add_argument('--output_task_proteins_all', type = Path, help='Output Task Proteins (All)') parser.add_argument('--protein_coverage', type = Path, help='Added Protein Coverage') parser.add_argument('--protein_coverage_external', type = Path, help='Added Protein Coverage (External)') + parser.add_argument('--protein_coverage_external_compare', type = Path, help='Added Protein Coverage (External) - Comparison') parser.add_argument('--cosine_cutoff', type = float, help='Cosine Cutoff') parser.add_argument('--explained_intensity_cutoff', type = float, help='Explained Intensity Cutoff') parser.add_argument('--annotated_ions_cutoff', type = float, help='Annotated Ion Cutoff') @@ -206,6 +207,13 @@ def main(): comparison_picked_fdr = {} comparison_hpp_fdr = {} + precursors_per_protein_all = defaultdict(lambda: defaultdict(list)) + precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list)) + precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list)) + + all_sequences = {} + unique_sequences = {} + frequency = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) with open(args.msv_to_pxd_mapping) as json_file: @@ -216,32 +224,6 @@ def main(): if pxd: pxd_mapping[pxd] = msv - - with open(args.comparison_pep) as f: - r = csv.DictReader(f, delimiter='\t') - for l in r: - il_peptide = l['demodified'].replace('I','L') - has_synthetic = False - has_synthetic_cosine = False - if l['protein'] in proteome.proteins: - - if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso: - #peptides that are not uniquely matching will have both FDRs as 1 - comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1)) - comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1)) - - protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None)) - comparison = sequences_found[il_peptide].comparison - if comparison != SeqOccurances(True,True,True): - if float(l.get('synthetic_cosine',-1)) >= 0: - has_synthetic = True - if float(l['synthetic_cosine']) > args.cosine_cutoff: - has_synthetic_cosine = True - comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine) - sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison) - if l['is_hpp'] == 'True': - sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True) - peptide_to_protein = defaultdict(list) peptide_to_exon_map = defaultdict(list) @@ -480,7 +462,23 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, outdict['hpp_match'] = 'No - failed quality thresholds' return outdict, output_proteins - row_pass_filters = lambda best_psm: (float(best_psm['explained_intensity']) >= args.explained_intensity_cutoff or float(best_psm['cosine']) >= args.cosine_cutoff) and int(best_psm['matched_ions']) >= args.annotated_ions_cutoff + row_pass_filters_input = lambda best_psm, explained_intensity_cutoff, cosine_cutoff, annotated_ions_cutoff: ( + float(best_psm['explained_intensity']) >= explained_intensity_cutoff or + float(best_psm['cosine']) >= cosine_cutoff + ) and int(best_psm['matched_ions']) >= annotated_ions_cutoff + + row_pass_filters = lambda best_psm: row_pass_filters_input( + best_psm, + args.explained_intensity_cutoff, + args.cosine_cutoff, + args.annotated_ions_cutoff + ) + row_pass_filters_compare = lambda best_psm: row_pass_filters_input( + best_psm, + float(best_psm['explained_intensity_cutoff']), + float(best_psm['cosine_cutoff']), + int(float(best_psm['annotated_ions_cutoff'])) + ) if args.output_psms: all_psms_with_score = [] @@ -546,12 +544,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, if len(all_hpp_precursors) > 0: precursor_fdr = fdr.calculate_fdr(all_hpp_precursors) - precursors_per_protein_all = defaultdict(lambda: defaultdict(list)) - precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list)) - precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list)) - - - def output_protein_level_results(best_psm): + def output_protein_level_results(best_psm,row_pass_filters,update_mappings = True): sequence = best_psm['sequence_unmodified'] sequence_il = best_psm['sequence_unmodified_il'] @@ -565,52 +558,44 @@ def output_protein_level_results(best_psm): precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) for protein in proteins: precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) + + if update_mappings: + proteins = peptide_to_protein.get(sequence_il,[]) + cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] + output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) - proteins = peptide_to_protein.get(sequence_il,[]) - cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] - output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) - - if len(cannonical_proteins) <= 1: - if row_pass_filters(best_psm): - for dataset in best_psm['datasets']: - all_datasets.add(dataset) - datasets_per_sequence[sequence_il].add(dataset) - for task in best_psm['tasks']: - all_tasks.add(task) - tasks_per_sequence[sequence_il].add(task) - - if len(cannonical_proteins) <= 1 : - is_hpp = pep_mapping_info[sequence]['hpp'] and best_psm.get('hpp_match','') == 'Yes' - match = False - has_synthetic = False - has_synthetic_cosine = False - is_isoform_unique = False - if len(cannonical_proteins) == 1: - if row_pass_filters(best_psm): - if float(best_psm['cosine']) >= 0: - has_synthetic = True - if float(best_psm['cosine']) >= args.cosine_cutoff: - has_synthetic_cosine = True - match = True - # for dataset in best_psm['datasets']: - # all_datasets.add(dataset) - # datasets_per_sequence[sequence_il].add(dataset) - # for task in best_psm['tasks']: - # all_tasks.add(task) - # tasks_per_sequence[sequence_il].add(task) - if len(proteins) == 1: - if row_pass_filters(best_psm): - is_isoform_unique = True - added = sequences_found[sequence_il].added - sequences_found[sequence_il] = sequences_found[sequence_il]._replace( - hpp=is_hpp, - isoform_unique=is_isoform_unique, - added = SeqOccurances( - added.match or match, - added.synthetic_match or has_synthetic, - added.synthetic_match_cosine or has_synthetic_cosine + if len(cannonical_proteins) <= 1 : + is_hpp = pep_mapping_info[sequence]['hpp'] + match = False + has_synthetic = False + has_synthetic_cosine = False + is_isoform_unique = False + if len(cannonical_proteins) == 1: + if row_pass_filters(best_psm): + if float(best_psm['cosine']) >= 0: + has_synthetic = True + if float(best_psm['cosine']) >= args.cosine_cutoff: + has_synthetic_cosine = True + match = True + for dataset in best_psm['datasets']: + all_datasets.add(dataset) + datasets_per_sequence[sequence_il].add(dataset) + for task in best_psm['tasks']: + all_tasks.add(task) + tasks_per_sequence[sequence_il].add(task) + if len(proteins) == 1: + if row_pass_filters(best_psm): + is_isoform_unique = True + added = sequences_found[sequence_il].added + sequences_found[sequence_il] = sequences_found[sequence_il]._replace( + hpp=is_hpp, + isoform_unique=is_isoform_unique, + added = SeqOccurances( + added.match or match, + added.synthetic_match or has_synthetic, + added.synthetic_match_cosine or has_synthetic_cosine + ) ) - ) print("About to output sequences") all_precursors = [] @@ -626,7 +611,7 @@ def output_protein_level_results(best_psm): best_psm['precursor_fdr'] = min(precursor_fdr.get((sequence, charge),1),1) best_psm['psm_fdr'] = -1 best_psm['synthetic_sequence'] = sequence.replace('+229.163','').replace('+229.162932','') - output_protein_level_results(best_psm) + output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True) best_psm['datasets'] = ';'.join(best_psm['datasets']) best_psm['tasks'] = ';'.join(best_psm['tasks']) all_precursors.append(best_psm) @@ -635,7 +620,45 @@ def output_protein_level_results(best_psm): with open(args.input_peptides) as f: r = csv.DictReader(f, delimiter='\t') for best_psm in r: - output_protein_level_results(best_psm) + output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True) + + for protein, seqs in precursors_per_protein_all.items(): + unique_sequences[protein] = len(seqs) + + for protein, seqs in precursors_per_protein_non_unique.items(): + all_sequences[protein] = len(seqs) + + for comparison_peptide_file in args.protein_coverage_external_compare.glob('*'): + with open(comparison_peptide_file) as f: + r = csv.DictReader(f, delimiter='\t') + for l in r: + output_protein_level_results(l, row_pass_filters=row_pass_filters_compare, update_mappings=False) + + with open(args.comparison_pep) as f: + r = csv.DictReader(f, delimiter='\t') + for l in r: + il_peptide = l['demodified'].replace('I','L') + has_synthetic = False + has_synthetic_cosine = False + if l['protein'] in proteome.proteins: + + if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso: + #peptides that are not uniquely matching will have both FDRs as 1 + comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1)) + comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1)) + + protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None)) + comparison = sequences_found[il_peptide].comparison + if comparison != SeqOccurances(True,True,True): + if float(l.get('synthetic_cosine',-1)) >= 0: + has_synthetic = True + if float(l['synthetic_cosine']) > args.cosine_cutoff: + has_synthetic_cosine = True + comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine) + sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison) + if l['is_hpp'] == 'True': + sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True) + hpp_protein_w_scores = [] hpp_score_dict = {} @@ -845,8 +868,8 @@ def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregat 'common_fdr':min(1,common_fdr_dict.get(protein,1)), 'leftover_score':leftover_score_dict.get(protein,0), 'leftover_fdr':min(1,leftover_fdr_dict.get(protein,1)), - 'num_sequences':len(precursors_per_protein_all.get(protein,[])), - 'num_sequences_incl_shared':len(precursors_per_protein_non_unique.get(protein,[])), + 'num_sequences':unique_sequences.get(protein,0), + 'num_sequences_incl_shared':all_sequences.get(protein,0), } pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison