update PSM counting

CCMS-UCSD · benpullman · Jul 1, 2022 · Jul 6, 2022 · Jul 6, 2022 · Jul 6, 2022
commit bdb0d1f65dabeec00f4c5aa0fe59bc65e9b2e931
diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
@@ -245,9 +245,14 @@
                 <column type="float" precision="3"  field="explained_intensity" label="Explained Intensity" width="6"/>
                 <column type="integer" field="matched_ions" label="Matched Ions" width="6"/>
                 <!-- <column type="text" field="pass" label="Above Library Threshold" width="6"/> -->
-                <column type="float" precision="3"  field="cosine" label="Best Annotated Cosine to Synthetic Spectrum Match" width="6" tooltip="Spectrum cosine using only annotated peaks from the input spectrum and closest matching synthetic."/>
+                <column type="float" precision="4"  field="best_overall_psm_score" label="Best PSM Score" width="4" tooltip=""/>
+                <column type="integer" field="filtered_psms" label="#PSMs - Passing Quality Filters" width="4" tooltip=""/>
+                <column type="integer" field="total_psms" label="#PSMs - All" width="4" tooltip=""/>
+                <column type="integer" field="num_peptidoforms" label="#Peptidoforms" width="4" tooltip=""/>
+                <column type="integer" field="variant_number" label="Variant Number" width="4" tooltip=""/>
                 <column type="integer" field="canonical_matches" label="#SAAP protein matches" width="1" tooltip = "Number of proteins the peptide matches to with a single amino acid variant."/>
                 <column type="text" field="gene_unique" label="Gene Unique" width="2" tooltip = "Peptide matches uniquely to a single gene with a single amino acid variant (True/False)."/>
+                <column type="text" field="decoy" label="Decoy" width="2" tooltip = ""/>
                 <column type="integer" field="exon_junctions_covered" label="Exon Junctions Covered" width="1"/>
                 <column type="integer" field="exons_covered_no_junction" label="Complete Exons Covered" width="1"/>
                 <column type="integer" field="total_unique_exons_covered" label="Total Exons Covered" width="1"/>

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -71,6 +71,16 @@ def unit_delta(aa1, aa2):
 
 no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()])
 
+def seq_theoretical_mass(sequence):
+    aa = ''.join([a for a in sequence if a.isalpha()])
+    mods = ''.join([m for m in sequence if not m.isalpha()])
+    if len(mods) > 0:
+        mods = eval(mods)
+    else:
+        mods = 0
+    return (theoretical_mass(aa) + mods + 1.007276035)
+
+
 def theoretical_mz(sequence,charge):
     aa = ''.join([a for a in sequence if a.isalpha()])
     mods = ''.join([m for m in sequence if not m.isalpha()])
@@ -346,8 +356,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['best_overall_psm_score'] = float(l['score'])
 
         if (this_pass_ei or this_pass_cos) and this_pass_by:
-            precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
-        precursor_representative['total_psms'] += l.get('total_psms',1)
+            precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1))
+        precursor_representative['total_psms'] += int(l.get('total_psms',1))
 
         precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant])
 
@@ -545,10 +555,10 @@ def output_protein_level_results(best_psm):
             if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
                 pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
                 if best_psm.get('hpp_match','') == 'Yes':
-                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
-                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
             for protein in proteins:
-                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
 
         proteins = peptide_to_protein.get(sequence_il,[])
         cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
@@ -683,8 +693,9 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg
 
     if args.output_peptides:
         with open(args.output_peptides,'w') as w:
-            header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
+            header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
             header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff']
+            header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score']
             r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore')
             r.writeheader()
             for precursor in all_precursors: