From 8427e7f0321298cfd9cdfbedf02679e26e504289 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Thu, 30 Jun 2022 19:25:07 -0700
Subject: [PATCH 01/22] update mappings to separate contaminants

---
 peptide_statistics_hpp/binding.xml            |  6 +++++
 peptide_statistics_hpp/flow.xml               | 15 ++++++++----
 peptide_statistics_hpp/input.xml              | 14 +++++++++++
 peptide_statistics_hpp/tool.xml               | 10 ++++++--
 .../cosine_to_synthetics.py                   |  3 ++-
 .../download_latest_kb.py                     |  5 ++--
 tools/peptide_statistics_hpp/map_peptides.py  |  3 ++-
 .../peptide_protein_cosine.py                 | 23 ++++++++++++-------
 8 files changed, 61 insertions(+), 18 deletions(-)
diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml
index 18189bb..827e7de 100644
--- a/peptide_statistics_hpp/binding.xml
+++ b/peptide_statistics_hpp/binding.xml
@@ -29,6 +29,9 @@
         <download port="fastadb" type="folder">
             <query name="resource" value="fastadb"/>
         </download>
+        <download port="con_fastadb" type="folder">
+            <query name="resource" value="con_fastadb"/>
+        </download>
         <download port="dna" type="folder">
                 <query name="resource" value="dna"/>
             </download>
@@ -54,6 +57,7 @@
     <bind action="download_latest_kb" tool="download_latest_kb">
         <inputAsRequirement port="params"    requirement="params"/>
         <inputAsRequirement port="fastadb"                  requirement="fastadb"/>
+        <inputAsRequirement port="con_fastadb"                  requirement="con_fastadb"/>
         <inputAsRequirement port="peptide_coverage_comparisons" requirement="peptide_coverage_comparisons"/>
         <productionToOutput port="kb_pep"    production="kb_pep"/>
     </bind>
@@ -61,6 +65,7 @@
     <bind action="map_peptides" tool="map_peptides">
         <inputAsRequirement port="peptide_list"             requirement="peptide_list"/>
         <inputAsRequirement port="fastadb"                  requirement="fastadb"/>
+        <inputAsRequirement port="con_fastadb"                  requirement="con_fastadb"/>
         <inputAsRequirement port="exon_fasta"               requirement="exon_fasta"/>
 		    <productionToOutput port="peptide_coverage"         production="peptide_coverage"/>
     </bind>
@@ -88,6 +93,7 @@
     <bind action="peptide_protein_cosine" tool="peptide_protein_cosine">
         <inputAsRequirement port="kb_pep"                         requirement="kb_pep"/>
         <inputAsRequirement port="fastadb"                  requirement="fastadb"/>
+        <inputAsRequirement port="con_fastadb"                  requirement="con_fastadb"/>
         <inputAsRequirement port="novel_psms_w_cosine"       requirement="novel_psms_w_cosine"/>
         <inputAsRequirement port="peptide_coverage"        requirement="peptide_coverage"/>
         <inputAsRequirement port="novel_psms_w_cosine_external"       requirement="novel_psms_w_cosine_external"/>
diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml
index efff10a..374bc19 100644
--- a/peptide_statistics_hpp/flow.xml
+++ b/peptide_statistics_hpp/flow.xml
@@ -13,6 +13,7 @@
 		<object name="kb_pep"/>
 
 	<collection name="fastadb"/>
+	<collection name="con_fastadb"/>
 
 	<collection name="parallel_params"/>
 	<collection name="intermediate_protein_coverage"/>
@@ -41,6 +42,8 @@
         <output port="mztab"       			 collection="mztab"/>
 				<output port="spec_on_server"       collection="spec_on_server"/>
         <output port="fastadb" 		  collection="fastadb"/>
+		<output port="con_fastadb" 		  collection="con_fastadb"/>
+
 				<output port="synthetics" 		  collection="synthetics"/>
         <output port="params" object="workflowParameters"/>
 				<output port="peptide_coverage_merged_external"       			 collection="peptide_coverage_merged_external"/>
@@ -60,7 +63,9 @@
 
 		<action name="download_latest_kb">
 			<input   port="params"       			    object="workflowParameters"/>
-			<input  port="fastadb"             			object="fastadb"/>
+			<input  port="fastadb"             			collection="fastadb"/>
+			<input  port="con_fastadb"             	    collection="con_fastadb"/>
+
 			<input port="peptide_coverage_comparisons" collection="peptide_coverage_comparisons"/>
 			<output  port="kb_pep"             		object="kb_pep"/>
 		</action>
@@ -68,8 +73,9 @@
 	<collection name="intermediate_peptide_coverage"/>
 	<action name="map_peptides" multiplicity="multiple" type="parallel">
         <input  port="peptide_list"             collection="peptide_list"  transformation="unfolding"/>
-        <input  port="fastadb"             			object="fastadb"/>
-				<input  port="exon_fasta"             	object="exon_fasta"/>
+			<input  port="fastadb"             			collection="fastadb"/>
+			<input  port="con_fastadb"             	    collection="con_fastadb"/>
+							<input  port="exon_fasta"             	object="exon_fasta"/>
         <output port="peptide_coverage"       	collection="peptide_coverage"/>
     </action>
 
@@ -112,7 +118,8 @@
 		<action name="peptide_protein_cosine">
 			<input   port="kb_pep"       			    object="kb_pep"/>
 			<input   port="novel_psms_w_cosine"          collection="novel_psms_w_cosine"/>
-			<input  port="fastadb"             			object="fastadb"/>
+			<input  port="fastadb"             			collection="fastadb"/>
+			<input  port="con_fastadb"             	    collection="con_fastadb"/>
 			<input   port="peptide_coverage"   collection="peptide_coverage"/>
 			<input   port="novel_psms_w_cosine_external"       collection="novel_psms_w_cosine_external"/>
 			<input   port="peptide_coverage_merged_external"   collection="peptide_coverage_merged_external"/>
diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml
index f30a264..279e804 100644
--- a/peptide_statistics_hpp/input.xml
+++ b/peptide_statistics_hpp/input.xml
@@ -13,6 +13,10 @@
 		<parameter name="fastadb" label="Proteome DB">
 			<fileGenerator type="upload" purpose="fasta" target="fastadb"/>
 		</parameter>
+        <!-- Search file parameters -->
+		<parameter name="con_fastadb" label="Proteome DB - Contaminants">
+			<fileGenerator type="upload" purpose="con_fasta" target="con_fastadb"/>
+		</parameter>
 
 		<parameter name="mztab" label="Search mzTab Files">
 			<fileGenerator type="upload" purpose="mztab" target="mztab"/>
@@ -227,6 +231,16 @@
                 <module id="fastadb" type="fileSelector">
                     <property name="label">Proteome DB</property>
                 </module>
+            </cell>
+			<cell>
+                <label>
+                    <content parameter="con_fastadb"/>
+                </label>
+            </cell>
+            <cell colspan="3">
+                <module id="con_fastadb" type="fileSelector">
+                    <property name="label">Proteome DB - Contaminants</property>
+                </module>
             </cell>
         </row>
 
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index bbe1f8e..afd2332 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -68,7 +68,8 @@
 
     <tool name="download_latest_kb">
       <require name="params"    type="file"/>
-            <require name="fastadb"            type="folder"/>
+      <require name="fastadb"            type="folder"/>
+      <require name="con_fastadb"            type="folder"/>
       <require name="peptide_coverage_comparisons"    type="folder"/>
       <produce name="kb_pep" type="file" naming="explicit"  extension="out"/>
 
@@ -76,6 +77,7 @@
             <arg pathRef="download_latest_kb.script"/>
             <arg option="-params"  valueRef="params"/>
             <arg option="-proteome_fasta"          valueRef="fastadb"/>
+            <arg option="-contaminants_fasta"          valueRef="con_fastadb"/>
             <arg option="-comparisons"  valueRef="peptide_coverage_comparisons"/>
             <arg option="-backup_kb_pep"  pathRef="predownloaded_pep"/>
             <arg option="-kb_pep"  valueRef="kb_pep"/>
@@ -85,11 +87,13 @@
   <tool name="map_peptides">
       <require name="peptide_list"       type="file"/>
       <require name="fastadb"            type="folder"/>
+      <require name="con_fastadb"            type="folder"/>
       <require name="exon_fasta"            type="file"/>
       <produce name="peptide_coverage"        type="folder"/>
       <execution env="binary" argConvention="adhoc">
           <arg pathRef="map_peptides.script"/>
           <arg option="-proteome_fasta"          valueRef="fastadb"/>
+          <arg option="-contaminants_fasta"          valueRef="con_fastadb"/>
           <arg option="-exon_fasta"          valueRef="exon_fasta"/>
           <arg option="-peptide_list"   valueRef="peptide_list"/>
           <arg option="-output_folder"  valueRef="peptide_coverage"/>
@@ -137,6 +141,7 @@
   <tool name="peptide_protein_cosine">
       <require name="kb_pep"    type="file"/>
       <require name="fastadb"            type="folder"/>
+      <require name="con_fastadb"            type="folder"/>
       <require name="novel_psms_w_cosine"             type="folder"/>
       <require name="peptide_coverage"                type="folder"/>
       <require name="novel_psms_w_cosine_external"     type="folder"/>
@@ -157,7 +162,8 @@
           <arg pathRef="peptide_protein_cosine.script"/>
           <arg option="-comparison_pep"               valueRef="kb_pep"/>
           <arg option="-input_psms"           valueRef="novel_psms_w_cosine"/>
-          <arg option="-fasta"                      valueRef="fastadb"/>
+          <arg option="-proteome_fasta"          valueRef="fastadb"/>
+          <arg option="-contaminants_fasta"          valueRef="con_fastadb"/>
           <arg option="-input_psms_external"           valueRef="novel_psms_w_cosine_external"/>
           <arg option="-protein_coverage"     valueRef="peptide_coverage"/>
           <arg option="-protein_coverage_external"     valueRef="peptide_coverage_merged_external"/>
diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index aa4496c..68da376 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -95,7 +95,8 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s
             precursor_filter_window=1.5,
             low_mass_filter=low_mass_filter,
             isobaric_tag_type=None,
-            min_snr=min_snr
+            min_snr=min_snr,
+            num_top_unannotated_envelopes_to_remove=2
     )
     ion_vector = spectrum._replace(peaks = ion_vector)
     ion_vector = processing.normalize_spectrum(ion_vector)
diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py
index db56a11..1d63925 100644
--- a/tools/peptide_statistics_hpp/download_latest_kb.py
+++ b/tools/peptide_statistics_hpp/download_latest_kb.py
@@ -10,7 +10,8 @@ def arguments():
     parser = argparse.ArgumentParser(description='mzTab to list of peptides')
     parser.add_argument('-p','--params', type = str, help='Input Parameters')
     parser.add_argument('-c','--comparisons', type = Path, help='Comparison Jobs')
-    parser.add_argument('-f','--proteome_fasta', type = Path, help='FASTA File')
+    parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database')
+    parser.add_argument('-m','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database')
     parser.add_argument('-b','--backup_kb_pep', type = Path, help='Backup KB Peptides')
     parser.add_argument('-t','--use_job_level_thresholds', type = bool, help='Use job level thresholds',default=True)
     parser.add_argument('-k','--kb_pep', type = Path, help='Output KB Peptides')
@@ -65,7 +66,7 @@ def main():
 
         try:
 
-            proteome = mapping.add_decoys(mapping.read_uniprot(args.proteome_fasta))
+            proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])
 
             with open(args.kb_pep, 'w') as w:
                 r = csv.DictWriter(w, delimiter = '\t', fieldnames = header)
diff --git a/tools/peptide_statistics_hpp/map_peptides.py b/tools/peptide_statistics_hpp/map_peptides.py
index 86e3308..40d298e 100755
--- a/tools/peptide_statistics_hpp/map_peptides.py
+++ b/tools/peptide_statistics_hpp/map_peptides.py
@@ -9,6 +9,7 @@
 def arguments():
     parser = argparse.ArgumentParser(description='Map Peptides')
     parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database')
+    parser.add_argument('-c','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database')
     parser.add_argument('-e','--exon_fasta', type = Path, help='Input FASTA Exon Mapping Database')
     parser.add_argument('-p','--peptide_list', type = Path, help='Peptide List')
     parser.add_argument('-o','--output_folder', type = Path, help='Output Folder')
@@ -28,7 +29,7 @@ def load_peptide_list(input_peptide_filename):
 def main():
     args = arguments()
 
-    proteome = mapping.read_uniprot(args.proteome_fasta)
+    proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])
     proteome_with_decoys = mapping.add_decoys(proteome)
     peptide_list = load_peptide_list(args.peptide_list)
 
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 50cb0e0..5bb6d28 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -16,7 +16,8 @@
 def arguments():
     parser = argparse.ArgumentParser(description='mzTab to list of peptides')
     parser.add_argument('--comparison_pep', type = Path, help='Peptides to Compare')
-    parser.add_argument('--fasta', type = Path, help='Input FASTA')
+    parser.add_argument('--proteome_fasta', type = Path, help='Input Proteome FASTA')
+    parser.add_argument('--contaminants_fasta', type = Path, help='Input Contaminants FASTA')
     parser.add_argument('--input_psms', type = Path, help='Input PSMs')
     parser.add_argument('--input_psms_external', type = Path, help='Input PSMs (External)')
     parser.add_argument('--input_peptides', type = Path, help='Input PSMs (External)')
@@ -92,7 +93,7 @@ def add_brackets(pep):
         pep = pep[:breakpoint] + end_bracket + pep[breakpoint:]
     return pep
 
-protein_type = lambda protein, proteome: 'TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform')
+protein_type = lambda protein, proteome: 'Contaminant' if proteome.proteins[protein].db == 'con' else ('TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform'))
 
 def msv_to_pxd(msv, msv_mapping):
     output_mapping = msv_mapping.get(msv,{}).get('px_accession')
@@ -164,8 +165,9 @@ def main():
     args = arguments()
 
     representative_per_precursor = {}
+    variant_to_precursors = defaultdict(list)
 
-    proteome = mapping.add_decoys(mapping.read_uniprot(args.fasta))
+    proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
 
     all_datasets = set()
     datasets_per_sequence= defaultdict(set)
@@ -237,6 +239,10 @@ def update_precursor_representative(l,from_psm = True):
         datasets = set([d for d in l.get('datasets','').split(';') if d != ''])
         tasks = set([t for t in l.get('tasks','').split(';') if t != ''])
         sequence, charge = l['sequence'],l['charge']
+        precursor_theoretical_mz = theoretical_mz(sequence, charge)
+        aa_seq = ''.join([a for a in sequence if a.isalpha()])
+        variant = (aa_seq, charge, int(precursor_theoretical_mz))
+        
         if not (sequence, charge) in representative_per_precursor:
             representative_per_precursor[(sequence, charge)] = l.copy()
             precursor_representative = representative_per_precursor[(sequence, charge)]
@@ -287,7 +293,7 @@ def update_precursor_representative(l,from_psm = True):
             precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi']
             precursor_representative['score'] = float(l['score'])
             #consider best EI And matched ions over all representatives
-            if float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
+            if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
                 precursor_representative['explained_intensity'] = l['explained_intensity']
                 precursor_representative['matched_ions'] = l['matched_ions']
 
@@ -447,7 +453,8 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
                         proteins = l['protein'].split(' ###')[0].split(';')
                         # PSM-level FDR was inefficient at this scale - need to rethink
                         #if row_pass_filters(l):
-                        all_psms_with_score.append(fdr.ScoredElement(l['usi'],'XXX_' in proteins[0],l['score']))
+                        all_targets = [p for p in proteins if 'XXX_' not in p]
+                        all_psms_with_score.append(fdr.ScoredElement(l['usi'],len(all_targets)==0,l['score']))
                         l.pop('mapped_proteins')
                         l.pop('hpp')
                         l.pop('len')
@@ -488,7 +495,7 @@ def output_protein_level_results(best_psm):
 
         proteins = [p for p in best_psm['protein'].split(' ###')[0].split(';') if p != '']
         if row_pass_filters(best_psm):
-            if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and 'Canonical' in best_psm.get('protein_type',''):
+            if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
                 pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
                 if best_psm.get('hpp_match','') == 'Yes':
                     precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
@@ -497,7 +504,7 @@ def output_protein_level_results(best_psm):
                 precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
 
         proteins = peptide_to_protein.get(sequence_il,[])
-        cannonical_proteins = [protein for protein in proteins if proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso]
+        cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
         output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
 
         if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes':
@@ -635,7 +642,7 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5):
             r.writeheader()
             for precursor in all_precursors:
                 proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != '']
-                if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and 'Canonical' in precursor.get('protein_type',''):
+                if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')):
                     precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1)
                     precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1)
                 else:

From e56b10a77ad559a87d74d4c2d7ea602b495ea4f2 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 10:08:18 -0700
Subject: [PATCH 02/22] use variants

---
 .../download_latest_kb.py                      |  2 +-
 .../peptide_protein_cosine.py                  | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py
index 1d63925..d3457a8 100644
--- a/tools/peptide_statistics_hpp/download_latest_kb.py
+++ b/tools/peptide_statistics_hpp/download_latest_kb.py
@@ -66,7 +66,7 @@ def main():
 
         try:
 
-            proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])
+            proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
 
             with open(args.kb_pep, 'w') as w:
                 r = csv.DictWriter(w, delimiter = '\t', fieldnames = header)
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 5bb6d28..badfaec 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -165,7 +165,7 @@ def main():
     args = arguments()
 
     representative_per_precursor = {}
-    variant_to_precursors = defaultdict(list)
+    variant_to_best_precursor = {}
 
     proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
 
@@ -241,13 +241,17 @@ def update_precursor_representative(l,from_psm = True):
         sequence, charge = l['sequence'],l['charge']
         precursor_theoretical_mz = theoretical_mz(sequence, charge)
         aa_seq = ''.join([a for a in sequence if a.isalpha()])
+
         variant = (aa_seq, charge, int(precursor_theoretical_mz))
-        
-        if not (sequence, charge) in representative_per_precursor:
+        update_peptidoform = False
+
+        if not variant in variant_to_best_precursor:
+            variant_to_best_precursor[variant] = (sequence, charge)
             representative_per_precursor[(sequence, charge)] = l.copy()
             precursor_representative = representative_per_precursor[(sequence, charge)]
             precursor_representative['datasets'] = datasets
             precursor_representative['tasks'] = tasks
+            precursor_representative['parent_mass'] = precursor_theoretical_mz
             if from_psm:
                 precursor_representative['database_filename'] = l['filename']
                 precursor_representative['database_scan'] = l['scan']
@@ -259,7 +263,7 @@ def update_precursor_representative(l,from_psm = True):
                 precursor_representative.pop('scan')
                 precursor_representative.pop('usi')
 
-        precursor_representative = representative_per_precursor[(sequence, charge)]
+        precursor_representative = representative_per_precursor[variant_to_best_precursor[variant]]
 
         precursor_representative['datasets'] |= datasets
         precursor_representative['tasks'] |= tasks
@@ -292,6 +296,7 @@ def update_precursor_representative(l,from_psm = True):
             precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan']
             precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi']
             precursor_representative['score'] = float(l['score'])
+            update_peptidoform = True
             #consider best EI And matched ions over all representatives
             if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
                 precursor_representative['explained_intensity'] = l['explained_intensity']
@@ -311,6 +316,11 @@ def update_precursor_representative(l,from_psm = True):
         else:
             precursor_representative['cosine_score_match'] = 'No'
 
+        if update_peptidoform:
+            representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant])
+            variant_to_best_precursor[variant] = (sequence, charge)
+
+
     def update_mappings(protein_coverage_file,update_precursor_representatives):
         if(protein_coverage_file.is_file()):
             print("{}: Loading {} ({} cumulative peptides @ {} peptides/second)".format(datetime.now().strftime("%H:%M:%S"),protein_coverage_file,len(pep_mapping_info),len(pep_mapping_info)/(1+(datetime.now()-start_time).seconds)))

From 77c17b94b63c8712e866f8eca6361f0a938ea956 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 16:13:05 -0700
Subject: [PATCH 03/22] add non-numeric filter to PE column

---
 peptide_statistics_hpp/result.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index 81de135..9099946 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -147,7 +147,7 @@
             <column type="text" field="protein_type" label="Protein Type" width="4"/>
             <column type="integer" field="aa_start" label="AA Start" width="1"/>
             <column type="integer" field="aa_end" label="AA End" width="1"/>
-            <column type="integer" field="pe" label="PE" width="5"/>
+            <column type="integer" field="pe" label="PE" width="5" nonNumericCheckbox="true"/>
             <column type="text" field="hpp_match" label="HPP Compliant" width="2"/>
             <column type="text" field="usi" label="USI" width="20"/>
             <!-- <column type="text" field="filename" label="Filename" width="20"/>
@@ -222,7 +222,7 @@
                 <column type="text" field="protein_type" label="Protein Type" width="4"/>
                 <column type="integer" field="aa_start" label="AA Start" width="1" tooltip="Start location of peptide.  If the peptide maps multiple times, to either multiple proteins or the same protein, this will be N/A."/>
                 <column type="integer" field="aa_end" label="AA End" width="1" tooltip="End location of peptide.  If the peptide maps multiple times, to either multiple proteins or the same protein, this will be N/A."/>
-                <column type="integer" field="pe" label="Protein Existence (PE) Level" width="5" tooltip="Protein Existence (PE) Level from neXtProt 2020 release."/>
+                <column type="integer" field="pe" label="Protein Existence (PE) Level" width="5" tooltip="Protein Existence (PE) Level from latest neXtProt release." nonNumericCheckbox="true" />
                 <column type="text" field="sequence" label="Sequence" width="20"/>
                 <column type="text" field="sequence_unmodified" label="Demodified Sequence" width="20"/>
                 <column type="text" field="sequence_unmodified_il" label="IL Demodified Sequence" width="20"/>

From 3bfa14bd2c9af76a8190cb77f57c7baa7e411b06 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 16:13:29 -0700
Subject: [PATCH 04/22] update variant definition to be sum of integer mod
 masses

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index badfaec..9999dac 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -77,6 +77,10 @@ def theoretical_mz(sequence,charge):
         mods = 0
     return (theoretical_mass(aa) + mods + (int(charge)*1.007276035))/int(charge)
 
+def integer_mod_mass(sequence):
+    mods = ''.join([m for m in sequence if not m.isalpha()])
+    mods = [int(round(float(m))) if i == 0 else -int(round(float(m))) for ms in mods.split('+') for i,m in enumerate(ms.split('-')) if m != '' ]
+    return sum(mods)
 
 def add_brackets(pep):
     aa_breakpoints = []
@@ -235,14 +239,15 @@ def main():
     latest_nextprot_release = sorted(list(nextprot_releases_pe.keys()))[-1]
     nextprot_pe = nextprot_releases_pe[latest_nextprot_release]
 
-    def update_precursor_representative(l,from_psm = True):
+    def update_precursor_representative(l,from_psm = True, variant_level = False):
         datasets = set([d for d in l.get('datasets','').split(';') if d != ''])
         tasks = set([t for t in l.get('tasks','').split(';') if t != ''])
         sequence, charge = l['sequence'],l['charge']
         precursor_theoretical_mz = theoretical_mz(sequence, charge)
+        integer_mods = integer_mod_mass(sequence)
         aa_seq = ''.join([a for a in sequence if a.isalpha()])
 
-        variant = (aa_seq, charge, int(precursor_theoretical_mz))
+        variant = (aa_seq, charge, integer_mods)
         update_peptidoform = False
 
         if not variant in variant_to_best_precursor:

From f8120fe30e56d1253f832d0cb7a33649c244443e Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 17:02:45 -0700
Subject: [PATCH 05/22] find best cosine over all sequences matching

---
 .../cosine_to_synthetics.py                   | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index 68da376..b1192ca 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -108,12 +108,12 @@ def find_ei_and_intensity(spectrum, psm, synthetic_scans, tol, low_mass_filter,
     charge = psm['charge']
     tolerance = psm['tolerance'] if psm['tolerance'] else tol
     spectrum = spectrum._replace(precursor_z = int(charge), annotation = processing.Annotation(sequence, None))
-    matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[])
+    matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[])
     spectrum_ei, spectrum_ion_vector = extract_annotated_peaks(spectrum, tolerance, low_mass_filter, min_snr)
-    for synthetic_filescan, synthetic_ion_vector in matching_synthetics:
+    for synthetic_filescan, synthetic_ion_vector, synthetic_peptide in matching_synthetics:
         cosine = processing.match_peaks(spectrum_ion_vector, synthetic_ion_vector, tolerance)
         if not best_cosine or cosine > best_cosine[0]:
-            best_cosine = (cosine,synthetic_filescan)
+            best_cosine = (cosine,synthetic_filescan,synthetic_peptide)
     return spectrum_ei, best_cosine
 
 def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, peaks_obj, spectrum_select_func):
@@ -173,7 +173,7 @@ def main():
         r = csv.DictReader(f, delimiter='\t')
         psms_header = r.fieldnames
         for l in r:
-            synthetic_keys.add((l['sequence'].replace('+229.163','').replace('+229.162932',''),l['charge']))
+            synthetic_keys.add((''.join([a for a in l['sequence'] if a.isalpha()]),l['charge']))
             all_psms.append(l)
             try:
                 tolerance = float(l['frag_tol'])
@@ -196,9 +196,10 @@ def main():
                 with mgf.read(synthetics_file) as reader:
                     for i,s in enumerate(reader):
                         peptide = s['params'].get('seq')
+                        sequence = ''.join([a for a in peptide if a.isalpha()])
                         charge = int(s['params'].get('charge')[0])
                         # print(peptide, str(charge))
-                        if (peptide, str(charge)) in synthetic_keys:
+                        if (sequence, str(charge)) in synthetic_keys:
                             synthetics_loaded += 1
                             filename = s['params'].get('originalfile_filename',s['params'].get('provenance_filename'))
                             scan = s['params'].get('originalfile_scan',s['params'].get('provenance_scan'))
@@ -215,7 +216,7 @@ def main():
                             )
                             synthetic_low_mass, synthetic_min_snr = (args.low_mass_filter, args.min_snr) if args.filter_synthetics == 'Yes' else (0,0)
                             _, synthetic_ion_vector = extract_annotated_peaks(spectrum, 0.05, synthetic_low_mass, synthetic_min_snr)
-                            synthetic_scans[(peptide, str(charge))].append(((filename,scan),synthetic_ion_vector))
+                            synthetic_scans[(sequence, str(charge))].append(((filename,scan),synthetic_ion_vector,peptide))
         print("{}: Loaded {} synthetics".format(datetime.now().strftime("%H:%M:%S"),synthetics_loaded))
     else:
         print("Not loading synthetics")
@@ -314,9 +315,9 @@ def main():
             for scan in psms_to_consider[filename].keys():
                 sequence = psms_to_consider[filename][scan]['sequence']
                 charge = psms_to_consider[filename][scan]['charge']
-                matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[])
-                for synthetic_filescan, _ in matching_synthetics:
-                    cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan)
+                matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[])
+                for synthetic_filescan, _ , synthetic_peptide in matching_synthetics:
+                    cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan,synthetic_peptide)
 
     print("{}: About to write out PSMs".format(datetime.now().strftime("%H:%M:%S")))
 
@@ -329,13 +330,15 @@ def main():
             if best_synthetic[0] != 'N/A':
                 synthetic_filename = 'f.' + best_synthetic[0].replace('/data/massive/','')
                 synthetic_scan = best_synthetic[1]
+                synthetic_peptide = best_synthetic[2]
             else:
                 synthetic_filename = best_synthetic[0]
                 synthetic_scan = best_synthetic[1]
+                synthetic_peptide = best_synthetic[2]
             psm['usi'] = make_usi(psm['filename'], psm['scan'], psm['sequence'], psm['charge'])
             psm['synthetic_filename'] = synthetic_filename
             psm['synthetic_scan'] = synthetic_scan
-            psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, psm['sequence'].replace('+229.163','').replace('+229.162932',''), psm['charge'])
+            psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, synthetic_peptide, psm['charge'])
             psm['cosine'] = cosine
             ei, num_matched_peaks = explained_intensity_per_spectrum.get((psm['filename'],psm['scan']),(0,0))
             psm['explained_intensity'] = ei

From bc5b98f14b4a9146e01baedf4c721ea496557df0 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:25:34 -0700
Subject: [PATCH 06/22] update to only calculate precursor FDR for spectra that
 pass PSM FDR

---
 .../peptide_protein_cosine.py                 | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 9999dac..c771314 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -239,7 +239,7 @@ def main():
     latest_nextprot_release = sorted(list(nextprot_releases_pe.keys()))[-1]
     nextprot_pe = nextprot_releases_pe[latest_nextprot_release]
 
-    def update_precursor_representative(l,from_psm = True, variant_level = False):
+    def update_precursor_representative(l,from_psm = True, variant_level = False, psm_fdr = 0):
         datasets = set([d for d in l.get('datasets','').split(';') if d != ''])
         tasks = set([t for t in l.get('tasks','').split(';') if t != ''])
         sequence, charge = l['sequence'],l['charge']
@@ -250,6 +250,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False):
         variant = (aa_seq, charge, integer_mods)
         update_peptidoform = False
 
+        score = float(l['score']) if psm_fdr <= 0.01 else 0
+
         if not variant in variant_to_best_precursor:
             variant_to_best_precursor[variant] = (sequence, charge)
             representative_per_precursor[(sequence, charge)] = l.copy()
@@ -263,7 +265,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False):
                 precursor_representative['database_usi'] = l['usi']
                 precursor_representative['explained_intensity'] = float(l['explained_intensity'])
                 precursor_representative['cosine'] = float(l['cosine'])
-                precursor_representative['score'] = float(l['score'])
+                precursor_representative['score'] = score
                 precursor_representative.pop('filename')
                 precursor_representative.pop('scan')
                 precursor_representative.pop('usi')
@@ -274,7 +276,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False):
         precursor_representative['tasks'] |= tasks
 
         best_cosine = float(l['cosine']) >= float(precursor_representative['cosine'])
-        best_score = float(l['score']) >= float(precursor_representative['score'])
+        best_score = score >= float(precursor_representative['score'])
 
         this_pass_ei = float(l['explained_intensity']) >= args.explained_intensity_cutoff
         this_pass_cos = float(l['cosine']) >= args.cosine_cutoff
@@ -467,9 +469,11 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
                         l.update(protein_info_dict)
                         proteins = l['protein'].split(' ###')[0].split(';')
                         # PSM-level FDR was inefficient at this scale - need to rethink
-                        #if row_pass_filters(l):
                         all_targets = [p for p in proteins if 'XXX_' not in p]
-                        all_psms_with_score.append(fdr.ScoredElement(l['usi'],len(all_targets)==0,l['score']))
+                        is_decoy = len(all_targets)==0
+                        l['decoy'] = is_decoy
+                        # if row_pass_filters(l):
+                        all_psms_with_score.append(fdr.ScoredElement(l['usi'],is_decoy,l['score']))
                         l.pop('mapped_proteins')
                         l.pop('hpp')
                         l.pop('len')
@@ -481,8 +485,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
                 l['psm_fdr'] = psm_fdr.get(l['usi'],1)
                 if args.output_psms_flag == "1" or (args.output_psms_flag == "0.5" and row_pass_filters(l)):
                     o.writerow(l)
-                if l['psm_fdr'] <= 0.01:
-                    update_precursor_representative(l)
+                update_precursor_representative(l, psm_fdr = l['psm_fdr'])
 
 
     print("About to calculate precursor FDR")
@@ -491,7 +494,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
 
     for (sequence, charge), best_psm in representative_per_precursor.items():
         all_targets = [p for ps in best_psm['protein'].split(' ###') for p in ps.split(';') if 'XXX_' not in p]
-        if best_psm['protein'] != '':
+        if best_psm['protein'] != '' and best_psm['score'] > 0:
             all_hpp_precursors.append(fdr.ScoredElement((sequence, charge),len(all_targets)==0,best_psm['score']))
     
     precursor_fdr = {}

From 43412e12dbd8c2e61479733cb963c65e2ee318cc Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:25:57 -0700
Subject: [PATCH 07/22] properly account for PSM quality filters

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index c771314..079a1ed 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -472,8 +472,8 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
                         all_targets = [p for p in proteins if 'XXX_' not in p]
                         is_decoy = len(all_targets)==0
                         l['decoy'] = is_decoy
-                        # if row_pass_filters(l):
-                        all_psms_with_score.append(fdr.ScoredElement(l['usi'],is_decoy,l['score']))
+                        if row_pass_filters(l):
+                            all_psms_with_score.append(fdr.ScoredElement(l['usi'],is_decoy,l['score']))
                         l.pop('mapped_proteins')
                         l.pop('hpp')
                         l.pop('len')

From 33f22c882ee0ade44dfd64ebd9527942448a62c7 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:30:22 -0700
Subject: [PATCH 08/22] use EI as tie breaker when PSM scores are the same

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 079a1ed..5f03c95 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -276,7 +276,6 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         precursor_representative['tasks'] |= tasks
 
         best_cosine = float(l['cosine']) >= float(precursor_representative['cosine'])
-        best_score = score >= float(precursor_representative['score'])
 
         this_pass_ei = float(l['explained_intensity']) >= args.explained_intensity_cutoff
         this_pass_cos = float(l['cosine']) >= args.cosine_cutoff
@@ -285,6 +284,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         best_pass_cos = float(precursor_representative['cosine']) >= args.cosine_cutoff
         best_pass_by = int(precursor_representative['matched_ions']) >= args.annotated_ions_cutoff
 
+        best_score =  float(l['explained_intensity']) >= float(precursor_representative['explained_intensity']) if score == float(precursor_representative['score']) else score > float(precursor_representative['score'])
+
         # we want to find the spectrum with the best score,
         # but sometimes the highest scoring spectrum does not pass the filters
 

From 7b99f7f4737a453e99a8ce3374a1b52757336eb3 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:34:51 -0700
Subject: [PATCH 09/22] keep track of num PSMs

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 5f03c95..1cba2bd 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -259,6 +259,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['datasets'] = datasets
             precursor_representative['tasks'] = tasks
             precursor_representative['parent_mass'] = precursor_theoretical_mz
+            precursor_representative['filtered_psms'] = 0
+            precursor_representative['total_psms'] = 0
             if from_psm:
                 precursor_representative['database_filename'] = l['filename']
                 precursor_representative['database_scan'] = l['scan']
@@ -284,7 +286,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         best_pass_cos = float(precursor_representative['cosine']) >= args.cosine_cutoff
         best_pass_by = int(precursor_representative['matched_ions']) >= args.annotated_ions_cutoff
 
-        best_score =  float(l['explained_intensity']) >= float(precursor_representative['explained_intensity']) if score == float(precursor_representative['score']) else score > float(precursor_representative['score'])
+        best_score = float(l['explained_intensity']) >= float(precursor_representative['explained_intensity']) if score == float(precursor_representative['score']) else score > float(precursor_representative['score'])
 
         # we want to find the spectrum with the best score,
         # but sometimes the highest scoring spectrum does not pass the filters
@@ -324,6 +326,12 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         else:
             precursor_representative['cosine_score_match'] = 'No'
 
+        if (this_pass_ei or this_pass_cos) and this_pass_by:
+            precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
+        precursor_representative['total_psms'] += l.get('total_psms',1)
+
+
+
         if update_peptidoform:
             representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant])
             variant_to_best_precursor[variant] = (sequence, charge)

From b3b16dae521398b8794d18eb9da958738225c7f8 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:36:10 -0700
Subject: [PATCH 10/22] keep track of overall best PSM score

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 1cba2bd..592a8a0 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -268,6 +268,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
                 precursor_representative['explained_intensity'] = float(l['explained_intensity'])
                 precursor_representative['cosine'] = float(l['cosine'])
                 precursor_representative['score'] = score
+                precursor_representative['best_overall_psm_score'] = score
                 precursor_representative.pop('filename')
                 precursor_representative.pop('scan')
                 precursor_representative.pop('usi')
@@ -326,6 +327,9 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         else:
             precursor_representative['cosine_score_match'] = 'No'
 
+        if best_score:
+            precursor_representative['best_overall_psm_score'] = float(l['score'])
+
         if (this_pass_ei or this_pass_cos) and this_pass_by:
             precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
         precursor_representative['total_psms'] += l.get('total_psms',1)

From 51805401c16e43471d23684a703d4043fc5ad658 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 18:44:36 -0700
Subject: [PATCH 11/22] keep track of num_precursors per variant

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 592a8a0..f2402dc 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -170,6 +170,8 @@ def main():
 
     representative_per_precursor = {}
     variant_to_best_precursor = {}
+    variant_to_all_precursors = defaultdict(set)
+    current_variant_number = 0
 
     proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
 
@@ -252,6 +254,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
 
         score = float(l['score']) if psm_fdr <= 0.01 else 0
 
+        variant_to_all_precursors[variant].add((sequence, charge))
+
         if not variant in variant_to_best_precursor:
             variant_to_best_precursor[variant] = (sequence, charge)
             representative_per_precursor[(sequence, charge)] = l.copy()
@@ -334,7 +338,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
         precursor_representative['total_psms'] += l.get('total_psms',1)
 
-
+        precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant])
 
         if update_peptidoform:
             representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant])

From 4698e080c0fb01b6a033550e0e2133d7907a643c Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 19:09:54 -0700
Subject: [PATCH 12/22] add variant number, option to run with peptidoforms and
 not variants

---
 peptide_statistics_hpp/input.xml              | 19 ++++++++++++++++++
 peptide_statistics_hpp/tool.xml               |  2 +-
 .../peptide_protein_cosine.py                 | 20 ++++++++++++++-----
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml
index 279e804..87f218f 100644
--- a/peptide_statistics_hpp/input.xml
+++ b/peptide_statistics_hpp/input.xml
@@ -101,6 +101,15 @@
 			<validator type="set" />
 		</parameter>
 
+		<parameter name="use_variants" label="Variant-level analysis">
+			<options>
+				<option label="Peptidoforms" value="0" />
+				<option label="Variants" value="1" />
+			</options>
+			<default value="0" />
+			<validator type="set" />
+		</parameter>
+
 		<parameter name="parallel_cosine.parallel_cosine" label="Parallelism (For Cosine Calculations)">
 			<default value="10"/>
 			<validator type="integer"/>
@@ -396,6 +405,16 @@
 						<input type="select" parameter="synthetic_filters"/>
 					</cell>
 				</row>
+				<row>
+					<cell>
+						<label>
+							<content parameter="use_variants"/>
+						</label>
+					</cell>
+					<cell colspan="2">
+						<input type="select" parameter="use_variants"/>
+					</cell>
+				</row>
 
 	</block>
 	<block label="Comparison">
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index afd2332..2d02dd8 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -195,7 +195,7 @@
           <arg option="-library_name"         valueRef="@library_name"/>
           <arg option="-export_explorers"     valueRef="@export_explorers"/>
           <arg option="-explorers_output"     valueRef="explorer_export"/>
-
+          <arg option="-variant_output"     valueRef="@use_variants"/>
       </execution>
   </tool>
   
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index f2402dc..c1d23e6 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -51,6 +51,8 @@ def arguments():
     parser.add_argument('--library_name', type = int, help='Library Name')
     parser.add_argument('--export_explorers', type = int, help='Export Explorer Tables (0/1)')
     parser.add_argument('--explorers_output', type = Path, help='Tables for Explorers')
+    parser.add_argument('--variant_output', type = int, help='Variant level outputs',default=0)
+
     if len(sys.argv) < 4:
         parser.print_help()
         sys.exit(1)
@@ -171,7 +173,7 @@ def main():
     representative_per_precursor = {}
     variant_to_best_precursor = {}
     variant_to_all_precursors = defaultdict(set)
-    current_variant_number = 0
+    variant_number = {}
 
     proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
 
@@ -249,7 +251,10 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         integer_mods = integer_mod_mass(sequence)
         aa_seq = ''.join([a for a in sequence if a.isalpha()])
 
-        variant = (aa_seq, charge, integer_mods)
+        if variant_level:
+            variant = (aa_seq, charge, integer_mods)
+        else:
+            variant = (sequence, charge)
         update_peptidoform = False
 
         score = float(l['score']) if psm_fdr <= 0.01 else 0
@@ -257,6 +262,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
         variant_to_all_precursors[variant].add((sequence, charge))
 
         if not variant in variant_to_best_precursor:
+            current_variant_number = len(variant_number)
+            variant_number[variant] = current_variant_number
             variant_to_best_precursor[variant] = (sequence, charge)
             representative_per_precursor[(sequence, charge)] = l.copy()
             precursor_representative = representative_per_precursor[(sequence, charge)]
@@ -265,6 +272,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['parent_mass'] = precursor_theoretical_mz
             precursor_representative['filtered_psms'] = 0
             precursor_representative['total_psms'] = 0
+            precursor_representative['variant_number'] = current_variant_number
             if from_psm:
                 precursor_representative['database_filename'] = l['filename']
                 precursor_representative['database_scan'] = l['scan']
@@ -344,6 +352,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant])
             variant_to_best_precursor[variant] = (sequence, charge)
 
+        return variant_number[variant]
 
     def update_mappings(protein_coverage_file,update_precursor_representatives):
         if(protein_coverage_file.is_file()):
@@ -355,7 +364,7 @@ def update_mappings(protein_coverage_file,update_precursor_representatives):
                 protein_mapping[protein].update(peptide_mapping)
             if update_precursor_representatives:
                 for l in output_peptides:
-                    update_precursor_representative(l,False)
+                    update_precursor_representative(l,False,variant_level=args.variant_output==1)
 
 
     start_time = datetime.now()
@@ -458,7 +467,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
         all_psms_with_score = []
         all_psm_rows = []
         with open(args.output_psms,'w') as w:
-            header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
+            header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','variant_number','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
             o = csv.DictWriter(w, delimiter='\t',fieldnames = header, restval='N/A', extrasaction='ignore')
             o.writeheader()
             for input_psm in args.input_psms.glob('*'):
@@ -500,9 +509,10 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
 
             for l in all_psm_rows:
                 l['psm_fdr'] = psm_fdr.get(l['usi'],1)
+                variant_number = update_precursor_representative(l, psm_fdr = l['psm_fdr'], variant_level=args.variant_output==1)
+                l['variant_number'] = variant_number
                 if args.output_psms_flag == "1" or (args.output_psms_flag == "0.5" and row_pass_filters(l)):
                     o.writerow(l)
-                update_precursor_representative(l, psm_fdr = l['psm_fdr'])
 
 
     print("About to calculate precursor FDR")

From 5dab5cd1600514ca8707284ca9e2bb179781a036 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Wed, 6 Jul 2022 19:34:48 -0700
Subject: [PATCH 13/22] assume TMT

---
 tools/peptide_statistics_hpp/cosine_to_synthetics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index b1192ca..65b303a 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -96,7 +96,8 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s
             low_mass_filter=low_mass_filter,
             isobaric_tag_type=None,
             min_snr=min_snr,
-            num_top_unannotated_envelopes_to_remove=2
+            num_top_unannotated_envelopes_to_remove=2,
+            isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs
     )
     ion_vector = spectrum._replace(peaks = ion_vector)
     ion_vector = processing.normalize_spectrum(ion_vector)

From 0a24e54e94d6adc1c028e70c181b34f5cdf7cae2 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Thu, 7 Jul 2022 11:35:04 -0700
Subject: [PATCH 14/22] fix variant aggregation

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index c1d23e6..a93569d 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -509,8 +509,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
 
             for l in all_psm_rows:
                 l['psm_fdr'] = psm_fdr.get(l['usi'],1)
-                variant_number = update_precursor_representative(l, psm_fdr = l['psm_fdr'], variant_level=args.variant_output==1)
-                l['variant_number'] = variant_number
+                l['variant_number'] = update_precursor_representative(l, psm_fdr = l['psm_fdr'], variant_level=args.variant_output==1)
                 if args.output_psms_flag == "1" or (args.output_psms_flag == "0.5" and row_pass_filters(l)):
                     o.writerow(l)
 
@@ -543,10 +542,10 @@ def output_protein_level_results(best_psm):
             if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
                 pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
                 if best_psm.get('hpp_match','') == 'Yes':
-                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
-                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
+                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
+                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
             for protein in proteins:
-                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
+                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
 
         proteins = peptide_to_protein.get(sequence_il,[])
         cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']

From 256b7ff8bab3436504eff0ab167cae012cb7cf83 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Thu, 7 Jul 2022 11:38:07 -0700
Subject: [PATCH 15/22] cosine updates

---
 tools/peptide_statistics_hpp/cosine_to_synthetics.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index 65b303a..edfb8c3 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -94,7 +94,6 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s
             fragment_tolerance,
             precursor_filter_window=1.5,
             low_mass_filter=low_mass_filter,
-            isobaric_tag_type=None,
             min_snr=min_snr,
             num_top_unannotated_envelopes_to_remove=2,
             isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs
@@ -118,7 +117,7 @@ def find_ei_and_intensity(spectrum, psm, synthetic_scans, tol, low_mass_filter,
     return spectrum_ei, best_cosine
 
 def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, peaks_obj, spectrum_select_func):
-    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
     explained_intensity_per_spectrum = {}
     for scan in psms_to_consider[filename].keys():
         spectrum = spectrum_select_func(peaks_obj,scan)
@@ -130,7 +129,7 @@ def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_
 
 def process_spectrum_read_file(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, reader, spectrum_select_func,read_scan):
     start_time = datetime.now()
-    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
     explained_intensity_per_spectrum = {}
     all_spectra = []
     for i,s in enumerate(reader):
@@ -222,7 +221,7 @@ def main():
     else:
         print("Not loading synthetics")
 
-    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+    cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
     explained_intensity_per_spectrum = {}
 
     min_spectra_to_load_file = 20
@@ -327,15 +326,13 @@ def main():
         w_psm = csv.DictWriter(fw_psm, delimiter = '\t', fieldnames = header)
         w_psm.writeheader()
         for psm in all_psms:
-            cosine, best_synthetic = cosine_to_synthetic[(psm['filename'],psm['scan'])]
+            cosine, best_synthetic, synthetic_peptide = cosine_to_synthetic[(psm['filename'],psm['scan'])]
             if best_synthetic[0] != 'N/A':
                 synthetic_filename = 'f.' + best_synthetic[0].replace('/data/massive/','')
                 synthetic_scan = best_synthetic[1]
-                synthetic_peptide = best_synthetic[2]
             else:
                 synthetic_filename = best_synthetic[0]
                 synthetic_scan = best_synthetic[1]
-                synthetic_peptide = best_synthetic[2]
             psm['usi'] = make_usi(psm['filename'], psm['scan'], psm['sequence'], psm['charge'])
             psm['synthetic_filename'] = synthetic_filename
             psm['synthetic_scan'] = synthetic_scan

From e0e0e38d1c000de57014ddb8063dab2c48685c44 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Thu, 7 Jul 2022 12:04:37 -0700
Subject: [PATCH 16/22] update variant grouping

---
 peptide_statistics_hpp/input.xml               | 18 ++++++++++++++++++
 peptide_statistics_hpp/tool.xml                |  1 +
 .../peptide_protein_cosine.py                  | 17 ++++++++++-------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml
index 87f218f..0fb3d02 100644
--- a/peptide_statistics_hpp/input.xml
+++ b/peptide_statistics_hpp/input.xml
@@ -110,6 +110,16 @@
 			<validator type="set" />
 		</parameter>
 
+		<parameter name="hpp_aggregation_function" label="Sequence-level HPP Aggregation">
+			<options>
+				<option label="Sum of variants" value="sum" />
+				<option label="Max of variants" value="max" />
+			</options>
+			<default value="sum" />
+			<validator type="set" />
+		</parameter>
+		
+
 		<parameter name="parallel_cosine.parallel_cosine" label="Parallelism (For Cosine Calculations)">
 			<default value="10"/>
 			<validator type="integer"/>
@@ -414,6 +424,14 @@
 					<cell colspan="2">
 						<input type="select" parameter="use_variants"/>
 					</cell>
+					<cell>
+						<label>
+							<content parameter="hpp_aggregation_function"/>
+						</label>
+					</cell>
+					<cell colspan="2">
+						<input type="select" parameter="hpp_aggregation_function"/>
+					</cell>
 				</row>
 
 	</block>
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index 2d02dd8..a13331a 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -196,6 +196,7 @@
           <arg option="-export_explorers"     valueRef="@export_explorers"/>
           <arg option="-explorers_output"     valueRef="explorer_export"/>
           <arg option="-variant_output"     valueRef="@use_variants"/>
+          <arg option="-hpp_protein_score_aggregation" valueRef="@hpp_aggregation_function"/>
       </execution>
   </tool>
   
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index a93569d..34b15b2 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -52,6 +52,7 @@ def arguments():
     parser.add_argument('--export_explorers', type = int, help='Export Explorer Tables (0/1)')
     parser.add_argument('--explorers_output', type = Path, help='Tables for Explorers')
     parser.add_argument('--variant_output', type = int, help='Variant level outputs',default=0)
+    parser.add_argument('--hpp_protein_score_aggregation', type = str, help='HPP Protein Aggregation (max or sum')
 
     if len(sys.argv) < 4:
         parser.print_help()
@@ -170,6 +171,8 @@ def find_overlap(existing_peptides, new_peptides, protein_length, protein_pe, na
 def main():
     args = arguments()
 
+    hpp_score_aggregation = lambda xs: sum(xs) if args.hpp_protein_score_aggregation == 'sum' else max(xs)
+
     representative_per_precursor = {}
     variant_to_best_precursor = {}
     variant_to_all_precursors = defaultdict(set)
@@ -542,10 +545,10 @@ def output_protein_level_results(best_psm):
             if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
                 pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
                 if best_psm.get('hpp_match','') == 'Yes':
-                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
-                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
+                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
             for protein in proteins:
-                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence'])))
+                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
 
         proteins = peptide_to_protein.get(sequence_il,[])
         cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
@@ -623,19 +626,19 @@ def output_protein_level_results(best_psm):
 
     seen_picked = set()
 
-    def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5):
+    def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggregation_func = lambda xs: sum(xs)):
         used_precursors = []
         for precursor in sorted(precursor_list, key = lambda x: x[0], reverse = True):
             found = False
             for seen_precursor in used_precursors:
-                if abs(precursor[1]-seen_precursor[1]) <= mz_distance:
+                if precursor[2]==seen_precursor[2] and abs(precursor[1]-seen_precursor[1]) <= distance:
                     found = True
             if not found:
                 used_precursors.append(precursor)
-        return sum([p[0] for p in used_precursors])
+        return score_aggregation_func([p[0] for p in used_precursors])
 
     for protein, precursors in precursors_per_protein_hpp.items():
-        score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v)) for k,v in precursors.items()])
+        score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v,score_aggregation_func=hpp_score_aggregation)) for k,v in precursors.items()])
         if score != 0:
             hpp_protein_w_scores.append(fdr.ScoredElement(protein,'XXX_' in protein, score))
             hpp_score_dict[protein] = score

From bdb0d1f65dabeec00f4c5aa0fe59bc65e9b2e931 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Fri, 8 Jul 2022 13:57:42 -0700
Subject: [PATCH 17/22] update PSM counting

---
 peptide_statistics_hpp/result.xml             |  7 +++++-
 .../peptide_protein_cosine.py                 | 23 ++++++++++++++-----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index 9099946..152981b 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -245,9 +245,14 @@
                 <column type="float" precision="3"  field="explained_intensity" label="Explained Intensity" width="6"/>
                 <column type="integer" field="matched_ions" label="Matched Ions" width="6"/>
                 <!-- <column type="text" field="pass" label="Above Library Threshold" width="6"/> -->
-                <column type="float" precision="3"  field="cosine" label="Best Annotated Cosine to Synthetic Spectrum Match" width="6" tooltip="Spectrum cosine using only annotated peaks from the input spectrum and closest matching synthetic."/>
+                <column type="float" precision="4"  field="best_overall_psm_score" label="Best PSM Score" width="4" tooltip=""/>
+                <column type="integer" field="filtered_psms" label="#PSMs - Passing Quality Filters" width="4" tooltip=""/>
+                <column type="integer" field="total_psms" label="#PSMs - All" width="4" tooltip=""/>
+                <column type="integer" field="num_peptidoforms" label="#Peptidoforms" width="4" tooltip=""/>
+                <column type="integer" field="variant_number" label="Variant Number" width="4" tooltip=""/>
                 <column type="integer" field="canonical_matches" label="#SAAP protein matches" width="1" tooltip = "Number of proteins the peptide matches to with a single amino acid variant."/>
                 <column type="text" field="gene_unique" label="Gene Unique" width="2" tooltip = "Peptide matches uniquely to a single gene with a single amino acid variant (True/False)."/>
+                <column type="text" field="decoy" label="Decoy" width="2" tooltip = ""/>
                 <column type="integer" field="exon_junctions_covered" label="Exon Junctions Covered" width="1"/>
                 <column type="integer" field="exons_covered_no_junction" label="Complete Exons Covered" width="1"/>
                 <column type="integer" field="total_unique_exons_covered" label="Total Exons Covered" width="1"/>
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 34b15b2..d4a2d2b 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -71,6 +71,16 @@ def unit_delta(aa1, aa2):
 
 no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()])
 
+def seq_theoretical_mass(sequence):
+    aa = ''.join([a for a in sequence if a.isalpha()])
+    mods = ''.join([m for m in sequence if not m.isalpha()])
+    if len(mods) > 0:
+        mods = eval(mods)
+    else:
+        mods = 0
+    return (theoretical_mass(aa) + mods + 1.007276035)
+
+
 def theoretical_mz(sequence,charge):
     aa = ''.join([a for a in sequence if a.isalpha()])
     mods = ''.join([m for m in sequence if not m.isalpha()])
@@ -346,8 +356,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['best_overall_psm_score'] = float(l['score'])
 
         if (this_pass_ei or this_pass_cos) and this_pass_by:
-            precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
-        precursor_representative['total_psms'] += l.get('total_psms',1)
+            precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1))
+        precursor_representative['total_psms'] += int(l.get('total_psms',1))
 
         precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant])
 
@@ -545,10 +555,10 @@ def output_protein_level_results(best_psm):
             if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
                 pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
                 if best_psm.get('hpp_match','') == 'Yes':
-                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
-                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                    precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
             for protein in proteins:
-                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
+                precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
 
         proteins = peptide_to_protein.get(sequence_il,[])
         cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
@@ -683,8 +693,9 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg
 
     if args.output_peptides:
         with open(args.output_peptides,'w') as w:
-            header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
+            header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
             header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff']
+            header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score']
             r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore')
             r.writeheader()
             for precursor in all_precursors:

From baf1bcc8c64ff871de2fe982f71729eb4ddd3bb5 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Sun, 10 Jul 2022 14:57:38 -0700
Subject: [PATCH 18/22] update outputs to be better named

---
 peptide_statistics_hpp/input.xml              | 21 +++++++++----------
 peptide_statistics_hpp/result.xml             |  6 +++---
 peptide_statistics_hpp/tool.xml               |  2 +-
 .../peptide_protein_cosine.py                 |  6 +++---
 tools/peptide_statistics_hpp/read_mappings.py |  2 +-
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml
index 0fb3d02..64dae31 100644
--- a/peptide_statistics_hpp/input.xml
+++ b/peptide_statistics_hpp/input.xml
@@ -45,8 +45,8 @@
 			<validator type="float" minimum="0"/>
 		</parameter>
 
-		<parameter name="annotated_ions_threshold" label="Annotated Ions Threshold">
-			<default value="6"/>
+		<parameter name="annotated_ions_threshold" label="Peptide Breaks Threshold">
+			<default value="5"/>
 			<validator type="integer" minimum="0"/>
 		</parameter>
 
@@ -84,22 +84,21 @@
 
 		<parameter name="main_fdr" label="FDR for HPP proteins">
 			<options>
-				<option label="Common HPP" value="common_hpp" />
-				<option label="Picked HPP" value="picked_hpp" />
-				<option label="Traditional" value="traditional" />
+				<option label="Traditional" value="common_hpp" />
+				<option label="Picked" value="picked_hpp" />
 			</options>
 			<default value="common_hpp" />
 			<validator type="set" />
 		</parameter>
 
-		<parameter name="leftover_fdr" label="FDR for leftover proteins (if using non-traditional FDR for HPP stage)">
+		<!--<parameter name="leftover_fdr" label="FDR for leftover proteins">
 			<options>
-				<option label="Common" value="common_leftover" />
+				<option label="Traditional" value="common_leftover" />
 				<option label="Picked" value="picked_leftover" />
 			</options>
 			<default value="picked_leftover" />
 			<validator type="set" />
-		</parameter>
+		</parameter>-->
 
 		<parameter name="use_variants" label="Variant-level analysis">
 			<options>
@@ -136,7 +135,7 @@
 		</parameter>
 
 		<parameter name="min_mz" label="Minimum m/z">
-			<default value="232"/>
+			<default value="0"/>
 			<validator type="float" minimum="0"/>
 		</parameter>
 
@@ -396,14 +395,14 @@
 					<cell colspan="3">
 						<input type="select" parameter="main_fdr"/>
 					</cell>
-					<cell>
+					<!-- <cell>
 						<label>
 							<content parameter="leftover_fdr"/>
 						</label>
 					</cell>
 					<cell colspan="3">
 						<input type="select" parameter="leftover_fdr"/>
-					</cell>
+					</cell> -->
 				</row>
 				<row>
 					<cell>
diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index 152981b..de2ceab 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -216,7 +216,7 @@
                            <column type="slideExpandableHidden" characterhidelimit="18" field="protein" label="Protein Accession" width="4"/>
 
                 <column type="float" precision="4" field="precursor_fdr" label="Precursor FDR" width="4" tooltip=""/>
-                <column type="float" precision="4" field="picked_protein_fdr" label="Protein FDR (Canonical)" width="4" tooltip=""/>
+                <column type="float" precision="4" field="picked_protein_fdr" label="Protein FDR (Leftover)" width="4" tooltip=""/>
                 <column type="float" precision="4"  field="hpp_protein_fdr" label="Protein FDR (HPP 2+)" width="4" tooltip=""/>
                 <column type="expandable" field="gene" label="Gene" width="4" tooltip="Gene for the protein"/>
                 <column type="text" field="protein_type" label="Protein Type" width="4"/>
@@ -413,8 +413,8 @@
           
           <!-- <column type="float" precision="4"  field="all_score" label="Score (Canonical old)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="all_fdr" label="FDR (Canonical old)" width="4" tooltip=""/> -->
-          <column type="float" precision="4"  field="picked_score" label="Score (Picked)" width="4" tooltip=""/>
-          <column type="float" precision="4"  field="picked_fdr" label="FDR (Picked)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="picked_score" label="Score (Canonical Picked)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="picked_fdr" label="FDR (Canonical Picked)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="common_score" label="Score (Canonical)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="common_fdr" label="FDR (Canonical)" width="4" tooltip=""/>
                    <column type="float" precision="4"  field="leftover_score" label="Score (Leftover)" width="4" tooltip=""/>
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index a13331a..64ebe25 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -187,7 +187,7 @@
           <arg option="-picked_protein_fdr_comparison"        valueRef="@picked_protein_fdr_comparison"/>
           <arg option="-filter_rows_fdr"        valueRef="@filter_rows_fdr"/>
           <arg option="-main_fdr"        valueRef="@main_fdr"/>
-          <arg option="-leftover_fdr"        valueRef="@leftover_fdr"/>
+          <arg option="-leftover_fdr"        valueRef="picked_leftover"/>
           <arg option="-nextprot_releases" pathRef="nextprot_releases"/>
           <arg option="-msv_to_pxd_mapping" pathRef="msv_to_pxd_mapping"/>
           <arg option="-external_provenance"  valueRef="external_provenance"/>
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index d4a2d2b..db9ff2a 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -636,7 +636,7 @@ def output_protein_level_results(best_psm):
 
     seen_picked = set()
 
-    def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggregation_func = lambda xs: sum(xs)):
+    def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregation_func = lambda xs: sum(xs)):
         used_precursors = []
         for precursor in sorted(precursor_list, key = lambda x: x[0], reverse = True):
             found = False
@@ -701,7 +701,7 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg
             for precursor in all_precursors:
                 proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != '']
                 if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')):
-                    precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1)
+                    precursor['picked_protein_fdr'] = min(leftover_fdr_dict.get(proteins[0],1),1)
                     precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1)
                 else:
                     precursor['picked_protein_fdr'] = 1
@@ -835,7 +835,7 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg
             }
 
             pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison 
-            pass_picked_fdr, pass_hpp_fdr = protein_dict['picked_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr 
+            pass_picked_fdr, pass_hpp_fdr = protein_dict['leftover_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr 
             if args.main_fdr == 'traditional':
                 pass_picked_fdr, pass_hpp_fdr = protein_dict['common_fdr'] <= args.picked_protein_fdr, protein_dict['common_fdr'] <= args.hpp_protein_fdr 
 
diff --git a/tools/peptide_statistics_hpp/read_mappings.py b/tools/peptide_statistics_hpp/read_mappings.py
index 8fb0a9f..a64d7fb 100644
--- a/tools/peptide_statistics_hpp/read_mappings.py
+++ b/tools/peptide_statistics_hpp/read_mappings.py
@@ -36,7 +36,7 @@ def read_protein_coverage(protein_coverage_file,seen_sequences,proteome, filter
                 mapped_protein_str = l['mapped_proteins'] if 'mapped_proteins' in l else l['all_proteins_w_coords']
                 mapped_exon_str = l['mapped_exons'] if 'mapped_exons' in l else l['all_mapped_exons']
 
-                all_protein_fdr = float(l.get('picked_protein_fdr',-1))
+                all_protein_fdr = float(l.get('leftover_protein_fdr',-1))
                 hpp_protein_fdr = float(l.get('hpp_protein_fdr',-1))
                 cosine = float(l.get('cosine',-1))
 

From 4b8aab5299f25ca3dba030668ed4445836b98cd1 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Mon, 11 Jul 2022 18:09:20 -0700
Subject: [PATCH 19/22] update to fix bug in EI representative

---
 peptide_statistics_hpp/result.xml             | 19 ++++++++++---------
 .../peptide_protein_cosine.py                 | 14 ++++++++++----
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index de2ceab..01c5967 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -35,14 +35,15 @@
 
           <column type="float" precision="4"  field="hpp_score" label="Score (HPP 2+)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="hpp_fdr" label="FDR (HPP 2+)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="leftover_score" label="Score (Leftover)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="leftover_fdr" label="FDR (Leftover)" width="4" tooltip=""/>
           <!-- <column type="float" precision="4"  field="all_score" label="Score (Canonical old)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="all_fdr" label="FDR (Canonical old)" width="4" tooltip=""/> -->
-          <column type="float" precision="4"  field="picked_score" label="Score (Picked)" width="4" tooltip=""/>
-          <column type="float" precision="4"  field="picked_fdr" label="FDR (Picked)" width="4" tooltip=""/>
-          <column type="float" precision="4"  field="common_score" label="Score (Canonical)" width="4" tooltip=""/>
-          <column type="float" precision="4"  field="common_fdr" label="FDR (Canonical)" width="4" tooltip=""/>
-         <column type="float" precision="4"  field="leftover_score" label="Score (Leftover)" width="4" tooltip=""/>
-          <column type="float" precision="4"  field="leftover_fdr" label="FDR (Leftover)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="picked_score" label="Score (Canonical Picked)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="picked_fdr" label="FDR (Canonical Picked)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="common_score" label="Score (Canonical Traditional)" width="4" tooltip=""/>
+          <column type="float" precision="4"  field="common_fdr" label="FDR (Canonical Traditional)" width="4" tooltip=""/>
+
           <!-- <column type="float" precision="4"  field="all_score" label="Score (Canonical)" width="4" tooltip=""/>
           <column type="float" precision="4"  field="all_fdr" label="FDR (Canonical)" width="4" tooltip=""/> -->
         <column type="float" precision="3"  field="coverage_incl_shared" label="Total Coverage" width="4" tooltip=""/>
@@ -73,7 +74,7 @@
             <column type="expandable" field="novel_peptides" label="Novel Peptides" width="20"/>
             <column type="integer" field="combined_hpp" label="#HPP Peptides (Combined)" width="5"/>
             <column type="integer" field="combined_fdr_hpp" label="#HPP Peptides (Combined, Pass FDR)" width="5"/>
-            <column type="integer" field="comparison_hpp_allfdr" label="#HPP Peptides (Reference Common FDR)" width="2"/>
+            <column type="integer" field="comparison_hpp_allfdr" label="#HPP Peptides (Reference Leftover FDR)" width="2"/>
             <column type="integer" field="comparison_hpp_hppfdr" label="#HPP Peptides (Reference HPP FDR)" width="2"/>
 
             <column type="integer" field="new_hpp" label="#HPP Peptides (Added Only)" width="2"/>
@@ -83,7 +84,7 @@
             <column type="expandable" field="supporting_peptides_w_synthetic" label="Previously-known Peptides (Synthetic)" width="20"/>
             <column type="expandable" field="novel_peptides_w_synthetic" label="Novel Peptides (Synthetic)" width="20"/>
             <column type="integer" field="combined_hpp_w_synthetic" label="#HPP Peptides (Combined) (Synthetic)" width="5"/>
-            <column type="integer" field="comparison_hpp_allfdr_w_synthetic" label="#HPP Peptides (Reference Common FDR) (Synthetic)" width="2"/>
+            <column type="integer" field="comparison_hpp_allfdr_w_synthetic" label="#HPP Peptides (Reference Leftover FDR) (Synthetic)" width="2"/>
             <column type="integer" field="comparison_hpp_hppfdr_w_synthetic" label="#HPP Peptides (Reference HPP FDR)(Synthetic)" width="2"/>
 
             <column type="integer" field="new_hpp_w_synthetic" label="#HPP Peptides (Added Only) (Synthetic)" width="2"/>
@@ -94,7 +95,7 @@
             <column type="expandable" field="supporting_peptides_w_synthetic_cosine" label="Previously-known Peptides (Synthetic + Pass Cosine Threshold)" width="20"/>
             <column type="expandable" field="novel_peptides_w_synthetic_cosine" label="Novel Peptides (Synthetic + Pass Cosine Threshold)" width="20"/>
                         <column type="integer" field="combined_hpp_w_synthetic_cosine" label="#HPP Peptides (Combined) (Synthetic + Pass Cosine Threshold)" width="5"/>
-            <column type="integer" field="comparison_hpp_allfdr_w_synthetic_cosine" label="#HPP Peptides (Reference Common FDR) (Synthetic + Pass Cosine Threshold)" width="2"/>
+            <column type="integer" field="comparison_hpp_allfdr_w_synthetic_cosine" label="#HPP Peptides (Reference Leftover FDR) (Synthetic + Pass Cosine Threshold)" width="2"/>
             <column type="integer" field="comparison_hpp_hppfdr_w_synthetic_cosine" label="#HPP Peptides (Reference HPP FDR) (Synthetic + Pass Cosine Threshold)" width="2"/>
             <column type="integer" field="new_hpp_w_synthetic_cosine" label="#HPP Peptides (Added Only) (Synthetic + Pass Cosine Threshold)" width="2"/>
             <column type="float" field="total_coverage_w_synthetic_cosine" label="Coverage (Total) (Synthetic + Pass Cosine Threshold)" width="2"/>
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index db9ff2a..53e746c 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -333,10 +333,16 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi']
             precursor_representative['score'] = float(l['score'])
             update_peptidoform = True
-            #consider best EI And matched ions over all representatives
-            if potential_psm_gain or float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
-                precursor_representative['explained_intensity'] = l['explained_intensity']
-                precursor_representative['matched_ions'] = l['matched_ions']
+        
+        #best EI does not pass #breaks threshold but current one does
+        if not best_pass_by and this_pass_by:
+            precursor_representative['explained_intensity'] = l['explained_intensity']
+            precursor_representative['matched_ions'] = l['matched_ions']
+        #two situations, either both pass #threshold or both fail #threshold
+        elif best_pass_by is this_pass_by and float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
+            precursor_representative['explained_intensity'] = l['explained_intensity']
+            precursor_representative['matched_ions'] = l['matched_ions']
+        #otherwise best already passed #threshold, so we skip it
 
         if best_cosine and float(l['cosine']) >= 0 and l['synthetic_usi'] != 'N/A':
             precursor_representative['cosine_filename'] = l['filename'] if from_psm else l['cosine_filename']

From f9c160d366dcf765de3831e6fe46e95841298776 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Fri, 15 Jul 2022 11:31:24 -0700
Subject: [PATCH 20/22] update peptide protein cosine

---
 tools/peptide_statistics_hpp/peptide_protein_cosine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 53e746c..06cd849 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -331,7 +331,7 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
             precursor_representative['database_filename'] = l['filename'] if from_psm else l['database_filename']
             precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan']
             precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi']
-            precursor_representative['score'] = float(l['score'])
+            precursor_representative['score'] = score
             update_peptidoform = True
         
         #best EI does not pass #breaks threshold but current one does

From 87388477434b96c728acaab5cc3199fe7ba2f156 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Fri, 5 Aug 2022 10:05:09 -0700
Subject: [PATCH 21/22] changes for window filters

---
 peptide_statistics_hpp/result.xml             |  4 ++-
 .../cosine_to_synthetics.py                   |  4 ++-
 .../peptide_protein_cosine.py                 | 25 +++++++++++++------
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index 01c5967..fbcf4ac 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -55,7 +55,7 @@
                 <parameter name="USETASK" value="True"/>
                 <parameter name="REQUESTPARAMETER=view" value="view_added_peptides"/>
                 <parameter name="HASHPARAMTER=protein_input" value="[protein]"/>
-                <parameter name="HASHPARAMTER=hpp_match_input" value="True"/>
+                <parameter name="HASHPARAMTER=hpp_match_input" value="Yes"/>
                 <parameter name="HASHPARAMTER=explained_intensity_lowerinput" value="[explained_intensity_cutoff]"/>
                 <parameter name="LABEL" value="View HPP Compliant Peptides"/>
                 <parameter name="PASS_URL" value="True"/>
@@ -161,6 +161,7 @@
             <column autoHideSentinel="true" type="text" field="synthetic_usi" label="Synthetic Match" width="20"/>
             <!-- <column type="text" field="synthetic_filename" label="Synthetic Filename" width="20"/>
             <column type="integer" field="synthetic_scan" label="Synthetic Scan" width="4"/> -->
+            <column type="float" precision="4"  field="cosine" label="Cosine to Synthetic" width="4" tooltip=""/>
             <column type="integer" field="canonical_matches" label="#SAAP protein matches" width="1"/>
             <column type="text" field="gene_unique" label="Gene Unique" width="2"/>
             <column type="integer" field="exon_junctions_covered" label="Exon Junctions Covered" width="1"/>
@@ -246,6 +247,7 @@
                 <column type="float" precision="3"  field="explained_intensity" label="Explained Intensity" width="6"/>
                 <column type="integer" field="matched_ions" label="Matched Ions" width="6"/>
                 <!-- <column type="text" field="pass" label="Above Library Threshold" width="6"/> -->
+                <column type="float" precision="4"  field="cosine" label="Cosine to Synthetic" width="4" tooltip=""/>
                 <column type="float" precision="4"  field="best_overall_psm_score" label="Best PSM Score" width="4" tooltip=""/>
                 <column type="integer" field="filtered_psms" label="#PSMs - Passing Quality Filters" width="4" tooltip=""/>
                 <column type="integer" field="total_psms" label="#PSMs - All" width="4" tooltip=""/>
diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index edfb8c3..13d6e35 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -94,7 +94,9 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s
             fragment_tolerance,
             precursor_filter_window=1.5,
             low_mass_filter=low_mass_filter,
-            min_snr=min_snr,
+            min_snr=0,
+            window_filter_size = 50,
+            window_filter_top_peaks = 8,
             num_top_unannotated_envelopes_to_remove=2,
             isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs
     )
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 06cd849..022c059 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -570,8 +570,17 @@ def output_protein_level_results(best_psm):
         cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
         output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
 
-        if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes':
-            is_hpp = pep_mapping_info[sequence]['hpp']
+        if len(cannonical_proteins) <= 1:
+            if row_pass_filters(best_psm):
+                for dataset in best_psm['datasets']:
+                    all_datasets.add(dataset)
+                    datasets_per_sequence[sequence_il].add(dataset)
+                for task in best_psm['tasks']:
+                    all_tasks.add(task)
+                    tasks_per_sequence[sequence_il].add(task)
+
+        if len(cannonical_proteins) <= 1 :
+            is_hpp = pep_mapping_info[sequence]['hpp'] and best_psm.get('hpp_match','') == 'Yes'
             match = False
             has_synthetic = False
             has_synthetic_cosine = False
@@ -583,12 +592,12 @@ def output_protein_level_results(best_psm):
                         if float(best_psm['cosine']) >= args.cosine_cutoff:
                             has_synthetic_cosine = True
                     match = True
-                    for dataset in best_psm['datasets']:
-                        all_datasets.add(dataset)
-                        datasets_per_sequence[sequence_il].add(dataset)
-                    for task in best_psm['tasks']:
-                        all_tasks.add(task)
-                        tasks_per_sequence[sequence_il].add(task)
+                    # for dataset in best_psm['datasets']:
+                    #     all_datasets.add(dataset)
+                    #     datasets_per_sequence[sequence_il].add(dataset)
+                    # for task in best_psm['tasks']:
+                    #     all_tasks.add(task)
+                    #     tasks_per_sequence[sequence_il].add(task)
             if len(proteins) == 1:
                 if row_pass_filters(best_psm):
                     is_isoform_unique = True

From 9c5e60f5cf38aad965c37997c1ff32db2a24e597 Mon Sep 17 00:00:00 2001
From: benpullman <bp2@icloud.com>
Date: Fri, 12 Aug 2022 13:11:59 -0700
Subject: [PATCH 22/22] changes to add FDR to comparison, should be discarded

---
 peptide_statistics_hpp/binding.xml            |   1 +
 peptide_statistics_hpp/flow.xml               |   1 +
 peptide_statistics_hpp/tool.xml               |   2 +
 .../cosine_to_synthetics.py                   |   1 +
 .../peptide_protein_cosine.py                 | 185 ++++++++++--------
 5 files changed, 109 insertions(+), 81 deletions(-)

diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml
index 827e7de..12722e8 100644
--- a/peptide_statistics_hpp/binding.xml
+++ b/peptide_statistics_hpp/binding.xml
@@ -92,6 +92,7 @@
 
     <bind action="peptide_protein_cosine" tool="peptide_protein_cosine">
         <inputAsRequirement port="kb_pep"                         requirement="kb_pep"/>
+        <inputAsRequirement port="peptide_coverage_merged_external_compare"                         requirement="peptide_coverage_merged_external_compare"/>
         <inputAsRequirement port="fastadb"                  requirement="fastadb"/>
         <inputAsRequirement port="con_fastadb"                  requirement="con_fastadb"/>
         <inputAsRequirement port="novel_psms_w_cosine"       requirement="novel_psms_w_cosine"/>
diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml
index 374bc19..63e744e 100644
--- a/peptide_statistics_hpp/flow.xml
+++ b/peptide_statistics_hpp/flow.xml
@@ -117,6 +117,7 @@
 
 		<action name="peptide_protein_cosine">
 			<input   port="kb_pep"       			    object="kb_pep"/>
+			<input   port="peptide_coverage_merged_external_compare"  collection="peptide_coverage_comparisons"/>
 			<input   port="novel_psms_w_cosine"          collection="novel_psms_w_cosine"/>
 			<input  port="fastadb"             			collection="fastadb"/>
 			<input  port="con_fastadb"             	    collection="con_fastadb"/>
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index 64ebe25..7d30b04 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -146,6 +146,7 @@
       <require name="peptide_coverage"                type="folder"/>
       <require name="novel_psms_w_cosine_external"     type="folder"/>
       <require name="peptide_coverage_merged_external" type="folder"/>
+      <require name="peptide_coverage_merged_external_compare" type="folder"/>
       <require name="external_provenance"             type="file"/>
       <produce name="merged_novel_psms_w_cosine"      type="file" naming="explicit"  extension="tsv"/>
       <produce name="novel_peptides_w_cosine"         type="file" naming="explicit"  extension="tsv"/>
@@ -167,6 +168,7 @@
           <arg option="-input_psms_external"           valueRef="novel_psms_w_cosine_external"/>
           <arg option="-protein_coverage"     valueRef="peptide_coverage"/>
           <arg option="-protein_coverage_external"     valueRef="peptide_coverage_merged_external"/>
+          <arg option="-protein_coverage_external_compare"     valueRef="peptide_coverage_merged_external_compare"/>
           <arg option="-output_psms_flag"          valueRef="@export_psms"/>
           <arg option="-output_psms"          valueRef="merged_novel_psms_w_cosine"/>
           <arg option="-output_peptides"      valueRef="novel_peptides_w_cosine"/>
diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index 13d6e35..9f970fc 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -339,6 +339,7 @@ def main():
             psm['synthetic_filename'] = synthetic_filename
             psm['synthetic_scan'] = synthetic_scan
             psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, synthetic_peptide, psm['charge'])
+            psm['synthetic_sequence'] = synthetic_peptide
             psm['cosine'] = cosine
             ei, num_matched_peaks = explained_intensity_per_spectrum.get((psm['filename'],psm['scan']),(0,0))
             psm['explained_intensity'] = ei
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 022c059..762af4b 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -33,6 +33,7 @@ def arguments():
     parser.add_argument('--output_task_proteins_all', type = Path, help='Output Task Proteins (All)')
     parser.add_argument('--protein_coverage', type = Path, help='Added Protein Coverage')
     parser.add_argument('--protein_coverage_external', type = Path, help='Added Protein Coverage (External)')
+    parser.add_argument('--protein_coverage_external_compare', type = Path, help='Added Protein Coverage (External) - Comparison')
     parser.add_argument('--cosine_cutoff', type = float, help='Cosine Cutoff')
     parser.add_argument('--explained_intensity_cutoff', type = float, help='Explained Intensity Cutoff')
     parser.add_argument('--annotated_ions_cutoff', type = float, help='Annotated Ion Cutoff')
@@ -206,6 +207,13 @@ def main():
     comparison_picked_fdr = {}
     comparison_hpp_fdr = {}
 
+    precursors_per_protein_all = defaultdict(lambda: defaultdict(list))
+    precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list))
+    precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list))
+    
+    all_sequences = {}
+    unique_sequences = {}
+
     frequency = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
 
     with open(args.msv_to_pxd_mapping) as json_file:
@@ -216,32 +224,6 @@ def main():
             if pxd:
                 pxd_mapping[pxd] = msv
 
-
-    with open(args.comparison_pep) as f:
-        r = csv.DictReader(f, delimiter='\t')
-        for l in r:
-            il_peptide = l['demodified'].replace('I','L')
-            has_synthetic = False
-            has_synthetic_cosine = False
-            if l['protein'] in proteome.proteins:
-                
-                if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso:
-                    #peptides that are not uniquely matching will have both FDRs as 1
-                    comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1))
-                    comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1))
-
-                protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None))
-                comparison = sequences_found[il_peptide].comparison
-                if comparison != SeqOccurances(True,True,True):
-                    if float(l.get('synthetic_cosine',-1)) >= 0:
-                        has_synthetic = True
-                        if float(l['synthetic_cosine']) > args.cosine_cutoff:
-                            has_synthetic_cosine = True
-                    comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine)
-                    sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison)
-                if l['is_hpp'] == 'True':
-                    sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True)
-
     peptide_to_protein = defaultdict(list)
     peptide_to_exon_map = defaultdict(list)
 
@@ -480,7 +462,23 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
                 outdict['hpp_match'] = 'No - failed quality thresholds'
         return outdict, output_proteins
 
-    row_pass_filters = lambda best_psm: (float(best_psm['explained_intensity']) >= args.explained_intensity_cutoff or float(best_psm['cosine']) >= args.cosine_cutoff) and int(best_psm['matched_ions']) >= args.annotated_ions_cutoff
+    row_pass_filters_input = lambda best_psm, explained_intensity_cutoff, cosine_cutoff, annotated_ions_cutoff: (
+            float(best_psm['explained_intensity']) >= explained_intensity_cutoff or 
+            float(best_psm['cosine']) >= cosine_cutoff
+        ) and int(best_psm['matched_ions']) >= annotated_ions_cutoff
+
+    row_pass_filters = lambda best_psm: row_pass_filters_input(
+        best_psm,
+        args.explained_intensity_cutoff,
+        args.cosine_cutoff,
+        args.annotated_ions_cutoff
+    )
+    row_pass_filters_compare = lambda best_psm: row_pass_filters_input(
+        best_psm,
+        float(best_psm['explained_intensity_cutoff']),
+        float(best_psm['cosine_cutoff']),
+        int(float(best_psm['annotated_ions_cutoff']))
+    )
 
     if args.output_psms:
         all_psms_with_score = []
@@ -546,12 +544,7 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
     if len(all_hpp_precursors) > 0:
         precursor_fdr = fdr.calculate_fdr(all_hpp_precursors)
 
-    precursors_per_protein_all = defaultdict(lambda: defaultdict(list))
-    precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list))
-    precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list))
-
-
-    def output_protein_level_results(best_psm):
+    def output_protein_level_results(best_psm,row_pass_filters,update_mappings = True):
 
         sequence = best_psm['sequence_unmodified']
         sequence_il = best_psm['sequence_unmodified_il']
@@ -565,52 +558,44 @@ def output_protein_level_results(best_psm):
                 precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
             for protein in proteins:
                 precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
+        
+        if update_mappings:
+            proteins = peptide_to_protein.get(sequence_il,[])
+            cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
+            output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
 
-        proteins = peptide_to_protein.get(sequence_il,[])
-        cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
-        output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
-
-        if len(cannonical_proteins) <= 1:
-            if row_pass_filters(best_psm):
-                for dataset in best_psm['datasets']:
-                    all_datasets.add(dataset)
-                    datasets_per_sequence[sequence_il].add(dataset)
-                for task in best_psm['tasks']:
-                    all_tasks.add(task)
-                    tasks_per_sequence[sequence_il].add(task)
-
-        if len(cannonical_proteins) <= 1 :
-            is_hpp = pep_mapping_info[sequence]['hpp'] and best_psm.get('hpp_match','') == 'Yes'
-            match = False
-            has_synthetic = False
-            has_synthetic_cosine = False
-            is_isoform_unique = False
-            if len(cannonical_proteins) == 1:
-                if row_pass_filters(best_psm):
-                    if float(best_psm['cosine']) >= 0:
-                        has_synthetic = True
-                        if float(best_psm['cosine']) >= args.cosine_cutoff:
-                            has_synthetic_cosine = True
-                    match = True
-                    # for dataset in best_psm['datasets']:
-                    #     all_datasets.add(dataset)
-                    #     datasets_per_sequence[sequence_il].add(dataset)
-                    # for task in best_psm['tasks']:
-                    #     all_tasks.add(task)
-                    #     tasks_per_sequence[sequence_il].add(task)
-            if len(proteins) == 1:
-                if row_pass_filters(best_psm):
-                    is_isoform_unique = True
-            added = sequences_found[sequence_il].added
-            sequences_found[sequence_il] = sequences_found[sequence_il]._replace(
-                hpp=is_hpp,
-                isoform_unique=is_isoform_unique,
-                added = SeqOccurances(
-                    added.match or match,
-                    added.synthetic_match or has_synthetic,
-                    added.synthetic_match_cosine or has_synthetic_cosine
+            if len(cannonical_proteins) <= 1 :
+                is_hpp = pep_mapping_info[sequence]['hpp']
+                match = False
+                has_synthetic = False
+                has_synthetic_cosine = False
+                is_isoform_unique = False
+                if len(cannonical_proteins) == 1:
+                    if row_pass_filters(best_psm):
+                        if float(best_psm['cosine']) >= 0:
+                            has_synthetic = True
+                            if float(best_psm['cosine']) >= args.cosine_cutoff:
+                                has_synthetic_cosine = True
+                        match = True
+                    for dataset in best_psm['datasets']:
+                        all_datasets.add(dataset)
+                        datasets_per_sequence[sequence_il].add(dataset)
+                    for task in best_psm['tasks']:
+                        all_tasks.add(task)
+                        tasks_per_sequence[sequence_il].add(task)
+                if len(proteins) == 1:
+                    if row_pass_filters(best_psm):
+                        is_isoform_unique = True
+                added = sequences_found[sequence_il].added
+                sequences_found[sequence_il] = sequences_found[sequence_il]._replace(
+                    hpp=is_hpp,
+                    isoform_unique=is_isoform_unique,
+                    added = SeqOccurances(
+                        added.match or match,
+                        added.synthetic_match or has_synthetic,
+                        added.synthetic_match_cosine or has_synthetic_cosine
+                        )
                     )
-                )
     print("About to output sequences")
 
     all_precursors = []
@@ -626,7 +611,7 @@ def output_protein_level_results(best_psm):
             best_psm['precursor_fdr'] = min(precursor_fdr.get((sequence, charge),1),1)
             best_psm['psm_fdr'] = -1
             best_psm['synthetic_sequence'] = sequence.replace('+229.163','').replace('+229.162932','')
-            output_protein_level_results(best_psm)
+            output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True)
             best_psm['datasets'] = ';'.join(best_psm['datasets'])
             best_psm['tasks'] = ';'.join(best_psm['tasks'])
             all_precursors.append(best_psm)
@@ -635,7 +620,45 @@ def output_protein_level_results(best_psm):
         with open(args.input_peptides) as f:
             r = csv.DictReader(f, delimiter='\t')
             for best_psm in r:
-                output_protein_level_results(best_psm)
+                output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True)
+
+    for protein, seqs in precursors_per_protein_all.items():
+        unique_sequences[protein] = len(seqs)
+
+    for protein, seqs in precursors_per_protein_non_unique.items():
+        all_sequences[protein] = len(seqs)
+
+    for comparison_peptide_file in args.protein_coverage_external_compare.glob('*'):
+        with open(comparison_peptide_file) as f:
+            r = csv.DictReader(f, delimiter='\t')
+            for l in r:
+                output_protein_level_results(l, row_pass_filters=row_pass_filters_compare, update_mappings=False)
+
+    with open(args.comparison_pep) as f:
+        r = csv.DictReader(f, delimiter='\t')
+        for l in r:
+            il_peptide = l['demodified'].replace('I','L')
+            has_synthetic = False
+            has_synthetic_cosine = False
+            if l['protein'] in proteome.proteins:
+                
+                if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso:
+                    #peptides that are not uniquely matching will have both FDRs as 1
+                    comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1))
+                    comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1))
+
+                protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None))
+                comparison = sequences_found[il_peptide].comparison
+                if comparison != SeqOccurances(True,True,True):
+                    if float(l.get('synthetic_cosine',-1)) >= 0:
+                        has_synthetic = True
+                        if float(l['synthetic_cosine']) > args.cosine_cutoff:
+                            has_synthetic_cosine = True
+                    comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine)
+                    sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison)
+                if l['is_hpp'] == 'True':
+                    sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True)
+
 
     hpp_protein_w_scores = []
     hpp_score_dict = {}
@@ -845,8 +868,8 @@ def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregat
                 'common_fdr':min(1,common_fdr_dict.get(protein,1)),
                 'leftover_score':leftover_score_dict.get(protein,0),
                 'leftover_fdr':min(1,leftover_fdr_dict.get(protein,1)),
-                'num_sequences':len(precursors_per_protein_all.get(protein,[])),
-                'num_sequences_incl_shared':len(precursors_per_protein_non_unique.get(protein,[])),
+                'num_sequences':unique_sequences.get(protein,0),
+                'num_sequences_incl_shared':all_sequences.get(protein,0),
             }
 
             pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison