Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Comparison fdr #30

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update PSM counting
  • Loading branch information
benpullman committed Jul 8, 2022
commit bdb0d1f65dabeec00f4c5aa0fe59bc65e9b2e931
7 changes: 6 additions & 1 deletion peptide_statistics_hpp/result.xml
Original file line number Diff line number Diff line change
@@ -245,9 +245,14 @@
<column type="float" precision="3" field="explained_intensity" label="Explained Intensity" width="6"/>
<column type="integer" field="matched_ions" label="Matched Ions" width="6"/>
<!-- <column type="text" field="pass" label="Above Library Threshold" width="6"/> -->
<column type="float" precision="3" field="cosine" label="Best Annotated Cosine to Synthetic Spectrum Match" width="6" tooltip="Spectrum cosine using only annotated peaks from the input spectrum and closest matching synthetic."/>
<column type="float" precision="4" field="best_overall_psm_score" label="Best PSM Score" width="4" tooltip=""/>
<column type="integer" field="filtered_psms" label="#PSMs - Passing Quality Filters" width="4" tooltip=""/>
<column type="integer" field="total_psms" label="#PSMs - All" width="4" tooltip=""/>
<column type="integer" field="num_peptidoforms" label="#Peptidoforms" width="4" tooltip=""/>
<column type="integer" field="variant_number" label="Variant Number" width="4" tooltip=""/>
<column type="integer" field="canonical_matches" label="#SAAP protein matches" width="1" tooltip = "Number of proteins the peptide matches to with a single amino acid variant."/>
<column type="text" field="gene_unique" label="Gene Unique" width="2" tooltip = "Peptide matches uniquely to a single gene with a single amino acid variant (True/False)."/>
<column type="text" field="decoy" label="Decoy" width="2" tooltip = ""/>
<column type="integer" field="exon_junctions_covered" label="Exon Junctions Covered" width="1"/>
<column type="integer" field="exons_covered_no_junction" label="Complete Exons Covered" width="1"/>
<column type="integer" field="total_unique_exons_covered" label="Total Exons Covered" width="1"/>
23 changes: 17 additions & 6 deletions tools/peptide_statistics_hpp/peptide_protein_cosine.py
Original file line number Diff line number Diff line change
@@ -71,6 +71,16 @@ def unit_delta(aa1, aa2):

no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()])

def seq_theoretical_mass(sequence):
aa = ''.join([a for a in sequence if a.isalpha()])
mods = ''.join([m for m in sequence if not m.isalpha()])
if len(mods) > 0:
mods = eval(mods)
else:
mods = 0
return (theoretical_mass(aa) + mods + 1.007276035)


def theoretical_mz(sequence,charge):
aa = ''.join([a for a in sequence if a.isalpha()])
mods = ''.join([m for m in sequence if not m.isalpha()])
@@ -346,8 +356,8 @@ def update_precursor_representative(l,from_psm = True, variant_level = False, ps
precursor_representative['best_overall_psm_score'] = float(l['score'])

if (this_pass_ei or this_pass_cos) and this_pass_by:
precursor_representative['filtered_psms'] += l.get('filtered_psms',1)
precursor_representative['total_psms'] += l.get('total_psms',1)
precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1))
precursor_representative['total_psms'] += int(l.get('total_psms',1))

precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant])

@@ -545,10 +555,10 @@ def output_protein_level_results(best_psm):
if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
if best_psm.get('hpp_match','') == 'Yes':
precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
for protein in proteins:
precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mass(best_psm['sequence']),best_psm['charge']))
precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))

proteins = peptide_to_protein.get(sequence_il,[])
cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
@@ -683,8 +693,9 @@ def greedy_sequence_precursor_score(precursor_list, distance = 2.5, score_aggreg

if args.output_peptides:
with open(args.output_peptides,'w') as w:
header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff']
header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score']
r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore')
r.writeheader()
for precursor in all_precursors: