diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml
index 18189bb..12722e8 100644
--- a/peptide_statistics_hpp/binding.xml
+++ b/peptide_statistics_hpp/binding.xml
@@ -29,6 +29,9 @@
+
+
+
@@ -54,6 +57,7 @@
+
@@ -61,6 +65,7 @@
+
@@ -87,7 +92,9 @@
+
+
diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml
index efff10a..63e744e 100644
--- a/peptide_statistics_hpp/flow.xml
+++ b/peptide_statistics_hpp/flow.xml
@@ -13,6 +13,7 @@
+
@@ -41,6 +42,8 @@
+
+
@@ -60,7 +63,9 @@
-
+
+
+
@@ -68,8 +73,9 @@
-
-
+
+
+
@@ -111,8 +117,10 @@
+
-
+
+
diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml
index f30a264..64dae31 100644
--- a/peptide_statistics_hpp/input.xml
+++ b/peptide_statistics_hpp/input.xml
@@ -13,6 +13,10 @@
+
+
+
+
@@ -41,8 +45,8 @@
-
-
+
+
@@ -80,22 +84,40 @@
-
-
-
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -113,7 +135,7 @@
-
+
@@ -227,6 +249,16 @@
Proteome DB
+
+
+
+ |
+
+
+ Proteome DB - Contaminants
+
|
@@ -363,14 +395,14 @@
|
-
+
@@ -382,6 +414,24 @@
|
+
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+ |
+
diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml
index 81de135..fbcf4ac 100644
--- a/peptide_statistics_hpp/result.xml
+++ b/peptide_statistics_hpp/result.xml
@@ -35,14 +35,15 @@
+
+
-
-
-
-
-
-
+
+
+
+
+
@@ -54,7 +55,7 @@
-
+
@@ -73,7 +74,7 @@
-
+
@@ -83,7 +84,7 @@
-
+
@@ -94,7 +95,7 @@
-
+
@@ -147,7 +148,7 @@
-
+
+
@@ -216,13 +218,13 @@
-
+
-
+
@@ -245,9 +247,15 @@
-
+
+
+
+
+
+
+
@@ -408,8 +416,8 @@
-
-
+
+
diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml
index bbe1f8e..7d30b04 100644
--- a/peptide_statistics_hpp/tool.xml
+++ b/peptide_statistics_hpp/tool.xml
@@ -68,7 +68,8 @@
-
+
+
@@ -76,6 +77,7 @@
+
@@ -85,11 +87,13 @@
+
+
@@ -137,10 +141,12 @@
+
+
@@ -157,10 +163,12 @@
-
+
+
+
@@ -181,7 +189,7 @@
-
+
@@ -189,7 +197,8 @@
-
+
+
diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
index aa4496c..9f970fc 100644
--- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py
+++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py
@@ -94,8 +94,11 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s
fragment_tolerance,
precursor_filter_window=1.5,
low_mass_filter=low_mass_filter,
- isobaric_tag_type=None,
- min_snr=min_snr
+ min_snr=0,
+ window_filter_size = 50,
+ window_filter_top_peaks = 8,
+ num_top_unannotated_envelopes_to_remove=2,
+ isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs
)
ion_vector = spectrum._replace(peaks = ion_vector)
ion_vector = processing.normalize_spectrum(ion_vector)
@@ -107,16 +110,16 @@ def find_ei_and_intensity(spectrum, psm, synthetic_scans, tol, low_mass_filter,
charge = psm['charge']
tolerance = psm['tolerance'] if psm['tolerance'] else tol
spectrum = spectrum._replace(precursor_z = int(charge), annotation = processing.Annotation(sequence, None))
- matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[])
+ matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[])
spectrum_ei, spectrum_ion_vector = extract_annotated_peaks(spectrum, tolerance, low_mass_filter, min_snr)
- for synthetic_filescan, synthetic_ion_vector in matching_synthetics:
+ for synthetic_filescan, synthetic_ion_vector, synthetic_peptide in matching_synthetics:
cosine = processing.match_peaks(spectrum_ion_vector, synthetic_ion_vector, tolerance)
if not best_cosine or cosine > best_cosine[0]:
- best_cosine = (cosine,synthetic_filescan)
+ best_cosine = (cosine,synthetic_filescan,synthetic_peptide)
return spectrum_ei, best_cosine
def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, peaks_obj, spectrum_select_func):
- cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+ cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
explained_intensity_per_spectrum = {}
for scan in psms_to_consider[filename].keys():
spectrum = spectrum_select_func(peaks_obj,scan)
@@ -128,7 +131,7 @@ def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_
def process_spectrum_read_file(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, reader, spectrum_select_func,read_scan):
start_time = datetime.now()
- cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+ cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
explained_intensity_per_spectrum = {}
all_spectra = []
for i,s in enumerate(reader):
@@ -172,7 +175,7 @@ def main():
r = csv.DictReader(f, delimiter='\t')
psms_header = r.fieldnames
for l in r:
- synthetic_keys.add((l['sequence'].replace('+229.163','').replace('+229.162932',''),l['charge']))
+ synthetic_keys.add((''.join([a for a in l['sequence'] if a.isalpha()]),l['charge']))
all_psms.append(l)
try:
tolerance = float(l['frag_tol'])
@@ -195,9 +198,10 @@ def main():
with mgf.read(synthetics_file) as reader:
for i,s in enumerate(reader):
peptide = s['params'].get('seq')
+ sequence = ''.join([a for a in peptide if a.isalpha()])
charge = int(s['params'].get('charge')[0])
# print(peptide, str(charge))
- if (peptide, str(charge)) in synthetic_keys:
+ if (sequence, str(charge)) in synthetic_keys:
synthetics_loaded += 1
filename = s['params'].get('originalfile_filename',s['params'].get('provenance_filename'))
scan = s['params'].get('originalfile_scan',s['params'].get('provenance_scan'))
@@ -214,12 +218,12 @@ def main():
)
synthetic_low_mass, synthetic_min_snr = (args.low_mass_filter, args.min_snr) if args.filter_synthetics == 'Yes' else (0,0)
_, synthetic_ion_vector = extract_annotated_peaks(spectrum, 0.05, synthetic_low_mass, synthetic_min_snr)
- synthetic_scans[(peptide, str(charge))].append(((filename,scan),synthetic_ion_vector))
+ synthetic_scans[(sequence, str(charge))].append(((filename,scan),synthetic_ion_vector,peptide))
print("{}: Loaded {} synthetics".format(datetime.now().strftime("%H:%M:%S"),synthetics_loaded))
else:
print("Not loading synthetics")
- cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A')))
+ cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A'))
explained_intensity_per_spectrum = {}
min_spectra_to_load_file = 20
@@ -313,9 +317,9 @@ def main():
for scan in psms_to_consider[filename].keys():
sequence = psms_to_consider[filename][scan]['sequence']
charge = psms_to_consider[filename][scan]['charge']
- matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[])
- for synthetic_filescan, _ in matching_synthetics:
- cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan)
+ matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[])
+ for synthetic_filescan, _ , synthetic_peptide in matching_synthetics:
+ cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan,synthetic_peptide)
print("{}: About to write out PSMs".format(datetime.now().strftime("%H:%M:%S")))
@@ -324,7 +328,7 @@ def main():
w_psm = csv.DictWriter(fw_psm, delimiter = '\t', fieldnames = header)
w_psm.writeheader()
for psm in all_psms:
- cosine, best_synthetic = cosine_to_synthetic[(psm['filename'],psm['scan'])]
+ cosine, best_synthetic, synthetic_peptide = cosine_to_synthetic[(psm['filename'],psm['scan'])]
if best_synthetic[0] != 'N/A':
synthetic_filename = 'f.' + best_synthetic[0].replace('/data/massive/','')
synthetic_scan = best_synthetic[1]
@@ -334,7 +338,8 @@ def main():
psm['usi'] = make_usi(psm['filename'], psm['scan'], psm['sequence'], psm['charge'])
psm['synthetic_filename'] = synthetic_filename
psm['synthetic_scan'] = synthetic_scan
- psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, psm['sequence'].replace('+229.163','').replace('+229.162932',''), psm['charge'])
+ psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, synthetic_peptide, psm['charge'])
+ psm['synthetic_sequence'] = synthetic_peptide
psm['cosine'] = cosine
ei, num_matched_peaks = explained_intensity_per_spectrum.get((psm['filename'],psm['scan']),(0,0))
psm['explained_intensity'] = ei
diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py
index db56a11..d3457a8 100644
--- a/tools/peptide_statistics_hpp/download_latest_kb.py
+++ b/tools/peptide_statistics_hpp/download_latest_kb.py
@@ -10,7 +10,8 @@ def arguments():
parser = argparse.ArgumentParser(description='mzTab to list of peptides')
parser.add_argument('-p','--params', type = str, help='Input Parameters')
parser.add_argument('-c','--comparisons', type = Path, help='Comparison Jobs')
- parser.add_argument('-f','--proteome_fasta', type = Path, help='FASTA File')
+ parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database')
+ parser.add_argument('-m','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database')
parser.add_argument('-b','--backup_kb_pep', type = Path, help='Backup KB Peptides')
parser.add_argument('-t','--use_job_level_thresholds', type = bool, help='Use job level thresholds',default=True)
parser.add_argument('-k','--kb_pep', type = Path, help='Output KB Peptides')
@@ -65,7 +66,7 @@ def main():
try:
- proteome = mapping.add_decoys(mapping.read_uniprot(args.proteome_fasta))
+ proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
with open(args.kb_pep, 'w') as w:
r = csv.DictWriter(w, delimiter = '\t', fieldnames = header)
diff --git a/tools/peptide_statistics_hpp/map_peptides.py b/tools/peptide_statistics_hpp/map_peptides.py
index 86e3308..40d298e 100755
--- a/tools/peptide_statistics_hpp/map_peptides.py
+++ b/tools/peptide_statistics_hpp/map_peptides.py
@@ -9,6 +9,7 @@
def arguments():
parser = argparse.ArgumentParser(description='Map Peptides')
parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database')
+ parser.add_argument('-c','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database')
parser.add_argument('-e','--exon_fasta', type = Path, help='Input FASTA Exon Mapping Database')
parser.add_argument('-p','--peptide_list', type = Path, help='Peptide List')
parser.add_argument('-o','--output_folder', type = Path, help='Output Folder')
@@ -28,7 +29,7 @@ def load_peptide_list(input_peptide_filename):
def main():
args = arguments()
- proteome = mapping.read_uniprot(args.proteome_fasta)
+ proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])
proteome_with_decoys = mapping.add_decoys(proteome)
peptide_list = load_peptide_list(args.peptide_list)
diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
index 50cb0e0..762af4b 100755
--- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py
+++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py
@@ -16,7 +16,8 @@
def arguments():
parser = argparse.ArgumentParser(description='mzTab to list of peptides')
parser.add_argument('--comparison_pep', type = Path, help='Peptides to Compare')
- parser.add_argument('--fasta', type = Path, help='Input FASTA')
+ parser.add_argument('--proteome_fasta', type = Path, help='Input Proteome FASTA')
+ parser.add_argument('--contaminants_fasta', type = Path, help='Input Contaminants FASTA')
parser.add_argument('--input_psms', type = Path, help='Input PSMs')
parser.add_argument('--input_psms_external', type = Path, help='Input PSMs (External)')
parser.add_argument('--input_peptides', type = Path, help='Input PSMs (External)')
@@ -32,6 +33,7 @@ def arguments():
parser.add_argument('--output_task_proteins_all', type = Path, help='Output Task Proteins (All)')
parser.add_argument('--protein_coverage', type = Path, help='Added Protein Coverage')
parser.add_argument('--protein_coverage_external', type = Path, help='Added Protein Coverage (External)')
+ parser.add_argument('--protein_coverage_external_compare', type = Path, help='Added Protein Coverage (External) - Comparison')
parser.add_argument('--cosine_cutoff', type = float, help='Cosine Cutoff')
parser.add_argument('--explained_intensity_cutoff', type = float, help='Explained Intensity Cutoff')
parser.add_argument('--annotated_ions_cutoff', type = float, help='Annotated Ion Cutoff')
@@ -50,6 +52,9 @@ def arguments():
parser.add_argument('--library_name', type = int, help='Library Name')
parser.add_argument('--export_explorers', type = int, help='Export Explorer Tables (0/1)')
parser.add_argument('--explorers_output', type = Path, help='Tables for Explorers')
+ parser.add_argument('--variant_output', type = int, help='Variant level outputs',default=0)
+ parser.add_argument('--hpp_protein_score_aggregation', type = str, help='HPP Protein Aggregation (max or sum')
+
if len(sys.argv) < 4:
parser.print_help()
sys.exit(1)
@@ -67,6 +72,16 @@ def unit_delta(aa1, aa2):
no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()])
+def seq_theoretical_mass(sequence):
+ aa = ''.join([a for a in sequence if a.isalpha()])
+ mods = ''.join([m for m in sequence if not m.isalpha()])
+ if len(mods) > 0:
+ mods = eval(mods)
+ else:
+ mods = 0
+ return (theoretical_mass(aa) + mods + 1.007276035)
+
+
def theoretical_mz(sequence,charge):
aa = ''.join([a for a in sequence if a.isalpha()])
mods = ''.join([m for m in sequence if not m.isalpha()])
@@ -76,6 +91,10 @@ def theoretical_mz(sequence,charge):
mods = 0
return (theoretical_mass(aa) + mods + (int(charge)*1.007276035))/int(charge)
+def integer_mod_mass(sequence):
+ mods = ''.join([m for m in sequence if not m.isalpha()])
+ mods = [int(round(float(m))) if i == 0 else -int(round(float(m))) for ms in mods.split('+') for i,m in enumerate(ms.split('-')) if m != '' ]
+ return sum(mods)
def add_brackets(pep):
aa_breakpoints = []
@@ -92,7 +111,7 @@ def add_brackets(pep):
pep = pep[:breakpoint] + end_bracket + pep[breakpoint:]
return pep
-protein_type = lambda protein, proteome: 'TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform')
+protein_type = lambda protein, proteome: 'Contaminant' if proteome.proteins[protein].db == 'con' else ('TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform'))
def msv_to_pxd(msv, msv_mapping):
output_mapping = msv_mapping.get(msv,{}).get('px_accession')
@@ -163,9 +182,14 @@ def find_overlap(existing_peptides, new_peptides, protein_length, protein_pe, na
def main():
args = arguments()
+ hpp_score_aggregation = lambda xs: sum(xs) if args.hpp_protein_score_aggregation == 'sum' else max(xs)
+
representative_per_precursor = {}
+ variant_to_best_precursor = {}
+ variant_to_all_precursors = defaultdict(set)
+ variant_number = {}
- proteome = mapping.add_decoys(mapping.read_uniprot(args.fasta))
+ proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]))
all_datasets = set()
datasets_per_sequence= defaultdict(set)
@@ -183,6 +207,13 @@ def main():
comparison_picked_fdr = {}
comparison_hpp_fdr = {}
+ precursors_per_protein_all = defaultdict(lambda: defaultdict(list))
+ precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list))
+ precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list))
+
+ all_sequences = {}
+ unique_sequences = {}
+
frequency = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
with open(args.msv_to_pxd_mapping) as json_file:
@@ -193,32 +224,6 @@ def main():
if pxd:
pxd_mapping[pxd] = msv
-
- with open(args.comparison_pep) as f:
- r = csv.DictReader(f, delimiter='\t')
- for l in r:
- il_peptide = l['demodified'].replace('I','L')
- has_synthetic = False
- has_synthetic_cosine = False
- if l['protein'] in proteome.proteins:
-
- if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso:
- #peptides that are not uniquely matching will have both FDRs as 1
- comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1))
- comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1))
-
- protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None))
- comparison = sequences_found[il_peptide].comparison
- if comparison != SeqOccurances(True,True,True):
- if float(l.get('synthetic_cosine',-1)) >= 0:
- has_synthetic = True
- if float(l['synthetic_cosine']) > args.cosine_cutoff:
- has_synthetic_cosine = True
- comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine)
- sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison)
- if l['is_hpp'] == 'True':
- sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True)
-
peptide_to_protein = defaultdict(list)
peptide_to_exon_map = defaultdict(list)
@@ -233,33 +238,54 @@ def main():
latest_nextprot_release = sorted(list(nextprot_releases_pe.keys()))[-1]
nextprot_pe = nextprot_releases_pe[latest_nextprot_release]
- def update_precursor_representative(l,from_psm = True):
+ def update_precursor_representative(l,from_psm = True, variant_level = False, psm_fdr = 0):
datasets = set([d for d in l.get('datasets','').split(';') if d != ''])
tasks = set([t for t in l.get('tasks','').split(';') if t != ''])
sequence, charge = l['sequence'],l['charge']
- if not (sequence, charge) in representative_per_precursor:
+ precursor_theoretical_mz = theoretical_mz(sequence, charge)
+ integer_mods = integer_mod_mass(sequence)
+ aa_seq = ''.join([a for a in sequence if a.isalpha()])
+
+ if variant_level:
+ variant = (aa_seq, charge, integer_mods)
+ else:
+ variant = (sequence, charge)
+ update_peptidoform = False
+
+ score = float(l['score']) if psm_fdr <= 0.01 else 0
+
+ variant_to_all_precursors[variant].add((sequence, charge))
+
+ if not variant in variant_to_best_precursor:
+ current_variant_number = len(variant_number)
+ variant_number[variant] = current_variant_number
+ variant_to_best_precursor[variant] = (sequence, charge)
representative_per_precursor[(sequence, charge)] = l.copy()
precursor_representative = representative_per_precursor[(sequence, charge)]
precursor_representative['datasets'] = datasets
precursor_representative['tasks'] = tasks
+ precursor_representative['parent_mass'] = precursor_theoretical_mz
+ precursor_representative['filtered_psms'] = 0
+ precursor_representative['total_psms'] = 0
+ precursor_representative['variant_number'] = current_variant_number
if from_psm:
precursor_representative['database_filename'] = l['filename']
precursor_representative['database_scan'] = l['scan']
precursor_representative['database_usi'] = l['usi']
precursor_representative['explained_intensity'] = float(l['explained_intensity'])
precursor_representative['cosine'] = float(l['cosine'])
- precursor_representative['score'] = float(l['score'])
+ precursor_representative['score'] = score
+ precursor_representative['best_overall_psm_score'] = score
precursor_representative.pop('filename')
precursor_representative.pop('scan')
precursor_representative.pop('usi')
- precursor_representative = representative_per_precursor[(sequence, charge)]
+ precursor_representative = representative_per_precursor[variant_to_best_precursor[variant]]
precursor_representative['datasets'] |= datasets
precursor_representative['tasks'] |= tasks
best_cosine = float(l['cosine']) >= float(precursor_representative['cosine'])
- best_score = float(l['score']) >= float(precursor_representative['score'])
this_pass_ei = float(l['explained_intensity']) >= args.explained_intensity_cutoff
this_pass_cos = float(l['cosine']) >= args.cosine_cutoff
@@ -268,6 +294,8 @@ def update_precursor_representative(l,from_psm = True):
best_pass_cos = float(precursor_representative['cosine']) >= args.cosine_cutoff
best_pass_by = int(precursor_representative['matched_ions']) >= args.annotated_ions_cutoff
+ best_score = float(l['explained_intensity']) >= float(precursor_representative['explained_intensity']) if score == float(precursor_representative['score']) else score > float(precursor_representative['score'])
+
# we want to find the spectrum with the best score,
# but sometimes the highest scoring spectrum does not pass the filters
@@ -285,11 +313,18 @@ def update_precursor_representative(l,from_psm = True):
precursor_representative['database_filename'] = l['filename'] if from_psm else l['database_filename']
precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan']
precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi']
- precursor_representative['score'] = float(l['score'])
- #consider best EI And matched ions over all representatives
- if float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
- precursor_representative['explained_intensity'] = l['explained_intensity']
- precursor_representative['matched_ions'] = l['matched_ions']
+ precursor_representative['score'] = score
+ update_peptidoform = True
+
+ #best EI does not pass #breaks threshold but current one does
+ if not best_pass_by and this_pass_by:
+ precursor_representative['explained_intensity'] = l['explained_intensity']
+ precursor_representative['matched_ions'] = l['matched_ions']
+ #two situations, either both pass #threshold or both fail #threshold
+ elif best_pass_by is this_pass_by and float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)):
+ precursor_representative['explained_intensity'] = l['explained_intensity']
+ precursor_representative['matched_ions'] = l['matched_ions']
+ #otherwise best already passed #threshold, so we skip it
if best_cosine and float(l['cosine']) >= 0 and l['synthetic_usi'] != 'N/A':
precursor_representative['cosine_filename'] = l['filename'] if from_psm else l['cosine_filename']
@@ -305,6 +340,21 @@ def update_precursor_representative(l,from_psm = True):
else:
precursor_representative['cosine_score_match'] = 'No'
+ if best_score:
+ precursor_representative['best_overall_psm_score'] = float(l['score'])
+
+ if (this_pass_ei or this_pass_cos) and this_pass_by:
+ precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1))
+ precursor_representative['total_psms'] += int(l.get('total_psms',1))
+
+ precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant])
+
+ if update_peptidoform:
+ representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant])
+ variant_to_best_precursor[variant] = (sequence, charge)
+
+ return variant_number[variant]
+
def update_mappings(protein_coverage_file,update_precursor_representatives):
if(protein_coverage_file.is_file()):
print("{}: Loading {} ({} cumulative peptides @ {} peptides/second)".format(datetime.now().strftime("%H:%M:%S"),protein_coverage_file,len(pep_mapping_info),len(pep_mapping_info)/(1+(datetime.now()-start_time).seconds)))
@@ -315,7 +365,7 @@ def update_mappings(protein_coverage_file,update_precursor_representatives):
protein_mapping[protein].update(peptide_mapping)
if update_precursor_representatives:
for l in output_peptides:
- update_precursor_representative(l,False)
+ update_precursor_representative(l,False,variant_level=args.variant_output==1)
start_time = datetime.now()
@@ -412,13 +462,29 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
outdict['hpp_match'] = 'No - failed quality thresholds'
return outdict, output_proteins
- row_pass_filters = lambda best_psm: (float(best_psm['explained_intensity']) >= args.explained_intensity_cutoff or float(best_psm['cosine']) >= args.cosine_cutoff) and int(best_psm['matched_ions']) >= args.annotated_ions_cutoff
+ row_pass_filters_input = lambda best_psm, explained_intensity_cutoff, cosine_cutoff, annotated_ions_cutoff: (
+ float(best_psm['explained_intensity']) >= explained_intensity_cutoff or
+ float(best_psm['cosine']) >= cosine_cutoff
+ ) and int(best_psm['matched_ions']) >= annotated_ions_cutoff
+
+ row_pass_filters = lambda best_psm: row_pass_filters_input(
+ best_psm,
+ args.explained_intensity_cutoff,
+ args.cosine_cutoff,
+ args.annotated_ions_cutoff
+ )
+ row_pass_filters_compare = lambda best_psm: row_pass_filters_input(
+ best_psm,
+ float(best_psm['explained_intensity_cutoff']),
+ float(best_psm['cosine_cutoff']),
+ int(float(best_psm['annotated_ions_cutoff']))
+ )
if args.output_psms:
all_psms_with_score = []
all_psm_rows = []
with open(args.output_psms,'w') as w:
- header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
+ header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','variant_number','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
o = csv.DictWriter(w, delimiter='\t',fieldnames = header, restval='N/A', extrasaction='ignore')
o.writeheader()
for input_psm in args.input_psms.glob('*'):
@@ -446,8 +512,11 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
l.update(protein_info_dict)
proteins = l['protein'].split(' ###')[0].split(';')
# PSM-level FDR was inefficient at this scale - need to rethink
- #if row_pass_filters(l):
- all_psms_with_score.append(fdr.ScoredElement(l['usi'],'XXX_' in proteins[0],l['score']))
+ all_targets = [p for p in proteins if 'XXX_' not in p]
+ is_decoy = len(all_targets)==0
+ l['decoy'] = is_decoy
+ if row_pass_filters(l):
+ all_psms_with_score.append(fdr.ScoredElement(l['usi'],is_decoy,l['score']))
l.pop('mapped_proteins')
l.pop('hpp')
l.pop('len')
@@ -457,10 +526,9 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
for l in all_psm_rows:
l['psm_fdr'] = psm_fdr.get(l['usi'],1)
+ l['variant_number'] = update_precursor_representative(l, psm_fdr = l['psm_fdr'], variant_level=args.variant_output==1)
if args.output_psms_flag == "1" or (args.output_psms_flag == "0.5" and row_pass_filters(l)):
o.writerow(l)
- if l['psm_fdr'] <= 0.01:
- update_precursor_representative(l)
print("About to calculate precursor FDR")
@@ -469,69 +537,65 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found,
for (sequence, charge), best_psm in representative_per_precursor.items():
all_targets = [p for ps in best_psm['protein'].split(' ###') for p in ps.split(';') if 'XXX_' not in p]
- if best_psm['protein'] != '':
+ if best_psm['protein'] != '' and best_psm['score'] > 0:
all_hpp_precursors.append(fdr.ScoredElement((sequence, charge),len(all_targets)==0,best_psm['score']))
precursor_fdr = {}
if len(all_hpp_precursors) > 0:
precursor_fdr = fdr.calculate_fdr(all_hpp_precursors)
- precursors_per_protein_all = defaultdict(lambda: defaultdict(list))
- precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list))
- precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list))
-
-
- def output_protein_level_results(best_psm):
+ def output_protein_level_results(best_psm,row_pass_filters,update_mappings = True):
sequence = best_psm['sequence_unmodified']
sequence_il = best_psm['sequence_unmodified_il']
proteins = [p for p in best_psm['protein'].split(' ###')[0].split(';') if p != '']
if row_pass_filters(best_psm):
- if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and 'Canonical' in best_psm.get('protein_type',''):
+ if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')):
pos = (int(best_psm['aa_start']),int(best_psm['aa_end']))
if best_psm.get('hpp_match','') == 'Yes':
- precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
- precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
+ precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
+ precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
for protein in proteins:
- precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge']))))
-
- proteins = peptide_to_protein.get(sequence_il,[])
- cannonical_proteins = [protein for protein in proteins if proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso]
- output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
+ precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge']))
+
+ if update_mappings:
+ proteins = peptide_to_protein.get(sequence_il,[])
+ cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con']
+ output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins])
- if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes':
- is_hpp = pep_mapping_info[sequence]['hpp']
- match = False
- has_synthetic = False
- has_synthetic_cosine = False
- is_isoform_unique = False
- if len(cannonical_proteins) == 1:
- if row_pass_filters(best_psm):
- if float(best_psm['cosine']) >= 0:
- has_synthetic = True
- if float(best_psm['cosine']) >= args.cosine_cutoff:
- has_synthetic_cosine = True
- match = True
+ if len(cannonical_proteins) <= 1 :
+ is_hpp = pep_mapping_info[sequence]['hpp']
+ match = False
+ has_synthetic = False
+ has_synthetic_cosine = False
+ is_isoform_unique = False
+ if len(cannonical_proteins) == 1:
+ if row_pass_filters(best_psm):
+ if float(best_psm['cosine']) >= 0:
+ has_synthetic = True
+ if float(best_psm['cosine']) >= args.cosine_cutoff:
+ has_synthetic_cosine = True
+ match = True
for dataset in best_psm['datasets']:
all_datasets.add(dataset)
datasets_per_sequence[sequence_il].add(dataset)
for task in best_psm['tasks']:
all_tasks.add(task)
tasks_per_sequence[sequence_il].add(task)
- if len(proteins) == 1:
- if row_pass_filters(best_psm):
- is_isoform_unique = True
- added = sequences_found[sequence_il].added
- sequences_found[sequence_il] = sequences_found[sequence_il]._replace(
- hpp=is_hpp,
- isoform_unique=is_isoform_unique,
- added = SeqOccurances(
- added.match or match,
- added.synthetic_match or has_synthetic,
- added.synthetic_match_cosine or has_synthetic_cosine
+ if len(proteins) == 1:
+ if row_pass_filters(best_psm):
+ is_isoform_unique = True
+ added = sequences_found[sequence_il].added
+ sequences_found[sequence_il] = sequences_found[sequence_il]._replace(
+ hpp=is_hpp,
+ isoform_unique=is_isoform_unique,
+ added = SeqOccurances(
+ added.match or match,
+ added.synthetic_match or has_synthetic,
+ added.synthetic_match_cosine or has_synthetic_cosine
+ )
)
- )
print("About to output sequences")
all_precursors = []
@@ -547,7 +611,7 @@ def output_protein_level_results(best_psm):
best_psm['precursor_fdr'] = min(precursor_fdr.get((sequence, charge),1),1)
best_psm['psm_fdr'] = -1
best_psm['synthetic_sequence'] = sequence.replace('+229.163','').replace('+229.162932','')
- output_protein_level_results(best_psm)
+ output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True)
best_psm['datasets'] = ';'.join(best_psm['datasets'])
best_psm['tasks'] = ';'.join(best_psm['tasks'])
all_precursors.append(best_psm)
@@ -556,7 +620,45 @@ def output_protein_level_results(best_psm):
with open(args.input_peptides) as f:
r = csv.DictReader(f, delimiter='\t')
for best_psm in r:
- output_protein_level_results(best_psm)
+ output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True)
+
+ for protein, seqs in precursors_per_protein_all.items():
+ unique_sequences[protein] = len(seqs)
+
+ for protein, seqs in precursors_per_protein_non_unique.items():
+ all_sequences[protein] = len(seqs)
+
+ for comparison_peptide_file in args.protein_coverage_external_compare.glob('*'):
+ with open(comparison_peptide_file) as f:
+ r = csv.DictReader(f, delimiter='\t')
+ for l in r:
+ output_protein_level_results(l, row_pass_filters=row_pass_filters_compare, update_mappings=False)
+
+ with open(args.comparison_pep) as f:
+ r = csv.DictReader(f, delimiter='\t')
+ for l in r:
+ il_peptide = l['demodified'].replace('I','L')
+ has_synthetic = False
+ has_synthetic_cosine = False
+ if l['protein'] in proteome.proteins:
+
+ if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso:
+ #peptides that are not uniquely matching will have both FDRs as 1
+ comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1))
+ comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1))
+
+ protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None))
+ comparison = sequences_found[il_peptide].comparison
+ if comparison != SeqOccurances(True,True,True):
+ if float(l.get('synthetic_cosine',-1)) >= 0:
+ has_synthetic = True
+ if float(l['synthetic_cosine']) > args.cosine_cutoff:
+ has_synthetic_cosine = True
+ comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine)
+ sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison)
+ if l['is_hpp'] == 'True':
+ sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True)
+
hpp_protein_w_scores = []
hpp_score_dict = {}
@@ -572,19 +674,19 @@ def output_protein_level_results(best_psm):
seen_picked = set()
- def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5):
+ def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregation_func = lambda xs: sum(xs)):
used_precursors = []
for precursor in sorted(precursor_list, key = lambda x: x[0], reverse = True):
found = False
for seen_precursor in used_precursors:
- if abs(precursor[1]-seen_precursor[1]) <= mz_distance:
+ if precursor[2]==seen_precursor[2] and abs(precursor[1]-seen_precursor[1]) <= distance:
found = True
if not found:
used_precursors.append(precursor)
- return sum([p[0] for p in used_precursors])
+ return score_aggregation_func([p[0] for p in used_precursors])
for protein, precursors in precursors_per_protein_hpp.items():
- score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v)) for k,v in precursors.items()])
+ score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v,score_aggregation_func=hpp_score_aggregation)) for k,v in precursors.items()])
if score != 0:
hpp_protein_w_scores.append(fdr.ScoredElement(protein,'XXX_' in protein, score))
hpp_score_dict[protein] = score
@@ -629,14 +731,15 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5):
if args.output_peptides:
with open(args.output_peptides,'w') as w:
- header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
+ header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks']
header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff']
+ header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score']
r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore')
r.writeheader()
for precursor in all_precursors:
proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != '']
- if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and 'Canonical' in precursor.get('protein_type',''):
- precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1)
+ if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')):
+ precursor['picked_protein_fdr'] = min(leftover_fdr_dict.get(proteins[0],1),1)
precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1)
else:
precursor['picked_protein_fdr'] = 1
@@ -765,12 +868,12 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5):
'common_fdr':min(1,common_fdr_dict.get(protein,1)),
'leftover_score':leftover_score_dict.get(protein,0),
'leftover_fdr':min(1,leftover_fdr_dict.get(protein,1)),
- 'num_sequences':len(precursors_per_protein_all.get(protein,[])),
- 'num_sequences_incl_shared':len(precursors_per_protein_non_unique.get(protein,[])),
+ 'num_sequences':unique_sequences.get(protein,0),
+ 'num_sequences_incl_shared':all_sequences.get(protein,0),
}
pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison
- pass_picked_fdr, pass_hpp_fdr = protein_dict['picked_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr
+ pass_picked_fdr, pass_hpp_fdr = protein_dict['leftover_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr
if args.main_fdr == 'traditional':
pass_picked_fdr, pass_hpp_fdr = protein_dict['common_fdr'] <= args.picked_protein_fdr, protein_dict['common_fdr'] <= args.hpp_protein_fdr
diff --git a/tools/peptide_statistics_hpp/read_mappings.py b/tools/peptide_statistics_hpp/read_mappings.py
index 8fb0a9f..a64d7fb 100644
--- a/tools/peptide_statistics_hpp/read_mappings.py
+++ b/tools/peptide_statistics_hpp/read_mappings.py
@@ -36,7 +36,7 @@ def read_protein_coverage(protein_coverage_file,seen_sequences,proteome, filter
mapped_protein_str = l['mapped_proteins'] if 'mapped_proteins' in l else l['all_proteins_w_coords']
mapped_exon_str = l['mapped_exons'] if 'mapped_exons' in l else l['all_mapped_exons']
- all_protein_fdr = float(l.get('picked_protein_fdr',-1))
+ all_protein_fdr = float(l.get('leftover_protein_fdr',-1))
hpp_protein_fdr = float(l.get('hpp_protein_fdr',-1))
cosine = float(l.get('cosine',-1))
|