diff --git a/peptide_statistics_hpp/binding.xml b/peptide_statistics_hpp/binding.xml index 18189bb..12722e8 100644 --- a/peptide_statistics_hpp/binding.xml +++ b/peptide_statistics_hpp/binding.xml @@ -29,6 +29,9 @@ + + + @@ -54,6 +57,7 @@ + @@ -61,6 +65,7 @@ + @@ -87,7 +92,9 @@ + + diff --git a/peptide_statistics_hpp/flow.xml b/peptide_statistics_hpp/flow.xml index efff10a..63e744e 100644 --- a/peptide_statistics_hpp/flow.xml +++ b/peptide_statistics_hpp/flow.xml @@ -13,6 +13,7 @@ + @@ -41,6 +42,8 @@ + + @@ -60,7 +63,9 @@ - + + + @@ -68,8 +73,9 @@ - - + + + @@ -111,8 +117,10 @@ + - + + diff --git a/peptide_statistics_hpp/input.xml b/peptide_statistics_hpp/input.xml index f30a264..64dae31 100644 --- a/peptide_statistics_hpp/input.xml +++ b/peptide_statistics_hpp/input.xml @@ -13,6 +13,10 @@ + + + + @@ -41,8 +45,8 @@ - - + + @@ -80,22 +84,40 @@ - - + + + + + + + + + + + + + + + @@ -113,7 +135,7 @@ - + @@ -227,6 +249,16 @@ Proteome DB + + + + + + + Proteome DB - Contaminants + @@ -363,14 +395,14 @@ - + @@ -382,6 +414,24 @@ + + + + + + + + + + + + + + diff --git a/peptide_statistics_hpp/result.xml b/peptide_statistics_hpp/result.xml index 81de135..fbcf4ac 100644 --- a/peptide_statistics_hpp/result.xml +++ b/peptide_statistics_hpp/result.xml @@ -35,14 +35,15 @@ + + - - - - - - + + + + + @@ -54,7 +55,7 @@ - + @@ -73,7 +74,7 @@ - + @@ -83,7 +84,7 @@ - + @@ -94,7 +95,7 @@ - + @@ -147,7 +148,7 @@ - + + @@ -216,13 +218,13 @@ - + - + @@ -245,9 +247,15 @@ - + + + + + + + @@ -408,8 +416,8 @@ - - + + diff --git a/peptide_statistics_hpp/tool.xml b/peptide_statistics_hpp/tool.xml index bbe1f8e..7d30b04 100644 --- a/peptide_statistics_hpp/tool.xml +++ b/peptide_statistics_hpp/tool.xml @@ -68,7 +68,8 @@ - + + @@ -76,6 +77,7 @@ + @@ -85,11 +87,13 @@ + + @@ -137,10 +141,12 @@ + + @@ -157,10 +163,12 @@ - + + + @@ -181,7 +189,7 @@ - + @@ -189,7 +197,8 @@ - + + diff --git a/tools/peptide_statistics_hpp/cosine_to_synthetics.py b/tools/peptide_statistics_hpp/cosine_to_synthetics.py index aa4496c..9f970fc 100644 --- a/tools/peptide_statistics_hpp/cosine_to_synthetics.py +++ b/tools/peptide_statistics_hpp/cosine_to_synthetics.py @@ -94,8 +94,11 @@ def extract_annotated_peaks(spectrum, fragment_tolerance, low_mass_filter, min_s fragment_tolerance, precursor_filter_window=1.5, low_mass_filter=low_mass_filter, - isobaric_tag_type=None, - min_snr=min_snr + min_snr=0, + window_filter_size = 50, + window_filter_top_peaks = 8, + num_top_unannotated_envelopes_to_remove=2, + isobaric_tag_type='TMT 16-plex' #remove these peaks for all jobs ) ion_vector = spectrum._replace(peaks = ion_vector) ion_vector = processing.normalize_spectrum(ion_vector) @@ -107,16 +110,16 @@ def find_ei_and_intensity(spectrum, psm, synthetic_scans, tol, low_mass_filter, charge = psm['charge'] tolerance = psm['tolerance'] if psm['tolerance'] else tol spectrum = spectrum._replace(precursor_z = int(charge), annotation = processing.Annotation(sequence, None)) - matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[]) + matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[]) spectrum_ei, spectrum_ion_vector = extract_annotated_peaks(spectrum, tolerance, low_mass_filter, min_snr) - for synthetic_filescan, synthetic_ion_vector in matching_synthetics: + for synthetic_filescan, synthetic_ion_vector, synthetic_peptide in matching_synthetics: cosine = processing.match_peaks(spectrum_ion_vector, synthetic_ion_vector, tolerance) if not best_cosine or cosine > best_cosine[0]: - best_cosine = (cosine,synthetic_filescan) + best_cosine = (cosine,synthetic_filescan,synthetic_peptide) return spectrum_ei, best_cosine def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, peaks_obj, spectrum_select_func): - cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'))) + cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A')) explained_intensity_per_spectrum = {} for scan in psms_to_consider[filename].keys(): spectrum = spectrum_select_func(peaks_obj,scan) @@ -128,7 +131,7 @@ def process_spectrum(psms_to_consider, filename, synthetic_scans, tol, low_mass_ def process_spectrum_read_file(psms_to_consider, filename, synthetic_scans, tol, low_mass_filter, min_snr, threshold, reader, spectrum_select_func,read_scan): start_time = datetime.now() - cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'))) + cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A')) explained_intensity_per_spectrum = {} all_spectra = [] for i,s in enumerate(reader): @@ -172,7 +175,7 @@ def main(): r = csv.DictReader(f, delimiter='\t') psms_header = r.fieldnames for l in r: - synthetic_keys.add((l['sequence'].replace('+229.163','').replace('+229.162932',''),l['charge'])) + synthetic_keys.add((''.join([a for a in l['sequence'] if a.isalpha()]),l['charge'])) all_psms.append(l) try: tolerance = float(l['frag_tol']) @@ -195,9 +198,10 @@ def main(): with mgf.read(synthetics_file) as reader: for i,s in enumerate(reader): peptide = s['params'].get('seq') + sequence = ''.join([a for a in peptide if a.isalpha()]) charge = int(s['params'].get('charge')[0]) # print(peptide, str(charge)) - if (peptide, str(charge)) in synthetic_keys: + if (sequence, str(charge)) in synthetic_keys: synthetics_loaded += 1 filename = s['params'].get('originalfile_filename',s['params'].get('provenance_filename')) scan = s['params'].get('originalfile_scan',s['params'].get('provenance_scan')) @@ -214,12 +218,12 @@ def main(): ) synthetic_low_mass, synthetic_min_snr = (args.low_mass_filter, args.min_snr) if args.filter_synthetics == 'Yes' else (0,0) _, synthetic_ion_vector = extract_annotated_peaks(spectrum, 0.05, synthetic_low_mass, synthetic_min_snr) - synthetic_scans[(peptide, str(charge))].append(((filename,scan),synthetic_ion_vector)) + synthetic_scans[(sequence, str(charge))].append(((filename,scan),synthetic_ion_vector,peptide)) print("{}: Loaded {} synthetics".format(datetime.now().strftime("%H:%M:%S"),synthetics_loaded)) else: print("Not loading synthetics") - cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'))) + cosine_to_synthetic = defaultdict(lambda: (-1,('N/A','N/A'),'N/A')) explained_intensity_per_spectrum = {} min_spectra_to_load_file = 20 @@ -313,9 +317,9 @@ def main(): for scan in psms_to_consider[filename].keys(): sequence = psms_to_consider[filename][scan]['sequence'] charge = psms_to_consider[filename][scan]['charge'] - matching_synthetics = synthetic_scans.get((sequence.replace('+229.163','').replace('+229.162932',''),charge),[]) - for synthetic_filescan, _ in matching_synthetics: - cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan) + matching_synthetics = synthetic_scans.get((''.join([s for s in sequence if s.isalpha()]),charge),[]) + for synthetic_filescan, _ , synthetic_peptide in matching_synthetics: + cosine_to_synthetic[(filename,scan)] = (0,synthetic_filescan,synthetic_peptide) print("{}: About to write out PSMs".format(datetime.now().strftime("%H:%M:%S"))) @@ -324,7 +328,7 @@ def main(): w_psm = csv.DictWriter(fw_psm, delimiter = '\t', fieldnames = header) w_psm.writeheader() for psm in all_psms: - cosine, best_synthetic = cosine_to_synthetic[(psm['filename'],psm['scan'])] + cosine, best_synthetic, synthetic_peptide = cosine_to_synthetic[(psm['filename'],psm['scan'])] if best_synthetic[0] != 'N/A': synthetic_filename = 'f.' + best_synthetic[0].replace('/data/massive/','') synthetic_scan = best_synthetic[1] @@ -334,7 +338,8 @@ def main(): psm['usi'] = make_usi(psm['filename'], psm['scan'], psm['sequence'], psm['charge']) psm['synthetic_filename'] = synthetic_filename psm['synthetic_scan'] = synthetic_scan - psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, psm['sequence'].replace('+229.163','').replace('+229.162932',''), psm['charge']) + psm['synthetic_usi'] = make_usi(synthetic_filename, synthetic_scan, synthetic_peptide, psm['charge']) + psm['synthetic_sequence'] = synthetic_peptide psm['cosine'] = cosine ei, num_matched_peaks = explained_intensity_per_spectrum.get((psm['filename'],psm['scan']),(0,0)) psm['explained_intensity'] = ei diff --git a/tools/peptide_statistics_hpp/download_latest_kb.py b/tools/peptide_statistics_hpp/download_latest_kb.py index db56a11..d3457a8 100644 --- a/tools/peptide_statistics_hpp/download_latest_kb.py +++ b/tools/peptide_statistics_hpp/download_latest_kb.py @@ -10,7 +10,8 @@ def arguments(): parser = argparse.ArgumentParser(description='mzTab to list of peptides') parser.add_argument('-p','--params', type = str, help='Input Parameters') parser.add_argument('-c','--comparisons', type = Path, help='Comparison Jobs') - parser.add_argument('-f','--proteome_fasta', type = Path, help='FASTA File') + parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database') + parser.add_argument('-m','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database') parser.add_argument('-b','--backup_kb_pep', type = Path, help='Backup KB Peptides') parser.add_argument('-t','--use_job_level_thresholds', type = bool, help='Use job level thresholds',default=True) parser.add_argument('-k','--kb_pep', type = Path, help='Output KB Peptides') @@ -65,7 +66,7 @@ def main(): try: - proteome = mapping.add_decoys(mapping.read_uniprot(args.proteome_fasta)) + proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])) with open(args.kb_pep, 'w') as w: r = csv.DictWriter(w, delimiter = '\t', fieldnames = header) diff --git a/tools/peptide_statistics_hpp/map_peptides.py b/tools/peptide_statistics_hpp/map_peptides.py index 86e3308..40d298e 100755 --- a/tools/peptide_statistics_hpp/map_peptides.py +++ b/tools/peptide_statistics_hpp/map_peptides.py @@ -9,6 +9,7 @@ def arguments(): parser = argparse.ArgumentParser(description='Map Peptides') parser.add_argument('-f','--proteome_fasta', type = Path, help='Input FASTA Protein Database') + parser.add_argument('-c','--contaminants_fasta', type = Path, help='Input FASTA Protein Contaminants Database') parser.add_argument('-e','--exon_fasta', type = Path, help='Input FASTA Exon Mapping Database') parser.add_argument('-p','--peptide_list', type = Path, help='Peptide List') parser.add_argument('-o','--output_folder', type = Path, help='Output Folder') @@ -28,7 +29,7 @@ def load_peptide_list(input_peptide_filename): def main(): args = arguments() - proteome = mapping.read_uniprot(args.proteome_fasta) + proteome = mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)]) proteome_with_decoys = mapping.add_decoys(proteome) peptide_list = load_peptide_list(args.peptide_list) diff --git a/tools/peptide_statistics_hpp/peptide_protein_cosine.py b/tools/peptide_statistics_hpp/peptide_protein_cosine.py index 50cb0e0..762af4b 100755 --- a/tools/peptide_statistics_hpp/peptide_protein_cosine.py +++ b/tools/peptide_statistics_hpp/peptide_protein_cosine.py @@ -16,7 +16,8 @@ def arguments(): parser = argparse.ArgumentParser(description='mzTab to list of peptides') parser.add_argument('--comparison_pep', type = Path, help='Peptides to Compare') - parser.add_argument('--fasta', type = Path, help='Input FASTA') + parser.add_argument('--proteome_fasta', type = Path, help='Input Proteome FASTA') + parser.add_argument('--contaminants_fasta', type = Path, help='Input Contaminants FASTA') parser.add_argument('--input_psms', type = Path, help='Input PSMs') parser.add_argument('--input_psms_external', type = Path, help='Input PSMs (External)') parser.add_argument('--input_peptides', type = Path, help='Input PSMs (External)') @@ -32,6 +33,7 @@ def arguments(): parser.add_argument('--output_task_proteins_all', type = Path, help='Output Task Proteins (All)') parser.add_argument('--protein_coverage', type = Path, help='Added Protein Coverage') parser.add_argument('--protein_coverage_external', type = Path, help='Added Protein Coverage (External)') + parser.add_argument('--protein_coverage_external_compare', type = Path, help='Added Protein Coverage (External) - Comparison') parser.add_argument('--cosine_cutoff', type = float, help='Cosine Cutoff') parser.add_argument('--explained_intensity_cutoff', type = float, help='Explained Intensity Cutoff') parser.add_argument('--annotated_ions_cutoff', type = float, help='Annotated Ion Cutoff') @@ -50,6 +52,9 @@ def arguments(): parser.add_argument('--library_name', type = int, help='Library Name') parser.add_argument('--export_explorers', type = int, help='Export Explorer Tables (0/1)') parser.add_argument('--explorers_output', type = Path, help='Tables for Explorers') + parser.add_argument('--variant_output', type = int, help='Variant level outputs',default=0) + parser.add_argument('--hpp_protein_score_aggregation', type = str, help='HPP Protein Aggregation (max or sum') + if len(sys.argv) < 4: parser.print_help() sys.exit(1) @@ -67,6 +72,16 @@ def unit_delta(aa1, aa2): no_mod_il = lambda pep: ''.join([p.replace('I','L') for p in pep if p.isalpha()]) +def seq_theoretical_mass(sequence): + aa = ''.join([a for a in sequence if a.isalpha()]) + mods = ''.join([m for m in sequence if not m.isalpha()]) + if len(mods) > 0: + mods = eval(mods) + else: + mods = 0 + return (theoretical_mass(aa) + mods + 1.007276035) + + def theoretical_mz(sequence,charge): aa = ''.join([a for a in sequence if a.isalpha()]) mods = ''.join([m for m in sequence if not m.isalpha()]) @@ -76,6 +91,10 @@ def theoretical_mz(sequence,charge): mods = 0 return (theoretical_mass(aa) + mods + (int(charge)*1.007276035))/int(charge) +def integer_mod_mass(sequence): + mods = ''.join([m for m in sequence if not m.isalpha()]) + mods = [int(round(float(m))) if i == 0 else -int(round(float(m))) for ms in mods.split('+') for i,m in enumerate(ms.split('-')) if m != '' ] + return sum(mods) def add_brackets(pep): aa_breakpoints = [] @@ -92,7 +111,7 @@ def add_brackets(pep): pep = pep[:breakpoint] + end_bracket + pep[breakpoint:] return pep -protein_type = lambda protein, proteome: 'TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform') +protein_type = lambda protein, proteome: 'Contaminant' if proteome.proteins[protein].db == 'con' else ('TrEMBL' if proteome.proteins[protein].db == 'tr' else ('Canonical' if proteome.proteins[protein].iso == None else 'Isoform')) def msv_to_pxd(msv, msv_mapping): output_mapping = msv_mapping.get(msv,{}).get('px_accession') @@ -163,9 +182,14 @@ def find_overlap(existing_peptides, new_peptides, protein_length, protein_pe, na def main(): args = arguments() + hpp_score_aggregation = lambda xs: sum(xs) if args.hpp_protein_score_aggregation == 'sum' else max(xs) + representative_per_precursor = {} + variant_to_best_precursor = {} + variant_to_all_precursors = defaultdict(set) + variant_number = {} - proteome = mapping.add_decoys(mapping.read_uniprot(args.fasta)) + proteome = mapping.add_decoys(mapping.merge_proteomes([mapping.read_uniprot(args.proteome_fasta),mapping.read_fasta(args.contaminants_fasta)])) all_datasets = set() datasets_per_sequence= defaultdict(set) @@ -183,6 +207,13 @@ def main(): comparison_picked_fdr = {} comparison_hpp_fdr = {} + precursors_per_protein_all = defaultdict(lambda: defaultdict(list)) + precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list)) + precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list)) + + all_sequences = {} + unique_sequences = {} + frequency = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) with open(args.msv_to_pxd_mapping) as json_file: @@ -193,32 +224,6 @@ def main(): if pxd: pxd_mapping[pxd] = msv - - with open(args.comparison_pep) as f: - r = csv.DictReader(f, delimiter='\t') - for l in r: - il_peptide = l['demodified'].replace('I','L') - has_synthetic = False - has_synthetic_cosine = False - if l['protein'] in proteome.proteins: - - if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso: - #peptides that are not uniquely matching will have both FDRs as 1 - comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1)) - comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1)) - - protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None)) - comparison = sequences_found[il_peptide].comparison - if comparison != SeqOccurances(True,True,True): - if float(l.get('synthetic_cosine',-1)) >= 0: - has_synthetic = True - if float(l['synthetic_cosine']) > args.cosine_cutoff: - has_synthetic_cosine = True - comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine) - sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison) - if l['is_hpp'] == 'True': - sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True) - peptide_to_protein = defaultdict(list) peptide_to_exon_map = defaultdict(list) @@ -233,33 +238,54 @@ def main(): latest_nextprot_release = sorted(list(nextprot_releases_pe.keys()))[-1] nextprot_pe = nextprot_releases_pe[latest_nextprot_release] - def update_precursor_representative(l,from_psm = True): + def update_precursor_representative(l,from_psm = True, variant_level = False, psm_fdr = 0): datasets = set([d for d in l.get('datasets','').split(';') if d != '']) tasks = set([t for t in l.get('tasks','').split(';') if t != '']) sequence, charge = l['sequence'],l['charge'] - if not (sequence, charge) in representative_per_precursor: + precursor_theoretical_mz = theoretical_mz(sequence, charge) + integer_mods = integer_mod_mass(sequence) + aa_seq = ''.join([a for a in sequence if a.isalpha()]) + + if variant_level: + variant = (aa_seq, charge, integer_mods) + else: + variant = (sequence, charge) + update_peptidoform = False + + score = float(l['score']) if psm_fdr <= 0.01 else 0 + + variant_to_all_precursors[variant].add((sequence, charge)) + + if not variant in variant_to_best_precursor: + current_variant_number = len(variant_number) + variant_number[variant] = current_variant_number + variant_to_best_precursor[variant] = (sequence, charge) representative_per_precursor[(sequence, charge)] = l.copy() precursor_representative = representative_per_precursor[(sequence, charge)] precursor_representative['datasets'] = datasets precursor_representative['tasks'] = tasks + precursor_representative['parent_mass'] = precursor_theoretical_mz + precursor_representative['filtered_psms'] = 0 + precursor_representative['total_psms'] = 0 + precursor_representative['variant_number'] = current_variant_number if from_psm: precursor_representative['database_filename'] = l['filename'] precursor_representative['database_scan'] = l['scan'] precursor_representative['database_usi'] = l['usi'] precursor_representative['explained_intensity'] = float(l['explained_intensity']) precursor_representative['cosine'] = float(l['cosine']) - precursor_representative['score'] = float(l['score']) + precursor_representative['score'] = score + precursor_representative['best_overall_psm_score'] = score precursor_representative.pop('filename') precursor_representative.pop('scan') precursor_representative.pop('usi') - precursor_representative = representative_per_precursor[(sequence, charge)] + precursor_representative = representative_per_precursor[variant_to_best_precursor[variant]] precursor_representative['datasets'] |= datasets precursor_representative['tasks'] |= tasks best_cosine = float(l['cosine']) >= float(precursor_representative['cosine']) - best_score = float(l['score']) >= float(precursor_representative['score']) this_pass_ei = float(l['explained_intensity']) >= args.explained_intensity_cutoff this_pass_cos = float(l['cosine']) >= args.cosine_cutoff @@ -268,6 +294,8 @@ def update_precursor_representative(l,from_psm = True): best_pass_cos = float(precursor_representative['cosine']) >= args.cosine_cutoff best_pass_by = int(precursor_representative['matched_ions']) >= args.annotated_ions_cutoff + best_score = float(l['explained_intensity']) >= float(precursor_representative['explained_intensity']) if score == float(precursor_representative['score']) else score > float(precursor_representative['score']) + # we want to find the spectrum with the best score, # but sometimes the highest scoring spectrum does not pass the filters @@ -285,11 +313,18 @@ def update_precursor_representative(l,from_psm = True): precursor_representative['database_filename'] = l['filename'] if from_psm else l['database_filename'] precursor_representative['database_scan'] = l['scan'] if from_psm else l['database_scan'] precursor_representative['database_usi'] = l['usi'] if from_psm else l['database_usi'] - precursor_representative['score'] = float(l['score']) - #consider best EI And matched ions over all representatives - if float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): - precursor_representative['explained_intensity'] = l['explained_intensity'] - precursor_representative['matched_ions'] = l['matched_ions'] + precursor_representative['score'] = score + update_peptidoform = True + + #best EI does not pass #breaks threshold but current one does + if not best_pass_by and this_pass_by: + precursor_representative['explained_intensity'] = l['explained_intensity'] + precursor_representative['matched_ions'] = l['matched_ions'] + #two situations, either both pass #threshold or both fail #threshold + elif best_pass_by is this_pass_by and float(l['explained_intensity']) > float(precursor_representative.get('explained_intensity',0.0)): + precursor_representative['explained_intensity'] = l['explained_intensity'] + precursor_representative['matched_ions'] = l['matched_ions'] + #otherwise best already passed #threshold, so we skip it if best_cosine and float(l['cosine']) >= 0 and l['synthetic_usi'] != 'N/A': precursor_representative['cosine_filename'] = l['filename'] if from_psm else l['cosine_filename'] @@ -305,6 +340,21 @@ def update_precursor_representative(l,from_psm = True): else: precursor_representative['cosine_score_match'] = 'No' + if best_score: + precursor_representative['best_overall_psm_score'] = float(l['score']) + + if (this_pass_ei or this_pass_cos) and this_pass_by: + precursor_representative['filtered_psms'] += int(l.get('filtered_psms',1)) + precursor_representative['total_psms'] += int(l.get('total_psms',1)) + + precursor_representative['num_peptidoforms'] = len(variant_to_all_precursors[variant]) + + if update_peptidoform: + representative_per_precursor[(sequence, charge)] = representative_per_precursor.pop(variant_to_best_precursor[variant]) + variant_to_best_precursor[variant] = (sequence, charge) + + return variant_number[variant] + def update_mappings(protein_coverage_file,update_precursor_representatives): if(protein_coverage_file.is_file()): print("{}: Loading {} ({} cumulative peptides @ {} peptides/second)".format(datetime.now().strftime("%H:%M:%S"),protein_coverage_file,len(pep_mapping_info),len(pep_mapping_info)/(1+(datetime.now()-start_time).seconds))) @@ -315,7 +365,7 @@ def update_mappings(protein_coverage_file,update_precursor_representatives): protein_mapping[protein].update(peptide_mapping) if update_precursor_representatives: for l in output_peptides: - update_precursor_representative(l,False) + update_precursor_representative(l,False,variant_level=args.variant_output==1) start_time = datetime.now() @@ -412,13 +462,29 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, outdict['hpp_match'] = 'No - failed quality thresholds' return outdict, output_proteins - row_pass_filters = lambda best_psm: (float(best_psm['explained_intensity']) >= args.explained_intensity_cutoff or float(best_psm['cosine']) >= args.cosine_cutoff) and int(best_psm['matched_ions']) >= args.annotated_ions_cutoff + row_pass_filters_input = lambda best_psm, explained_intensity_cutoff, cosine_cutoff, annotated_ions_cutoff: ( + float(best_psm['explained_intensity']) >= explained_intensity_cutoff or + float(best_psm['cosine']) >= cosine_cutoff + ) and int(best_psm['matched_ions']) >= annotated_ions_cutoff + + row_pass_filters = lambda best_psm: row_pass_filters_input( + best_psm, + args.explained_intensity_cutoff, + args.cosine_cutoff, + args.annotated_ions_cutoff + ) + row_pass_filters_compare = lambda best_psm: row_pass_filters_input( + best_psm, + float(best_psm['explained_intensity_cutoff']), + float(best_psm['cosine_cutoff']), + int(float(best_psm['annotated_ions_cutoff'])) + ) if args.output_psms: all_psms_with_score = [] all_psm_rows = [] with open(args.output_psms,'w') as w: - header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] + header = ['protein','protein_full','psm_fdr','protein_type','gene','all_proteins','decoy','pe','ms_evidence','filename','scan','variant_number','sequence','sequence_unmodified','sequence_unmodified_il','charge','usi','score','modifications','pass','type','parent_mass','frag_tol','synthetic_filename','synthetic_scan','synthetic_usi','cosine','synthetic_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] o = csv.DictWriter(w, delimiter='\t',fieldnames = header, restval='N/A', extrasaction='ignore') o.writeheader() for input_psm in args.input_psms.glob('*'): @@ -446,8 +512,11 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, l.update(protein_info_dict) proteins = l['protein'].split(' ###')[0].split(';') # PSM-level FDR was inefficient at this scale - need to rethink - #if row_pass_filters(l): - all_psms_with_score.append(fdr.ScoredElement(l['usi'],'XXX_' in proteins[0],l['score'])) + all_targets = [p for p in proteins if 'XXX_' not in p] + is_decoy = len(all_targets)==0 + l['decoy'] = is_decoy + if row_pass_filters(l): + all_psms_with_score.append(fdr.ScoredElement(l['usi'],is_decoy,l['score'])) l.pop('mapped_proteins') l.pop('hpp') l.pop('len') @@ -457,10 +526,9 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, for l in all_psm_rows: l['psm_fdr'] = psm_fdr.get(l['usi'],1) + l['variant_number'] = update_precursor_representative(l, psm_fdr = l['psm_fdr'], variant_level=args.variant_output==1) if args.output_psms_flag == "1" or (args.output_psms_flag == "0.5" and row_pass_filters(l)): o.writerow(l) - if l['psm_fdr'] <= 0.01: - update_precursor_representative(l) print("About to calculate precursor FDR") @@ -469,69 +537,65 @@ def protein_info(peptide, peptide_to_protein, protein_mappings, sequences_found, for (sequence, charge), best_psm in representative_per_precursor.items(): all_targets = [p for ps in best_psm['protein'].split(' ###') for p in ps.split(';') if 'XXX_' not in p] - if best_psm['protein'] != '': + if best_psm['protein'] != '' and best_psm['score'] > 0: all_hpp_precursors.append(fdr.ScoredElement((sequence, charge),len(all_targets)==0,best_psm['score'])) precursor_fdr = {} if len(all_hpp_precursors) > 0: precursor_fdr = fdr.calculate_fdr(all_hpp_precursors) - precursors_per_protein_all = defaultdict(lambda: defaultdict(list)) - precursors_per_protein_hpp = defaultdict(lambda: defaultdict(list)) - precursors_per_protein_non_unique = defaultdict(lambda: defaultdict(list)) - - - def output_protein_level_results(best_psm): + def output_protein_level_results(best_psm,row_pass_filters,update_mappings = True): sequence = best_psm['sequence_unmodified'] sequence_il = best_psm['sequence_unmodified_il'] proteins = [p for p in best_psm['protein'].split(' ###')[0].split(';') if p != ''] if row_pass_filters(best_psm): - if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and 'Canonical' in best_psm.get('protein_type',''): + if float(best_psm['precursor_fdr']) <= args.precursor_fdr and len(proteins) == 1 and ('Canonical' in best_psm.get('protein_type','') or 'Contaminant' in best_psm.get('protein_type','')): pos = (int(best_psm['aa_start']),int(best_psm['aa_end'])) if best_psm.get('hpp_match','') == 'Yes': - precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge'])))) - precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge'])))) + precursors_per_protein_hpp[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) + precursors_per_protein_all[proteins[0]][pos].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) for protein in proteins: - precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),theoretical_mz(best_psm['sequence'],int(best_psm['charge'])))) - - proteins = peptide_to_protein.get(sequence_il,[]) - cannonical_proteins = [protein for protein in proteins if proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso] - output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) + precursors_per_protein_non_unique[protein][sequence_il].append((float(best_psm['score']),seq_theoretical_mass(best_psm['sequence']),best_psm['charge'])) + + if update_mappings: + proteins = peptide_to_protein.get(sequence_il,[]) + cannonical_proteins = [protein for protein in proteins if (proteome.proteins[protein].db == 'sp' and not proteome.proteins[protein].iso) or proteome.proteins[protein].db == 'con'] + output_genes = set([proteome.proteins[protein].gene if proteome.proteins[protein].gene else 'N/A' for protein in proteins]) - if len(cannonical_proteins) <= 1 and best_psm.get('hpp_match','') == 'Yes': - is_hpp = pep_mapping_info[sequence]['hpp'] - match = False - has_synthetic = False - has_synthetic_cosine = False - is_isoform_unique = False - if len(cannonical_proteins) == 1: - if row_pass_filters(best_psm): - if float(best_psm['cosine']) >= 0: - has_synthetic = True - if float(best_psm['cosine']) >= args.cosine_cutoff: - has_synthetic_cosine = True - match = True + if len(cannonical_proteins) <= 1 : + is_hpp = pep_mapping_info[sequence]['hpp'] + match = False + has_synthetic = False + has_synthetic_cosine = False + is_isoform_unique = False + if len(cannonical_proteins) == 1: + if row_pass_filters(best_psm): + if float(best_psm['cosine']) >= 0: + has_synthetic = True + if float(best_psm['cosine']) >= args.cosine_cutoff: + has_synthetic_cosine = True + match = True for dataset in best_psm['datasets']: all_datasets.add(dataset) datasets_per_sequence[sequence_il].add(dataset) for task in best_psm['tasks']: all_tasks.add(task) tasks_per_sequence[sequence_il].add(task) - if len(proteins) == 1: - if row_pass_filters(best_psm): - is_isoform_unique = True - added = sequences_found[sequence_il].added - sequences_found[sequence_il] = sequences_found[sequence_il]._replace( - hpp=is_hpp, - isoform_unique=is_isoform_unique, - added = SeqOccurances( - added.match or match, - added.synthetic_match or has_synthetic, - added.synthetic_match_cosine or has_synthetic_cosine + if len(proteins) == 1: + if row_pass_filters(best_psm): + is_isoform_unique = True + added = sequences_found[sequence_il].added + sequences_found[sequence_il] = sequences_found[sequence_il]._replace( + hpp=is_hpp, + isoform_unique=is_isoform_unique, + added = SeqOccurances( + added.match or match, + added.synthetic_match or has_synthetic, + added.synthetic_match_cosine or has_synthetic_cosine + ) ) - ) print("About to output sequences") all_precursors = [] @@ -547,7 +611,7 @@ def output_protein_level_results(best_psm): best_psm['precursor_fdr'] = min(precursor_fdr.get((sequence, charge),1),1) best_psm['psm_fdr'] = -1 best_psm['synthetic_sequence'] = sequence.replace('+229.163','').replace('+229.162932','') - output_protein_level_results(best_psm) + output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True) best_psm['datasets'] = ';'.join(best_psm['datasets']) best_psm['tasks'] = ';'.join(best_psm['tasks']) all_precursors.append(best_psm) @@ -556,7 +620,45 @@ def output_protein_level_results(best_psm): with open(args.input_peptides) as f: r = csv.DictReader(f, delimiter='\t') for best_psm in r: - output_protein_level_results(best_psm) + output_protein_level_results(best_psm,row_pass_filters=row_pass_filters,update_mappings=True) + + for protein, seqs in precursors_per_protein_all.items(): + unique_sequences[protein] = len(seqs) + + for protein, seqs in precursors_per_protein_non_unique.items(): + all_sequences[protein] = len(seqs) + + for comparison_peptide_file in args.protein_coverage_external_compare.glob('*'): + with open(comparison_peptide_file) as f: + r = csv.DictReader(f, delimiter='\t') + for l in r: + output_protein_level_results(l, row_pass_filters=row_pass_filters_compare, update_mappings=False) + + with open(args.comparison_pep) as f: + r = csv.DictReader(f, delimiter='\t') + for l in r: + il_peptide = l['demodified'].replace('I','L') + has_synthetic = False + has_synthetic_cosine = False + if l['protein'] in proteome.proteins: + + if proteome.proteins[l['protein']].db == 'sp' and not proteome.proteins[l['protein']].iso: + #peptides that are not uniquely matching will have both FDRs as 1 + comparison_picked_fdr[l['protein']] = min(float(l['all_protein_fdr']),comparison_picked_fdr.get(l['protein'],1)) + comparison_hpp_fdr[l['protein']] = min(float(l['hpp_protein_fdr']),comparison_hpp_fdr.get(l['protein'],1)) + + protein_mapping[l['protein']][il_peptide].add((int(l['aa_start']),int(l['aa_end']),None,None,None,None)) + comparison = sequences_found[il_peptide].comparison + if comparison != SeqOccurances(True,True,True): + if float(l.get('synthetic_cosine',-1)) >= 0: + has_synthetic = True + if float(l['synthetic_cosine']) > args.cosine_cutoff: + has_synthetic_cosine = True + comparison = SeqOccurances(True, comparison.synthetic_match or has_synthetic, comparison.synthetic_match_cosine or has_synthetic_cosine) + sequences_found[il_peptide] = sequences_found[il_peptide]._replace(comparison = comparison) + if l['is_hpp'] == 'True': + sequences_found[il_peptide] = sequences_found[il_peptide]._replace(hpp = True) + hpp_protein_w_scores = [] hpp_score_dict = {} @@ -572,19 +674,19 @@ def output_protein_level_results(best_psm): seen_picked = set() - def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5): + def greedy_sequence_precursor_score(precursor_list, distance = 3, score_aggregation_func = lambda xs: sum(xs)): used_precursors = [] for precursor in sorted(precursor_list, key = lambda x: x[0], reverse = True): found = False for seen_precursor in used_precursors: - if abs(precursor[1]-seen_precursor[1]) <= mz_distance: + if precursor[2]==seen_precursor[2] and abs(precursor[1]-seen_precursor[1]) <= distance: found = True if not found: used_precursors.append(precursor) - return sum([p[0] for p in used_precursors]) + return score_aggregation_func([p[0] for p in used_precursors]) for protein, precursors in precursors_per_protein_hpp.items(): - score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v)) for k,v in precursors.items()]) + score, count = mapping.non_nested_score([(*k,greedy_sequence_precursor_score(v,score_aggregation_func=hpp_score_aggregation)) for k,v in precursors.items()]) if score != 0: hpp_protein_w_scores.append(fdr.ScoredElement(protein,'XXX_' in protein, score)) hpp_score_dict[protein] = score @@ -629,14 +731,15 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5): if args.output_peptides: with open(args.output_peptides,'w') as w: - header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction', 'exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] + header = ['picked_protein_fdr','hpp_protein_fdr','precursor_fdr','psm_fdr','protein_full','protein','protein_type','gene','decoy','all_proteins','pe','ms_evidence','aa_total','database_filename','database_scan','database_usi','sequence','sequence_unmodified','sequence_unmodified_il','charge','score','modifications','pass','type','parent_mass','cosine_filename','cosine_scan','cosine_usi','synthetic_filename','synthetic_scan','synthetic_usi','synthetic_sequence','cosine','synthetic_match','cosine_score_match','explained_intensity','matched_ions','hpp_match','gene_unique','canonical_matches','all_proteins_w_coords','aa_start','aa_end','frag_tol', 'total_unique_exons_covered', 'exons_covered_no_junction','exon_junctions_covered', 'all_mapped_exons','datasets','tasks'] header += ['precursor_fdr_cutoff', 'picked_protein_fdr_cutoff', 'hpp_protein_fdr_cutoff', 'cosine_cutoff', 'explained_intensity_cutoff', 'annotated_ions_cutoff'] + header += ['num_peptidoforms','filtered_psms','total_psms','variant_number','best_overall_psm_score'] r = csv.DictWriter(w, delimiter = '\t', fieldnames = header, restval='N/A', extrasaction='ignore') r.writeheader() for precursor in all_precursors: proteins = [p for p in precursor['protein'].split(' ###')[0].split(';') if p != ''] - if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and 'Canonical' in precursor.get('protein_type',''): - precursor['picked_protein_fdr'] = min(picked_fdr_dict.get(proteins[0],1),1) + if float(precursor['precursor_fdr']) < 1 and len(proteins) == 1 and ('Canonical' in precursor.get('protein_type','') or 'Contaminant' in precursor.get('protein_type','')): + precursor['picked_protein_fdr'] = min(leftover_fdr_dict.get(proteins[0],1),1) precursor['hpp_protein_fdr'] = min(hpp_fdr_dict.get(proteins[0],1),1) else: precursor['picked_protein_fdr'] = 1 @@ -765,12 +868,12 @@ def greedy_sequence_precursor_score(precursor_list, mz_distance = 2.5): 'common_fdr':min(1,common_fdr_dict.get(protein,1)), 'leftover_score':leftover_score_dict.get(protein,0), 'leftover_fdr':min(1,leftover_fdr_dict.get(protein,1)), - 'num_sequences':len(precursors_per_protein_all.get(protein,[])), - 'num_sequences_incl_shared':len(precursors_per_protein_non_unique.get(protein,[])), + 'num_sequences':unique_sequences.get(protein,0), + 'num_sequences_incl_shared':all_sequences.get(protein,0), } pass_comparison_picked_fdr,pass_comparison_hpp_fdr = comparison_picked_fdr.get(protein,1) <= args.picked_protein_fdr_comparison, comparison_hpp_fdr.get(protein,1) <= args.hpp_protein_fdr_comparison - pass_picked_fdr, pass_hpp_fdr = protein_dict['picked_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr + pass_picked_fdr, pass_hpp_fdr = protein_dict['leftover_fdr'] <= args.picked_protein_fdr, protein_dict['hpp_fdr'] <= args.hpp_protein_fdr if args.main_fdr == 'traditional': pass_picked_fdr, pass_hpp_fdr = protein_dict['common_fdr'] <= args.picked_protein_fdr, protein_dict['common_fdr'] <= args.hpp_protein_fdr diff --git a/tools/peptide_statistics_hpp/read_mappings.py b/tools/peptide_statistics_hpp/read_mappings.py index 8fb0a9f..a64d7fb 100644 --- a/tools/peptide_statistics_hpp/read_mappings.py +++ b/tools/peptide_statistics_hpp/read_mappings.py @@ -36,7 +36,7 @@ def read_protein_coverage(protein_coverage_file,seen_sequences,proteome, filter mapped_protein_str = l['mapped_proteins'] if 'mapped_proteins' in l else l['all_proteins_w_coords'] mapped_exon_str = l['mapped_exons'] if 'mapped_exons' in l else l['all_mapped_exons'] - all_protein_fdr = float(l.get('picked_protein_fdr',-1)) + all_protein_fdr = float(l.get('leftover_protein_fdr',-1)) hpp_protein_fdr = float(l.get('hpp_protein_fdr',-1)) cosine = float(l.get('cosine',-1))