Skip to content

Commit

Permalink
Fixed #35
Browse files Browse the repository at this point in the history
  • Loading branch information
weka511 committed Oct 24, 2020
1 parent 727e2d2 commit 6f17712
Show file tree
Hide file tree
Showing 3 changed files with 391 additions and 311 deletions.
113 changes: 66 additions & 47 deletions MPRT.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,72 +16,91 @@
# MPRT Finding a Protein Motif
#
# N{P}[ST]{P}.

import os
import re
from urllib.request import urlopen
from urllib.parse import urljoin
import reference_tables as rt


def read_unipro_as_fasta(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00'):
# read_uniprot_as_fasta
#
# Read data from uniprot site as fasta file

def read_uniprot_as_fasta(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00'):
resource = urlopen(urljoin(url,ID+'.FASTA'))
return resource.read().decode("utf-8", "ignore")

# get_protein_sequence
#
# Extract the portion of the fasta stringthat represents protein sequence

def get_protein_sequence(fasta):
pos=0
re_protein = rt.get_re_protein(min_length=8)
matched = re_protein.search(fasta,pos=pos)
amino_acids=[]
pos = 0
re_protein = rt.get_re_protein(min_length=8)
matched = re_protein.search(fasta,pos=pos)
amino_acids = []
while matched:
amino_acids.append(matched.string[matched.start(0):matched.end(0)])
#print (pos, matched.string[matched.start(0):matched.end(0)])
pos = matched.end(0)+1
pos = matched.end(0)+1
re_protein = rt.get_re_protein(min_length=1)
matched = re_protein.search(fasta,pos=pos)
return ''.join(amino_acids)
matched = re_protein.search(fasta,pos=pos)
return ''.join(amino_acids)

# mprt
#
# Input: At most 15 UniProt Protein Database access IDs.
#
# Return: For each protein possessing the N-glycosylation motif, output its given access ID followed
# by a list of locations in the protein string where the motif can be found.

def mprt(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00',motif_pattern='N[\s]*[^P][\s]*[ST][\s]*[^P]'):
fasta = read_unipro_as_fasta(url=url,ID=ID)
fasta = read_uniprot_as_fasta(url=url,ID=ID)
protein_sequence = get_protein_sequence(fasta)
start_protein = fasta.find('\n')
re_motif = re.compile(motif_pattern)
motifs = {}
pos = 0
while True:
matched = re_motif.search(protein_sequence,pos=pos)
if matched:
motif = matched.string[matched.start(0):matched.end(0)]
pos = matched.start(0) + 1
matched = re_motif.search(protein_sequence,pos=pos)
motifs[motif] = pos
else:
break
return motifs
start_protein = fasta.find('\n')
re_motif = re.compile(motif_pattern)
motifs = {}
matched = re_motif.search(protein_sequence,pos=0)
while matched:
motif = matched.string[matched.start(0):matched.end(0)]
pos = matched.start(0) + 1
motifs[motif] = pos
matched = re_motif.search(protein_sequence,pos=pos)

return motifs

def read_data(file,path=r'C:\Users\Simon\bioinformatics\data'):
with open (os.path.join(path,file)) as f:
return [line.strip() for line in f]

if __name__=='__main__':
for ID in [
import argparse
parser = argparse.ArgumentParser('MPRT')
parser.add_argument('file', nargs='?')
parser.add_argument('--path', default = r'C:\Users\Simon\bioinformatics\data')
parser.add_argument('--output')
args=parser.parse_args()
IDs = read_data(args.file,path=args.path) if args.file != None else [
'A2Z669',
'B5ZC00',
'P07204_TRBM_HUMAN',
'P20840_SAG1_YEAST'
]:
#'P04141_CSF2_HUMAN',
#'P01190_COLI_BOVIN',
#'P07204_TRBM_HUMAN',
#'Q8WW18',
#'Q3B391',
#'P0AF66',
#'Q5FTZ8',
#'P80069_A45K_MYCBO',
#'P37803',
#'Q14ID0',
#'P80195_MPP3_BOVIN',
#'P11279_LMP1_HUMAN',
#'A5GIU0',
#'A7Z201'
motifs = mprt(ID=ID)
if len(motifs)>0:
print (ID)
locations = [seq for seq in motifs.values()]
locations.sort()
print (' '.join([str(p) for p in locations]))
]
if args.output:
with open (args.output,'w') as out:
for ID in IDs:
motifs = mprt(ID=ID)
if len(motifs)>0:
out.write (f'{ID}\n')
locations = [seq for seq in motifs.values()]
locations.sort()
line = ' '.join([str(p) for p in locations])
out.write (f'{line}\n')
else:
for ID in IDs:
motifs = mprt(ID=ID)
if len(motifs)>0:
print (ID)
locations = [seq for seq in motifs.values()]
locations.sort()
print (' '.join([str(p) for p in locations]))
3 changes: 3 additions & 0 deletions reference_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def createSimpleDNASubst(match=+1,subst=1,bases='ATGC'):
weights[(bases[i],bases[j])] = +match if i==j else -subst
return weights

# get_re_protein
# Produce a regular expression to recognize a straing of amino acids

def get_re_protein(min_length=1):
return re.compile('[A,C-IK-WY]{'+str(min_length)+',}')

Expand Down
Loading

0 comments on commit 6f17712

Please sign in to comment.