Fixed #35

weka511 · Oct 24, 2020 · 6f17712 · 6f17712
1 parent 727e2d2
commit 6f17712
Show file tree

Hide file tree

Showing 3 changed files with 391 additions and 311 deletions.
diff --git a/MPRT.py b/MPRT.py
@@ -16,72 +16,91 @@
 #   MPRT Finding a Protein Motif
 # 
 #  N{P}[ST]{P}.
+
+import os
 import re
 from urllib.request import urlopen
 from urllib.parse import urljoin
 import reference_tables as rt
 
-
-def read_unipro_as_fasta(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00'):
+# read_uniprot_as_fasta
+#
+# Read data from uniprot site as fasta file     
+
+def read_uniprot_as_fasta(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00'):
       resource = urlopen(urljoin(url,ID+'.FASTA'))
       return resource.read().decode("utf-8", "ignore")
 
+# get_protein_sequence
+#
+# Extract the portion of the fasta stringthat represents protein sequence
+
 def get_protein_sequence(fasta):
-      pos=0
-      re_protein = rt.get_re_protein(min_length=8)
-      matched = re_protein.search(fasta,pos=pos)
-      amino_acids=[]
+      pos         = 0
+      re_protein  = rt.get_re_protein(min_length=8)
+      matched     = re_protein.search(fasta,pos=pos)
+      amino_acids = []
       while matched:
             amino_acids.append(matched.string[matched.start(0):matched.end(0)])
-            #print (pos, matched.string[matched.start(0):matched.end(0)])
-            pos = matched.end(0)+1
+            pos        = matched.end(0)+1
             re_protein = rt.get_re_protein(min_length=1)
-            matched = re_protein.search(fasta,pos=pos)
-      return ''.join(amino_acids)            
+            matched    = re_protein.search(fasta,pos=pos)
+      return ''.join(amino_acids) 
+
+# mprt
+#
+# Input: At most 15 UniProt Protein Database access IDs.
+#
+# Return: For each protein possessing the N-glycosylation motif, output its given access ID followed
+#         by a list of locations in the protein string where the motif can be found.
+
 def mprt(url = 'http://www.uniprot.org/uniprot/',ID='B5ZC00',motif_pattern='N[\s]*[^P][\s]*[ST][\s]*[^P]'):
-      fasta = read_unipro_as_fasta(url=url,ID=ID)
+      fasta            = read_uniprot_as_fasta(url=url,ID=ID)
       protein_sequence = get_protein_sequence(fasta)
-      start_protein = fasta.find('\n')
-      re_motif = re.compile(motif_pattern)
-      motifs   = {}
-      pos = 0
-      while True:
-            matched = re_motif.search(protein_sequence,pos=pos)
-            if matched:
-                  motif = matched.string[matched.start(0):matched.end(0)]
-                  pos = matched.start(0) + 1
-                  matched = re_motif.search(protein_sequence,pos=pos)
-                  motifs[motif] = pos
-            else:
-                  break
-      return motifs
+      start_protein    = fasta.find('\n')
+      re_motif         = re.compile(motif_pattern)
+      motifs           = {}
+      matched          = re_motif.search(protein_sequence,pos=0)
+      while matched:
+            motif         = matched.string[matched.start(0):matched.end(0)]
+            pos           = matched.start(0) + 1
+            motifs[motif] = pos
+            matched       = re_motif.search(protein_sequence,pos=pos)
 
+      return motifs
 
+def read_data(file,path=r'C:\Users\Simon\bioinformatics\data'):
+      with open (os.path.join(path,file)) as f:
+            return [line.strip() for line in f]
 
 if __name__=='__main__':
-      for ID in [
+      import argparse
+      parser = argparse.ArgumentParser('MPRT')
+      parser.add_argument('file', nargs='?')
+      parser.add_argument('--path', default = r'C:\Users\Simon\bioinformatics\data')
+      parser.add_argument('--output')
+      args=parser.parse_args()
+      IDs = read_data(args.file,path=args.path) if args.file != None else [
             'A2Z669',
             'B5ZC00',
             'P07204_TRBM_HUMAN',
             'P20840_SAG1_YEAST'
-            ]:
-            #'P04141_CSF2_HUMAN',
-            #'P01190_COLI_BOVIN',
-            #'P07204_TRBM_HUMAN',
-            #'Q8WW18',
-            #'Q3B391',
-            #'P0AF66',
-            #'Q5FTZ8',
-            #'P80069_A45K_MYCBO',
-            #'P37803',
-            #'Q14ID0',
-            #'P80195_MPP3_BOVIN',
-            #'P11279_LMP1_HUMAN',
-            #'A5GIU0',
-            #'A7Z201'               
-            motifs = mprt(ID=ID)
-            if len(motifs)>0:
-                  print (ID)
-                  locations = [seq for seq in motifs.values()]
-                  locations.sort()
-                  print (' '.join([str(p) for p in locations]))
+            ]
+      if args.output:
+            with open (args.output,'w') as out:
+                  for ID in IDs:    
+                        motifs = mprt(ID=ID)
+                        if len(motifs)>0:
+                              out.write (f'{ID}\n')
+                              locations = [seq for seq in motifs.values()]
+                              locations.sort()
+                              line = ' '.join([str(p) for p in locations])
+                              out.write (f'{line}\n')                  
+      else:            
+            for ID in IDs:    
+                  motifs = mprt(ID=ID)
+                  if len(motifs)>0:
+                        print (ID)
+                        locations = [seq for seq in motifs.values()]
+                        locations.sort()
+                        print (' '.join([str(p) for p in locations]))
diff --git a/reference_tables.py b/reference_tables.py
@@ -150,6 +150,9 @@ def createSimpleDNASubst(match=+1,subst=1,bases='ATGC'):
             weights[(bases[i],bases[j])] = +match if i==j else -subst          
     return weights
 
+# get_re_protein
+# Produce a regular expression to recognize a straing of amino acids
+
 def get_re_protein(min_length=1):
     return re.compile('[A,C-IK-WY]{'+str(min_length)+',}')