-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_bio_xml.py
159 lines (109 loc) · 5.92 KB
/
parse_bio_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import Entrez as ncbi
import xml.etree.ElementTree as ET
import pandas as pd
import itertools
def get_temperature(text):
# options are:
# 1) 35-40
# 2) 98
# 3) 100C
# 4) 34~45
# 5) 35-40C
to_str = lambda txt: str(txt) if type(txt)!=unicode else txt
text = to_str(text)
patterns = ['(\d+)-(\d+)','(\d+)-(\d+)C','(\d+)~(\d+)','(\d+)~(\d+)C','(\d+)','(\d+.\d+)','(\d+)C',u'(\d+)[\xb0]C','<(\d+)-(\d+)']
for pattern in patterns:
res = re.match(pattern,text)
if res:
return pd.np.mean([float(temp) for temp in res.groups()])
return pd.np.nan
def get_env_features(xml_root):
to_return = []
xml_entries = list(xml_root.getchildren())
for entry_root in xml_entries:
uid = entry_root.attrib['uid']
xml_org = list(entry_root.iter('OrganismName'))
# organism name isn't a cumpulsory option for these xml files ...
# so we have avoid this problem somehow...
if not xml_org:
return to_return
else:
xml_org = xml_org[0]
org_name = xml_org.text
# let's find Environmental description ...
xml_env = list(entry_root.iter('Environment'))
if not xml_env:
print >> sys.stdout, "No 'Environment'-entry for %s in the BioProject %s"%(org_name, uid)
else:
print >> sys.stdout, "There is %d 'Environment' entry in %s for %s, as expected!"%(len(xml_env), uid, org_name)
dict_to_append = [('BioProject',uid),]
dict_to_append += [('Organism',org_name),]
dict_to_append += [(node.tag, node.text) for node in xml_env[0].getchildren()]
to_return.append(dict(dict_to_append))
return to_return
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
buff = []
for line in data:
if separator(line):
if buff:
yield ''.join(buff)
buff[:] = []
buff.append(line)
yield ''.join(buff)
# path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER/ftp.ncbi.nih.gov/refseq/release/bacteria/genbank')
path = os.path.join(os.path.expanduser('~'),'GENOMES_BACTER_RELEASE69/genbank')
# descriptions_fname = "genbank.inventory.description"
results_xml = "results.xml"
# now iterate over all xml files and whatever is inside those files ...
xml_roots = []
with open(os.path.join(path,results_xml),'r') as fp:
for batch in xmlSplitter(fp):
xml_roots.append(ET.fromstring(batch))
# now collecting environmental features about these bacteria ...
results = []
for xml_root in xml_roots:
results += get_env_features(xml_root)
env_df = pd.DataFrame(results)
env_df = env_df.drop_duplicates(subset=['BioProject','Organism','OptimumTemperature'])
notnnull_temp_entries = env_df.OptimumTemperature.notnull().nonzero()[0].size
entries_parsed = env_df.OptimumTemperature.apply(get_temperature).notnull().nonzero()[0].size
print >> sys.stdout, "There are %d non-empty OptTemp entries in the DataFrame, and %d of them are parseable."%(notnnull_temp_entries,entries_parsed)
env_df.OptimumTemperature = env_df.OptimumTemperature.apply(get_temperature)
# we now for sure that there are 2 entries for '[Clostridium] stercorarium subsp. stercorarium DSM 8532'
# they both have OptTemps that differ slightly 60 and 65 ...
# let's just take one of these:
env_df = env_df[env_df.OptimumTemperature.notnull()]
env_df = env_df.drop_duplicates(subset=['BioProject',],take_last=True)
print >> sys.stdout, "There are %d unique entries with OptTemp after all the adjustments ..."%(env_df.shape[0])
temp_vals_notnull = env_df.OptimumTemperature.notnull().nonzero()[0].size
under50 = env_df.OptimumTemperature.where(env_df.OptimumTemperature<=50.0).notnull().nonzero()[0].size
above50 = env_df.OptimumTemperature.where(env_df.OptimumTemperature>50.0).notnull().nonzero()[0].size
print >> sys.stdout, ''
print >> sys.stdout, "There are %d notnull values of temperature retrieved. %d are above 50C; %d are under 50C(incl.)"%(temp_vals_notnull,above50,under50)
print >> sys.stdout, ''
print >> sys.stdout, 'The temp values are unique in a sense, that each organism is displayed <= once.'
#print which columns do we have:
print >> sys.stdout, "Columns of the env_DataFrame are: ",env_df.columns
env_df[['BioProject', 'Organism', 'OptimumTemperature', 'TemperatureRange', 'OxygenReq', 'Habitat', 'Salinity']].to_csv(os.path.join( path, 'env_catalog_bacter.dat' ), index=False)
# ###########################################################
# # env data extracted using links to BioProject entries ...
# ###########################################################
# print >> sys.stdout, ''
# print >> sys.stdout, 'Environmental data extracted using links to the BioProject uids ...'
# subset = ['BP_Organism','Habitat','OptimumTemperature','OxygenReq','Salinity','TemperatureRange']
# link_dat = pd.read_csv(os.path.join(org_path,'environmental_catalog.dat'))
# link_dat_uniq = link_dat.drop_duplicates(subset=subset)
# link_dat_uniq.OptimumTemperature = link_dat_uniq.OptimumTemperature.apply(get_temperature)
# l_temp_vals_notnull = link_dat_uniq.OptimumTemperature.notnull().nonzero()[0].size
# l_under50 = link_dat_uniq.OptimumTemperature.where(link_dat_uniq.OptimumTemperature<=50.0).notnull().nonzero()[0].size
# l_above50 = link_dat_uniq.OptimumTemperature.where(link_dat_uniq.OptimumTemperature>50.0).notnull().nonzero()[0].size
# print >> sys.stdout, ''
# print >> sys.stdout, "There are %d notnull values of temperature retrieved. %d are above 50C; %d are under 50C(incl.)"%(l_temp_vals_notnull,l_above50,l_under50)
# print >> sys.stdout, ''
# print >> sys.stdout, 'The temp values are unique in a sense, that each organism is displayed <= once.'
# link_dat_uniq[['BP_Organism', 'OptimumTemperature', 'TemperatureRange', 'OxygenReq', 'Habitat', 'Salinity']].rename(columns={'BP_Organism':'Organism'}).to_csv(os.path.join( org_path, 'env_catalog_LINK.dat' ), index=False)