-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathread_input.py
236 lines (207 loc) · 8.34 KB
/
read_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
__author__ = 'pavel'
import os
import sys
import gzip
import pickle
import random
import string
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.PropertyMol import PropertyMol
from io import BytesIO
def read_pdbqt(fname, smi, sanitize=True, removeHs=False):
"""
Read all MODEL entries in input PDBQT file as separate identical molecules. If no MODEL sections then whole file is
recognized as a single structure (list with a single molecule will be returned)
:param fname: pdbqt file
:param smi: SMILES of the molecule in pdbqt file to assign bond orders
:param sanitize:
:param removeHs:
:return: list of molecules
"""
def read_pdbqt_block(pdbqt_block):
return Chem.MolFromPDBBlock('\n'.join([i[:66] for i in pdbqt_block.split('\n')]),
sanitize=sanitize,
removeHs=removeHs)
mols = []
refmol = Chem.MolFromSmiles(smi)
with open(fname) as f:
s = f.read()
if 'MODEL' in s:
pdbqt_blocks = s.split('MODEL ')
for j, block in enumerate(pdbqt_blocks[1:]):
m = read_pdbqt_block(block)
if m is None:
sys.stderr.write(f'The pose #{j+1} cannot be read from {fname}\n')
else:
m = AllChem.AssignBondOrdersFromTemplate(refmol, m)
mols.append(m)
else:
m = read_pdbqt_block(s)
if m is None:
sys.stderr.write(f'Structure from {fname} cannot be read\n')
else:
m = AllChem.AssignBondOrdersFromTemplate(refmol, m)
mols.append(m)
return mols
def assign_mol_props_to_conf(mol, conf):
for k, v in mol.GetPropsAsDict().items():
if isinstance(v, int):
conf.SetIntProp(k, v)
elif isinstance(v, float):
conf.SetDoubleProp(k, v)
elif isinstance(v, bool):
conf.SetBoolProp(k, v)
else:
conf.SetProp(k, v)
return conf
def assign_conf_props_to_mol(conf, mol):
for k, v in conf.GetPropsAsDict().items():
if isinstance(v, int):
mol.SetIntProp(k, v)
elif isinstance(v, float):
mol.SetDoubleProp(k, v)
elif isinstance(v, bool):
mol.SetBoolProp(k, v)
else:
mol.SetProp(k, v)
return mol
def __get_smi_as_molname(mol):
try:
name = Chem.MolToSmiles(mol, isomericSmiles=True)
except Exception as e:
name = ''.join(random.sample(string.ascii_uppercase, 10))
sys.stderr.write(f'Some molecule cannot be converted to SMILES - {name} was inserted as the molecule title\n')
return name
def __read_pkl(fname):
with open(fname, 'rb') as f:
while True:
try:
yield pickle.load(f)
except EOFError:
break
def __read_sdf(fname, input_format, id_field_name=None, sanitize=True):
if input_format == 'sdf':
suppl = Chem.SDMolSupplier(fname, sanitize=sanitize, removeHs=False)
elif input_format == 'sdf.gz':
suppl = Chem.ForwardSDMolSupplier(gzip.open(fname), sanitize=sanitize, removeHs=False)
else:
return
for mol in suppl:
if mol is not None:
if id_field_name is not None:
mol_title = mol.GetProp(id_field_name)
else:
if mol.GetProp("_Name"):
mol_title = mol.GetProp("_Name")
else:
mol_title = __get_smi_as_molname(mol)
yield PropertyMol(mol), mol_title
def __read_sdf_confs(fname, input_format, id_field_name=None, sanitize=True, sdf_confs=False):
title = None
for mol, mol_title in __read_sdf(fname, input_format, id_field_name, sanitize):
if sdf_confs:
if title is None:
m = mol
conf = assign_mol_props_to_conf(m, m.GetConformer(0))
title = mol_title
elif title == mol_title:
conf = assign_mol_props_to_conf(mol, mol.GetConformer(0))
m.AddConformer(conf, assignId=True)
else:
yield m, title
m = mol
conf = assign_mol_props_to_conf(m, m.GetConformer(0))
title = mol_title
else:
yield mol, mol_title
if sdf_confs:
yield m, title
def __read_smiles(fname, sanitize=True, sep='\t'):
with open(fname) as f:
for line in f:
tmp = line.strip().split(sep)
mol = Chem.MolFromSmiles(tmp[0], sanitize=sanitize)
if mol is not None:
if len(tmp) > 1:
mol_title = tmp[1]
else:
mol_title = __get_smi_as_molname(mol)
mol.SetProp('_Name', mol_title)
yield mol, mol_title
def __read_stdin_smiles(sanitize=True, sep='\t'):
line = sys.stdin.readline()
while line:
tmp = line.strip().split(sep)
if tmp:
mol = Chem.MolFromSmiles(tmp[0], sanitize=sanitize)
if mol is not None:
if len(tmp) > 1:
mol_title = tmp[1]
else:
mol_title = __get_smi_as_molname(mol)
yield mol, mol_title
line = sys.stdin.readline()
def __read_stdin_sdf(sanitize=True):
molblock = ''
line = sys.stdin.readline()
while line:
molblock += line
if line == '$$$$\n':
mol = [x for x in Chem.ForwardSDMolSupplier(BytesIO(molblock.encode('utf-8')), sanitize=sanitize, removeHs=False)][0]
mol_title = molblock.split('\n', 1)[0]
if not mol_title:
mol_title = __get_smi_as_molname(mol)
yield mol, mol_title
molblock = ''
line = sys.stdin.readline()
# def read_input(fname, id_field_name=None, stdin_format=None, sanitize=True):
# if fname is None:
# if stdin_format == 'smi':
# suppl = read_stdin_smiles()
# elif stdin_format == 'sdf':
# suppl = read_stdin_sdf(sanitize=sanitize)
# else:
# raise Exception("Cannot read STDIN. STDIN format should be specified explicitly: smi or sdf.")
# elif fname.lower().endswith('.sdf') or fname.lower().endswith('.sdf.gz'):
# suppl = read_sdf(os.path.abspath(fname), id_field_name=id_field_name, sanitize=sanitize)
# elif fname.lower().endswith('.smi') or fname.lower().endswith('.smiles'):
# suppl = read_smiles(os.path.abspath(fname))
# elif fname.lower().endswith('.pkl'):
# suppl = read_pkl(os.path.abspath(fname))
# else:
# raise Exception("File extension can be only SDF, SMI or SMILES")
# for mol, mol_name in suppl:
# yield mol, mol_name
def read_input(fname, input_format=None, id_field_name=None, sanitize=True, sdf_confs=False, sep='\t'):
"""
fname - is a file name, None if STDIN
input_format - is a format of input data, cannot be None for STDIN
id_field_name - name of the field containing molecule name, if None molecule title will be taken
sdf_confs - return consecutive molecules with the same name as a single Mol object with multiple conformers
sep - separator in SMILES format
"""
if input_format is None:
tmp = os.path.basename(fname).split('.')
if tmp == 'gz':
input_format = '.'.join(tmp[-2:])
else:
input_format = tmp[-1]
input_format = input_format.lower()
if fname is None: # handle STDIN
if input_format == 'sdf':
suppl = __read_stdin_sdf(sanitize=sanitize)
elif input_format == 'smi':
suppl = __read_stdin_smiles(sanitize=sanitize, sep=sep)
else:
raise Exception("Input STDIN format '%s' is not supported. It can be only sdf, smi." % input_format)
elif input_format in ("sdf", "sdf.gz"):
suppl = __read_sdf_confs(os.path.abspath(fname), input_format, id_field_name, sanitize, sdf_confs)
elif input_format in ('smi'):
suppl = __read_smiles(os.path.abspath(fname), sanitize, sep=sep)
elif input_format == 'pkl':
suppl = __read_pkl(os.path.abspath(fname))
else:
raise Exception("Input file format '%s' is not supported. It can be only sdf, sdf.gz, smi, pkl." % input_format)
for mol, mol_name in suppl:
yield mol, mol_name