-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathrmsd_rdkit.py
executable file
·181 lines (154 loc) · 8.12 KB
/
rmsd_rdkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
__author__ = 'Pavel Polishchuk'
import argparse
import os
import re
import sys
from rdkit import Chem
from rdkit.Chem import rdFMCS
from read_input import read_pdbqt, read_input
def get_coord(mol, indices=None):
if indices is None:
indices = tuple(range(mol.GetNumAtoms()))
output = []
for atom_id in indices:
pos = mol.GetConformer().GetAtomPosition(atom_id)
output.append((pos.x, pos.y, pos.z))
return tuple(output)
def rmsd(mol, ref, chirality):
def rmsd_calc(r_coord, m_coord):
s = 0
for r, m in zip(r_coord, m_coord):
s += (r[0] - m[0]) ** 2 + (r[1] - m[1]) ** 2 + (r[2] - m[2]) ** 2
s = (s / len(r_coord)) ** 0.5
return s
match_indices = mol.GetSubstructMatches(ref, uniquify=False, useChirality=chirality, maxMatches=10000)
min_rmsd = float('inf')
if not match_indices:
mcs = rdFMCS.FindMCS([mol, ref], threshold=1.0,
ringMatchesRingOnly=False, completeRingsOnly=False,
matchChiralTag=chirality)
if not mcs:
return None
patt = Chem.MolFromSmarts(mcs.smartsString)
refMatch, molMatch = ref.GetSubstructMatches(patt, uniquify=False), \
mol.GetSubstructMatches(patt, uniquify=False)
for ids_ref in refMatch:
for ids_mol in molMatch:
ref_coord = get_coord(ref, ids_ref)
mol_coord = get_coord(mol, ids_mol)
s = rmsd_calc(ref_coord, mol_coord)
if s < min_rmsd:
min_rmsd = s
else:
ref_coord = get_coord(ref)
for ids in match_indices:
mol_coord = get_coord(mol, ids)
s = rmsd_calc(ref_coord, mol_coord)
if s < min_rmsd:
min_rmsd = s
return round(min_rmsd, 3)
def main_params(input_fnames, input_smi, output_fname, ref_name, refsmi, chirality, regex):
if ref_name.lower().endswith('.mol2'):
ref = Chem.MolFromMol2File(ref_name, removeHs=True)
elif ref_name.lower().endswith('.pdbqt'):
ref = read_pdbqt(ref_name, refsmi, removeHs=True)[0]
elif ref_name.lower().endswith('.mol'):
ref = Chem.MolFromMolFile(ref_name, removeHs=True)
elif ref_name.lower().endswith('.sdf'):
ref = {m.GetProp('_Name'): m for m in Chem.SDMolSupplier(ref_name) if m}
else:
sys.stderr.write('Wrong format of the reference file. Only MOL2, PDBQT and SDF files are allowed.\n')
raise ValueError
if output_fname is not None:
sys.stdout = open(output_fname, 'wt')
# read input smiles in dict: {name: smi}
smis = dict()
if input_smi is not None:
with open(input_smi) as f:
for line in f:
values = line.strip().split()
if len(values) >= 2:
smis[values[1]] = values[0]
else:
sys.stderr.write(
f'Line "{line}" in input smiles does not have two fields - SMILES and mol name. Skipped.\n')
for in_fname in input_fnames:
if in_fname.lower().endswith('.mol2'):
mols = [Chem.MolFromMol2File(in_fname)]
elif in_fname.lower().endswith('.pdbqt') or in_fname.endswith('.pdbqt_out'):
if regex is not None:
mols = read_pdbqt(in_fname, smis[re.search(regex, os.path.basename(in_fname)).group()], removeHs=True)
else:
mols = read_pdbqt(in_fname, smis[os.path.splitext(os.path.basename(in_fname))[0]], removeHs=True)
elif in_fname.lower().endswith('.sdf'):
mols = [mol for mol, mol_name in read_input(in_fname)]
else:
sys.stderr.write(f'Wrong format of the input file - {in_fname}. '
f'Only MOL2, PDBQT and SDF files are allowed.\n')
raise ValueError
for i, mol in enumerate(mols, 1):
if mol is None:
print(f'{in_fname}\t{i}\tCannot read structure')
else:
# assign printed mol name from mol object or file name
mol_name = mol.GetProp('_Name')
if not mol_name:
if regex is not None:
mol_name = re.search(regex, os.path.basename(in_fname)).group()
else:
mol_name = os.path.basename(in_fname)
# assign ref mol object
if isinstance(ref, dict):
try:
refmol = ref[mol.GetProp('_Name')]
except KeyError:
sys.stderr.write(f'Molecule with name {mol.GetProp("_Name")} is not available '
f'in the reference SDF file\n')
print(f'{mol_name}\t{i}\tNo matches')
refmol = None
else:
refmol = ref
if refmol is not None:
mol_rmsd = rmsd(mol, refmol, chirality)
if mol_rmsd is not None:
print(f'{mol_name}\t{i}\t{mol_rmsd}')
else:
print(f'{mol_name}\t{i}\tNo matches')
def main():
parser = argparse.ArgumentParser(description='''Calc RMSD between a reference molecule and docked poses.
If reference molecule is not substructure of the docked molecule
maximum common substructure is used.''')
parser.add_argument('-i', '--input', metavar='FILENAME', required=True, nargs='*',
help='input MOL2/PDBQT/SDF file(s) to compare with a reference molecule or molecules.')
parser.add_argument('--input_smi', metavar='FILENAME', required=False, default=None,
help='SMILES of input molecules if they are in PDBQT format. No header. Space- or '
'tab-separated. Molecule names should correspond to PDBQT file names. '
'DEVELOPERS, STOP USING PDB FORMAT FOR DOCKING PROGRAMS!')
parser.add_argument('-r', '--reference', metavar='FILENAME', required=True,
help='reference molecule (from X-ray complex structure) in MOL/MOL2/PDBQT/SDF format. '
'Multiple reference molecules can be supplied in SDF file, in this case molecules '
'will be matched by their names.')
parser.add_argument('-s', '--refsmi', metavar='SMILES or FILENAME', required=False, default=None,
help='SMILES of the reference molecule. This can be a string (do not forget to escape '
'special characters) or a file. This is required only for PDBQT input to '
'assign bond orders. DEVELOPERS, STOP USING PDB FORMAT FOR DOCKING PROGRAMS!')
parser.add_argument('--regex', metavar='REGEX', required=False, default=None,
help='Use it if there are complex names of pdbqt files. '
'Use regex search to establish a relationship between reference smiles name and pdbqt '
'filename. If None filename of pdbqt file will be taken to find reference smiles '
'Examples: MOLID[0-9]* or .*(?=_out.pdbqt)')
parser.add_argument('-o', '--output', metavar='FILENAME',
help='output text file. If omitted output will be in stdout.')
parser.add_argument('-x', '--nochirality', action='store_true', default=False,
help='choose this option if you want to omit matching chirality in substructure search. '
'By default chirality is considered.')
args = parser.parse_args()
if (args.refsmi is not None) and (args.refsmi.lower().endswith('.smi') or args.refsmi.lower().endswith('.smiles')):
with open(args.refsmi) as inp:
refsmi = inp.read().strip()
else:
refsmi = args.refsmi
main_params(args.input, args.input_smi, args.output, args.reference, refsmi, not args.nochirality, args.regex)
if __name__ == '__main__':
main()