Skip to content

Commit

Permalink
0.2 release (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt authored Apr 2, 2017
1 parent 9aa160d commit e644da9
Show file tree
Hide file tree
Showing 47 changed files with 14,748 additions and 353 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
If you are a computational biologist, chances are that you cursed one too many times about protein structure files. Yes, I am talking about ye Goode Olde Protein Data Bank format, aka "PDB files." Nothing against PDB, it's a neatly structured format (if deployed correctly); yet, it is a bit cumbersome to work with PDB files in "modern" programming languages -- I am pretty sure we all agree on this.

As machine learning and "data science" person, I fell in love with [pandas](http://pandas.pydata.org) DataFrames for handling just about everything that can be loaded into memory.
So, why don't we take pandas to the structural biology world? Working with molecular structures of biological macromolecules in pandas DataFrames is what BioPandas is all about!
So, why don't we take pandas to the structural biology world? Working with molecular structures of biological macromolecules (from PDB and MOL2 files) in pandas DataFrames is what BioPandas is all about!

<br>

Expand All @@ -35,10 +35,10 @@ So, why don't we take pandas to the structural biology world? Working with molec
![3eiy](./docs/sources/img/index/3eiy.png)

```python
# Initialize a new PandasPDB object
# Initialize a new PandasPdb object
# and fetch the PDB file from rcsb.org
>>> from biopandas.pdb import PandasPDB
>>> ppdb = PandasPDB().fetch_pdb('3eiy')
>>> from biopandas.pdb import PandasPdb
>>> ppdb = PandasPdb().fetch_pdb('3eiy')
>>> ppdb.df['ATOM'].head()
```

Expand All @@ -53,10 +53,10 @@ So, why don't we take pandas to the structural biology world? Working with molec
```python
# Load structures from your drive and compute the
# Root Mean Square Deviation
>>> from biopandas.pdb import PandasPDB
>>> pl1 = PandasPDB().read_pdb('./docking_pose_1.pdb')
>>> pl2 = PandasPDB().read_pdb('./docking_pose_2.pdb')
>>> r = PandasPDB.rmsd(pl1.df['HETATM'], pl2.df['HETATM'],
>>> from biopandas.pdb import PandasPdb
>>> pl1 = PandasPdb().read_pdb('./docking_pose_1.pdb')
>>> pl2 = PandasPdb().read_pdb('./docking_pose_2.pdb')
>>> r = PandasPdb.rmsd(pl1.df['HETATM'], pl2.df['HETATM'],
s='hydrogen', invert=True)
>>> print('RMSD: %.4f Angstrom' % r)

Expand Down
2 changes: 1 addition & 1 deletion biopandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

__version__ = '0.1.5'
__version__ = '0.2.0.dev0'
__author__ = "Sebastian Raschka <[email protected]>"
15 changes: 15 additions & 0 deletions biopandas/mol2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# BioPandas
# Author: Sebastian Raschka <[email protected]>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

"""
BioPandas module for working with TRIPOS MOL2
files in pandas DataFrames.
"""

from .pandas_mol2 import PandasMol2
from .mol2_io import split_multimol2

__all__ = ["PandasMol2", "split_multimol2"]
54 changes: 54 additions & 0 deletions biopandas/mol2/mol2_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# BioPandas
# Author: Sebastian Raschka <[email protected]>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import gzip


def split_multimol2(mol2_path):
r"""
Splits a multi-mol2 file into individual Mol2 file contents.
Parameters
-----------
mol2_path : str
Path to the multi-mol2 file. Parses gzip files if the filepath
ends on .gz.
Returns
-----------
A generator object for lists for every extracted mol2-file. Lists contain
the molecule ID and the mol2 file contents.
e.g., ['ID1234', ['@<TRIPOS>MOLECULE\n', '...']]. Note that bytestrings
are returned (for reasons of efficieny) if the Mol2 content is read
from a gzip (.gz) file.
"""
if mol2_path.endswith('.gz'):
open_file = gzip.open
read_mode = 'rb'
else:
open_file = open
read_mode = 'r'
check = {'rb': b'@<TRIPOS>MOLECULE', 'r': '@<TRIPOS>MOLECULE'}

with open_file(mol2_path, read_mode) as f:
mol2 = ['', []]
while True:
try:
line = next(f)
if line.startswith(check[read_mode]):
if mol2[0]:
yield(mol2)
mol2 = ['', []]
mol2_id = next(f)
mol2[0] = mol2_id.rstrip()
mol2[1].append(line)
mol2[1].append(mol2_id)
else:
mol2[1].append(line)
except StopIteration:
yield(mol2)
return
235 changes: 235 additions & 0 deletions biopandas/mol2/pandas_mol2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
""" Class for working with Tripos MOL2 files"""
# BioPandas
# Author: Sebastian Raschka <[email protected]>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

import pandas as pd
import numpy as np
from .mol2_io import split_multimol2


COLUMN_NAMES = (
'atom_id',
'atom_name',
'x',
'y',
'z',
'atom_type',
'subst_id',
'subst_name',
'charge'
)

COLUMN_TYPES = (int, str, float, float, float, str, int, str, float)


class PandasMol2(object):
"""
Object for working with Tripos Mol2 structure files.
Attributes
----------
df : pandas.DataFrame
DataFrame of a Mol2's ATOM section
mol2_text : str
Mol2 file contents in string format
code : str
ID, code, or name of the molecule stored
"""
def __init__(self):
self._df = None
self.mol2_text = ''
self.header = ''
self.code = ''

@property
def df(self):
"""Acccesses the pandas DataFrame"""
return self._df

def _load_mol2(self, mol2_lines, mol2_code, columns):
"""Load mol2 contents into assert_raise_message instance"""
if columns is None:
col_names = COLUMN_NAMES
col_types = COLUMN_TYPES
else:
col_names, col_types = [], []
for i in range(len(columns)):
col_names.append(columns[i][0])
col_types.append(columns[i][1])

try:
self.mol2_text = ''.join(mol2_lines)
self.code = mol2_code
except TypeError:
mol2_lines = [m.decode() for m in mol2_lines]
self.mol2_text = ''.join(mol2_lines)
self.code = mol2_code.decode()

self._df = self._construct_df(mol2_lines, col_names, col_types)

def read_mol2(self, path, columns=None):
"""Reads Mol2 files (unzipped or gzipped) from local drive
Note that if your mol2 file contains more than one molecule,
only the first molecule is loaded into the DataFrame
Attributes
----------
path : str
Path to the Mol2 file in .mol2 format or gzipped format (.mol2.gz)
columns : dict or None (default: None)
If None, this methods expects a 9-column ATOM section that contains
the following columns:
{0:('atom_id', int), 1:('atom_name', str),
2:('x', float), 3:('y', float), 4:('z', float),
5:('atom_type', str), 6:('subst_id', int),
7:('subst_name', str), 8:('charge', float)}
If your Mol2 files are formatted differently, you can provide your
own column_mapping dictionary in a format similar to the one above.
However, note that not all assert_raise_message methods
may be supported then.
Returns
---------
self
"""
mol2_code, mol2_lines = next(split_multimol2(path))
self._load_mol2(mol2_lines, mol2_code, columns)
return self

def read_mol2_from_list(self, mol2_lines, mol2_code, columns=None):
r"""Reads Mol2 file from a list into DataFrames
Attributes
----------
mol2_lines : list
A list of lines containing the mol2 file contents. For example,
['@<TRIPOS>MOLECULE\n',
'ZINC38611810\n',
' 65 68 0 0 0\n',
'SMALL\n',
'NO_CHARGES\n',
'\n',
'@<TRIPOS>ATOM\n',
' 1 C1 -1.1786 2.7011 -4.0323 C.3 1 <0> -0.1537\n',
' 2 C2 -1.2950 1.2442 -3.5798 C.3 1 <0> -0.1156\n',
...]
mol2_code : str or None
Name or ID of the molecule.
columns : dict or None (default: None)
If None, this methods expects a 9-column ATOM section that contains
the following columns:
{0:('atom_id', int), 1:('atom_name', str),
2:('x', float), 3:('y', float), 4:('z', float),
5:('atom_type', str), 6:('subst_id', int),
7:('subst_name', str), 8:('charge', float)}
If your Mol2 files are formatted differently, you can provide your
own column_mapping dictionary in a format similar to the one above.
However, note that not all assert_raise_message methods may be
supported then.
Returns
---------
self
"""
self._load_mol2(mol2_lines, mol2_code, columns)
return self

def _construct_df(self, mol2_lines, col_names, col_types):
"""Construct DataFrames from list of PDB lines."""
return self._atomsection_to_pandas(self._get_atomsection(mol2_lines),
col_names=col_names,
col_types=col_types)

@staticmethod
def _get_atomsection(mol2_lst):
"""Returns atom section from mol2 provided as list of strings"""
started = False
for idx, s in enumerate(mol2_lst):
if s.startswith('@<TRIPOS>ATOM'):
first_idx = idx + 1
started = True
elif started and s.startswith('@<TRIPOS>'):
last_idx_plus1 = idx
break
return mol2_lst[first_idx:last_idx_plus1]

@staticmethod
def _atomsection_to_pandas(mol2_atom_lst, col_names, col_types):

df = pd.DataFrame([lst.split() for lst in mol2_atom_lst],
columns=col_names)

for i in range(df.shape[1]):
df[col_names[i]] = df[col_names[i]].astype(col_types[i])

return df

@staticmethod
def rmsd(df1, df2, heavy_only=True):
"""Compute the Root Mean Square Deviation between molecules
Parameters
----------
df1 : pandas.DataFrame
DataFrame with HETATM, ATOM, and/or ANISOU entries
df2 : pandas.DataFrame
Second DataFrame for RMSD computation against df1. Must have the
same number of entries as df1
heavy_only : bool (default: True)
Which atoms to compare to compute the RMSD. If `True` (default),
computes the RMSD between non-hydrogen atoms only.
Returns
---------
rmsd : float
Root Mean Square Deviation between df1 and df2
"""
if df1.shape[0] != df2.shape[0]:
raise AttributeError('DataFrames have unequal lengths')

if heavy_only:
d1 = df1[df1['atom_type'] != 'H']
d2 = df2[df2['atom_type'] != 'H']
else:
d1, d2 = df1, df2

total = ((d1['x'] - d2['x'])**2 +
(d1['y'] - d2['y'])**2 +
(d1['z'] - d2['z'])**2)
rmsd = round((total.sum() / df1.shape[0])**0.5, 4)
return rmsd

def distance(self, xyz=(0.00, 0.00, 0.00)):
"""Computes Euclidean distance between atoms and a 3D point.
Parameters
----------
xyz : tuple (0.00, 0.00, 0.00)
X, Y, and Z coordinate of the reference center for the distance
computation
Returns
---------
pandas.Series : Pandas Series object containing the Euclidean
distance between the atoms in the atom section and `xyz`.
"""
return self.df.apply(lambda x: np.sqrt(np.sum(
((x['x'] - xyz[0])**2,
(x['y'] - xyz[1])**2,
(x['z'] - xyz[2])**2))), axis=1)
Loading

0 comments on commit e644da9

Please sign in to comment.