Skip to content

Commit

Permalink
Merge branch 'master' of github.com:maartenbreddels/vaex
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels committed Mar 27, 2018
2 parents 9a21c86 + 524e89e commit 072ab9b
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.so
*.mkv
*.mk4
/*.png
#/*.png
*.eps
*.hdf5
*.pgm
Expand Down
6 changes: 3 additions & 3 deletions .releash.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#core.release_targets.append(gitpush)
core.release_targets.append(ReleaseTargetCondaForge(core, '../feedstocks/vaex-core-feedstock'))

packages = ['vaex-core', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed']
packages = ['vaex-core', 'vaex-meta', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed']
names = [k[5:] for k in packages[1:]]

for name in names:
Expand All @@ -30,6 +30,6 @@
package.release_targets.append(ReleaseTargetSourceDist(package))
# also ok to add twice, it will only execute for the last package
package.release_targets.append(gitpush)
if name in ['hdf5', 'viz']:
package.release_targets.append(ReleaseTargetCondaForge(core, '../feedstocks/vaex-' + name + '-feedstock'))
#if name in ['hdf5', 'viz']:
package.release_targets.append(ReleaseTargetCondaForge(package, '../feedstocks/vaex-' + name + '-feedstock'))

Binary file modified data/helmi-dezeeuw-2000-10p.hdf5
Binary file not shown.
7 changes: 4 additions & 3 deletions docs/source/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"source": [
"## What is Vaex?\n",
"\n",
"Vaex is a python library for **Out-of-Core DataFrames** (similar to Pandas), to visualize and explore big tabular datasets. It can calculate *statistics* such as mean, sum, count, standard deviation etc, on an *N-dimensional grid* up to **a billion** ($10^9$) objects/rows **per second**. Visualization is done using **histograms**, **density plots** and **3d volume rendering**, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted)."
"Vaex is a python library for lazy **Out-of-Core DataFrames** (similar to Pandas), to visualize and explore big tabular datasets. It can calculate *statistics* such as mean, sum, count, standard deviation etc, on an *N-dimensional grid* up to **a billion** ($10^9$) objects/rows **per second**. Visualization is done using **histograms**, **density plots** and **3d volume rendering**, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted)."
]
},
{
Expand All @@ -60,7 +60,7 @@
"# Why vaex\n",
" \n",
" * **Performance:** Works with huge tabular data, process $\\gt 10^9$ rows/second\n",
" * **Virtual columns:** compute on the fly, without wasting ram\n",
" * **Lazy / Virtual columns:** compute on the fly, without wasting ram\n",
" * **Memory efficient** no memory copies when doing filtering/selections/subsets.\n",
" * **Visualization:** directly supported, a one-liner is often enough.\n",
" * **User friendly API:** You will only need to deal with a Dataset object, and tab completion + docstring will help you out: `ds.mean<tab>`, feels very similar to Pandas.\n",
Expand All @@ -72,8 +72,9 @@
" * `vaex-astro`: Astronomy related transformations and FITS file support.\n",
" * `vaex-server`: Provides a server to access a dataset remotely.\n",
" * `vaex-distributed`: (Proof of concept) combined multiple servers / cluster into a single dataset for distributed computations.\n",
" * `vaex`: meta package that installs all of the above.\n",
" * `vaex-qt`: Program written using Qt GUI.\n",
" * `vaex`: meta package that installs all of the above.\n",
" * `vaex-ml`: [Machine learning](ml.ipynb)\n",
"\n",
" * **Jupyter integration**: vaex-jupyter will give you interactive visualization and selection in the Jupyter notebook and Jupyter lab."
]
Expand Down
83 changes: 67 additions & 16 deletions packages/vaex-core/vaex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,30 @@ def app(*args, **kwargs):
return vaex.ui.main.VaexApp()


def open(path, *args, **kwargs):
def _convert_name(filenames, shuffle=False):
'''Convert a filename (or list of) to a filename with .hdf5 and optionally a -shuffle suffix'''
if not isinstance(filenames, (list, tuple)):
filenames = [filenames]
base = filenames[0]
if shuffle:
base += '-shuffle'
if len(filenames) > 1:
return base + "_and_{}_more.hdf5".format(len(filenames)-1)
else:
return base + ".hdf5"


def open(path, convert=False, shuffle=False, *args, **kwargs):
"""Open a dataset from file given by path
:param str path: local or absolute path to file
Example:
>>> ds = vaex.open('sometable.hdf5')
>>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')
:param str path: local or absolute path to file, or glob string
:param convert: convert files to an hdf5 file for optimization, can also be a path
:param bool shuffle: shuffle converted dataset or not
:param args: extra arguments for file readers that need it
:param kwargs: extra keyword arguments
:return: return dataset if file is supported, otherwise None
Expand Down Expand Up @@ -136,7 +156,51 @@ def open(path, *args, **kwargs):
return vaex.distributed.open(path, *args, **kwargs)
else:
import vaex.file
return vaex.file.open(path, *args, **kwargs)
import glob
filenames = glob.glob(path)
ds = None
if len(filenames) == 0:
raise IOError('Could not open file: {}, it does not exists'.format(path))
filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
if len(filenames) == 1:
path = filenames[0]
ext = os.path.splitext(path)[1]
if os.path.exists(filename_hdf5) and convert: # also check mtime?
ds = vaex.file.open(filename_hdf5, *args, **kwargs)
else:
if ext == '.csv': # special support for csv.. should probably approach it a different way
ds = from_csv(path, **kwargs)
else:
ds = vaex.file.open(path, *args, **kwargs)
if convert:
ds.export_hdf5(filename_hdf5, shuffle=shuffle)
ds = vaex.file.open(filename_hdf5, *args, **kwargs)
if ds is None:
if os.path.exists(path):
raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
if os.path.exists(path):
raise IOError('Could not open file: {}, it does not exist?'.format(path))
elif len(filenames) > 1:
if convert not in [True, False]:
filename_hdf5 = convert
else:
filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
if os.path.exists(filename_hdf5) and convert: # also check mtime
ds = open(filename_hdf5)
else:
# with ProcessPoolExecutor() as executor:
# executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
for filename in filenames:
open(filename, convert=convert is not False, shuffle=shuffle, **kwargs)
ds = open_many([_convert_name(k, shuffle=shuffle) for k in filenames])
if convert:
ds.export_hdf5(filename_hdf5, shuffle=shuffle)
ds = vaex.file.open(filename_hdf5, *args, **kwargs)

if ds is None:
raise IOError('Unknown error opening: {}'.format(path))
return ds
except:
logging.getLogger("vaex").error("error opening %r" % path)
raise
Expand Down Expand Up @@ -295,19 +359,6 @@ def read_csv(filepath_or_buffer, **kwargs):
return from_csv(filenames, **kwargs)


def _convert_name(filenames, shuffle=False):
'''Convert a filename (or list of) to a filename with .hdf5 and optionally a -shuffle suffix'''
if not isinstance(filenames, (list, tuple)):
filenames = [filenames]
base = filenames[0]
if shuffle:
base += '-shuffle'
if len(filenames) > 1:
return base + "_and_{}_more.hdf5".format(len(filenames))
else:
return base + ".hdf5"


def read_csv_and_convert(path, shuffle=False, copy_index=True, **kwargs):
'''Convert a path (or glob pattern) to a single hdf5 file, will open the hdf5 file if exists
Expand Down
4 changes: 2 additions & 2 deletions packages/vaex-core/vaex/core/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version_tuple__ = (0, 1, 8)
__version__ = '0.1.8'
__version_tuple__ = (0, 1, 9)
__version__ = '0.1.9'
22 changes: 13 additions & 9 deletions packages/vaex-core/vaex/file/other.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,16 +400,20 @@ def __init__(self, filename):
self.addColumn("vz", veloffset+8, length, dtype=np.float32, stride=3)
@classmethod
def can_open(cls, path, *args, **kwargs):
with open(path, 'rb') as f:
first_words = struct.unpack('4I',f.read(4*4))
if first_words[0] == 8 and first_words[2] == 8 and first_words[3] == 256:
logg.debug('gadget file SnapFormat=2 detected')
return True
elif first_words[0] == 256:
f.seek(256+4)
if struct.unpack('I',f.read(4))[0] == 256:
logger.debug('gadget file SnapFormat=1 detected')
try:
with open(path, 'rb') as f:
first_words = struct.unpack('4I',f.read(4*4))
if first_words[0] == 8 and first_words[2] == 8 and first_words[3] == 256:
logg.debug('gadget file SnapFormat=2 detected')
return True
elif first_words[0] == 256:
f.seek(256+4)
if struct.unpack('I',f.read(4))[0] == 256:
logger.debug('gadget file SnapFormat=1 detected')
return True
except:
pass

return False


Expand Down

0 comments on commit 072ab9b

Please sign in to comment.