Merge branch 'master' of github.com:maartenbreddels/vaex

vaexio · Mar 27, 2018 · 072ab9b · 072ab9b
2 parents 9a21c86 + 524e89e
commit 072ab9b
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 *.so
 *.mkv
 *.mk4
-/*.png
+#/*.png
 *.eps
 *.hdf5
 *.pgm

diff --git a/.releash.py b/.releash.py
@@ -15,7 +15,7 @@
 #core.release_targets.append(gitpush)
 core.release_targets.append(ReleaseTargetCondaForge(core, '../feedstocks/vaex-core-feedstock'))
 
-packages = ['vaex-core', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed']
+packages = ['vaex-core', 'vaex-meta', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed']
 names = [k[5:] for k in packages[1:]]
 
 for name in names:
@@ -30,6 +30,6 @@
     package.release_targets.append(ReleaseTargetSourceDist(package))
     # also ok to add twice, it will only execute for the last package
     package.release_targets.append(gitpush)
-    if name in ['hdf5', 'viz']:
-        package.release_targets.append(ReleaseTargetCondaForge(core, '../feedstocks/vaex-' + name + '-feedstock'))
+    #if name in ['hdf5', 'viz']:
+    package.release_targets.append(ReleaseTargetCondaForge(package, '../feedstocks/vaex-' + name + '-feedstock'))
 
diff --git a/data/helmi-dezeeuw-2000-10p.hdf5 b/data/helmi-dezeeuw-2000-10p.hdf5
diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb
@@ -50,7 +50,7 @@
    "source": [
     "## What is Vaex?\n",
     "\n",
-    "Vaex is a python library for **Out-of-Core DataFrames** (similar to Pandas), to visualize and explore big tabular datasets. It can calculate *statistics* such as mean, sum, count, standard deviation etc, on an *N-dimensional grid* up to **a billion** ($10^9$) objects/rows **per second**. Visualization is done using **histograms**, **density plots** and **3d volume rendering**, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted)."
+    "Vaex is a python library for lazy **Out-of-Core DataFrames** (similar to Pandas), to visualize and explore big tabular datasets. It can calculate *statistics* such as mean, sum, count, standard deviation etc, on an *N-dimensional grid* up to **a billion** ($10^9$) objects/rows **per second**. Visualization is done using **histograms**, **density plots** and **3d volume rendering**, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted)."
    ]
   },
   {
@@ -60,7 +60,7 @@
     "# Why vaex\n",
     " \n",
     " * **Performance:** Works with huge tabular data, process $\\gt 10^9$ rows/second\n",
-    " * **Virtual columns:** compute on the fly, without wasting ram\n",
+    " * **Lazy / Virtual columns:** compute on the fly, without wasting ram\n",
     " * **Memory efficient** no memory copies when doing filtering/selections/subsets.\n",
     " * **Visualization:** directly supported, a one-liner is often enough.\n",
     " * **User friendly API:** You will only need to deal with a Dataset object, and tab completion + docstring will help you out: `ds.mean<tab>`, feels very similar to Pandas.\n",
@@ -72,8 +72,9 @@
     "    * `vaex-astro`: Astronomy related transformations and FITS file support.\n",
     "    * `vaex-server`: Provides a server to access a dataset remotely.\n",
     "    * `vaex-distributed`: (Proof of concept) combined multiple servers / cluster into a single dataset for distributed computations.\n",
-    "    * `vaex`: meta package that installs all of the above.\n",
     "    * `vaex-qt`: Program written using Qt GUI.\n",
+    "    * `vaex`: meta package that installs all of the above.\n",
+    "    * `vaex-ml`: [Machine learning](ml.ipynb)\n",
     "\n",
     " * **Jupyter integration**: vaex-jupyter will give you interactive visualization and selection in the Jupyter notebook and Jupyter lab."
    ]

diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py
@@ -103,10 +103,30 @@ def app(*args, **kwargs):
     return vaex.ui.main.VaexApp()
 
 
-def open(path, *args, **kwargs):
+def _convert_name(filenames, shuffle=False):
+    '''Convert a filename (or list of) to a filename with .hdf5 and optionally a -shuffle suffix'''
+    if not isinstance(filenames, (list, tuple)):
+        filenames = [filenames]
+    base = filenames[0]
+    if shuffle:
+        base += '-shuffle'
+    if len(filenames) > 1:
+        return base + "_and_{}_more.hdf5".format(len(filenames)-1)
+    else:
+        return base + ".hdf5"
+
+
+def open(path, convert=False, shuffle=False, *args, **kwargs):
     """Open a dataset from file given by path
 
-    :param str path: local or absolute path to file
+    Example:
+
+    >>> ds = vaex.open('sometable.hdf5')
+    >>> ds = vaex.open('somedata*.csv', convert='bigdata.hdf5')
+
+    :param str path: local or absolute path to file, or glob string
+    :param convert: convert files to an hdf5 file for optimization, can also be a path
+    :param bool shuffle: shuffle converted dataset or not
     :param args: extra arguments for file readers that need it
     :param kwargs: extra keyword arguments
     :return: return dataset if file is supported, otherwise None
@@ -136,7 +156,51 @@ def open(path, *args, **kwargs):
             return vaex.distributed.open(path, *args, **kwargs)
         else:
             import vaex.file
-            return vaex.file.open(path, *args, **kwargs)
+            import glob
+            filenames = glob.glob(path)
+            ds = None
+            if len(filenames) == 0:
+                raise IOError('Could not open file: {}, it does not exists'.format(path))
+            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
+            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
+            if len(filenames) == 1:
+                path = filenames[0]
+                ext = os.path.splitext(path)[1]
+                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
+                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)
+                else:
+                    if ext == '.csv':  # special support for csv.. should probably approach it a different way
+                        ds = from_csv(path, **kwargs)
+                    else:
+                        ds = vaex.file.open(path, *args, **kwargs)
+                    if convert:
+                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
+                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
+                if ds is None:
+                    if os.path.exists(path):
+                        raise IOError('Could not open file: {}, did you install vaex-hdf5?'.format(path))
+                    if os.path.exists(path):
+                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
+            elif len(filenames) > 1:
+                if convert not in [True, False]:
+                    filename_hdf5 = convert
+                else:
+                    filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
+                if os.path.exists(filename_hdf5) and convert:  # also check mtime
+                    ds = open(filename_hdf5)
+                else:
+                    # with ProcessPoolExecutor() as executor:
+                    # executor.submit(read_csv_and_convert, filenames, shuffle=shuffle, **kwargs)
+                    for filename in filenames:
+                        open(filename, convert=convert is not False, shuffle=shuffle, **kwargs)
+                    ds = open_many([_convert_name(k, shuffle=shuffle) for k in filenames])
+                if convert:
+                    ds.export_hdf5(filename_hdf5, shuffle=shuffle)
+                    ds = vaex.file.open(filename_hdf5, *args, **kwargs)
+
+        if ds is None:
+            raise IOError('Unknown error opening: {}'.format(path))
+        return ds
     except:
         logging.getLogger("vaex").error("error opening %r" % path)
         raise
@@ -295,19 +359,6 @@ def read_csv(filepath_or_buffer, **kwargs):
     return from_csv(filenames, **kwargs)
 
 
-def _convert_name(filenames, shuffle=False):
-    '''Convert a filename (or list of) to a filename with .hdf5 and optionally a -shuffle suffix'''
-    if not isinstance(filenames, (list, tuple)):
-        filenames = [filenames]
-    base = filenames[0]
-    if shuffle:
-        base += '-shuffle'
-    if len(filenames) > 1:
-        return base + "_and_{}_more.hdf5".format(len(filenames))
-    else:
-        return base + ".hdf5"
-
-
 def read_csv_and_convert(path, shuffle=False, copy_index=True, **kwargs):
     '''Convert a path (or glob pattern) to a single hdf5 file, will open the hdf5 file if exists
 

diff --git a/packages/vaex-core/vaex/core/_version.py b/packages/vaex-core/vaex/core/_version.py
@@ -1,2 +1,2 @@
-__version_tuple__ = (0, 1, 8)
-__version__ = '0.1.8'
+__version_tuple__ = (0, 1, 9)
+__version__ = '0.1.9'
diff --git a/packages/vaex-core/vaex/file/other.py b/packages/vaex-core/vaex/file/other.py
@@ -400,16 +400,20 @@ def __init__(self, filename):
 		self.addColumn("vz", veloffset+8, length, dtype=np.float32, stride=3)
 	@classmethod
 	def can_open(cls, path, *args, **kwargs):
-		with open(path, 'rb') as f:
-			first_words = struct.unpack('4I',f.read(4*4))
-			if first_words[0] == 8 and first_words[2] == 8 and first_words[3] == 256:
-				logg.debug('gadget file SnapFormat=2 detected')
-				return True
-			elif first_words[0] == 256:
-				f.seek(256+4)
-				if struct.unpack('I',f.read(4))[0] == 256:
-					logger.debug('gadget file SnapFormat=1 detected')
+		try:
+			with open(path, 'rb') as f:
+				first_words = struct.unpack('4I',f.read(4*4))
+				if first_words[0] == 8 and first_words[2] == 8 and first_words[3] == 256:
+					logg.debug('gadget file SnapFormat=2 detected')
 					return True
+				elif first_words[0] == 256:
+					f.seek(256+4)
+					if struct.unpack('I',f.read(4))[0] == 256:
+						logger.debug('gadget file SnapFormat=1 detected')
+						return True
+		except:
+			pass
+
 		return False
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ @@
     *.so
     *.mkv
     *.mk4
-    /*.png
+    #/*.png
     *.eps
     *.hdf5
     *.pgm
@@ Expand Down @@