blaze · ahasha · Nov 19, 2015 · Nov 19, 2015 · Nov 25, 2015 · Nov 30, 2015
diff --git a/.travis.yml b/.travis.yml
@@ -59,7 +59,7 @@ install:
 
   # Install various deps
   - conda uninstall toolz
-  - pip install -U toolz sas7bdat psycopg2 dill 'pymongo<3'
+  - pip install -U toolz sas7bdat psycopg2 dill 'pymongo<3' avro
   - pip install --upgrade git+git://github.com/blaze/dask.git#egg=dask-dev[complete]
   - if [ -n "$PANDAS_VERSION" ]; then pip install $PANDAS_VERSION; fi
 

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -31,6 +31,7 @@ requirements:
 
 test:
   requires:
+    - avro
     - pytest
     - h5py
     - pytables >=3.0.0

diff --git a/odo/__init__.py b/odo/__init__.py
@@ -63,10 +63,13 @@
     from .backends.sparksql import SparkDataFrame
 with ignoring(ImportError):
     from .backends.url import URL
+with ignoring(ImportError):
+    from .backends.avro import AVRO
 with ignoring(ImportError):
     from .backends.dask import dask
 
 
+
 restart_ordering()  # Restart multipledispatch ordering and do ordering
 
 

diff --git a/odo/backends/avro.py b/odo/backends/avro.py
@@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, print_function
+
+import errno
+import os
+import uuid
+
+from avro import schema, datafile, io
+from avro.schema import AvroException
+import pandas as pd
+from datashape import discover, var, Record, Map, Var, \
+    Option, null, string, int32, int64, float64, float32, boolean
+from collections import Iterator
+from ..append import append
+from ..convert import convert
+from ..resource import resource
+from ..temp import Temp
+
+AVRO_TYPE_MAP = {
+    'string': string,
+    'int': int32,
+    'long': int64,
+    'null': null,
+    'double': float64,
+    'float': float32,
+    'bool': boolean,
+    'map': Map,
+    'record': Record,
+    'array': Var,
+}
+
+class AVRO(object):
+    """Wrapper object for reading and writing an Avro container file
+
+    Parameters
+    ----------
+
+    uri : str
+        uri of avro data
+
+    schema : avro.schema.Schema
+        User specified Avro schema object.  Used to decode file or serialize new records to file.
+         schema is required to create a new Avro file.
+         If reading or appending to an existing Avro file, the writers_schema embedded in that file
+         will be used.
+
+    codec : str
+        compression codec.  Valid values: 'null', 'deflate', 'snappy'
+
+    """
+    def __init__(self, uri, schema=None, codec='null', **kwargs):
+        self._uri = uri
+        self._schema = schema
+        self._codec = codec
+        self._kwargs = kwargs #CURRENTLY UNUSED
+
+        if not schema:
+            sch = self._get_writers_schema()
+            if sch is None:
+                raise AvroException("Couldn't extract writers schema from '{0}'.  User must provide a valid schema".format(uri))
+            self._schema = sch
+
+    def __iter__(self):
+        return self.reader
+
+    def next(self):
+        return self.reader.next()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        # Perform a close if there's no exception
+        if type is None:
+            self.reader.close()
+            self.writer.close()
+
+    def _get_writers_schema(self):
+        """
+        Extract writers schema embedded in an existing Avro file.
+        """
+        reader = self.reader
+        return schema.parse(self.reader.meta['avro.schema']) if reader else None
+
+    uri = property(lambda self: self._uri)
+    codec = property(lambda self: self._codec)
+    schema = property(lambda self: self._schema)
+
+    @property
+    def reader(self):
+        if hasattr(self, '_reader'):
+            if hasattr(self, '_writer'):
+                self.flush()
+            return self._reader
+        else:
+            try:
+                rec_reader = io.DatumReader(readers_schema=self.schema)
+
+                df_reader = datafile.DataFileReader(
+                    open(self.uri, 'rb'),
+                    rec_reader
+                )
+
+                return df_reader
+            except IOError as exc:
+                #If file doesn't exist, don't set _reader now.
+                #Allow for reevaluation later after file has been created.
+                #Otherwise, reraise exception
+                if exc.errno != errno.ENOENT:
+                    raise exc
+                return None
+
+    @staticmethod
+    def _get_append_writer(uri, writers_schema=None):
+        """
+        Returns an isntance of avro.datafile.DataFileWriter for appending
+        to an existing avro file at `uri`.  Does not take a writers schema,
+        because avro library requires that writers_schema embedded in existing
+        file be used for appending.
+
+        Parameters
+        ----------
+
+        uri : str
+            uri of avro existing, non-empty avro file
+
+        writers_schema : avro.schema.Schema object
+            If not None, checks that writers_schema in existing file is the same as supplied schema.
+            Avro does not allow writing records to a container file with multiple writers_schema.
+
+        Returns
+        -------
+        avro.datafile.DataFileWriter
+        """
+        rec_writer = io.DatumWriter()
+        df_writer = datafile.DataFileWriter(
+            open(uri, 'ab+'),
+            rec_writer
+        )
+        #Check for embedded schema to ensure existing file is an avro file.
+        embedded_schema = schema.parse(df_writer.get_meta('avro.schema'))
+
+        #If writers_schema supplied, check for equality with embedded schema.
+        if writers_schema:
+            assert embedded_schema == writers_schema, \
+                "writers_schema embedded in {uri} differs from user supplied schema for appending."
+
+        return df_writer
+
+    @staticmethod
+    def _get_new_writer(uri, sch):
+        """
+        Returns an isntance of avro.datafile.DataFileWriter for writing
+        to a new avro file at `uri`.
+
+        Parameters
+        ----------
+
+        uri : str
+            uri of avro existing, non-empty avro file
+
+        sch : avro.schema.Schema object
+
+        Returns
+        -------
+        avro.datafile.DataFileWriter
+        """
+        rec_writer = io.DatumWriter()
+        df_writer = datafile.DataFileWriter(
+            open(uri, 'wb'),
+            rec_writer,
+            writers_schema = sch
+        )
+        return df_writer
+
+    @property
+    def writer(self):
+        if hasattr(self, '_writer'):
+            return self._writer
+        else:
+            if os.path.exists(self.uri) and os.path.getsize(self.uri) > 0:
+                df_writer = self._get_append_writer(self.uri, self.schema)
+            else:
+                df_writer = self._get_new_writer(self.uri, self.schema)
+            self._writer = df_writer
+        return df_writer
+
+    def flush(self):
+        if hasattr(self, '_writer'):
+            self._writer.close()
+            del(self._writer)
+
+
+@resource.register('.+\.(avro)')
+def resource_avro(uri, schema=None, **kwargs):
+    return AVRO(uri, schema=schema, **kwargs)
+
+def discover_schema(sch):
+    if isinstance(sch, schema.RecordSchema):
+        return var * Record([(f.name, discover_schema(f.type)) for f in sch.fields])
+    elif isinstance(sch, schema.UnionSchema):
+        try:
+            types = [s.type for s in sch.schemas]
+            assert "null" in types
+            types.remove("null")
+            assert len(types) == 1
+            return Option(AVRO_TYPE_MAP[types[0]])
+        except AssertionError:
+            raise TypeError("odo supports avro UnionSchema only for nullabel fields.  "
+                            "Received {0}".format(str([s.type for s in sch.schemas])))
+    elif isinstance(sch, schema.PrimitiveSchema):
+        return AVRO_TYPE_MAP[sch.type]
+    elif isinstance(sch, schema.MapSchema):
+        return Map(string, discover_schema(sch.values))
+    elif isinstance(sch, schema.ArraySchema):
+        return var * discover_schema(sch.items)
+    else:
+        raise Exception(str(type(sch)))
+
+@discover.register(AVRO)
+def discover_avro(f, **kwargs):
+    return discover_schema(f.schema)
+
+@convert.register(pd.DataFrame, AVRO, cost=4.0)
+def avro_to_DataFrame(avro, dshape=None, **kwargs):
+    #XXX:AEH:todo - correct for pandas automated type conversions.  e.g. strings containing numbers get cast to numeric.
+    #XXX:AEH:todo - column with nulls just becomes an "object" column.
+    df = pd.DataFrame([r for r in avro])
+    names = [f.name.decode('utf-8') for f in avro.schema.fields]
+    df = df[names]
+    return df
+
+@convert.register(Temp(AVRO), Iterator, cost=1.0)
+def convert_iterator_to_temporary_avro(data, schema=None, **kwargs):
+    fn = '.%s.avro' % uuid.uuid1()
+    avro = Temp(AVRO)(fn, schema, **kwargs)
+    return append(avro, data, **kwargs)
+
+
+@convert.register(Iterator, AVRO, cost=1.0)
+def avro_to_iterator(s, **kwargs):
+    return s
+
+@append.register(AVRO, Iterator)
+def append_iterator_to_avro(tgt_avro, src_itr, **kwargs):
+    for datum in src_itr:
+        tgt_avro.writer.append(datum)
+    tgt_avro.flush()
+
+@append.register(AVRO, object)  # anything else
+def append_anything_to_iterator(tgt, src, **kwargs):
+    source_as_iter = convert(Iterator, src, **kwargs)
+    return append(tgt, source_as_iter, **kwargs)
diff --git a/odo/backends/tests/test_avro.py b/odo/backends/tests/test_avro.py
@@ -0,0 +1,92 @@
+from __future__ import absolute_import, division, print_function
+
+from avro import datafile, io, schema
+from collections import Iterator
+import pandas as pd
+from pandas.util.testing import assert_frame_equal
+from odo.backends.avro import discover, avro_to_DataFrame, avro_to_iterator, resource, AVRO
+
+import unittest
+import tempfile
+
+from odo.utils import tmpfile, into_path
+from odo import append, convert, resource, dshape
+
+test_schema_str = """
+{
+    "type"        : "record",
+    "namespace"   : "dataset",
+    "name"        : "test_dataset",
+    "fields": [
+        {"type": "int"   , "name": "field_1"},
+        {"type": "string", "name": "field_2"},
+        {"default": null, "name": "field_3", "type": ["null", "long"]},
+        { "name": "features", "type": { "type": "map", "values": "double"}},
+        { "name": "words", "type": {"type": "array", "items": "string"}}
+    ]
+}
+"""
+
+test_data = [
+    {"field_1":2072373602,"field_2":"mxllbfxk","field_3":-3887990995227229804,"features":{"bhettcdl":0.8581552641969377,"vdqvnqgqbrjtkug":0.4938648291874551,"sgmlbagyfb":0.5796466618955293,"ka":0.9873135485253831},"words":["ciplc","htvixoujptehr","rbeiimkevsn"]},
+    {"field_1":517434305,"field_2":"frgcnqrocddimu","field_3":None,"features":{"atqqsuttysdrursxlynwcrmfrwcrdxaegfnidvwjxamoj":0.2697279678696263,"kjb":0.8279248178446112,"wqlecjb":0.8241169129373344,"inihhrtnawyopu":0.08511455977126114,"dpjw":0.760489536392584},"words":["ignsrafxpgu","ckg"]},
+    {"field_1":1925434607,"field_2":"aurlydvgfygmu","field_3":None,"features":{"crslipya":0.1596449079423896,"":0.4304848508533662,"imbfgwnaphh":0.19323554138270294},"words":["rqdpanbbcemg","auurshsxxkp","rdngxdthekt"]},
+    {"field_1":636669589,"field_2":"","field_3":-1858103537322807465,"features":{"dv":0.9635053430456509,"lhljgywersxjp":0.5289026834129389,"nmtns":0.7645922724023969},"words":["vviuffehxh","jpquemsx","xnoj",""]},
+    {"field_1":-1311284713,"field_2":"infejerere","field_3":5673921375069484569,"features":{"iaen":0.7412670573684966,"ekqfnn":0.6685382939302145,"innfcqqbdrpcdn":0.39528359165136695,"fd":0.8572519278668735,"fbryid":0.7244784428105817},"words":["ciqu","emfruneloqh"]},
+    {"field_1":1716247766,"field_2":"gmmfghijngo","field_3":None,"features":{"ourul":0.1849234265503661,"vhvwhech":0.41140968300430625,"m":0.9576395352199625,"fgh":0.9547116485401502,"gqpdtvncno":0.027038814818686197},"words":["ugwcfecipffmkwi","kttgclwjlk","siejdtrpjkqennx","ixwrpmywtbgiygaoxpwnvuckdygttsssqfrplbyyv","mfsrhne"]},
+    {"field_1":101453273,"field_2":"frjaqnrbfspsuw","field_3":None,"features":{"ffps":0.02989888991738765,"fxkhyomw":0.2963204572188527},"words":["jwi","rfxlxngyethg"]},
+    {"field_1":-1792425886,"field_2":"pqkawoyw","field_3":None,"features":{"vsovnbsdhbkydf":0.09777409545072746,"eovoiix":0.10890846076556715},"words":["xntmmvpbrq","uof"]},
+    {"field_1":-1828393530,"field_2":"nkflrmkxiry","field_3":None,"features":{"qewmpdviapfyjma":0.8727493942139006},"words":["lgtrtjhpf"]},
+    {"field_1":1048099453,"field_2":"jsle","field_3":None,"features":{"qbndce":0.5459572647413652},"words":["d"]},
+]
+
+ds = dshape("""var * {
+  field_1: int32,
+  field_2: string,
+  field_3: ?int64,
+  features: map[string, float64],
+  words: var * string
+  }""")
+
+test_path = into_path('backends', 'tests', 'test_file.avro')
+
+class TestAvro(unittest.TestCase):
+
+    def setUp(self):
+        self.avrofile = resource(test_path)
+        self.temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".avro")
+
+    def tearDown(self):
+        self.temp_output.unlink(self.temp_output.name)
+
+    def test_resource_datafile(self):
+        self.assertIsInstance(resource(test_path), AVRO)
+
+    def test_discover(self):
+        self.assertEquals(discover(self.avrofile), ds)
+
+    def test_convert_avro_to_dataframe(self):
+        df = convert(pd.DataFrame, self.avrofile)
+        self.assertIsInstance(df, pd.DataFrame)
+
+        names = ["field_1", "field_2", "field_3", "features", "words"]
+        expected_output = pd.DataFrame(test_data, columns=names)
+        assert_frame_equal(df, expected_output)
+
+    def test_convert_avro_to_iterator(self):
+        itr = convert(Iterator, self.avrofile)
+        self.assertIsInstance(itr, Iterator)
+        self.assertEqual(list(itr), test_data)
+
+    def test_require_schema_for_new_file(self):
+        self.assertRaises(schema.AvroException, AVRO, "doesntexist.avro")
+
+    def test_append_and_convert_round_trip(self):
+        x = AVRO(self.temp_output.name, schema=schema.parse(test_schema_str))
+        append(x, test_data)
+        append(x, test_data)
+        assert convert(list, x) == test_data * 2
+
+
+if __name__=="__main__":
+    unittest.main()
diff --git a/odo/backends/tests/test_file.avro b/odo/backends/tests/test_file.avro
diff --git a/recommended-requirements.txt b/recommended-requirements.txt
@@ -12,3 +12,4 @@ sas7bdat
 paramiko
 pywebhdfs
 boto
+avro
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 datashape >= 0.4.6
 numpy >= 1.7
 pandas >= 0.15.0
-toolz >= 0.7.2
+toolz == 0.7.4
 multipledispatch >= 0.4.7
 networkx
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,6 +31,7 @@ requirements: @@
     test:
       requires:
+        - avro
         - pytest
         - h5py
         - pytables >=3.0.0
@@ Expand Down @@