Merge pull request #3 from jonadem/master

Add minimal support for fods
pyexcel · Jul 25, 2017 · 2e868df · 2e868df
2 parents a263e86 + c427ba9
commit 2e868df
Show file tree

Hide file tree

Showing 5 changed files with 658 additions and 1 deletion.
diff --git a/pyexcel_odsr/__init__.py b/pyexcel_odsr/__init__.py
@@ -18,6 +18,12 @@
     stream_type='binary'
 )
 
+__FILE_TYPE_FODS__ = 'fods'
+IOPluginInfoChain(__name__).add_a_reader(
+    relative_plugin_class_path='odsr.FODSBook',
+    file_types=[__FILE_TYPE_FODS__],
+    stream_type='text'
+)
 
 from pyexcel_io.io import get_data as read_data, isstream
 

diff --git a/pyexcel_odsr/messyods.py b/pyexcel_odsr/messyods.py
@@ -34,6 +34,13 @@
 ODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
 ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
                            re.MULTILINE)
+FODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document[^>]*>)",
+                                       re.DOTALL)
+FODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?",
+                              re.DOTALL)
+FODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
+FODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
+                            re.DOTALL)
 NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
 NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
 NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
@@ -172,6 +179,132 @@ def raw(self, sample=False):
         del rows
 
 
+class FODSTableSet(object):
+    """
+    A wrapper around ODS files. Because they are zipped and the info we want
+    is in the zipped file as content.xml we must ensure that we either have
+    a seekable object (local file) or that we retrieve all of the content from
+    the remote URL.
+    """
+
+    def __init__(self, fileobj, window=None, **kw):
+        '''Initialize the object.
+
+        :param fileobj: may be a file path or a file-like object. Note the
+        file-like object *must* be in binary mode and must be seekable (it will
+        get passed to zipfile).
+
+        As a specific tip: urllib2.urlopen returns a file-like object that is
+        not in file-like mode while urllib.urlopen *does*!
+
+        To get a seekable file you *cannot* use
+        messytables.core.seekable_stream as it does not support the full seek
+        functionality.
+        '''
+        if hasattr(fileobj, 'read'):
+            # wrap in a StringIO so we do not have hassle with seeks and
+            # binary etc (see notes to __init__ above)
+            # TODO: rather wasteful if in fact fileobj comes from disk
+            fileobj = io.BytesIO(fileobj.read())
+
+        self.window = window
+
+        self.content = open(fileobj, 'rb').read()
+
+    def make_tables(self):
+        """
+            Return the sheets in the workbook.
+
+            A regex is used for this to avoid having to:
+
+            1. load large the entire file into memory, or
+            2. SAX parse the file more than once
+        """
+        namespace_tags = self._get_namespace_tags()
+        sheets = [m.groups(0)[0]
+                  for m in FODS_TABLE_MATCH.finditer(self.content)]
+        return [FODSRowSet(sheet, self.window, namespace_tags)
+                for sheet in sheets]
+
+    def _get_namespace_tags(self):
+        match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content)
+        assert match
+        tag_open = match.groups()[0]
+        tag_close = b'</office:document>'
+        return tag_open, tag_close
+
+
+class FODSRowSet(object):
+    """ ODS support for a single sheet in the ODS workbook. Unlike
+    the CSV row set this is not a streaming operation. """
+
+    def __init__(self, sheet, window=None, namespace_tags=None):
+        self.sheet = sheet
+
+        self.name = "Unknown"
+        m = FODS_TABLE_NAME.match(self.sheet)
+        if m:
+            self.name = m.groups(0)[0]
+            if not PY2 and isinstance(self.name, bytes):
+                self.name = self.name.decode('utf-8')
+
+        self.window = window or 1000
+
+        # We must wrap the XML fragments in a valid header otherwise iterparse
+        # will explode with certain (undefined) versions of libxml2. The
+        # namespaces are in the ODS file, and change with the libreoffice
+        # version saving it, so get them from the ODS file if possible. The
+        # default namespaces are an option to preserve backwards compatibility
+        # of ODSRowSet.
+        if namespace_tags:
+            self.namespace_tags = namespace_tags
+        else:
+            namespaces = {
+                "dc": u"http://purl.org/dc/elements/1.1/",
+                "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
+                "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
+                "office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
+                "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
+                "table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
+                "text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
+                "calcext": NS_CAL_PTTN % u"calcext:1.0",
+            }
+
+            ods_header = u"<wrapper {0}>"\
+                .format(" ".join('xmlns:{0}="{1}"'.format(k, v)
+                        for k, v in namespaces.iteritems())).encode('utf-8')
+            ods_footer = u"</wrapper>".encode('utf-8')
+            self.namespace_tags = (ods_header, ods_footer)
+
+    def raw(self, sample=False):
+        """ Iterate over all rows in this sheet. """
+        rows = FODS_ROW_MATCH.findall(self.sheet)
+
+        for row in rows:
+            row_data = []
+
+            block = self.namespace_tags[0] + row + self.namespace_tags[1]
+            partial = io.BytesIO(block)
+
+            for action, element in etree.iterparse(partial, ('end',)):
+                if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
+                    continue
+
+                cell = _read_cell(element)
+                repeat = element.attrib.get(
+                    _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
+
+                if repeat:
+                    number_of_repeat = int(repeat)
+                    row_data += [cell] * number_of_repeat
+                else:
+                    row_data.append(cell)
+
+            del partial
+            yield row_data
+        del rows
+
+
 def _read_cell(element):
     cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
     value_token = VALUE_TOKEN.get(cell_type, 'value')

diff --git a/pyexcel_odsr/odsr.py b/pyexcel_odsr/odsr.py
@@ -12,7 +12,7 @@
 from pyexcel_io.sheet import SheetReader
 from pyexcel_io._compact import OrderedDict
 
-from pyexcel_odsr.messyods import ODSTableSet
+from pyexcel_odsr.messyods import ODSTableSet, FODSTableSet
 import pyexcel_odsr.converter as converter
 
 
@@ -104,6 +104,31 @@ def _load_from_file(self):
         self._native_book = ODSTableSet(self._file_name)
 
 
+class FODSBook(BookReader):
+    """read ods book"""
+    def open(self, file_name, **keywords):
+        """open fods file"""
+        BookReader.open(self, file_name, **keywords)
+        self._load_from_file()
+
+    def read_all(self):
+        """read all sheets"""
+        result = OrderedDict()
+        for sheet in self._native_book.make_tables():
+            ods_sheet = ODSSheet(sheet, **self._keywords)
+            result[ods_sheet.name] = ods_sheet.to_array()
+
+        return result
+
+    def read_sheet(self, native_sheet):
+        """read one native sheet"""
+        sheet = ODSSheet(native_sheet, **self._keywords)
+        return {sheet.name: sheet.to_array()}
+
+    def _load_from_file(self):
+        self._native_book = FODSTableSet(self._file_name)
+
+
 def is_integer_ok_for_xl_float(value):
     """check if a float had zero value in digits"""
     return value == math.floor(value)