Skip to content

Commit

Permalink
Merge pull request #3 from jonadem/master
Browse files Browse the repository at this point in the history
Add minimal support for fods
  • Loading branch information
chfw authored Jul 25, 2017
2 parents a263e86 + c427ba9 commit 2e868df
Show file tree
Hide file tree
Showing 5 changed files with 658 additions and 1 deletion.
6 changes: 6 additions & 0 deletions pyexcel_odsr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
stream_type='binary'
)

__FILE_TYPE_FODS__ = 'fods'
IOPluginInfoChain(__name__).add_a_reader(
relative_plugin_class_path='odsr.FODSBook',
file_types=[__FILE_TYPE_FODS__],
stream_type='text'
)

from pyexcel_io.io import get_data as read_data, isstream

Expand Down
133 changes: 133 additions & 0 deletions pyexcel_odsr/messyods.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@
ODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
re.MULTILINE)
FODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document[^>]*>)",
re.DOTALL)
FODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?",
re.DOTALL)
FODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
FODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
re.DOTALL)
NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
Expand Down Expand Up @@ -172,6 +179,132 @@ def raw(self, sample=False):
del rows


class FODSTableSet(object):
"""
A wrapper around ODS files. Because they are zipped and the info we want
is in the zipped file as content.xml we must ensure that we either have
a seekable object (local file) or that we retrieve all of the content from
the remote URL.
"""

def __init__(self, fileobj, window=None, **kw):
'''Initialize the object.
:param fileobj: may be a file path or a file-like object. Note the
file-like object *must* be in binary mode and must be seekable (it will
get passed to zipfile).
As a specific tip: urllib2.urlopen returns a file-like object that is
not in file-like mode while urllib.urlopen *does*!
To get a seekable file you *cannot* use
messytables.core.seekable_stream as it does not support the full seek
functionality.
'''
if hasattr(fileobj, 'read'):
# wrap in a StringIO so we do not have hassle with seeks and
# binary etc (see notes to __init__ above)
# TODO: rather wasteful if in fact fileobj comes from disk
fileobj = io.BytesIO(fileobj.read())

self.window = window

self.content = open(fileobj, 'rb').read()

def make_tables(self):
"""
Return the sheets in the workbook.
A regex is used for this to avoid having to:
1. load large the entire file into memory, or
2. SAX parse the file more than once
"""
namespace_tags = self._get_namespace_tags()
sheets = [m.groups(0)[0]
for m in FODS_TABLE_MATCH.finditer(self.content)]
return [FODSRowSet(sheet, self.window, namespace_tags)
for sheet in sheets]

def _get_namespace_tags(self):
match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content)
assert match
tag_open = match.groups()[0]
tag_close = b'</office:document>'
return tag_open, tag_close


class FODSRowSet(object):
""" ODS support for a single sheet in the ODS workbook. Unlike
the CSV row set this is not a streaming operation. """

def __init__(self, sheet, window=None, namespace_tags=None):
self.sheet = sheet

self.name = "Unknown"
m = FODS_TABLE_NAME.match(self.sheet)
if m:
self.name = m.groups(0)[0]
if not PY2 and isinstance(self.name, bytes):
self.name = self.name.decode('utf-8')

self.window = window or 1000

# We must wrap the XML fragments in a valid header otherwise iterparse
# will explode with certain (undefined) versions of libxml2. The
# namespaces are in the ODS file, and change with the libreoffice
# version saving it, so get them from the ODS file if possible. The
# default namespaces are an option to preserve backwards compatibility
# of ODSRowSet.
if namespace_tags:
self.namespace_tags = namespace_tags
else:
namespaces = {
"dc": u"http://purl.org/dc/elements/1.1/",
"draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
"number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
"office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
"svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
"table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
"text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
"calcext": NS_CAL_PTTN % u"calcext:1.0",
}

ods_header = u"<wrapper {0}>"\
.format(" ".join('xmlns:{0}="{1}"'.format(k, v)
for k, v in namespaces.iteritems())).encode('utf-8')
ods_footer = u"</wrapper>".encode('utf-8')
self.namespace_tags = (ods_header, ods_footer)

def raw(self, sample=False):
""" Iterate over all rows in this sheet. """
rows = FODS_ROW_MATCH.findall(self.sheet)

for row in rows:
row_data = []

block = self.namespace_tags[0] + row + self.namespace_tags[1]
partial = io.BytesIO(block)

for action, element in etree.iterparse(partial, ('end',)):
if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
continue

cell = _read_cell(element)
repeat = element.attrib.get(
_tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))

if repeat:
number_of_repeat = int(repeat)
row_data += [cell] * number_of_repeat
else:
row_data.append(cell)

del partial
yield row_data
del rows


def _read_cell(element):
cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
value_token = VALUE_TOKEN.get(cell_type, 'value')
Expand Down
27 changes: 26 additions & 1 deletion pyexcel_odsr/odsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pyexcel_io.sheet import SheetReader
from pyexcel_io._compact import OrderedDict

from pyexcel_odsr.messyods import ODSTableSet
from pyexcel_odsr.messyods import ODSTableSet, FODSTableSet
import pyexcel_odsr.converter as converter


Expand Down Expand Up @@ -104,6 +104,31 @@ def _load_from_file(self):
self._native_book = ODSTableSet(self._file_name)


class FODSBook(BookReader):
"""read ods book"""
def open(self, file_name, **keywords):
"""open fods file"""
BookReader.open(self, file_name, **keywords)
self._load_from_file()

def read_all(self):
"""read all sheets"""
result = OrderedDict()
for sheet in self._native_book.make_tables():
ods_sheet = ODSSheet(sheet, **self._keywords)
result[ods_sheet.name] = ods_sheet.to_array()

return result

def read_sheet(self, native_sheet):
"""read one native sheet"""
sheet = ODSSheet(native_sheet, **self._keywords)
return {sheet.name: sheet.to_array()}

def _load_from_file(self):
self._native_book = FODSTableSet(self._file_name)


def is_integer_ok_for_xl_float(value):
"""check if a float had zero value in digits"""
return value == math.floor(value)
Loading

0 comments on commit 2e868df

Please sign in to comment.