Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add minimal support for fods #3

Merged
merged 4 commits into from
Jul 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyexcel_odsr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
stream_type='binary'
)

__FILE_TYPE_FODS__ = 'fods'
IOPluginInfoChain(__name__).add_a_reader(
relative_plugin_class_path='odsr.FODSBook',
file_types=[__FILE_TYPE_FODS__],
stream_type='text'
)

from pyexcel_io.io import get_data as read_data, isstream

Expand Down
133 changes: 133 additions & 0 deletions pyexcel_odsr/messyods.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@
ODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
re.MULTILINE)
FODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document[^>]*>)",
re.DOTALL)
FODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?",
re.DOTALL)
FODS_TABLE_NAME = re.compile(b".*?table:name=\"(.*?)\".*?")
FODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?",
re.DOTALL)
NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
Expand Down Expand Up @@ -172,6 +179,132 @@ def raw(self, sample=False):
del rows


class FODSTableSet(object):
"""
A wrapper around ODS files. Because they are zipped and the info we want
is in the zipped file as content.xml we must ensure that we either have
a seekable object (local file) or that we retrieve all of the content from
the remote URL.
"""

def __init__(self, fileobj, window=None, **kw):
'''Initialize the object.

:param fileobj: may be a file path or a file-like object. Note the
file-like object *must* be in binary mode and must be seekable (it will
get passed to zipfile).

As a specific tip: urllib2.urlopen returns a file-like object that is
not in file-like mode while urllib.urlopen *does*!

To get a seekable file you *cannot* use
messytables.core.seekable_stream as it does not support the full seek
functionality.
'''
if hasattr(fileobj, 'read'):
# wrap in a StringIO so we do not have hassle with seeks and
# binary etc (see notes to __init__ above)
# TODO: rather wasteful if in fact fileobj comes from disk
fileobj = io.BytesIO(fileobj.read())

self.window = window

self.content = open(fileobj, 'rb').read()

def make_tables(self):
"""
Return the sheets in the workbook.

A regex is used for this to avoid having to:

1. load large the entire file into memory, or
2. SAX parse the file more than once
"""
namespace_tags = self._get_namespace_tags()
sheets = [m.groups(0)[0]
for m in FODS_TABLE_MATCH.finditer(self.content)]
return [FODSRowSet(sheet, self.window, namespace_tags)
for sheet in sheets]

def _get_namespace_tags(self):
match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content)
assert match
tag_open = match.groups()[0]
tag_close = b'</office:document>'
return tag_open, tag_close


class FODSRowSet(object):
""" ODS support for a single sheet in the ODS workbook. Unlike
the CSV row set this is not a streaming operation. """

def __init__(self, sheet, window=None, namespace_tags=None):
self.sheet = sheet

self.name = "Unknown"
m = FODS_TABLE_NAME.match(self.sheet)
if m:
self.name = m.groups(0)[0]
if not PY2 and isinstance(self.name, bytes):
self.name = self.name.decode('utf-8')

self.window = window or 1000

# We must wrap the XML fragments in a valid header otherwise iterparse
# will explode with certain (undefined) versions of libxml2. The
# namespaces are in the ODS file, and change with the libreoffice
# version saving it, so get them from the ODS file if possible. The
# default namespaces are an option to preserve backwards compatibility
# of ODSRowSet.
if namespace_tags:
self.namespace_tags = namespace_tags
else:
namespaces = {
"dc": u"http://purl.org/dc/elements/1.1/",
"draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
"number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
"office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
"svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
"table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
"text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
"calcext": NS_CAL_PTTN % u"calcext:1.0",
}

ods_header = u"<wrapper {0}>"\
.format(" ".join('xmlns:{0}="{1}"'.format(k, v)
for k, v in namespaces.iteritems())).encode('utf-8')
ods_footer = u"</wrapper>".encode('utf-8')
self.namespace_tags = (ods_header, ods_footer)

def raw(self, sample=False):
""" Iterate over all rows in this sheet. """
rows = FODS_ROW_MATCH.findall(self.sheet)

for row in rows:
row_data = []

block = self.namespace_tags[0] + row + self.namespace_tags[1]
partial = io.BytesIO(block)

for action, element in etree.iterparse(partial, ('end',)):
if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
continue

cell = _read_cell(element)
repeat = element.attrib.get(
_tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))

if repeat:
number_of_repeat = int(repeat)
row_data += [cell] * number_of_repeat
else:
row_data.append(cell)

del partial
yield row_data
del rows


def _read_cell(element):
cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
value_token = VALUE_TOKEN.get(cell_type, 'value')
Expand Down
27 changes: 26 additions & 1 deletion pyexcel_odsr/odsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pyexcel_io.sheet import SheetReader
from pyexcel_io._compact import OrderedDict

from pyexcel_odsr.messyods import ODSTableSet
from pyexcel_odsr.messyods import ODSTableSet, FODSTableSet
import pyexcel_odsr.converter as converter


Expand Down Expand Up @@ -104,6 +104,31 @@ def _load_from_file(self):
self._native_book = ODSTableSet(self._file_name)


class FODSBook(BookReader):
"""read ods book"""
def open(self, file_name, **keywords):
"""open fods file"""
BookReader.open(self, file_name, **keywords)
self._load_from_file()

def read_all(self):
"""read all sheets"""
result = OrderedDict()
for sheet in self._native_book.make_tables():
ods_sheet = ODSSheet(sheet, **self._keywords)
result[ods_sheet.name] = ods_sheet.to_array()

return result

def read_sheet(self, native_sheet):
"""read one native sheet"""
sheet = ODSSheet(native_sheet, **self._keywords)
return {sheet.name: sheet.to_array()}

def _load_from_file(self):
self._native_book = FODSTableSet(self._file_name)


def is_integer_ok_for_xl_float(value):
"""check if a float had zero value in digits"""
return value == math.floor(value)
Loading