Skip to content

Commit

Permalink
Merge branch 'master' into cleanup-mt2-redux
Browse files Browse the repository at this point in the history
  • Loading branch information
David Read committed Jul 5, 2019
2 parents 6638e58 + f6f2250 commit 4aa3101
Show file tree
Hide file tree
Showing 17 changed files with 217 additions and 63 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.15.2 (8 February 2017)
* #165: detect ods types: boolean, currency, time and percentage. support repeated columns
* #160: Correct spelling of separator in source

0.15.1 (29 September 2016)
* #158: Add CDFV2-unknown to MIMELOOKUP
* #157: Fix for Python Magic API change
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/)

A library for dealing with messy tabular data in several formats, guessing types and detecting headers.

Expand Down
11 changes: 2 additions & 9 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,8 @@ of a given column into all types and searching for the best match.

.. automethod:: messytables.types.type_guess

The supported types include:

.. autoclass:: messytables.types.StringType
.. autoclass:: messytables.types.IntegerType
.. autoclass:: messytables.types.FloatType
.. autoclass:: messytables.types.DecimalType
.. autoclass:: messytables.types.BoolType
.. autoclass:: messytables.types.DateType
.. autoclass:: messytables.types.DateUtilType
The supported types are detailed in
`typecast <https://github.com/pudo/typecast#typecast>`_

Headers detection
-----------------
Expand Down
Binary file added horror/multilineods.ods
Binary file not shown.
Binary file added horror/ods_formats.ods
Binary file not shown.
1 change: 1 addition & 0 deletions messytables/any.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'text/plain': 'CSV', # could be TAB.
'application/CDFV2-corrupt': 'XLS',
'application/CDFV2-unknown': 'XLS',
'application/CDFV2': 'XLS',
'application/vnd.oasis.opendocument.spreadsheet': 'ODS',
'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
}
Expand Down
2 changes: 1 addition & 1 deletion messytables/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def headers_guess(rows, tolerance=1):
"""Guess the offset and names of the headers of the row set.
This will attempt to locate the first row within ``tolerance``
of the mode of the number of rows in the row set sample.
of the mode of the number of columns in the row set sample.
The return value is a tuple of the offset of the header row
and the names of the columns.
Expand Down
4 changes: 2 additions & 2 deletions messytables/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def identify_anatomy(tag):


class FakeHTMLCell(Cell):
"""FakeHTMLCells are not present because of column or row spannning."""
"""FakeHTMLCells are not present because of column or row spanning."""

def __init__(self):
super(FakeHTMLCell, self).__init__("")
Expand All @@ -145,7 +145,7 @@ class HTMLCell(Cell):
""" The Cell __init__ signature is:
def __init__(self, value=None, column=None, type=None):
where 'value' is the primary input, 'column' is a column name, and
type is messytables.types.StringType() or better."""
type is messytables.types.String() or better."""

def __init__(self, value=None, column=None, type=None, source=None):
assert value is None
Expand Down
109 changes: 90 additions & 19 deletions messytables/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,44 @@

from lxml import etree
from typecast import String, Decimal, Date
# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast?

from messytables.core import RowSet, TableSet, Cell


ODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document-content[^>]*>)", re.MULTILINE)
ODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
ODS_NAMESPACES_TAG_MATCH = re.compile(
b"(<office:document-content[^>]*>)", re.MULTILINE)
ODS_TABLE_MATCH = re.compile(
b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?')
ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
ODS_ROW_MATCH = re.compile(
b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)

NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0"

TABLE_CELL = 'table-cell'
VALUE_TYPE = 'value-type'
COLUMN_REPEAT = 'number-columns-repeated'
EMPTY_CELL_VALUE = ''

ODS_VALUE_TOKEN = {
"float": "value",
"date": "date-value",
"time": "time-value",
"boolean": "boolean-value",
"percentage": "value",
"currency": "value"
}

ODS_TYPES = {
'float': Decimal(),
'date': Date(),
'boolean': BoolType(),
'percentage': PercentageType(),
'time': TimeType()
}


Expand Down Expand Up @@ -102,13 +128,13 @@ def __init__(self, sheet, window=None, namespace_tags=None):
else:
namespaces = {
"dc": u"http://purl.org/dc/elements/1.1/",
"draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
"number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
"office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0",
"svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
"table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0",
"text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0",
"calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0",
"draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
"number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
"office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
"svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
"table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
"text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
"calcext": NS_CAL_PTTN % u"calcext:1.0",
}

ods_header = u"<wrapper {0}>"\
Expand All @@ -128,20 +154,65 @@ def raw(self, sample=False):

block = self.namespace_tags[0] + row + self.namespace_tags[1]
partial = io.BytesIO(block)
empty_row = True

for action, element in etree.iterparse(partial, ('end',)):
if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
continue

for action, elem in etree.iterparse(partial, ('end',)):
if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type')
children = elem.getchildren()
if children:
c = Cell(children[0].text,
type=ODS_TYPES.get(cell_type, String()))
row_data.append(c)
cell = _read_cell(element)
if empty_row is True and cell.value != EMPTY_CELL_VALUE:
empty_row = False

if not row_data:
repeat = element.attrib.get(
_tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
if repeat:
number_of_repeat = int(repeat)
row_data += [cell] * number_of_repeat
else:
row_data.append(cell)

if empty_row:
# ignore blank lines
continue

del partial
yield row_data
del rows


def _read_cell(element):
cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
if cell_type == 'string':
cell = _read_text_cell(element)
elif cell_type == 'currency':
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
cell = Cell(value + ' ' + currency, type=CurrencyType())
elif cell_type is not None:
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
cell = Cell(value, type=ODS_TYPES.get(cell_type, String()))
else:
cell = Cell(EMPTY_CELL_VALUE, type=String())

return cell


def _read_text_cell(element):
children = element.getchildren()
text_content = []
for child in children:
if child.text:
text_content.append(child.text)
else:
text_content.append(EMPTY_CELL_VALUE)
if len(text_content) > 0:
cell_value = '\n'.join(text_content)
else:
cell_value = EMPTY_CELL_VALUE
return Cell(cell_value, type=String())


def _tag(namespace, tag):
return '{%s}%s' % (namespace, tag)
19 changes: 0 additions & 19 deletions messytables/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,3 @@ def type_guess(rows, types=GUESS_TYPES, strict=False):
# add string guess so that we have at least one guess
guessers[j].add(cell.value)
return [g.best for g in guessers]


def types_processor(types, strict=False):
"""Apply the column types to the each row.
Strict means that casting errors are not ignored.
"""
def apply_types(row_set, row):
if types is None:
return row
for cell, type in six.moves.zip_longest(row, types):
try:
cell.value = type.cast(cell.value)
cell.type = type
except:
if strict and type:
raise
return row
return apply_types
2 changes: 1 addition & 1 deletion test/test_any.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import unittest

from .util import horror_fobj
from util import horror_fobj
from nose.tools import assert_equal
from nose.plugins.skip import SkipTest
from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
Expand Down
2 changes: 1 addition & 1 deletion test/test_guessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# import cProfile
# from pstats import Stats

from .util import horror_fobj
from util import horror_fobj
from nose.plugins.attrib import attr
from nose.plugins.skip import SkipTest
from nose.tools import assert_equal
Expand Down
2 changes: 1 addition & 1 deletion test/test_properties.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

import unittest
from .util import horror_fobj
from util import horror_fobj
from messytables.any import any_tableset
from messytables.error import NoSuchPropertyError
from nose.tools import (
Expand Down
Loading

0 comments on commit 4aa3101

Please sign in to comment.