Skip to content

Commit

Permalink
Merge branch 'master' into cleanup-mt2-redux
Browse files Browse the repository at this point in the history
  • Loading branch information
David Read committed Jul 5, 2019
2 parents 6638e58 + f6f2250 commit 126630d
Show file tree
Hide file tree
Showing 16 changed files with 217 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.15.2 (8 February 2017)
* #165: detect ods types: boolean, currency, time and percentage. support repeated columns
* #160: Correct spelling of separator in source

0.15.1 (29 September 2016)
* #158: Add CDFV2-unknown to MIMELOOKUP
* #157: Fix for Python Magic API change
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/)

A library for dealing with messy tabular data in several formats, guessing types and detecting headers.

Expand Down
11 changes: 2 additions & 9 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,8 @@ of a given column into all types and searching for the best match.

.. automethod:: messytables.types.type_guess

The supported types include:

.. autoclass:: messytables.types.StringType
.. autoclass:: messytables.types.IntegerType
.. autoclass:: messytables.types.FloatType
.. autoclass:: messytables.types.DecimalType
.. autoclass:: messytables.types.BoolType
.. autoclass:: messytables.types.DateType
.. autoclass:: messytables.types.DateUtilType
The supported types are detailed in
`typecast <https://github.com/pudo/typecast#typecast>`_

Headers detection
-----------------
Expand Down
Binary file added horror/multilineods.ods
Binary file not shown.
Binary file added horror/ods_formats.ods
Binary file not shown.
1 change: 1 addition & 0 deletions messytables/any.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'text/plain': 'CSV', # could be TAB.
'application/CDFV2-corrupt': 'XLS',
'application/CDFV2-unknown': 'XLS',
'application/CDFV2': 'XLS',
'application/vnd.oasis.opendocument.spreadsheet': 'ODS',
'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
}
Expand Down
2 changes: 1 addition & 1 deletion messytables/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def headers_guess(rows, tolerance=1):
"""Guess the offset and names of the headers of the row set.
This will attempt to locate the first row within ``tolerance``
of the mode of the number of rows in the row set sample.
of the mode of the number of columns in the row set sample.
The return value is a tuple of the offset of the header row
and the names of the columns.
Expand Down
4 changes: 2 additions & 2 deletions messytables/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def identify_anatomy(tag):


class FakeHTMLCell(Cell):
"""FakeHTMLCells are not present because of column or row spannning."""
"""FakeHTMLCells are not present because of column or row spanning."""

def __init__(self):
super(FakeHTMLCell, self).__init__("")
Expand All @@ -145,7 +145,7 @@ class HTMLCell(Cell):
""" The Cell __init__ signature is:
def __init__(self, value=None, column=None, type=None):
where 'value' is the primary input, 'column' is a column name, and
type is messytables.types.StringType() or better."""
type is messytables.types.String() or better."""

def __init__(self, value=None, column=None, type=None, source=None):
assert value is None
Expand Down
109 changes: 90 additions & 19 deletions messytables/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,44 @@

from lxml import etree
from typecast import String, Decimal, Date
# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast?

from messytables.core import RowSet, TableSet, Cell


ODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document-content[^>]*>)", re.MULTILINE)
ODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
ODS_NAMESPACES_TAG_MATCH = re.compile(
b"(<office:document-content[^>]*>)", re.MULTILINE)
ODS_TABLE_MATCH = re.compile(
b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?')
ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
ODS_ROW_MATCH = re.compile(
b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)

NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0"

TABLE_CELL = 'table-cell'
VALUE_TYPE = 'value-type'
COLUMN_REPEAT = 'number-columns-repeated'
EMPTY_CELL_VALUE = ''

ODS_VALUE_TOKEN = {
"float": "value",
"date": "date-value",
"time": "time-value",
"boolean": "boolean-value",
"percentage": "value",
"currency": "value"
}

ODS_TYPES = {
'float': Decimal(),
'date': Date(),
'boolean': BoolType(),
'percentage': PercentageType(),
'time': TimeType()
}


Expand Down Expand Up @@ -102,13 +128,13 @@ def __init__(self, sheet, window=None, namespace_tags=None):
else:
namespaces = {
"dc": u"http://purl.org/dc/elements/1.1/",
"draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
"number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
"office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0",
"svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
"table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0",
"text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0",
"calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0",
"draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
"number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
"office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
"svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
"table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
"text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
"calcext": NS_CAL_PTTN % u"calcext:1.0",
}

ods_header = u"<wrapper {0}>"\
Expand All @@ -128,20 +154,65 @@ def raw(self, sample=False):

block = self.namespace_tags[0] + row + self.namespace_tags[1]
partial = io.BytesIO(block)
empty_row = True

for action, element in etree.iterparse(partial, ('end',)):
if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
continue

for action, elem in etree.iterparse(partial, ('end',)):
if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type')
children = elem.getchildren()
if children:
c = Cell(children[0].text,
type=ODS_TYPES.get(cell_type, String()))
row_data.append(c)
cell = _read_cell(element)
if empty_row is True and cell.value != EMPTY_CELL_VALUE:
empty_row = False

if not row_data:
repeat = element.attrib.get(
_tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
if repeat:
number_of_repeat = int(repeat)
row_data += [cell] * number_of_repeat
else:
row_data.append(cell)

if empty_row:
# ignore blank lines
continue

del partial
yield row_data
del rows


def _read_cell(element):
cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
if cell_type == 'string':
cell = _read_text_cell(element)
elif cell_type == 'currency':
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
cell = Cell(value + ' ' + currency, type=CurrencyType())
elif cell_type is not None:
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
cell = Cell(value, type=ODS_TYPES.get(cell_type, String()))
else:
cell = Cell(EMPTY_CELL_VALUE, type=String())

return cell


def _read_text_cell(element):
children = element.getchildren()
text_content = []
for child in children:
if child.text:
text_content.append(child.text)
else:
text_content.append(EMPTY_CELL_VALUE)
if len(text_content) > 0:
cell_value = '\n'.join(text_content)
else:
cell_value = EMPTY_CELL_VALUE
return Cell(cell_value, type=String())


def _tag(namespace, tag):
return '{%s}%s' % (namespace, tag)
2 changes: 1 addition & 1 deletion test/test_any.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import unittest

from .util import horror_fobj
from util import horror_fobj
from nose.tools import assert_equal
from nose.plugins.skip import SkipTest
from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
Expand Down
2 changes: 1 addition & 1 deletion test/test_guessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# import cProfile
# from pstats import Stats

from .util import horror_fobj
from util import horror_fobj
from nose.plugins.attrib import attr
from nose.plugins.skip import SkipTest
from nose.tools import assert_equal
Expand Down
2 changes: 1 addition & 1 deletion test/test_properties.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

import unittest
from .util import horror_fobj
from util import horror_fobj
from messytables.any import any_tableset
from messytables.error import NoSuchPropertyError
from nose.tools import (
Expand Down
116 changes: 110 additions & 6 deletions test/test_read.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
import unittest

from .util import horror_fobj
from util import horror_fobj
from decimal import Decimal
from nose.plugins.attrib import attr
from nose.tools import assert_equal
from nose.plugins.skip import SkipTest
Expand Down Expand Up @@ -190,7 +191,8 @@ def rows(skip_policy):
row_set = table_set.tables[0]
return row_set

second = lambda r: r[1].value
def second(row):
return row[1].value

assert "goodbye" in list(map(second, rows(True)))
assert " goodbye" in list(map(second, rows(False)))
Expand Down Expand Up @@ -310,9 +312,9 @@ def test_read_large_ods(self):
assert_equal(6, len(table_set.tables))
row_set = table_set.tables[0]
row = next(row_set.raw())
assert len(row) == 5, len(row)
assert len(row) == 16384, len(row)
for row in row_set.sample:
assert len(row) == 5, len(row)
assert len(row) == 16384, len(row)

def test_ods_version_4412(self):
fh = horror_fobj('loffice-4.4.1.2.ods')
Expand All @@ -336,13 +338,115 @@ def test_ods_read_past_blank_lines(self):
assert_equal(rows[2][0], 'Jane')
assert_equal(rows[3][0], 'Ian')

def test_ods_read_all_supported_formats(self):
fh = horror_fobj('ods_formats.ods')
table_set = ODSTableSet(fh)
assert_equal(3, len(table_set.tables))
row_set = table_set.tables[0]
rows = row_set_to_rows(row_set)
assert_equal(rows[0][0], "Date")
assert_equal(rows[1][0], "2014-11-11")
assert_equal(rows[2][0], "2001-01-01")
assert_equal(rows[3][0], '')
# time formats
assert_equal(rows[0][1], "Time")
assert_equal(rows[1][1], "PT11H12M12S")
assert_equal(rows[2][1], "PT00H00M12S")
assert_equal(rows[4][1], 'PT27H17M54S')
assert_equal(rows[5][1], "Other")
# boolean
assert_equal(rows[0][2], "Boolean")
assert_equal(rows[1][2], 'true')
assert_equal(rows[2][2], 'false')
# Float
assert_equal(rows[0][3], "Float")
assert_equal(rows[1][3], '11.11')
# Currency
assert_equal(rows[0][4], "Currency")
assert_equal(rows[1][4], '1 GBP')
assert_equal(rows[2][4], '-10000 GBP')
# Percentage
assert_equal(rows[0][5], "Percentage")
assert_equal(rows[1][5], '2')
# int
assert_equal(rows[0][6], "Int")
assert_equal(rows[1][6], '3')
assert_equal(rows[4][6], '11')
# Scientific value is used but its notation is not
assert_equal(rows[1][7], '100000')
# Fraction
assert_equal(rows[1][8], '1.25')
# Text
assert_equal(rows[1][9], "abc")

def test_ods_read_all_supported_formats_casted(self):
fh = horror_fobj('ods_formats.ods')
table_set = ODSTableSet(fh)
assert_equal(3, len(table_set.tables))
row_set = table_set.tables[0]
rows = cast_row_set_to_rows(row_set)
date_format = "%d/%m/%Y"
assert_equal(rows[0][0], "Date")
assert_equal(rows[1][0].strftime(date_format), "11/11/2014")
assert_equal(rows[2][0].strftime(date_format), "01/01/2001")
assert_equal(rows[3][0], '')
# time formats
time_format = "%S:%M:%H"
assert_equal(rows[0][1], "Time")
assert_equal(rows[1][1].strftime(time_format), "12:12:11")
assert_equal(rows[2][1].strftime(time_format), "12:00:00")
assert_equal(rows[3][1], 0)
assert_equal(rows[4][1], datetime.timedelta(hours=27,
minutes=17,
seconds=54))
assert_equal(rows[5][1], "Other")
# boolean
assert_equal(rows[0][2], "Boolean")
assert_equal(rows[1][2], True)
assert_equal(rows[2][2], False)
# Float
assert_equal(rows[0][3], "Float")
assert_equal(rows[1][3], Decimal('11.11'))
# Currency
assert_equal(rows[0][4], "Currency")
assert_equal(rows[1][4], Decimal('1'))
assert_equal(rows[2][4], Decimal('-10000'))
# Percentage
assert_equal(rows[0][5], "Percentage")
assert_equal(rows[1][5], Decimal('0.02'))
# int
assert_equal(rows[0][6], "Int")
assert_equal(rows[1][6], 3)
assert_equal(rows[4][6], 11)
# Scientific value is used but its notation is not
assert_equal(rows[1][7], 100000)
# Fraction
assert_equal(rows[1][8], Decimal('1.25'))
# Text
assert_equal(rows[1][9], "abc")

def test_ods_read_multi_line_cell(self):
fh = horror_fobj('multilineods.ods')
table_set = ODSTableSet(fh)
row_set = table_set.tables[0]
rows = row_set_to_rows(row_set)
assert_equal(rows[0][0], '1\n2\n3\n4')


def row_set_to_rows(row_set):
rows = []
for row in row_set:
rows.append([cell.value for cell in row])
return rows


def cast_row_set_to_rows(row_set):
rows = []
for row in row_set:
rows.append([cell.type.cast(cell.value) for cell in row])
return rows


class XlsxBackwardsCompatibilityTest(unittest.TestCase):
def test_that_xlsx_is_handled_by_xls_table_set(self):
"""
Expand Down Expand Up @@ -575,8 +679,8 @@ def setUp(self):
PDFTableSet(fh)
except ImportError:
# Optional library isn't installed. Skip the tests.
raise SkipTest("pdftables is not installed, skipping PDF tests")

raise SkipTest(
"pdftables is not installed, skipping PDF tests")

def test_read_simple_pdf(self):
with horror_fobj('simple.pdf') as fh:
Expand Down
Loading

0 comments on commit 126630d

Please sign in to comment.