diff --git a/CHANGELOG.md b/CHANGELOG.md index a3b1d91..5e8f8cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +0.15.2 (8 February 2017) +* #165: detect ods types: boolean, currency, time and percentage. support repeated columns +* #160: Correct spelling of separator in source + 0.15.1 (29 September 2016) * #158: Add CDFV2-unknown to MIMELOOKUP * #157: Fix for Python Magic API change diff --git a/README.md b/README.md index 787a362..f160196 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) +# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/) A library for dealing with messy tabular data in several formats, guessing types and detecting headers. diff --git a/doc/index.rst b/doc/index.rst index 176e3f6..bb4c8f3 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -182,15 +182,8 @@ of a given column into all types and searching for the best match. .. automethod:: messytables.types.type_guess -The supported types include: - -.. autoclass:: messytables.types.StringType -.. autoclass:: messytables.types.IntegerType -.. autoclass:: messytables.types.FloatType -.. autoclass:: messytables.types.DecimalType -.. autoclass:: messytables.types.BoolType -.. autoclass:: messytables.types.DateType -.. autoclass:: messytables.types.DateUtilType +The supported types are detailed in +`typecast `_ Headers detection ----------------- diff --git a/horror/multilineods.ods b/horror/multilineods.ods new file mode 100644 index 0000000..c03f645 Binary files /dev/null and b/horror/multilineods.ods differ diff --git a/horror/ods_formats.ods b/horror/ods_formats.ods new file mode 100644 index 0000000..fa1f7ff Binary files /dev/null and b/horror/ods_formats.ods differ diff --git a/messytables/any.py b/messytables/any.py index 802aeb6..477e725 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -27,6 +27,7 @@ 'text/plain': 'CSV', # could be TAB. 'application/CDFV2-corrupt': 'XLS', 'application/CDFV2-unknown': 'XLS', + 'application/CDFV2': 'XLS', 'application/vnd.oasis.opendocument.spreadsheet': 'ODS', 'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS', } diff --git a/messytables/headers.py b/messytables/headers.py index 0b20453..cd53d39 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -24,7 +24,7 @@ def headers_guess(rows, tolerance=1): """Guess the offset and names of the headers of the row set. This will attempt to locate the first row within ``tolerance`` - of the mode of the number of rows in the row set sample. + of the mode of the number of columns in the row set sample. The return value is a tuple of the offset of the header row and the names of the columns. diff --git a/messytables/html.py b/messytables/html.py index 20c0f35..62c59d8 100644 --- a/messytables/html.py +++ b/messytables/html.py @@ -130,7 +130,7 @@ def identify_anatomy(tag): class FakeHTMLCell(Cell): - """FakeHTMLCells are not present because of column or row spannning.""" + """FakeHTMLCells are not present because of column or row spanning.""" def __init__(self): super(FakeHTMLCell, self).__init__("") @@ -145,7 +145,7 @@ class HTMLCell(Cell): """ The Cell __init__ signature is: def __init__(self, value=None, column=None, type=None): where 'value' is the primary input, 'column' is a column name, and - type is messytables.types.StringType() or better.""" + type is messytables.types.String() or better.""" def __init__(self, value=None, column=None, type=None, source=None): assert value is None diff --git a/messytables/ods.py b/messytables/ods.py index da35d57..140c2c6 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -4,18 +4,44 @@ from lxml import etree from typecast import String, Decimal, Date +# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast? from messytables.core import RowSet, TableSet, Cell -ODS_NAMESPACES_TAG_MATCH = re.compile(b"(]*>)", re.MULTILINE) -ODS_TABLE_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) +ODS_TABLE_MATCH = re.compile( + b".*?().*?", re.MULTILINE) ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') -ODS_ROW_MATCH = re.compile(b".*?().*?", re.MULTILINE) +ODS_ROW_MATCH = re.compile( + b".*?().*?", re.MULTILINE) + +NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" +NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" +NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0" +NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0" + +TABLE_CELL = 'table-cell' +VALUE_TYPE = 'value-type' +COLUMN_REPEAT = 'number-columns-repeated' +EMPTY_CELL_VALUE = '' + +ODS_VALUE_TOKEN = { + "float": "value", + "date": "date-value", + "time": "time-value", + "boolean": "boolean-value", + "percentage": "value", + "currency": "value" +} ODS_TYPES = { 'float': Decimal(), 'date': Date(), + 'boolean': BoolType(), + 'percentage': PercentageType(), + 'time': TimeType() } @@ -102,13 +128,13 @@ def __init__(self, sheet, window=None, namespace_tags=None): else: namespaces = { "dc": u"http://purl.org/dc/elements/1.1/", - "draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0", - "number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0", - "office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0", - "svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0", - "table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0", - "text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0", - "calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0", + "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0", + "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0", + "office": NS_OPENDOCUMENT_PTTN % u"office:1.0", + "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0", + "table": NS_OPENDOCUMENT_PTTN % u"table:1.0", + "text": NS_OPENDOCUMENT_PTTN % u"text:1.0", + "calcext": NS_CAL_PTTN % u"calcext:1.0", } ods_header = u""\ @@ -128,20 +154,65 @@ def raw(self, sample=False): block = self.namespace_tags[0] + row + self.namespace_tags[1] partial = io.BytesIO(block) + empty_row = True + + for action, element in etree.iterparse(partial, ('end',)): + if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): + continue - for action, elem in etree.iterparse(partial, ('end',)): - if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell': - cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type') - children = elem.getchildren() - if children: - c = Cell(children[0].text, - type=ODS_TYPES.get(cell_type, String())) - row_data.append(c) + cell = _read_cell(element) + if empty_row is True and cell.value != EMPTY_CELL_VALUE: + empty_row = False - if not row_data: + repeat = element.attrib.get( + _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) + if repeat: + number_of_repeat = int(repeat) + row_data += [cell] * number_of_repeat + else: + row_data.append(cell) + + if empty_row: # ignore blank lines continue del partial yield row_data del rows + + +def _read_cell(element): + cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, type=ODS_TYPES.get(cell_type, String())) + else: + cell = Cell(EMPTY_CELL_VALUE, type=String()) + + return cell + + +def _read_text_cell(element): + children = element.getchildren() + text_content = [] + for child in children: + if child.text: + text_content.append(child.text) + else: + text_content.append(EMPTY_CELL_VALUE) + if len(text_content) > 0: + cell_value = '\n'.join(text_content) + else: + cell_value = EMPTY_CELL_VALUE + return Cell(cell_value, type=String()) + + +def _tag(namespace, tag): + return '{%s}%s' % (namespace, tag) diff --git a/test/test_any.py b/test/test_any.py index bfb37a1..ce39b1c 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from .util import horror_fobj +from util import horror_fobj from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, diff --git a/test/test_guessing.py b/test/test_guessing.py index 2d3e7a4..2150340 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -4,7 +4,7 @@ # import cProfile # from pstats import Stats -from .util import horror_fobj +from util import horror_fobj from nose.plugins.attrib import attr from nose.plugins.skip import SkipTest from nose.tools import assert_equal diff --git a/test/test_properties.py b/test/test_properties.py index b4e4a0c..0a7ca09 100644 --- a/test/test_properties.py +++ b/test/test_properties.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from .util import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.error import NoSuchPropertyError from nose.tools import ( diff --git a/test/test_read.py b/test/test_read.py index 786c901..8e7e8e6 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- import unittest -from .util import horror_fobj +from util import horror_fobj +from decimal import Decimal from nose.plugins.attrib import attr from nose.tools import assert_equal from nose.plugins.skip import SkipTest @@ -190,7 +191,8 @@ def rows(skip_policy): row_set = table_set.tables[0] return row_set - second = lambda r: r[1].value + def second(row): + return row[1].value assert "goodbye" in list(map(second, rows(True))) assert " goodbye" in list(map(second, rows(False))) @@ -310,9 +312,9 @@ def test_read_large_ods(self): assert_equal(6, len(table_set.tables)) row_set = table_set.tables[0] row = next(row_set.raw()) - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) for row in row_set.sample: - assert len(row) == 5, len(row) + assert len(row) == 16384, len(row) def test_ods_version_4412(self): fh = horror_fobj('loffice-4.4.1.2.ods') @@ -336,6 +338,101 @@ def test_ods_read_past_blank_lines(self): assert_equal(rows[2][0], 'Jane') assert_equal(rows[3][0], 'Ian') + def test_ods_read_all_supported_formats(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = row_set_to_rows(row_set) + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0], "2014-11-11") + assert_equal(rows[2][0], "2001-01-01") + assert_equal(rows[3][0], '') + # time formats + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1], "PT11H12M12S") + assert_equal(rows[2][1], "PT00H00M12S") + assert_equal(rows[4][1], 'PT27H17M54S') + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], 'true') + assert_equal(rows[2][2], 'false') + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], '11.11') + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], '1 GBP') + assert_equal(rows[2][4], '-10000 GBP') + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], '2') + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], '3') + assert_equal(rows[4][6], '11') + # Scientific value is used but its notation is not + assert_equal(rows[1][7], '100000') + # Fraction + assert_equal(rows[1][8], '1.25') + # Text + assert_equal(rows[1][9], "abc") + + def test_ods_read_all_supported_formats_casted(self): + fh = horror_fobj('ods_formats.ods') + table_set = ODSTableSet(fh) + assert_equal(3, len(table_set.tables)) + row_set = table_set.tables[0] + rows = cast_row_set_to_rows(row_set) + date_format = "%d/%m/%Y" + assert_equal(rows[0][0], "Date") + assert_equal(rows[1][0].strftime(date_format), "11/11/2014") + assert_equal(rows[2][0].strftime(date_format), "01/01/2001") + assert_equal(rows[3][0], '') + # time formats + time_format = "%S:%M:%H" + assert_equal(rows[0][1], "Time") + assert_equal(rows[1][1].strftime(time_format), "12:12:11") + assert_equal(rows[2][1].strftime(time_format), "12:00:00") + assert_equal(rows[3][1], 0) + assert_equal(rows[4][1], datetime.timedelta(hours=27, + minutes=17, + seconds=54)) + assert_equal(rows[5][1], "Other") + # boolean + assert_equal(rows[0][2], "Boolean") + assert_equal(rows[1][2], True) + assert_equal(rows[2][2], False) + # Float + assert_equal(rows[0][3], "Float") + assert_equal(rows[1][3], Decimal('11.11')) + # Currency + assert_equal(rows[0][4], "Currency") + assert_equal(rows[1][4], Decimal('1')) + assert_equal(rows[2][4], Decimal('-10000')) + # Percentage + assert_equal(rows[0][5], "Percentage") + assert_equal(rows[1][5], Decimal('0.02')) + # int + assert_equal(rows[0][6], "Int") + assert_equal(rows[1][6], 3) + assert_equal(rows[4][6], 11) + # Scientific value is used but its notation is not + assert_equal(rows[1][7], 100000) + # Fraction + assert_equal(rows[1][8], Decimal('1.25')) + # Text + assert_equal(rows[1][9], "abc") + + def test_ods_read_multi_line_cell(self): + fh = horror_fobj('multilineods.ods') + table_set = ODSTableSet(fh) + row_set = table_set.tables[0] + rows = row_set_to_rows(row_set) + assert_equal(rows[0][0], '1\n2\n3\n4') + + def row_set_to_rows(row_set): rows = [] for row in row_set: @@ -343,6 +440,13 @@ def row_set_to_rows(row_set): return rows +def cast_row_set_to_rows(row_set): + rows = [] + for row in row_set: + rows.append([cell.type.cast(cell.value) for cell in row]) + return rows + + class XlsxBackwardsCompatibilityTest(unittest.TestCase): def test_that_xlsx_is_handled_by_xls_table_set(self): """ @@ -575,8 +679,8 @@ def setUp(self): PDFTableSet(fh) except ImportError: # Optional library isn't installed. Skip the tests. - raise SkipTest("pdftables is not installed, skipping PDF tests") - + raise SkipTest( + "pdftables is not installed, skipping PDF tests") def test_read_simple_pdf(self): with horror_fobj('simple.pdf') as fh: diff --git a/test/test_rowset.py b/test/test_rowset.py index 39077d9..52e3928 100644 --- a/test/test_rowset.py +++ b/test/test_rowset.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from .util import horror_fobj +from util import horror_fobj from messytables.any import any_tableset diff --git a/test/test_stream.py b/test/test_stream.py index f2e5723..2ed6efd 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -4,7 +4,7 @@ import requests import six.moves.urllib as urllib -from .util import horror_fobj +from util import horror_fobj import httpretty from nose.tools import assert_equal diff --git a/test/test_tableset.py b/test/test_tableset.py index 9d0c127..d03de88 100644 --- a/test/test_tableset.py +++ b/test/test_tableset.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import unittest -from .util import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.core import RowSet from messytables.error import TableError