Merge branch 'master' into cleanup-mt2-redux

okfn · Jul 5, 2019 · 126630d · 126630d
2 parents 6638e58 + f6f2250
commit 126630d
Show file tree

Hide file tree

Showing 16 changed files with 217 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+0.15.2 (8 February 2017)
+* #165: detect ods types: boolean, currency, time and percentage. support repeated columns
+* #160: Correct spelling of separator in source
+
 0.15.1 (29 September 2016)
 * #158: Add CDFV2-unknown to MIMELOOKUP
 * #157: Fix for Python Magic API change

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
+# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/)
 
 A library for dealing with messy tabular data in several formats, guessing types and detecting headers.
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -182,15 +182,8 @@ of a given column into all types and searching for the best match.
 
 .. automethod:: messytables.types.type_guess
 
-The supported types include:
-
-.. autoclass:: messytables.types.StringType
-.. autoclass:: messytables.types.IntegerType
-.. autoclass:: messytables.types.FloatType
-.. autoclass:: messytables.types.DecimalType
-.. autoclass:: messytables.types.BoolType
-.. autoclass:: messytables.types.DateType
-.. autoclass:: messytables.types.DateUtilType
+The supported types are detailed in
+`typecast <https://github.com/pudo/typecast#typecast>`_
 
 Headers detection
 -----------------

diff --git a/horror/multilineods.ods b/horror/multilineods.ods
diff --git a/horror/ods_formats.ods b/horror/ods_formats.ods
diff --git a/messytables/any.py b/messytables/any.py
@@ -27,6 +27,7 @@
               'text/plain': 'CSV',  # could be TAB.
               'application/CDFV2-corrupt': 'XLS',
               'application/CDFV2-unknown': 'XLS',
+              'application/CDFV2': 'XLS',
               'application/vnd.oasis.opendocument.spreadsheet': 'ODS',
               'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
               }

diff --git a/messytables/headers.py b/messytables/headers.py
@@ -24,7 +24,7 @@ def headers_guess(rows, tolerance=1):
     """Guess the offset and names of the headers of the row set.
 
     This will attempt to locate the first row within ``tolerance``
-    of the mode of the number of rows in the row set sample.
+    of the mode of the number of columns in the row set sample.
 
     The return value is a tuple of the offset of the header row
     and the names of the columns.

diff --git a/messytables/html.py b/messytables/html.py
@@ -130,7 +130,7 @@ def identify_anatomy(tag):
 
 
 class FakeHTMLCell(Cell):
-    """FakeHTMLCells are not present because of column or row spannning."""
+    """FakeHTMLCells are not present because of column or row spanning."""
 
     def __init__(self):
         super(FakeHTMLCell, self).__init__("")
@@ -145,7 +145,7 @@ class HTMLCell(Cell):
     """ The Cell __init__ signature is:
     def __init__(self, value=None, column=None, type=None):
     where 'value' is the primary input, 'column' is a column name, and
-    type is messytables.types.StringType() or better."""
+    type is messytables.types.String() or better."""
 
     def __init__(self, value=None, column=None, type=None, source=None):
         assert value is None

diff --git a/messytables/ods.py b/messytables/ods.py
@@ -4,18 +4,44 @@
 
 from lxml import etree
 from typecast import String, Decimal, Date
+# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast?
 
 from messytables.core import RowSet, TableSet, Cell
 
 
-ODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document-content[^>]*>)", re.MULTILINE)
-ODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
+ODS_NAMESPACES_TAG_MATCH = re.compile(
+    b"(<office:document-content[^>]*>)", re.MULTILINE)
+ODS_TABLE_MATCH = re.compile(
+    b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
 ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?')
-ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
+ODS_ROW_MATCH = re.compile(
+    b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
+
+NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
+NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
+NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
+NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0"
+
+TABLE_CELL = 'table-cell'
+VALUE_TYPE = 'value-type'
+COLUMN_REPEAT = 'number-columns-repeated'
+EMPTY_CELL_VALUE = ''
+
+ODS_VALUE_TOKEN = {
+    "float": "value",
+    "date": "date-value",
+    "time": "time-value",
+    "boolean": "boolean-value",
+    "percentage": "value",
+    "currency": "value"
+}
 
 ODS_TYPES = {
     'float': Decimal(),
     'date': Date(),
+    'boolean': BoolType(),
+    'percentage': PercentageType(),
+    'time': TimeType()
 }
 
 
@@ -102,13 +128,13 @@ def __init__(self, sheet, window=None, namespace_tags=None):
         else:
             namespaces = {
                 "dc": u"http://purl.org/dc/elements/1.1/",
-                "draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
-                "number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
-                "office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0",
-                "svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
-                "table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0",
-                "text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0",
-                "calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0",
+                "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
+                "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
+                "office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
+                "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
+                "table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
+                "text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
+                "calcext": NS_CAL_PTTN % u"calcext:1.0",
             }
 
             ods_header = u"<wrapper {0}>"\
@@ -128,20 +154,65 @@ def raw(self, sample=False):
 
             block = self.namespace_tags[0] + row + self.namespace_tags[1]
             partial = io.BytesIO(block)
+            empty_row = True
+
+            for action, element in etree.iterparse(partial, ('end',)):
+                if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
+                    continue
 
-            for action, elem in etree.iterparse(partial, ('end',)):
-                if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
-                    cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type')
-                    children = elem.getchildren()
-                    if children:
-                        c = Cell(children[0].text,
-                                 type=ODS_TYPES.get(cell_type, String()))
-                        row_data.append(c)
+                cell = _read_cell(element)
+                if empty_row is True and cell.value != EMPTY_CELL_VALUE:
+                    empty_row = False
 
-            if not row_data:
+                repeat = element.attrib.get(
+                    _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
+                if repeat:
+                    number_of_repeat = int(repeat)
+                    row_data += [cell] * number_of_repeat
+                else:
+                    row_data.append(cell)
+
+            if empty_row:
                 # ignore blank lines
                 continue
 
             del partial
             yield row_data
         del rows
+
+
+def _read_cell(element):
+    cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
+    value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
+    if cell_type == 'string':
+        cell = _read_text_cell(element)
+    elif cell_type == 'currency':
+        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
+        currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
+        cell = Cell(value + ' ' + currency, type=CurrencyType())
+    elif cell_type is not None:
+        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
+        cell = Cell(value, type=ODS_TYPES.get(cell_type, String()))
+    else:
+        cell = Cell(EMPTY_CELL_VALUE, type=String())
+
+    return cell
+
+
+def _read_text_cell(element):
+    children = element.getchildren()
+    text_content = []
+    for child in children:
+        if child.text:
+            text_content.append(child.text)
+        else:
+            text_content.append(EMPTY_CELL_VALUE)
+    if len(text_content) > 0:
+        cell_value = '\n'.join(text_content)
+    else:
+        cell_value = EMPTY_CELL_VALUE
+    return Cell(cell_value, type=String())
+
+
+def _tag(namespace, tag):
+    return '{%s}%s' % (namespace, tag)
diff --git a/test/test_any.py b/test/test_any.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from .util import horror_fobj
+from util import horror_fobj
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
 from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,

diff --git a/test/test_guessing.py b/test/test_guessing.py
@@ -4,7 +4,7 @@
 # import cProfile
 # from pstats import Stats
 
-from .util import horror_fobj
+from util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal

diff --git a/test/test_properties.py b/test/test_properties.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from .util import horror_fobj
+from util import horror_fobj
 from messytables.any import any_tableset
 from messytables.error import NoSuchPropertyError
 from nose.tools import (

diff --git a/test/test_read.py b/test/test_read.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from .util import horror_fobj
+from util import horror_fobj
+from decimal import Decimal
 from nose.plugins.attrib import attr
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
@@ -190,7 +191,8 @@ def rows(skip_policy):
             row_set = table_set.tables[0]
             return row_set
 
-        second = lambda r: r[1].value
+        def second(row):
+            return row[1].value
 
         assert "goodbye" in list(map(second, rows(True)))
         assert "    goodbye" in list(map(second, rows(False)))
@@ -310,9 +312,9 @@ def test_read_large_ods(self):
         assert_equal(6, len(table_set.tables))
         row_set = table_set.tables[0]
         row = next(row_set.raw())
-        assert len(row) == 5, len(row)
+        assert len(row) == 16384, len(row)
         for row in row_set.sample:
-            assert len(row) == 5, len(row)
+            assert len(row) == 16384, len(row)
 
     def test_ods_version_4412(self):
         fh = horror_fobj('loffice-4.4.1.2.ods')
@@ -336,13 +338,115 @@ def test_ods_read_past_blank_lines(self):
         assert_equal(rows[2][0], 'Jane')
         assert_equal(rows[3][0], 'Ian')
 
+    def test_ods_read_all_supported_formats(self):
+        fh = horror_fobj('ods_formats.ods')
+        table_set = ODSTableSet(fh)
+        assert_equal(3, len(table_set.tables))
+        row_set = table_set.tables[0]
+        rows = row_set_to_rows(row_set)
+        assert_equal(rows[0][0], "Date")
+        assert_equal(rows[1][0], "2014-11-11")
+        assert_equal(rows[2][0], "2001-01-01")
+        assert_equal(rows[3][0], '')
+        # time formats
+        assert_equal(rows[0][1], "Time")
+        assert_equal(rows[1][1], "PT11H12M12S")
+        assert_equal(rows[2][1], "PT00H00M12S")
+        assert_equal(rows[4][1], 'PT27H17M54S')
+        assert_equal(rows[5][1], "Other")
+        # boolean
+        assert_equal(rows[0][2], "Boolean")
+        assert_equal(rows[1][2], 'true')
+        assert_equal(rows[2][2], 'false')
+        # Float
+        assert_equal(rows[0][3], "Float")
+        assert_equal(rows[1][3], '11.11')
+        # Currency
+        assert_equal(rows[0][4], "Currency")
+        assert_equal(rows[1][4], '1 GBP')
+        assert_equal(rows[2][4], '-10000 GBP')
+        # Percentage
+        assert_equal(rows[0][5], "Percentage")
+        assert_equal(rows[1][5], '2')
+        # int
+        assert_equal(rows[0][6], "Int")
+        assert_equal(rows[1][6], '3')
+        assert_equal(rows[4][6], '11')
+        # Scientific value is used but its notation is not
+        assert_equal(rows[1][7], '100000')
+        # Fraction
+        assert_equal(rows[1][8], '1.25')
+        # Text
+        assert_equal(rows[1][9], "abc")
+
+    def test_ods_read_all_supported_formats_casted(self):
+        fh = horror_fobj('ods_formats.ods')
+        table_set = ODSTableSet(fh)
+        assert_equal(3, len(table_set.tables))
+        row_set = table_set.tables[0]
+        rows = cast_row_set_to_rows(row_set)
+        date_format = "%d/%m/%Y"
+        assert_equal(rows[0][0], "Date")
+        assert_equal(rows[1][0].strftime(date_format), "11/11/2014")
+        assert_equal(rows[2][0].strftime(date_format), "01/01/2001")
+        assert_equal(rows[3][0], '')
+        # time formats
+        time_format = "%S:%M:%H"
+        assert_equal(rows[0][1], "Time")
+        assert_equal(rows[1][1].strftime(time_format), "12:12:11")
+        assert_equal(rows[2][1].strftime(time_format), "12:00:00")
+        assert_equal(rows[3][1], 0)
+        assert_equal(rows[4][1], datetime.timedelta(hours=27,
+                                                    minutes=17,
+                                                    seconds=54))
+        assert_equal(rows[5][1], "Other")
+        # boolean
+        assert_equal(rows[0][2], "Boolean")
+        assert_equal(rows[1][2], True)
+        assert_equal(rows[2][2], False)
+        # Float
+        assert_equal(rows[0][3], "Float")
+        assert_equal(rows[1][3], Decimal('11.11'))
+        # Currency
+        assert_equal(rows[0][4], "Currency")
+        assert_equal(rows[1][4], Decimal('1'))
+        assert_equal(rows[2][4], Decimal('-10000'))
+        # Percentage
+        assert_equal(rows[0][5], "Percentage")
+        assert_equal(rows[1][5], Decimal('0.02'))
+        # int
+        assert_equal(rows[0][6], "Int")
+        assert_equal(rows[1][6], 3)
+        assert_equal(rows[4][6], 11)
+        # Scientific value is used but its notation is not
+        assert_equal(rows[1][7], 100000)
+        # Fraction
+        assert_equal(rows[1][8], Decimal('1.25'))
+        # Text
+        assert_equal(rows[1][9], "abc")
+
+    def test_ods_read_multi_line_cell(self):
+        fh = horror_fobj('multilineods.ods')
+        table_set = ODSTableSet(fh)
+        row_set = table_set.tables[0]
+        rows = row_set_to_rows(row_set)
+        assert_equal(rows[0][0], '1\n2\n3\n4')
+
+
 def row_set_to_rows(row_set):
     rows = []
     for row in row_set:
         rows.append([cell.value for cell in row])
     return rows
 
 
+def cast_row_set_to_rows(row_set):
+    rows = []
+    for row in row_set:
+        rows.append([cell.type.cast(cell.value) for cell in row])
+    return rows
+
+
 class XlsxBackwardsCompatibilityTest(unittest.TestCase):
     def test_that_xlsx_is_handled_by_xls_table_set(self):
         """
@@ -575,8 +679,8 @@ def setUp(self):
                 PDFTableSet(fh)
             except ImportError:
                 # Optional library isn't installed. Skip the tests.
-                raise SkipTest("pdftables is not installed, skipping PDF tests")
-
+                raise SkipTest(
+                    "pdftables is not installed, skipping PDF tests")
 
     def test_read_simple_pdf(self):
         with horror_fobj('simple.pdf') as fh: