Merge branch 'master' into cleanup-mt2-redux

okfn · Jul 5, 2019 · 4aa3101 · 4aa3101
2 parents 6638e58 + f6f2250
commit 4aa3101
Show file tree

Hide file tree

Showing 17 changed files with 217 additions and 63 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+0.15.2 (8 February 2017)
+* #165: detect ods types: boolean, currency, time and percentage. support repeated columns
+* #160: Correct spelling of separator in source
+
 0.15.1 (29 September 2016)
 * #158: Add CDFV2-unknown to MIMELOOKUP
 * #157: Fix for Python Magic API change

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
+# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/)
 
 A library for dealing with messy tabular data in several formats, guessing types and detecting headers.
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -182,15 +182,8 @@ of a given column into all types and searching for the best match.
 
 .. automethod:: messytables.types.type_guess
 
-The supported types include:
-
-.. autoclass:: messytables.types.StringType
-.. autoclass:: messytables.types.IntegerType
-.. autoclass:: messytables.types.FloatType
-.. autoclass:: messytables.types.DecimalType
-.. autoclass:: messytables.types.BoolType
-.. autoclass:: messytables.types.DateType
-.. autoclass:: messytables.types.DateUtilType
+The supported types are detailed in
+`typecast <https://github.com/pudo/typecast#typecast>`_
 
 Headers detection
 -----------------

diff --git a/horror/multilineods.ods b/horror/multilineods.ods
diff --git a/horror/ods_formats.ods b/horror/ods_formats.ods
diff --git a/messytables/any.py b/messytables/any.py
@@ -27,6 +27,7 @@
               'text/plain': 'CSV',  # could be TAB.
               'application/CDFV2-corrupt': 'XLS',
               'application/CDFV2-unknown': 'XLS',
+              'application/CDFV2': 'XLS',
               'application/vnd.oasis.opendocument.spreadsheet': 'ODS',
               'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
               }

diff --git a/messytables/headers.py b/messytables/headers.py
@@ -24,7 +24,7 @@ def headers_guess(rows, tolerance=1):
     """Guess the offset and names of the headers of the row set.
 
     This will attempt to locate the first row within ``tolerance``
-    of the mode of the number of rows in the row set sample.
+    of the mode of the number of columns in the row set sample.
 
     The return value is a tuple of the offset of the header row
     and the names of the columns.

diff --git a/messytables/html.py b/messytables/html.py
@@ -130,7 +130,7 @@ def identify_anatomy(tag):
 
 
 class FakeHTMLCell(Cell):
-    """FakeHTMLCells are not present because of column or row spannning."""
+    """FakeHTMLCells are not present because of column or row spanning."""
 
     def __init__(self):
         super(FakeHTMLCell, self).__init__("")
@@ -145,7 +145,7 @@ class HTMLCell(Cell):
     """ The Cell __init__ signature is:
     def __init__(self, value=None, column=None, type=None):
     where 'value' is the primary input, 'column' is a column name, and
-    type is messytables.types.StringType() or better."""
+    type is messytables.types.String() or better."""
 
     def __init__(self, value=None, column=None, type=None, source=None):
         assert value is None

diff --git a/messytables/ods.py b/messytables/ods.py
@@ -4,18 +4,44 @@
 
 from lxml import etree
 from typecast import String, Decimal, Date
+# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast?
 
 from messytables.core import RowSet, TableSet, Cell
 
 
-ODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document-content[^>]*>)", re.MULTILINE)
-ODS_TABLE_MATCH = re.compile(b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
+ODS_NAMESPACES_TAG_MATCH = re.compile(
+    b"(<office:document-content[^>]*>)", re.MULTILINE)
+ODS_TABLE_MATCH = re.compile(
+    b".*?(<table:table.*?<\/.*?:table>).*?", re.MULTILINE)
 ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?')
-ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
+ODS_ROW_MATCH = re.compile(
+    b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
+
+NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
+NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
+NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
+NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0"
+
+TABLE_CELL = 'table-cell'
+VALUE_TYPE = 'value-type'
+COLUMN_REPEAT = 'number-columns-repeated'
+EMPTY_CELL_VALUE = ''
+
+ODS_VALUE_TOKEN = {
+    "float": "value",
+    "date": "date-value",
+    "time": "time-value",
+    "boolean": "boolean-value",
+    "percentage": "value",
+    "currency": "value"
+}
 
 ODS_TYPES = {
     'float': Decimal(),
     'date': Date(),
+    'boolean': BoolType(),
+    'percentage': PercentageType(),
+    'time': TimeType()
 }
 
 
@@ -102,13 +128,13 @@ def __init__(self, sheet, window=None, namespace_tags=None):
         else:
             namespaces = {
                 "dc": u"http://purl.org/dc/elements/1.1/",
-                "draw": u"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
-                "number": u"urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
-                "office": u"urn:oasis:names:tc:opendocument:xmlns:office:1.0",
-                "svg": u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
-                "table": u"urn:oasis:names:tc:opendocument:xmlns:table:1.0",
-                "text": u"urn:oasis:names:tc:opendocument:xmlns:text:1.0",
-                "calcext": u"urn:org:documentfoundation:names:experimental:calc:xmlns:calcext:1.0",
+                "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
+                "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
+                "office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
+                "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
+                "table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
+                "text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
+                "calcext": NS_CAL_PTTN % u"calcext:1.0",
             }
 
             ods_header = u"<wrapper {0}>"\
@@ -128,20 +154,65 @@ def raw(self, sample=False):
 
             block = self.namespace_tags[0] + row + self.namespace_tags[1]
             partial = io.BytesIO(block)
+            empty_row = True
+
+            for action, element in etree.iterparse(partial, ('end',)):
+                if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
+                    continue
 
-            for action, elem in etree.iterparse(partial, ('end',)):
-                if elem.tag == '{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table-cell':
-                    cell_type = elem.attrib.get('urn:oasis:names:tc:opendocument:xmlns:office:1.0:value-type')
-                    children = elem.getchildren()
-                    if children:
-                        c = Cell(children[0].text,
-                                 type=ODS_TYPES.get(cell_type, String()))
-                        row_data.append(c)
+                cell = _read_cell(element)
+                if empty_row is True and cell.value != EMPTY_CELL_VALUE:
+                    empty_row = False
 
-            if not row_data:
+                repeat = element.attrib.get(
+                    _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
+                if repeat:
+                    number_of_repeat = int(repeat)
+                    row_data += [cell] * number_of_repeat
+                else:
+                    row_data.append(cell)
+
+            if empty_row:
                 # ignore blank lines
                 continue
 
             del partial
             yield row_data
         del rows
+
+
+def _read_cell(element):
+    cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
+    value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
+    if cell_type == 'string':
+        cell = _read_text_cell(element)
+    elif cell_type == 'currency':
+        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
+        currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
+        cell = Cell(value + ' ' + currency, type=CurrencyType())
+    elif cell_type is not None:
+        value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
+        cell = Cell(value, type=ODS_TYPES.get(cell_type, String()))
+    else:
+        cell = Cell(EMPTY_CELL_VALUE, type=String())
+
+    return cell
+
+
+def _read_text_cell(element):
+    children = element.getchildren()
+    text_content = []
+    for child in children:
+        if child.text:
+            text_content.append(child.text)
+        else:
+            text_content.append(EMPTY_CELL_VALUE)
+    if len(text_content) > 0:
+        cell_value = '\n'.join(text_content)
+    else:
+        cell_value = EMPTY_CELL_VALUE
+    return Cell(cell_value, type=String())
+
+
+def _tag(namespace, tag):
+    return '{%s}%s' % (namespace, tag)
diff --git a/messytables/types.py b/messytables/types.py
@@ -21,22 +21,3 @@ def type_guess(rows, types=GUESS_TYPES, strict=False):
             # add string guess so that we have at least one guess
             guessers[j].add(cell.value)
     return [g.best for g in guessers]
-
-
-def types_processor(types, strict=False):
-    """Apply the column types to the each row.
-
-    Strict means that casting errors are not ignored.
-    """
-    def apply_types(row_set, row):
-        if types is None:
-            return row
-        for cell, type in six.moves.zip_longest(row, types):
-            try:
-                cell.value = type.cast(cell.value)
-                cell.type = type
-            except:
-                if strict and type:
-                    raise
-        return row
-    return apply_types
diff --git a/test/test_any.py b/test/test_any.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from .util import horror_fobj
+from util import horror_fobj
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
 from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,

diff --git a/test/test_guessing.py b/test/test_guessing.py
@@ -4,7 +4,7 @@
 # import cProfile
 # from pstats import Stats
 
-from .util import horror_fobj
+from util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal

diff --git a/test/test_properties.py b/test/test_properties.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from .util import horror_fobj
+from util import horror_fobj
 from messytables.any import any_tableset
 from messytables.error import NoSuchPropertyError
 from nose.tools import (