From 1390f094f64f343cee900a7cad1dd0a0fb425313 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Thu, 6 Aug 2015 15:17:26 +0200
Subject: [PATCH 01/35] Use `typecast` for type conversion.

---
 Makefile                  |   5 +-
 messytables/dateparser.py |  66 ---------
 messytables/types.py      | 282 ++++++--------------------------------
 setup.py                  |   9 +-
 test/test_guessing.py     |   4 +-
 test/test_read.py         |   1 +
 6 files changed, 58 insertions(+), 309 deletions(-)
 delete mode 100644 messytables/dateparser.py

diff --git a/Makefile b/Makefile
index c5cf657..8214231 100644
--- a/Makefile
+++ b/Makefile
@@ -7,4 +7,7 @@ run:    build
 build:
 	@docker build -t messytables .
 
-.PHONY: run build
+test:
+	nosetests --with-coverage --cover-package=messytables --cover-erase
+
+.PHONY: run build test
diff --git a/messytables/dateparser.py b/messytables/dateparser.py
deleted file mode 100644
index 05d7c93..0000000
--- a/messytables/dateparser.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import re
-
-date_regex = re.compile(r'''^\d{1,4}[-\/\.\s]\S+[-\/\.\s]\S+''')
-
-
-def is_date(value):
-    return len(value) != 1 and date_regex.match(value)
-
-
-def create_date_formats(day_first=True):
-    """generate combinations of time and date
-    formats with different delimeters
-    """
-
-    if day_first:
-        date_formats = ['dd/mm/yyyy', 'dd/mm/yy', 'yyyy/mm/dd']
-        python_date_formats = ['%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d']
-    else:
-        date_formats = ['mm/dd/yyyy', 'mm/dd/yy', 'yyyy/mm/dd']
-        python_date_formats = ['%m/%d/%Y', '%m/%d/%y', '%Y/%m/%d']
-
-    date_formats += [
-        # Things with words in
-        'dd/bb/yyyy', 'dd/bbb/yyyy'
-    ]
-    python_date_formats += [
-        # Things with words in
-        '%d/%b/%Y', '%d/%B/%Y'
-    ]
-
-    both_date_formats = list(zip(date_formats, python_date_formats))
-
-    #time_formats = "hh:mmz hh:mm:ssz hh:mmtzd hh:mm:sstzd".split()
-    time_formats = "hh:mm:ssz hh:mm:ss hh:mm:sstzd".split()
-    python_time_formats = "%H:%M%Z %H:%M:%S %H:%M:%S%Z %H:%M%z %H:%M:%S%z".split()
-    both_time_formats = list(zip(time_formats, python_time_formats))
-
-    #date_separators = ["-","."," ","","/","\\"]
-    date_separators = ["-", ".", "/", " "]
-
-    all_date_formats = []
-
-    for separator in date_separators:
-        for date_format, python_date_format in both_date_formats:
-            all_date_formats.append(
-                (date_format.replace("/", separator),
-                 python_date_format.replace("/", separator))
-            )
-
-    all_formats = {}
-
-    for date_format, python_date_format in all_date_formats:
-        all_formats[date_format] = python_date_format
-        for time_format, python_time_format in both_time_formats:
-
-            all_formats[date_format + time_format] = \
-                python_date_format + python_time_format
-
-            all_formats[date_format + "T" + time_format] =\
-                python_date_format + "T" + python_time_format
-
-            all_formats[date_format + " " + time_format] =\
-                python_date_format + " " + python_time_format
-    return list(all_formats.values())
-
-DATE_FORMATS = create_date_formats()
diff --git a/messytables/types.py b/messytables/types.py
index ba017f3..29a4356 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -1,201 +1,30 @@
-import decimal
-import datetime
 from collections import defaultdict
-from messytables.compat23 import izip_longest, unicode_string, string_types
-import locale
-import sys
-
-import dateutil.parser as parser
-
-from messytables.dateparser import DATE_FORMATS, is_date
-
-
-class CellType(object):
-    """ A cell type maintains information about the format
-    of the cell, providing methods to check if a type is
-    applicable to a given value and to convert a value to the
-    type. """
-
-    guessing_weight = 1
-    # the type that the result will have
-    result_type = None
-
-    def test(self, value):
-        """ Test if the value is of the given type. The
-        default implementation calls ``cast`` and checks if
-        that throws an exception. True or False"""
-        if isinstance(value, self.result_type):
-            return True
-        try:
-            self.cast(value)
-            return True
-        except:
-            return False
-
-    @classmethod
-    def instances(cls):
-        return [cls()]
-
-    def cast(self, value):
-        """ Convert the value to the type. This may throw
-        a quasi-random exception if conversion fails. """
-        return value
-
-    def __eq__(self, other):
-        return self.__class__ == other.__class__
-
-    def __hash__(self):
-        return hash(self.__class__)
-
-    def __repr__(self):
-        return self.__class__.__name__.rsplit('Type', 1)[0]
-
-
-class StringType(CellType):
-    """ A string or other unconverted type. """
-    result_type = unicode_string
-
-    def cast(self, value):
-        if value is None:
-            return None
-        if isinstance(value, self.result_type):
-            return value
-        try:
-            return unicode_string(value)
-        except UnicodeEncodeError:
-            return str(value)
-
-
-class IntegerType(CellType):
-    """ An integer field. """
-    guessing_weight = 6
-    result_type = int
-
-    def cast(self, value):
-        if value in ('', None):
-            return None
-
-        try:
-            value = float(value)
-        except:
-            return locale.atoi(value)
-
-        if value.is_integer():
-            return int(value)
-        else:
-            raise ValueError('Invalid integer: %s' % value)
-
-
-class DecimalType(CellType):
-    """ Decimal number, ``decimal.Decimal`` or float numbers. """
-    guessing_weight = 4
-    result_type = decimal.Decimal
-
-    def cast(self, value):
-        if value in ('', None):
-            return None
-        try:
-            return decimal.Decimal(value)
-        except:
-            value = locale.atof(value)
-            if sys.version_info < (2, 7):
-                value = str(value)
-            return decimal.Decimal(value)
-
-
-class FloatType(DecimalType):
-    """ FloatType is deprecated """
-    pass
-
-
-class BoolType(CellType):
-    """ A boolean field. Matches true/false, yes/no and 0/1 by default,
-    but a custom set of values can be optionally provided.
-    """
-    guessing_weight = 7
-    result_type = bool
-    true_values = ('yes', 'true', '0')
-    false_values = ('no', 'false', '1')
-
-    def __init__(self, true_values=None, false_values=None):
-        if true_values is not None:
-            self.true_values = true_values
-        if false_values is not None:
-            self.false_values = false_values
-
-    def cast(self, value):
-        s = value.strip().lower()
-        if value in ('', None):
-            return None
-        if s in self.true_values:
-            return True
-        if s in self.false_values:
-            return False
-        raise ValueError
-
-
-class DateType(CellType):
-    """ The date type is special in that it also includes a specific
-    date format that is used to parse the date, additionally to the
-    basic type information. """
-    guessing_weight = 3
-    formats = DATE_FORMATS
-    result_type = datetime.datetime
-
-    def __init__(self, format):
-        self.format = format
-
-    @classmethod
-    def instances(cls):
-        return [cls(v) for v in cls.formats]
-
-    def test(self, value):
-        if isinstance(value, string_types) and not is_date(value):
-            return False
-        return CellType.test(self, value)
-
-    def cast(self, value):
-        if isinstance(value, self.result_type):
-            return value
-        if value in ('', None):
-            return None
-        if self.format is None:
-            return value
-        return datetime.datetime.strptime(value, self.format)
-
-    def __eq__(self, other):
-        return (isinstance(other, DateType) and
-                self.format == other.format)
-
-    def __repr__(self):
-        return "Date(%s)" % self.format
-
-    def __hash__(self):
-        return hash(self.__class__) + hash(self.format)
-
-
-class DateUtilType(CellType):
-    """ The date util type uses the dateutil library to
-    parse the dates. The advantage of this type over
-    DateType is the speed and better date detection. However,
-    it does not offer format detection.
-
-    Do not use this together with the DateType"""
-    guessing_weight = 3
-    result_type = datetime.datetime
-
-    def test(self, value):
-        if len(value) == 1:
-             return False
-        return CellType.test(self, value)
-
-    def cast(self, value):
-        if value in ('', None):
-            return None
-        return parser.parse(value)
-
-
-TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType]
+from messytables.compat23 import izip_longest
+
+import typecast
+
+# For legacy support:
+StringType = typecast.String
+IntegerType = typecast.Integer
+DecimalType = typecast.Decimal
+FloatType = typecast.Decimal
+BoolType = typecast.Boolean
+DateType = typecast.Date
+DateTimeType = typecast.DateTime
+DateUtilType = typecast.Date
+
+
+WEIGHTS = {
+    typecast.String: 1,
+    typecast.Integer: 6,
+    typecast.Decimal: 4,
+    typecast.Boolean: 7,
+    typecast.Date: 3,
+    typecast.DateTime: 3
+}
+TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType,
+         DateTimeType]
+FAILED = 'failed'
 
 
 def type_guess(rows, types=TYPES, strict=False):
@@ -209,54 +38,31 @@ def type_guess(rows, types=TYPES, strict=False):
     if parsing fails for a single cell in the column."""
     guesses = []
     type_instances = [i for t in types for i in t.instances()]
-    if strict:
-        at_least_one_value = []
-        for ri, row in enumerate(rows):
-            diff = len(row) - len(guesses)
-            for _ in range(diff):
-                typesdict = {}
-                for type in type_instances:
-                    typesdict[type] = 0
-                guesses.append(typesdict)
-                at_least_one_value.append(False)
-            for ci, cell in enumerate(row):
-                if not cell.value:
+    for i, row in enumerate(rows):
+        diff = len(row) - len(guesses)
+        for _ in range(diff):
+            guesses.append(defaultdict(int))
+        for i, cell in enumerate(row):
+            # add string guess so that we have at least one guess
+            guesses[i][StringType()] = guesses[i].get(StringType(), 0)
+            for type in type_instances:
+                if guesses[i][type] == FAILED:
                     continue
-                at_least_one_value[ci] = True
-                for type in list(guesses[ci].keys()):
-                    if not type.test(cell.value):
-                        guesses[ci].pop(type)
-        # no need to set guessing weights before this
-        # because we only accept a type if it never fails
-        for i, guess in enumerate(guesses):
-            for type in guess:
-                guesses[i][type] = type.guessing_weight
-        # in case there were no values at all in the column,
-        # we just set the guessed type to string
-        for i, v in enumerate(at_least_one_value):
-            if not v:
-                guesses[i] = {StringType(): 0}
-    else:
-        for i, row in enumerate(rows):
-            diff = len(row) - len(guesses)
-            for _ in range(diff):
-                guesses.append(defaultdict(int))
-            for i, cell in enumerate(row):
-                # add string guess so that we have at least one guess
-                guesses[i][StringType()] = guesses[i].get(StringType(), 0)
-                if not cell.value:
-                    continue
-                for type in type_instances:
-                    if type.test(cell.value):
-                        guesses[i][type] += type.guessing_weight
-        _columns = []
+                result = type.test(cell.value) == 1
+                weight = WEIGHTS[type.__class__]
+                if strict and not result and not isinstance(type, StringType):
+                    guesses[i][type] = FAILED
+                elif result:
+                    guesses[i][type] += weight
+
     _columns = []
     for guess in guesses:
         # this first creates an array of tuples because we want the types to be
         # sorted. Even though it is not specified, python chooses the first
         # element in case of a tie
         # See: http://stackoverflow.com/a/6783101/214950
-        guesses_tuples = [(t, guess[t]) for t in type_instances if t in guess]
+        guesses_tuples = [(t, guess[t]) for t in type_instances
+                          if t in guess and guess[t] != FAILED]
         _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
     return _columns
 
diff --git a/setup.py b/setup.py
index 16fdb73..bff0284 100644
--- a/setup.py
+++ b/setup.py
@@ -44,12 +44,17 @@
         'chardet>=2.3.0',
         'python-dateutil>=1.5.0',
         'lxml>=3.2',
-        'requests',
+        'requests>=2.0',
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
+        'typecast',
     ],
     extras_require={'pdf': ['pdftables>=0.0.4']},
-    tests_require=[],
+    tests_require=[
+        'nose',
+        'httpretty',
+        'coverage'
+    ],
     entry_points=\
     """
     """,
diff --git a/test/test_guessing.py b/test/test_guessing.py
index b843c4e..7cd1d53 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -135,11 +135,11 @@ def test_file_with_few_strings_among_integers(self):
 
     def test_integer_and_float_detection(self):
         def helper(value):
-            return any(i.test(value) for i in IntegerType.instances())
+            return any(i.test(value) == 1 for i in IntegerType.instances())
 
         assert_equal(helper(123), True)
         assert_equal(helper('123'), True)
         assert_equal(helper(123.0), True)
-        assert_equal(helper('123.0'), True)
+        assert_equal(helper('123.0'), False)
         assert_equal(helper(123.1), False)
         assert_equal(helper('123.1'), False)
diff --git a/test/test_read.py b/test/test_read.py
index f4b73d1..a79a358 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -23,6 +23,7 @@
 stringy = type(u'')
 
 class ReadCsvTest(unittest.TestCase):
+
     def test_utf8bom_lost(self):
         fh = horror_fobj('utf8bom.csv')
         table_set = CSVTableSet(fh)

From 879dc695440974f76c2374a559775b108cc16340 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Mon, 24 Aug 2015 23:52:16 +0200
Subject: [PATCH 02/35] Fix up type guessing tests.

---
 messytables/__init__.py |  3 ++
 messytables/types.py    | 14 ++++---
 test/test_guessing.py   | 91 +++++++++++++++++++----------------------
 test/test_read.py       |  6 +--
 4 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/messytables/__init__.py b/messytables/__init__.py
index e2c03b9..30a3690 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -23,3 +23,6 @@
 from messytables.any import any_tableset, AnyTableSet
 
 from messytables.jts import rowset_as_jts, headers_and_typed_as_jts
+
+import warnings
+warnings.filterwarnings('ignore', "Coercing non-XML name")
diff --git a/messytables/types.py b/messytables/types.py
index 29a4356..96aeaf0 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -17,10 +17,10 @@
 WEIGHTS = {
     typecast.String: 1,
     typecast.Integer: 6,
-    typecast.Decimal: 4,
+    typecast.Decimal: 3,
     typecast.Boolean: 7,
-    typecast.Date: 3,
-    typecast.DateTime: 3
+    typecast.Date: 4,
+    typecast.DateTime: 5
 }
 TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType,
          DateTimeType]
@@ -48,11 +48,12 @@ def type_guess(rows, types=TYPES, strict=False):
             for type in type_instances:
                 if guesses[i][type] == FAILED:
                     continue
-                result = type.test(cell.value) == 1
+                result = type.test(cell.value)
                 weight = WEIGHTS[type.__class__]
-                if strict and not result and not isinstance(type, StringType):
+                if strict and (result == -1) and \
+                        (not isinstance(type, StringType)):
                     guesses[i][type] = FAILED
-                elif result:
+                elif result == 1:
                     guesses[i][type] += weight
 
     _columns = []
@@ -63,6 +64,7 @@ def type_guess(rows, types=TYPES, strict=False):
         # See: http://stackoverflow.com/a/6783101/214950
         guesses_tuples = [(t, guess[t]) for t in type_instances
                           if t in guess and guess[t] != FAILED]
+        # print 'GUESSES', zip(row, guesses_tuples)
         _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
     return _columns
 
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 7cd1d53..351ab19 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -5,10 +5,9 @@
 from . import horror_fobj
 from nose.plugins.attrib import attr
 from nose.tools import assert_equal
+from typecast import Date, String, Decimal, Integer, Boolean
 from messytables import (CSVTableSet, type_guess, headers_guess,
-                         offset_processor, DateType, StringType,
-                         DecimalType, IntegerType,
-                         DateUtilType, BoolType)
+                         offset_processor)
 
 
 class TypeGuessTest(unittest.TestCase):
@@ -25,8 +24,8 @@ def test_type_guess(self):
         guessed_types = type_guess(rows.sample)
 
         assert_equal(guessed_types, [
-            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
-            DateType('%d %B %Y'), BoolType(), BoolType()])
+            Decimal(), Date('%Y/%m/%d'), Integer(),
+            Date('%d %b %Y'), Boolean(), Integer()])
 
     def test_type_guess_strict(self):
         import locale
@@ -40,9 +39,9 @@ def test_type_guess_strict(self):
         rows = CSVTableSet(csv_file).tables[0]
         guessed_types = type_guess(rows.sample, strict=True)
         assert_equal(guessed_types, [
-            StringType(), StringType(),
-            DecimalType(), IntegerType(), DateType('%d %B %Y'),
-            DecimalType()])
+            String(), String(),
+            Decimal(), Integer(), Date('%d %b %Y'),
+            Decimal()])
 
     def test_strict_guessing_handles_padding(self):
         csv_file = io.BytesIO(b'''
@@ -53,7 +52,7 @@ def test_strict_guessing_handles_padding(self):
         guessed_types = type_guess(rows.sample, strict=True)
         assert_equal(len(guessed_types), 3)
         assert_equal(guessed_types,
-                     [StringType(), StringType(), DecimalType()])
+                     [String(), String(), Decimal()])
 
     def test_non_strict_guessing_handles_padding(self):
         csv_file = io.BytesIO(b'''
@@ -64,21 +63,22 @@ def test_non_strict_guessing_handles_padding(self):
         guessed_types = type_guess(rows.sample, strict=False)
         assert_equal(len(guessed_types), 3)
         assert_equal(guessed_types,
-                     [IntegerType(), StringType(), DecimalType()])
+                     [Integer(), String(), Decimal()])
 
     def test_guessing_uses_first_in_case_of_tie(self):
         csv_file = io.BytesIO(b'''
             2
             1.1
+            2.1
             1500''')
         rows = CSVTableSet(csv_file).tables[0]
         guessed_types = type_guess(
-            rows.sample, types=[DecimalType, IntegerType], strict=False)
-        assert_equal(guessed_types, [DecimalType()])
+            rows.sample, types=[Decimal, Integer], strict=False)
+        assert_equal(guessed_types, [Decimal()])
 
         guessed_types = type_guess(
-            rows.sample, types=[IntegerType, DecimalType], strict=False)
-        assert_equal(guessed_types, [IntegerType()])
+            rows.sample, types=[Integer, Decimal], strict=False)
+        assert_equal(guessed_types, [Integer()])
 
     @attr("slow")
     def test_strict_type_guessing_with_large_file(self):
@@ -86,56 +86,49 @@ def test_strict_type_guessing_with_large_file(self):
         rows = CSVTableSet(fh).tables[0]
         offset, headers = headers_guess(rows.sample)
         rows.register_processor(offset_processor(offset + 1))
-        types = [StringType, IntegerType, DecimalType, DateUtilType]
-        guessed_types = type_guess(rows.sample, types, True)
+        types = [String, Integer, Decimal, Date]
+        guessed_types = type_guess(rows.sample, types, False)
         assert_equal(len(guessed_types), 96)
+        print guessed_types
         assert_equal(guessed_types, [
-            IntegerType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            IntegerType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), IntegerType(), StringType(), DecimalType(),
-            DecimalType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            IntegerType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            IntegerType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), DateUtilType(),
-            DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
-            StringType(), StringType()])
+            Integer(), String(), String(), String(),
+            String(), String(), Integer(), String(), String(), String(),
+            String(), String(), String(), Integer(), String(), String(),
+            String(), String(), String(), String(), Integer(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), Decimal(), Decimal(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), Integer(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), Date('%d/%m/%y'), Date('%d/%m/%y'),
+            String(), String(), String()])
 
     def test_file_with_few_strings_among_integers(self):
         fh = horror_fobj('mixedGLB.csv')
         rows = CSVTableSet(fh).tables[0]
         offset, headers = headers_guess(rows.sample)
         rows.register_processor(offset_processor(offset + 1))
-        types = [StringType, IntegerType, DecimalType, DateUtilType]
+        types = [String, Integer, Decimal, Date]
         guessed_types = type_guess(rows.sample, types, True)
         assert_equal(len(guessed_types), 19)
         print(guessed_types)
         assert_equal(guessed_types, [
-            IntegerType(), IntegerType(),
-            IntegerType(), IntegerType(), IntegerType(), IntegerType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), StringType(), StringType(),
-            StringType(), StringType(), IntegerType(), StringType(),
-            StringType()])
+            Integer(), Integer(),
+            Integer(), Integer(), Integer(), Integer(),
+            String(), String(), String(), String(),
+            String(), String(), String(), String(),
+            String(), String(), Integer(), String(),
+            String()])
 
     def test_integer_and_float_detection(self):
         def helper(value):
-            return any(i.test(value) == 1 for i in IntegerType.instances())
+            return any(i.test(value) == 1 for i in Integer.instances())
 
         assert_equal(helper(123), True)
         assert_equal(helper('123'), True)
diff --git a/test/test_read.py b/test/test_read.py
index a79a358..88b9214 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -116,7 +116,7 @@ def test_apply_null_values(self):
         table_set = CSVTableSet(fh)
         row_set = table_set.tables[0]
         types = type_guess(row_set.sample, strict=True)
-        expected_types = [IntegerType(), StringType(), BoolType(),
+        expected_types = [IntegerType(), StringType(), IntegerType(),
                           StringType()]
         assert_equal(types, expected_types)
 
@@ -146,8 +146,8 @@ def test_null_process(self):
         assert_equal(nones[2], [False, True, False, False])
 
         types = type_guess(row_set.sample, strict=True)
-        expected_types = [IntegerType(), BoolType(), BoolType(),
-                          BoolType()]
+        expected_types = [IntegerType(), IntegerType(), IntegerType(),
+                          IntegerType()]
         assert_equal(types, expected_types)
 
         row_set.register_processor(types_processor(types))

From 2fdaf25a739d1e1969dc48e84247a8f4838881fd Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Mon, 24 Aug 2015 23:52:42 +0200
Subject: [PATCH 03/35] Hide coverage results.

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index e6fac63..ebba6d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 */_build/*
 *.py~
 *.~lock.*#
+.coverage

From d1d097257ed33c0ae304dca18dc94646f765c934 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Mon, 24 Aug 2015 23:55:52 +0200
Subject: [PATCH 04/35] Clean up imports.

---
 messytables/types.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/messytables/types.py b/messytables/types.py
index 96aeaf0..65842c4 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -1,29 +1,28 @@
 from collections import defaultdict
 from messytables.compat23 import izip_longest
 
-import typecast
+from typecast import String, Integer, Decimal, Boolean, Date, DateTime
 
 # For legacy support:
-StringType = typecast.String
-IntegerType = typecast.Integer
-DecimalType = typecast.Decimal
-FloatType = typecast.Decimal
-BoolType = typecast.Boolean
-DateType = typecast.Date
-DateTimeType = typecast.DateTime
-DateUtilType = typecast.Date
+StringType = String
+IntegerType = Integer
+DecimalType = Decimal
+FloatType = Decimal
+BoolType = Boolean
+DateType = Date
+DateTimeType = DateTime
+DateUtilType = Date
 
 
 WEIGHTS = {
-    typecast.String: 1,
-    typecast.Integer: 6,
-    typecast.Decimal: 3,
-    typecast.Boolean: 7,
-    typecast.Date: 4,
-    typecast.DateTime: 5
+    String: 1,
+    Integer: 6,
+    Decimal: 3,
+    Boolean: 7,
+    Date: 4,
+    DateTime: 5
 }
-TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType,
-         DateTimeType]
+TYPES = [String, Decimal, Integer, Boolean, Date, DateTime]
 FAILED = 'failed'
 
 

From 2f71c24f21eec92ded4aac4623c94c39f0ad04e0 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 00:09:32 +0200
Subject: [PATCH 05/35] Get rid of old type names.

---
 messytables/__init__.py |  2 --
 messytables/core.py     | 11 +++++++----
 messytables/excel.py    | 25 ++++++++++++++-----------
 messytables/html.py     | 12 +++++++-----
 messytables/jts.py      | 21 ++++++++++++---------
 messytables/ods.py      |  9 ++++-----
 messytables/pdf.py      |  6 +++---
 messytables/types.py    | 16 ++--------------
 test/test_guessing.py   |  4 ++--
 test/test_read.py       | 33 +++++++++++++++++----------------
 test/test_unit.py       | 14 +-------------
 11 files changed, 69 insertions(+), 84 deletions(-)

diff --git a/messytables/__init__.py b/messytables/__init__.py
index 30a3690..baecc4a 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -2,8 +2,6 @@
 from messytables.util import offset_processor, null_processor
 from messytables.headers import headers_guess, headers_processor, headers_make_unique
 from messytables.types import type_guess, types_processor
-from messytables.types import StringType, IntegerType, FloatType, \
-        DecimalType, DateType, DateUtilType, BoolType
 from messytables.error import ReadError
 
 from messytables.core import Cell, TableSet, RowSet, seekable_stream
diff --git a/messytables/core.py b/messytables/core.py
index 28ad7eb..3094c34 100644
--- a/messytables/core.py
+++ b/messytables/core.py
@@ -1,9 +1,13 @@
-from messytables.util import OrderedDict
+import io
 from collections import Mapping
+
+from typecast import String
+
+from messytables.util import OrderedDict
 from messytables.error import TableError, NoSuchPropertyError
-import io
 from messytables.compat23 import *
 
+
 def seekable_stream(fileobj):
     try:
         fileobj.seek(0)
@@ -115,8 +119,7 @@ class Cell(object):
 
     def __init__(self, value, column=None, type=None):
         if type is None:
-            from messytables.types import StringType
-            type = StringType()
+            type = String()
         self.value = value
         self.column = column
         self.column_autogenerated = False
diff --git a/messytables/excel.py b/messytables/excel.py
index 9d30131..744e70c 100644
--- a/messytables/excel.py
+++ b/messytables/excel.py
@@ -1,28 +1,29 @@
 import sys
 from datetime import datetime, time
+
 import xlrd
 from xlrd.biffh import XLRDError
+from typecast import String, Integer, Date, Float
 
 from messytables.core import RowSet, TableSet, Cell, CoreProperties
-from messytables.types import (StringType, IntegerType,
-                               DateType, FloatType)
 from messytables.error import ReadError
 from messytables.compat23 import PY2
 
+
 class InvalidDateError(Exception):
     pass
 
 XLS_TYPES = {
-    1: StringType(),
+    1: String(),
     # NB: Excel does not distinguish floats from integers so we use floats
     # We could try actual type detection between floats and ints later
     # or use the excel format string info - see
     # https://groups.google.com/forum/?fromgroups=#!topic/
     #  python-excel/cAQ1ndsCVxk
-    2: FloatType(),
-    3: DateType(None),
+    2: Float(),
+    3: Date(),
     # this is actually boolean but we do not have a boolean type yet
-    4: IntegerType()
+    4: Integer()
 }
 
 
@@ -45,7 +46,7 @@ def get_workbook():
                     file_contents=read_obj,
                     encoding_override=encoding,
                     formatting_info=with_formatting_info)
-            except XLRDError as e:
+            except XLRDError:
                 _, value, traceback = sys.exc_info()
                 if PY2:
                    raise ReadError("Can't read Excel file: %r" % value, traceback)
@@ -76,7 +77,7 @@ def get_workbook():
 
         try:
             self.workbook = get_workbook()
-        except NotImplementedError as e:
+        except NotImplementedError:
             if not with_formatting_info:
                 raise
             else:
@@ -115,12 +116,13 @@ def raw(self, sample=False):
                         self.sheet.name, colnum+1, rownum+1))
             yield row
 
+
 class XLSCell(Cell):
     @staticmethod
     def from_xlrdcell(xlrd_cell, sheet, col, row):
         value = xlrd_cell.value
-        cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
-        if cell_type == DateType(None):
+        cell_type = XLS_TYPES.get(xlrd_cell.ctype, String())
+        if cell_type == Date():
             if value == 0:
                 raise InvalidDateError
             year, month, day, hour, minute, second = \
@@ -143,10 +145,12 @@ def topleft(self):
     def properties(self):
         return XLSProperties(self)
 
+
 class XLSProperties(CoreProperties):
     KEYS = ['bold', 'size', 'italic', 'font_name', 'strikeout', 'underline',
             'font_colour', 'background_colour', 'any_border', 'all_border',
             'richtext', 'blank', 'a_date', 'formatting_string']
+
     def __init__(self, cell):
         self.cell = cell
         self.merged = {}
@@ -243,4 +247,3 @@ def get_all_border(self):
         b = self.xf.border
         return b.top_line_style > 0 and b.bottom_line_style > 0 and \
                b.left_line_style > 0 and b.right_line_style > 0
-
diff --git a/messytables/html.py b/messytables/html.py
index 2214363..4f02f26 100644
--- a/messytables/html.py
+++ b/messytables/html.py
@@ -1,9 +1,12 @@
-from messytables.core import RowSet, TableSet, Cell, CoreProperties
-import lxml.html
 from collections import defaultdict
-import html5lib
 import xml.etree.ElementTree as etree
 
+import html5lib
+import lxml.html
+from typecast import String
+
+from messytables.core import RowSet, TableSet, Cell, CoreProperties
+
 
 def fromstring(s):
     tb = html5lib.getTreeBuilder("lxml", implementation=etree)
@@ -159,8 +162,7 @@ def __init__(self, value=None, column=None, type=None, source=None):
         assert isinstance(source, lxml.etree._Element)
         self._lxml = source
         if type is None:
-            from messytables.types import StringType
-            type = StringType()
+            type = String()
         self.type = type
         self.column = column
         self.column_autogenerated = False
diff --git a/messytables/jts.py b/messytables/jts.py
index 031528f..056d9c3 100644
--- a/messytables/jts.py
+++ b/messytables/jts.py
@@ -2,18 +2,20 @@
 Convert a rowset to the json table schema
 (http://www.dataprotocols.org/en/latest/json-table-schema.html)
 '''
+import jsontableschema
+from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean
 
 import messytables
-import jsontableschema
+
 
 MESSYTABLES_TO_JTS_MAPPING = {
-    messytables.StringType: 'string',
-    messytables.IntegerType: 'integer',
-    messytables.FloatType: 'number',
-    messytables.DecimalType: 'number',
-    messytables.DateType: 'date',
-    messytables.DateUtilType: 'date',
-    messytables.BoolType: 'boolean'
+    String: 'string',
+    Integer: 'integer',
+    Float: 'number',
+    Decimal: 'number',
+    Date: 'date',
+    DateTime: 'datetime',
+    Boolean: 'boolean'
 }
 
 
@@ -25,7 +27,8 @@ def rowset_as_jts(rowset, headers=None, types=None):
     ''' Create a json table schema from a rowset
     '''
     _, headers = messytables.headers_guess(rowset.sample)
-    types = list(map(celltype_as_string, messytables.type_guess(rowset.sample)))
+    types = list(map(celltype_as_string,
+                     messytables.type_guess(rowset.sample)))
 
     return headers_and_typed_as_jts(headers, types)
 
diff --git a/messytables/ods.py b/messytables/ods.py
index 7b03d74..4351c85 100644
--- a/messytables/ods.py
+++ b/messytables/ods.py
@@ -3,10 +3,9 @@
 import zipfile
 
 from lxml import etree
+from typecast import String, Decimal, Date
 
 from messytables.core import RowSet, TableSet, Cell
-from messytables.types import (StringType, DecimalType,
-                               DateType)
 
 
 ODS_NAMESPACES_TAG_MATCH = re.compile(b"(<office:document-content[^>]*>)", re.MULTILINE)
@@ -15,8 +14,8 @@
 ODS_ROW_MATCH = re.compile(b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.MULTILINE)
 
 ODS_TYPES = {
-    'float': DecimalType(),
-    'date': DateType(None),
+    'float': Decimal(),
+    'date': Date(),
 }
 
 
@@ -135,7 +134,7 @@ def raw(self, sample=False):
                     children = elem.getchildren()
                     if children:
                         c = Cell(children[0].text,
-                                 type=ODS_TYPES.get(cell_type, StringType()))
+                                 type=ODS_TYPES.get(cell_type, String()))
                         row_data.append(c)
 
             if not row_data:
diff --git a/messytables/pdf.py b/messytables/pdf.py
index 4f9052e..11aa907 100644
--- a/messytables/pdf.py
+++ b/messytables/pdf.py
@@ -1,6 +1,6 @@
-from messytables.core import RowSet, TableSet, Cell
+from typecast import String
 
-from messytables.types import StringType
+from messytables.core import RowSet, TableSet, Cell
 
 try:
     from pdftables import get_tables
@@ -30,7 +30,7 @@ def __init__(self, pdftables_cell):
 
         self.column = None
         self.column_autogenerated = False
-        self.type = StringType()
+        self.type = String()
 
     @property
     def topleft(self):
diff --git a/messytables/types.py b/messytables/types.py
index 65842c4..5709332 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -3,17 +3,6 @@
 
 from typecast import String, Integer, Decimal, Boolean, Date, DateTime
 
-# For legacy support:
-StringType = String
-IntegerType = Integer
-DecimalType = Decimal
-FloatType = Decimal
-BoolType = Boolean
-DateType = Date
-DateTimeType = DateTime
-DateUtilType = Date
-
-
 WEIGHTS = {
     String: 1,
     Integer: 6,
@@ -43,14 +32,13 @@ def type_guess(rows, types=TYPES, strict=False):
             guesses.append(defaultdict(int))
         for i, cell in enumerate(row):
             # add string guess so that we have at least one guess
-            guesses[i][StringType()] = guesses[i].get(StringType(), 0)
+            guesses[i][String()] = guesses[i].get(String(), 0)
             for type in type_instances:
                 if guesses[i][type] == FAILED:
                     continue
                 result = type.test(cell.value)
                 weight = WEIGHTS[type.__class__]
-                if strict and (result == -1) and \
-                        (not isinstance(type, StringType)):
+                if strict and (result == -1) and not isinstance(type, String):
                     guesses[i][type] = FAILED
                 elif result == 1:
                     guesses[i][type] += weight
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 351ab19..48e9e27 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -6,8 +6,8 @@
 from nose.plugins.attrib import attr
 from nose.tools import assert_equal
 from typecast import Date, String, Decimal, Integer, Boolean
-from messytables import (CSVTableSet, type_guess, headers_guess,
-                         offset_processor)
+from messytables import CSVTableSet, type_guess, headers_guess
+from messytables import offset_processor
 
 
 class TypeGuessTest(unittest.TestCase):
diff --git a/test/test_read.py b/test/test_read.py
index 88b9214..38014d8 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -12,16 +12,17 @@
 except ImportError:
     from .shim26 import assert_is_instance, assert_greater_equal
 
-from messytables import (CSVTableSet, StringType, HTMLTableSet,
+from typecast import Date, Float, Integer, String
+from messytables import (CSVTableSet, HTMLTableSet,
                          ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet,
                          ODSTableSet, headers_guess, headers_processor,
-                         offset_processor, DateType, FloatType,
-                         IntegerType, BoolType, rowset_as_jts,
+                         offset_processor, rowset_as_jts,
                          types_processor, type_guess, ReadError,
                          null_processor)
 import datetime
 stringy = type(u'')
 
+
 class ReadCsvTest(unittest.TestCase):
 
     def test_utf8bom_lost(self):
@@ -42,7 +43,7 @@ def test_read_simple_csv(self):
 
         for row in list(row_set):
             assert_equal(3, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
     def test_read_complex_csv(self):
         fh = horror_fobj('complex.csv')
@@ -57,7 +58,7 @@ def test_read_complex_csv(self):
 
         for row in list(row_set):
             assert_equal(4, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
     def test_overriding_sniffed(self):
         # semicolon separated values
@@ -101,13 +102,13 @@ def test_read_type_guess_simple(self):
         table_set = CSVTableSet(fh)
         row_set = table_set.tables[0]
         types = type_guess(row_set.sample)
-        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
+        expected_types = [Date("%Y-%m-%d"), Integer(), String()]
         assert_equal(types, expected_types)
 
         row_set.register_processor(types_processor(types))
         data = list(row_set)
         header_types = [c.type for c in data[0]]
-        assert_equal(header_types, [StringType()] * 3)
+        assert_equal(header_types, [String()] * 3)
         row_types = [c.type for c in data[2]]
         assert_equal(expected_types, row_types)
 
@@ -116,8 +117,8 @@ def test_apply_null_values(self):
         table_set = CSVTableSet(fh)
         row_set = table_set.tables[0]
         types = type_guess(row_set.sample, strict=True)
-        expected_types = [IntegerType(), StringType(), IntegerType(),
-                          StringType()]
+        expected_types = [Integer(), String(), Integer(),
+                          String()]
         assert_equal(types, expected_types)
 
         row_set.register_processor(types_processor(types))
@@ -146,8 +147,8 @@ def test_null_process(self):
         assert_equal(nones[2], [False, True, False, False])
 
         types = type_guess(row_set.sample, strict=True)
-        expected_types = [IntegerType(), IntegerType(), IntegerType(),
-                          IntegerType()]
+        expected_types = [Integer(), Integer(), Integer(),
+                          Integer()]
         assert_equal(types, expected_types)
 
         row_set.register_processor(types_processor(types))
@@ -237,7 +238,7 @@ def test_read_simple_zip(self):
 
         for row in list(row_set):
             assert_equal(3, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
 
 class ReadTsvTest(unittest.TestCase):
@@ -251,7 +252,7 @@ def test_read_simple_tsv(self):
         assert_equal(row[1].value, 'expr1_0_imp')
         for row in list(row_set):
             assert_equal(17, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
 
 class ReadSsvTest(unittest.TestCase):
@@ -267,7 +268,7 @@ def test_read_simple_ssv(self):
 
         for row in list(row_set):
             assert_equal(3, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
 
 class ReadPsvTest(unittest.TestCase):
@@ -283,7 +284,7 @@ def test_read_simple_psv(self):
 
         for row in list(row_set):
             assert_equal(6, len(row))
-            assert_equal(row[0].type, StringType())
+            assert_equal(row[0].type, String())
 
 
 class ReadODSTest(unittest.TestCase):
@@ -471,7 +472,7 @@ def test_read_type_know_simple(self):
         row_set = table_set.tables[0]
         row = list(row_set.sample)[1]
         types = [c.type for c in row]
-        assert_equal(types, [DateType(None), FloatType(), StringType()])
+        assert_equal(types, [Date(None), Float(), String()])
 
     def test_bad_first_sheet(self):
         # First sheet appears to have no cells
diff --git a/test/test_unit.py b/test/test_unit.py
index 27c63aa..696604d 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -1,19 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from messytables import dateparser, Cell
-
-
-class DateParserTest(unittest.TestCase):
-    def test_date_regex(self):
-        assert dateparser.is_date('2012 12 22')
-        assert dateparser.is_date('2012/12/22')
-        assert dateparser.is_date('2012-12-22')
-        assert dateparser.is_date('22.12.2012')
-        assert dateparser.is_date('12 12 22')
-        assert dateparser.is_date('22 Dec 2012')
-        assert dateparser.is_date('2012 12 22 13:17')
-        assert dateparser.is_date('2012 12 22 T 13:17')
+from messytables import Cell
 
 
 class CellReprTest(unittest.TestCase):

From f06a3c1594890ce7bfd77df143179d19bee2c581 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 00:11:46 +0200
Subject: [PATCH 06/35] Clean out old aliases for XLSXTableSet

---
 messytables/__init__.py | 7 -------
 test/test_read.py       | 4 ++--
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/messytables/__init__.py b/messytables/__init__.py
index baecc4a..4abdaa9 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -8,13 +8,6 @@
 from messytables.commas import CSVTableSet, CSVRowSet
 from messytables.ods import ODSTableSet, ODSRowSet
 from messytables.excel import XLSTableSet, XLSRowSet
-
-# XLSXTableSet has been deprecated and its functionality is now provided by
-# XLSTableSet. This is to retain backwards compatibility with anyone
-# constructing XLSXTableSet directly (rather than using any_tableset)
-XLSXTableSet = XLSTableSet
-XLSXRowSet = XLSRowSet
-
 from messytables.zip import ZIPTableSet
 from messytables.html import HTMLTableSet, HTMLRowSet
 from messytables.pdf import PDFTableSet, PDFRowSet
diff --git a/test/test_read.py b/test/test_read.py
index 38014d8..2901c67 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -14,7 +14,7 @@
 
 from typecast import Date, Float, Integer, String
 from messytables import (CSVTableSet, HTMLTableSet,
-                         ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet,
+                         ZIPTableSet, XLSTableSet, PDFTableSet,
                          ODSTableSet, headers_guess, headers_processor,
                          offset_processor, rowset_as_jts,
                          types_processor, type_guess, ReadError,
@@ -349,7 +349,7 @@ def test_that_xlsx_is_handled_by_xls_table_set(self):
         Should emit a DeprecationWarning.
         """
         fh = horror_fobj('simple.xlsx')
-        assert_is_instance(XLSXTableSet(fh), XLSTableSet)
+        assert_is_instance(XLSTableSet(fh), XLSTableSet)
 
 
 class ReadXlsTest(unittest.TestCase):

From 92fb2159551e2af5b0c32b59d630f7ab53556c83 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 00:28:02 +0200
Subject: [PATCH 07/35] Further pieces of clean up.

---
 .travis.yml             |  1 -
 Dockerfile              | 30 ------------------------------
 Makefile                |  9 ---------
 messytables/__init__.py |  5 +++--
 messytables/any.py      |  7 -------
 messytables/commas.py   |  4 +---
 messytables/error.py    | 10 ++++++----
 messytables/jts.py      |  1 -
 setup.py                |  8 +++-----
 9 files changed, 13 insertions(+), 62 deletions(-)
 delete mode 100644 Dockerfile

diff --git a/.travis.yml b/.travis.yml
index bd19ad7..25aaf2b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,5 @@
 language: python
 python:
-  - "2.6"
   - "2.7"
   - "3.4"
 install:
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index b682622..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,30 +0,0 @@
-FROM ubuntu:14.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && \
-    apt-get install -y \
-        python-pip \
-        python-dev
-
-RUN apt-get install -y python-numpy python-lxml
-RUN apt-get install -y python3 python3-pip python3-lxml python3-nose
-# chardet version is out of date; old version doesn't detect UTF8 w/ BOM
-RUN pip3 install --upgrade chardet
-RUN apt-get install -y python-nose
-RUN locale-gen en_GB.UTF-8
-
-RUN mkdir /home/messytables && \
-    chown nobody /home/messytables
-USER nobody
-ENV HOME=/home/messytables \
-    PATH=/home/messytables/.local/bin:$PATH \
-    LANG=en_GB.UTF-8
-# LANG needed for httpretty install on Py3
-WORKDIR /home/messytables
-
-COPY ./requirements-test.txt /home/messytables/
-RUN pip install --user -r /home/messytables/requirements-test.txt
-RUN pip3 install --user -r /home/messytables/requirements-test.txt
-RUN pip install --user pdftables
-COPY . /home/messytables/
diff --git a/Makefile b/Makefile
index 8214231..d22fbb6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,3 @@
-run:    build
-	@docker run \
-	    --rm \
-		-ti \
-	    messytables
-
-build:
-	@docker build -t messytables .
-
 test:
 	nosetests --with-coverage --cover-package=messytables --cover-erase
 
diff --git a/messytables/__init__.py b/messytables/__init__.py
index 4abdaa9..53e1dc6 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -1,6 +1,7 @@
 
 from messytables.util import offset_processor, null_processor
-from messytables.headers import headers_guess, headers_processor, headers_make_unique
+from messytables.headers import headers_guess, headers_processor
+from messytables.headers import headers_make_unique
 from messytables.types import type_guess, types_processor
 from messytables.error import ReadError
 
@@ -11,7 +12,7 @@
 from messytables.zip import ZIPTableSet
 from messytables.html import HTMLTableSet, HTMLRowSet
 from messytables.pdf import PDFTableSet, PDFRowSet
-from messytables.any import any_tableset, AnyTableSet
+from messytables.any import any_tableset
 
 from messytables.jts import rowset_as_jts, headers_and_typed_as_jts
 
diff --git a/messytables/any.py b/messytables/any.py
index c497391..cdd24b7 100644
--- a/messytables/any.py
+++ b/messytables/any.py
@@ -163,10 +163,3 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw):
         raise messytables.ReadError('any: \n'.join(error))
     else:
         raise messytables.ReadError("any: Did not attempt any detection.")
-
-
-class AnyTableSet:
-    '''Deprecated - use any_tableset instead.'''
-    @staticmethod
-    def from_fileobj(fileobj, mimetype=None, extension=None):
-        return any_tableset(fileobj, mimetype=mimetype, extension=extension)
diff --git a/messytables/commas.py b/messytables/commas.py
index 65dd999..7263a75 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -8,9 +8,7 @@
 
 
 class UTF8Recoder:
-    """
-    Iterator that reads an encoded stream and re-encodes the input to UTF-8
-    """
+    """ Iterator that reads an encoded stream and re-encodes it to UTF-8. """
 
     # maps between chardet encoding and codecs bom keys
     BOM_MAPPING = {
diff --git a/messytables/error.py b/messytables/error.py
index a65429c..3df3f63 100644
--- a/messytables/error.py
+++ b/messytables/error.py
@@ -1,16 +1,18 @@
+
 class MessytablesError(Exception):
-    """A generic error to inherit from"""
+    """ A generic error to inherit from. """
 
 
 class ReadError(MessytablesError):
-    '''Error reading the file/stream in terms of the expected format.'''
+    """ Error reading the file/stream in terms of the expected format. """
     pass
 
 
 class TableError(MessytablesError, LookupError):
-    """Couldn't identify correct table."""
+    """ Couldn't identify correct table. """
     pass
 
+
 class NoSuchPropertyError(MessytablesError, KeyError):
-    """The requested property doesn't exist"""
+    """ The requested property doesn't exist. """
     pass
diff --git a/messytables/jts.py b/messytables/jts.py
index 056d9c3..0254259 100644
--- a/messytables/jts.py
+++ b/messytables/jts.py
@@ -44,5 +44,4 @@ def headers_and_typed_as_jts(headers, types):
         j.add_field(field_id=field_id,
                     label=field_id,
                     field_type=field_type)
-
     return j
diff --git a/setup.py b/setup.py
index bff0284..08e7d52 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 
 setup(
     name='messytables',
-    version='0.15.1',
+    version='1.99.0',
     description="Parse messy tabular data in various formats",
     long_description=long_desc,
     classifiers=[
@@ -42,12 +42,12 @@
         'xlrd>=0.8.0',
         'python-magic>=0.4.12',  # used for type guessing
         'chardet>=2.3.0',
-        'python-dateutil>=1.5.0',
         'lxml>=3.2',
         'requests>=2.0',
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
         'typecast',
+        'json-table-schema>=0.2'
     ],
     extras_require={'pdf': ['pdftables>=0.0.4']},
     tests_require=[
@@ -55,7 +55,5 @@
         'httpretty',
         'coverage'
     ],
-    entry_points=\
-    """
-    """,
+    entry_points={}
 )

From 1108885d6983d14cbe147f00d52bff53aace6b7d Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 10:05:58 +0200
Subject: [PATCH 08/35] Start getting rid of the compatibility layer

---
 messytables/any.py      |  4 +++-
 messytables/commas.py   | 26 ++++++++++++++------------
 messytables/compat23.py |  2 +-
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/messytables/any.py b/messytables/any.py
index cdd24b7..8aa0b2f 100644
--- a/messytables/any.py
+++ b/messytables/any.py
@@ -1,7 +1,8 @@
+import re
+
 from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
                          HTMLTableSet, ODSTableSet)
 import messytables
-import re
 
 
 MIMELOOKUP = {'application/x-zip-compressed': 'ZIP',
@@ -29,6 +30,7 @@
               'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS',
               }
 
+
 def TABTableSet(fileobj):
     return CSVTableSet(fileobj, delimiter='\t')
 
diff --git a/messytables/commas.py b/messytables/commas.py
index 7263a75..29fa243 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -2,9 +2,11 @@
 import codecs
 import chardet
 
+from six import text_type, binary_type, PY2
+
+from messytables.core import seekable_stream
 from messytables.core import RowSet, TableSet, Cell
-import messytables
-from messytables.compat23 import unicode_string, byte_string, native_string, PY2
+from messytables.error import ReadError
 
 
 class UTF8Recoder:
@@ -66,8 +68,8 @@ def __next__(self):
 
 
 def to_unicode_or_bust(obj, encoding='utf-8'):
-    if isinstance(obj, byte_string):
-        obj = unicode_string(obj, encoding)
+    if isinstance(obj, binary_type):
+        obj = text_type(obj, encoding)
     return obj
 
 
@@ -78,7 +80,7 @@ class CSVTableSet(TableSet):
     def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
                  encoding=None, window=None, doublequote=None,
                  lineterminator=None, skipinitialspace=None, **kw):
-        self.fileobj = messytables.seekable_stream(fileobj)
+        self.fileobj = seekable_stream(fileobj)
         self.name = name or 'table'
         self.delimiter = delimiter
         self.quotechar = quotechar
@@ -110,7 +112,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None,
                  encoding='utf-8', window=None, doublequote=None,
                  lineterminator=None, skipinitialspace=None):
         self.name = name
-        seekable_fileobj = messytables.seekable_stream(fileobj)
+        seekable_fileobj = seekable_stream(fileobj)
         self.fileobj = UTF8Recoder(seekable_fileobj, encoding)
 
         def fake_ilines(fobj):
@@ -137,9 +139,9 @@ def _dialect(self):
         sample = delim.join(self._sample)
         try:
             dialect = csv.Sniffer().sniff(sample,
-                delimiters=['\t', ',', ';', '|'])  # NATIVE
-            dialect.delimiter = native_string(dialect.delimiter)
-            dialect.quotechar = native_string(dialect.quotechar)
+                                          delimiters=['\t', ',', ';', '|'])
+            dialect.delimiter = str(dialect.delimiter)
+            dialect.quotechar = str(dialect.quotechar)
             dialect.lineterminator = delim
             dialect.doublequote = True
             return dialect
@@ -184,9 +186,9 @@ def rows():
                                   dialect=self._dialect, **self._overrides):
                 yield [Cell(to_unicode_or_bust(c)) for c in row]
         except csv.Error as err:
-            if u'newline inside string' in unicode_string(err) and sample:
+            if u'newline inside string' in text_type(err) and sample:
                 pass
-            elif u'line contains NULL byte' in unicode_string(err):
+            elif u'line contains NULL byte' in text_type(err):
                 pass
             else:
-                raise messytables.ReadError('Error reading CSV: %r', err)
+                raise ReadError('Error reading CSV: %r', err)
diff --git a/messytables/compat23.py b/messytables/compat23.py
index 7970666..993d946 100644
--- a/messytables/compat23.py
+++ b/messytables/compat23.py
@@ -14,6 +14,6 @@
     unicode_string = str
     native_string = str
     byte_string = bytes
-    
+
     string_types = (str,)
     urlopen = urllib.request.urlopen

From ed8cda1d68eb3914603d773b285a7b8b13451858 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 10:28:31 +0200
Subject: [PATCH 09/35] Remove remaining awkward compatibility work-arounds.

---
 messytables/any.py      | 13 +++----
 messytables/compat23.py | 19 -----------
 messytables/core.py     | 11 ++++--
 messytables/excel.py    |  2 +-
 messytables/headers.py  |  6 ++--
 messytables/jts.py      |  7 ++--
 messytables/types.py    |  5 +--
 messytables/util.py     | 75 -----------------------------------------
 messytables/zip.py      | 17 +++++-----
 setup.py                |  3 +-
 test/test_stream.py     | 12 ++++---
 11 files changed, 44 insertions(+), 126 deletions(-)
 delete mode 100644 messytables/compat23.py

diff --git a/messytables/any.py b/messytables/any.py
index 8aa0b2f..13cac56 100644
--- a/messytables/any.py
+++ b/messytables/any.py
@@ -1,8 +1,9 @@
 import re
 
-from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
-                         HTMLTableSet, ODSTableSet)
-import messytables
+from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet
+from messytables import HTMLTableSet, ODSTableSet
+from messytables.core import seekable_stream
+from messytables.error import ReadError
 
 
 MIMELOOKUP = {'application/x-zip-compressed': 'ZIP',
@@ -64,7 +65,7 @@ def get_mime(fileobj):
     import magic
     # Since we need to peek the start of the stream, make sure we can
     # seek back later. If not, slurp in the contents into a StringIO.
-    fileobj = messytables.seekable_stream(fileobj)
+    fileobj = seekable_stream(fileobj)
     header = fileobj.read(4096)
     mimetype = magic.from_buffer(header, mime=True)
     fileobj.seek(0)
@@ -162,6 +163,6 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw):
                     mimetype=magic_mime))
 
     if error:
-        raise messytables.ReadError('any: \n'.join(error))
+        raise ReadError('any: \n'.join(error))
     else:
-        raise messytables.ReadError("any: Did not attempt any detection.")
+        raise ReadError("any: Did not attempt any detection.")
diff --git a/messytables/compat23.py b/messytables/compat23.py
deleted file mode 100644
index 993d946..0000000
--- a/messytables/compat23.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import sys
-PY2 = sys.version_info[0] == 2
-if PY2:
-    import urllib2
-    from itertools import izip_longest
-    unicode_string = unicode
-    native_string = str
-    byte_string = str
-    string_types = (str, unicode)
-    urlopen = urllib2.urlopen
-else:  # i.e. PY3
-    import urllib.request
-    from itertools import zip_longest as izip_longest
-    unicode_string = str
-    native_string = str
-    byte_string = bytes
-
-    string_types = (str,)
-    urlopen = urllib.request.urlopen
diff --git a/messytables/core.py b/messytables/core.py
index 3094c34..2042262 100644
--- a/messytables/core.py
+++ b/messytables/core.py
@@ -1,11 +1,16 @@
 import io
 from collections import Mapping
+try:
+    # python 2.7:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict  # noqa
 
+
+from six import text_type, string_types
 from typecast import String
 
-from messytables.util import OrderedDict
 from messytables.error import TableError, NoSuchPropertyError
-from messytables.compat23 import *
 
 
 def seekable_stream(fileobj):
@@ -138,7 +143,7 @@ def empty(self):
             return True
         value = self.value
         if not isinstance(value, string_types):
-            value = unicode_string(value)
+            value = text_type(value)
         if len(value.strip()):
             return False
         return True
diff --git a/messytables/excel.py b/messytables/excel.py
index 744e70c..abd658b 100644
--- a/messytables/excel.py
+++ b/messytables/excel.py
@@ -3,11 +3,11 @@
 
 import xlrd
 from xlrd.biffh import XLRDError
+from six import PY2
 from typecast import String, Integer, Date, Float
 
 from messytables.core import RowSet, TableSet, Cell, CoreProperties
 from messytables.error import ReadError
-from messytables.compat23 import PY2
 
 
 class InvalidDateError(Exception):
diff --git a/messytables/headers.py b/messytables/headers.py
index 4434618..664352c 100644
--- a/messytables/headers.py
+++ b/messytables/headers.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
-from messytables.compat23 import izip_longest
+
+import six
+
 from messytables.core import Cell
 
 
@@ -43,7 +45,7 @@ def headers_processor(headers):
 
     def apply_headers(row_set, row):
         _row = []
-        pairs = izip_longest(row, headers)
+        pairs = six.itertools.izip_longest(row, headers)
         for i, (cell, header) in enumerate(pairs):
             if cell is None:
                 cell = Cell(None)
diff --git a/messytables/jts.py b/messytables/jts.py
index 0254259..e2aeb61 100644
--- a/messytables/jts.py
+++ b/messytables/jts.py
@@ -5,7 +5,8 @@
 import jsontableschema
 from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean
 
-import messytables
+from messytables.headers import headers_guess
+from messytables.types import type_guess
 
 
 MESSYTABLES_TO_JTS_MAPPING = {
@@ -26,9 +27,9 @@ def celltype_as_string(celltype):
 def rowset_as_jts(rowset, headers=None, types=None):
     ''' Create a json table schema from a rowset
     '''
-    _, headers = messytables.headers_guess(rowset.sample)
+    _, headers = headers_guess(rowset.sample)
     types = list(map(celltype_as_string,
-                     messytables.type_guess(rowset.sample)))
+                     type_guess(rowset.sample)))
 
     return headers_and_typed_as_jts(headers, types)
 
diff --git a/messytables/types.py b/messytables/types.py
index 5709332..0b793b7 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
-from messytables.compat23 import izip_longest
+
+import six
 
 from typecast import String, Integer, Decimal, Boolean, Date, DateTime
 
@@ -65,7 +66,7 @@ def types_processor(types, strict=False):
     def apply_types(row_set, row):
         if types is None:
             return row
-        for cell, type in izip_longest(row, types):
+        for cell, type in six.itertools.izip_longest(row, types):
             try:
                 cell.value = type.cast(cell.value)
                 cell.type = type
diff --git a/messytables/util.py b/messytables/util.py
index 04dd160..df5f2fa 100644
--- a/messytables/util.py
+++ b/messytables/util.py
@@ -1,78 +1,3 @@
-try:
-    # python 2.7:
-    from collections import OrderedDict
-except ImportError:
-    ## {{{ http://code.activestate.com/recipes/576669/ (r18)
-    ## Raymond Hettingers proporsal to go in 2.7
-    from collections import MutableMapping
-
-    class OrderedDict(dict, MutableMapping):
-
-        # Methods with direct access to underlying attributes
-
-        def __init__(self, *args, **kwds):
-            if len(args) > 1:
-                raise TypeError('expected at 1 argument, got %d', len(args))
-            if not hasattr(self, '_keys'):
-                self._keys = []
-            self.update(*args, **kwds)
-
-        def clear(self):
-            del self._keys[:]
-            dict.clear(self)
-
-        def __setitem__(self, key, value):
-            if key not in self:
-                self._keys.append(key)
-            dict.__setitem__(self, key, value)
-
-        def __delitem__(self, key):
-            dict.__delitem__(self, key)
-            self._keys.remove(key)
-
-        def __iter__(self):
-            return iter(self._keys)
-
-        def __reversed__(self):
-            return reversed(self._keys)
-
-        def popitem(self):
-            if not self:
-                raise KeyError
-            key = self._keys.pop()
-            value = dict.pop(self, key)
-            return key, value
-
-        def __reduce__(self):
-            items = [[k, self[k]] for k in self]
-            inst_dict = vars(self).copy()
-            inst_dict.pop('_keys', None)
-            return (self.__class__, (items,), inst_dict)
-
-        # Methods with indirect access via the above methods
-
-        setdefault = MutableMapping.setdefault
-        update = MutableMapping.update
-        pop = MutableMapping.pop
-        keys = MutableMapping.keys
-        values = MutableMapping.values
-        items = MutableMapping.items
-
-        def __repr__(self):
-            pairs = ', '.join(map('%r: %r'.__mod__, self.items()))
-            return '%s({%s})' % (self.__class__.__name__, pairs)
-
-        def copy(self):
-            return self.__class__(self)
-
-        @classmethod
-        def fromkeys(cls, iterable, value=None):
-            d = cls()
-            for key in iterable:
-                d[key] = value
-            return d
-    ## end of http://code.activestate.com/recipes/576669/ }}}
-
 
 def offset_processor(offset):
     """ Skip ``offset`` from the given iterator. This can
diff --git a/messytables/zip.py b/messytables/zip.py
index 4707d47..59f1a1b 100644
--- a/messytables/zip.py
+++ b/messytables/zip.py
@@ -1,15 +1,15 @@
 import zipfile
 
-import messytables
+from messytables.core import TableSet
+from messytables.any import any_tableset
+from messytables.error import ReadError
 
 
-class ZIPTableSet(messytables.TableSet):
+class ZIPTableSet(TableSet):
     """ Reads TableSets from inside a ZIP file """
 
     def __init__(self, fileobj, **kw):
-        """
-        On error it will raise messytables.ReadError.
-        """
+        """ On error it will raise ReadError. """
         tables = []
         found = []
         z = zipfile.ZipFile(fileobj, 'r')
@@ -25,8 +25,7 @@ def __init__(self, fileobj, **kw):
                     ext = f.filename[f.filename.rindex(".") + 1:]
 
                 try:
-                    filetables = messytables.any.any_tableset(
-                        z.open(f), extension=ext, **kw)
+                    filetables = any_tableset(z.open(f), extension=ext, **kw)
                 except ValueError as e:
                     found.append(f.filename + ": " + e.message)
                     continue
@@ -34,8 +33,8 @@ def __init__(self, fileobj, **kw):
                 tables.extend(filetables.tables)
 
             if len(tables) == 0:
-                raise messytables.ReadError('''ZIP file has no recognized
-                    tables (%s).''' % ', '.join(found))
+                raise ReadError('''ZIP file has no recognized tables (%s).'''
+                                % ', '.join(found))
         finally:
             z.close()
 
diff --git a/setup.py b/setup.py
index 08e7d52..218ad4f 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,8 @@
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
         'typecast',
-        'json-table-schema>=0.2'
+        'six',
+        'ordereddict',
     ],
     extras_require={'pdf': ['pdftables>=0.0.4']},
     tests_require=[
diff --git a/test/test_stream.py b/test/test_stream.py
index 1d677d5..9151e5f 100644
--- a/test/test_stream.py
+++ b/test/test_stream.py
@@ -1,15 +1,17 @@
 # -*- coding: utf-8 -*-
 import unittest
-from messytables.compat23 import urlopen
 import requests
 import io
 
+import six.moves.urllib as urllib
+
 from . import horror_fobj
 from nose.tools import assert_equal
 import httpretty
 
 from messytables import CSVTableSet, XLSTableSet
 
+
 class StreamInputTest(unittest.TestCase):
     @httpretty.activate
     def test_http_csv(self):
@@ -18,7 +20,7 @@ def test_http_csv(self):
             httpretty.GET, url,
             body=horror_fobj('long.csv').read(),
             content_type="application/csv")
-        fh = urlopen(url)
+        fh = urllib.request.urlopen(url)
         table_set = CSVTableSet(fh)
         row_set = table_set.tables[0]
         data = list(row_set)
@@ -46,7 +48,7 @@ def test_http_csv_encoding(self):
             httpretty.GET, url,
             body=horror_fobj('utf-16le_encoded.csv').read(),
             content_type="application/csv")
-        fh = urlopen(url)
+        fh = urllib.request.urlopen(url)
         table_set = CSVTableSet(fh)
         row_set = table_set.tables[0]
         data = list(row_set)
@@ -59,7 +61,7 @@ def test_http_xls(self):
             httpretty.GET, url,
             body=horror_fobj('simple.xls').read(),
             content_type="application/ms-excel")
-        fh = urlopen(url)
+        fh = urllib.request.urlopen(url)
         table_set = XLSTableSet(fh)
         row_set = table_set.tables[0]
         data = list(row_set)
@@ -72,7 +74,7 @@ def test_http_xlsx(self):
             httpretty.GET, url,
             body=horror_fobj('simple.xlsx').read(),
             content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
-        fh = urlopen(url)
+        fh = urllib.request.urlopen(url)
         table_set = XLSTableSet(fh)
         row_set = table_set.tables[0]
         data = list(row_set)

From e87c77472f1b6ab26df005e4b98ae00978b30c58 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 10:39:31 +0200
Subject: [PATCH 10/35] avoid circular import

---
 messytables/zip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/messytables/zip.py b/messytables/zip.py
index 59f1a1b..680b44a 100644
--- a/messytables/zip.py
+++ b/messytables/zip.py
@@ -1,7 +1,6 @@
 import zipfile
 
 from messytables.core import TableSet
-from messytables.any import any_tableset
 from messytables.error import ReadError
 
 
@@ -10,6 +9,7 @@ class ZIPTableSet(TableSet):
 
     def __init__(self, fileobj, **kw):
         """ On error it will raise ReadError. """
+        from messytables.any import any_tableset
         tables = []
         found = []
         z = zipfile.ZipFile(fileobj, 'r')

From 3dd9baddff29ce1fa3b28334d40a00dfe0c875e4 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 10:41:21 +0200
Subject: [PATCH 11/35] Clean up README.

---
 README.md | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 75667cd..787a362 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,4 @@
-# Parsing for messy tables
-
-[![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables)
-[![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
-[![Latest Version](https://pypip.in/version/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![Downloads](https://pypip.in/download/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![Supported Python versions](https://pypip.in/py_versions/messytables/badge.svg)](https://pypi.python.org/pypi/ckanserviceprovider/)
-[![Development Status](https://pypip.in/status/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
-[![License](https://pypip.in/license/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/)
+# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master)
 
 A library for dealing with messy tabular data in several formats, guessing types and detecting headers.
 
@@ -14,6 +6,6 @@ See the documentation at: https://messytables.readthedocs.io
 
 Find the package at: https://pypi.python.org/pypi/messytables
 
-See CONTRIBUTING.md for how to send patches, run tests.
+See ``CONTRIBUTING.md`` for how to send patches, run tests.
 
 **Contact**: Open Knowledge Labs - http://okfnlabs.org/contact/. We especially recommend the forum: http://discuss.okfn.org/category/open-knowledge-labs/

From 8a56e5dcd98ce8405219965f858088819b7c76e0 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Tue, 25 Aug 2015 10:59:17 +0200
Subject: [PATCH 12/35] fix py3 compat

---
 .gitignore             | 2 ++
 messytables/headers.py | 2 +-
 messytables/types.py   | 2 +-
 test/test_guessing.py  | 1 -
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index ebba6d9..0b3fb13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@
 *.py~
 *.~lock.*#
 .coverage
+
+pyenv3
diff --git a/messytables/headers.py b/messytables/headers.py
index 664352c..6c28625 100644
--- a/messytables/headers.py
+++ b/messytables/headers.py
@@ -45,7 +45,7 @@ def headers_processor(headers):
 
     def apply_headers(row_set, row):
         _row = []
-        pairs = six.itertools.izip_longest(row, headers)
+        pairs = six.moves.zip_longest(row, headers)
         for i, (cell, header) in enumerate(pairs):
             if cell is None:
                 cell = Cell(None)
diff --git a/messytables/types.py b/messytables/types.py
index 0b793b7..92710e0 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -66,7 +66,7 @@ def types_processor(types, strict=False):
     def apply_types(row_set, row):
         if types is None:
             return row
-        for cell, type in six.itertools.izip_longest(row, types):
+        for cell, type in six.moves.zip_longest(row, types):
             try:
                 cell.value = type.cast(cell.value)
                 cell.type = type
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 48e9e27..80883b5 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -89,7 +89,6 @@ def test_strict_type_guessing_with_large_file(self):
         types = [String, Integer, Decimal, Date]
         guessed_types = type_guess(rows.sample, types, False)
         assert_equal(len(guessed_types), 96)
-        print guessed_types
         assert_equal(guessed_types, [
             Integer(), String(), String(), String(),
             String(), String(), Integer(), String(), String(), String(),

From afca9173469b544148a41d86022f2d7dfce2b02a Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 14:03:38 +0200
Subject: [PATCH 13/35] =?UTF-8?q?Don=E2=80=99t=20raise=20for=200=20as=20a?=
 =?UTF-8?q?=20date.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 messytables/excel.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/messytables/excel.py b/messytables/excel.py
index abd658b..92bc4ed 100644
--- a/messytables/excel.py
+++ b/messytables/excel.py
@@ -113,7 +113,7 @@ def raw(self, sample=False):
                     row.append(XLSCell.from_xlrdcell(cell, self.sheet, colnum, rownum))
                 except InvalidDateError:
                     raise ValueError("Invalid date at '%s':%d,%d" % (
-                        self.sheet.name, colnum+1, rownum+1))
+                       self.sheet.name, colnum+1, rownum+1))
             yield row
 
 
@@ -123,14 +123,13 @@ def from_xlrdcell(xlrd_cell, sheet, col, row):
         value = xlrd_cell.value
         cell_type = XLS_TYPES.get(xlrd_cell.ctype, String())
         if cell_type == Date():
-            if value == 0:
-                raise InvalidDateError
-            year, month, day, hour, minute, second = \
-                xlrd.xldate_as_tuple(value, sheet.book.datemode)
-            if (year, month, day) == (0, 0, 0):
-                value = time(hour, minute, second)
-            else:
-                value = datetime(year, month, day, hour, minute, second)
+            if value != 0:
+                year, month, day, hour, minute, second = \
+                    xlrd.xldate_as_tuple(value, sheet.book.datemode)
+                if (year, month, day) == (0, 0, 0):
+                    value = time(hour, minute, second)
+                else:
+                    value = datetime(year, month, day, hour, minute, second)
         messy_cell = XLSCell(value, type=cell_type)
         messy_cell.sheet = sheet
         messy_cell.xlrd_cell = xlrd_cell

From 5f4d97898b590c53a3c6c869e0bc43f4a3c3af0d Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 14:24:17 +0200
Subject: [PATCH 14/35] fix up test errors, attempt to make travis pass

---
 test/test_stream.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/test_stream.py b/test/test_stream.py
index 9151e5f..335022e 100644
--- a/test/test_stream.py
+++ b/test/test_stream.py
@@ -1,13 +1,12 @@
 # -*- coding: utf-8 -*-
+import io
 import unittest
 import requests
-import io
-
 import six.moves.urllib as urllib
 
 from . import horror_fobj
-from nose.tools import assert_equal
 import httpretty
+from nose.tools import assert_equal
 
 from messytables import CSVTableSet, XLSTableSet
 

From 145e2eed866240ba6913bacd70bd7ca3e2dc3905 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 14:37:52 +0200
Subject: [PATCH 15/35] skip tests if en_GB is not supported

---
 test/test_guessing.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/test_guessing.py b/test/test_guessing.py
index 80883b5..ee8924a 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -4,6 +4,7 @@
 
 from . import horror_fobj
 from nose.plugins.attrib import attr
+from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal
 from typecast import Date, String, Decimal, Integer, Boolean
 from messytables import CSVTableSet, type_guess, headers_guess
@@ -28,8 +29,11 @@ def test_type_guess(self):
             Date('%d %b %Y'), Boolean(), Integer()])
 
     def test_type_guess_strict(self):
-        import locale
-        locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
+        try:
+            import locale
+            locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
+        except:
+            raise SkipTest("Locale en_GB.UTF-8 not available.")
         csv_file = io.BytesIO(b'''
             1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
             2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"

From dcdf21d14c1d0e7e4132a9b299dfed73603454aa Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 14:59:09 +0200
Subject: [PATCH 16/35] remove ambiguous var

---
 messytables/headers.py |  6 +++---
 messytables/types.py   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/messytables/headers.py b/messytables/headers.py
index 6c28625..04c05d2 100644
--- a/messytables/headers.py
+++ b/messytables/headers.py
@@ -1,6 +1,6 @@
-from collections import defaultdict
-
 import six
+from collections import defaultdict
+from itertools import islice
 
 from messytables.core import Cell
 
@@ -27,7 +27,7 @@ def headers_guess(rows, tolerance=1):
     The return value is a tuple of the offset of the header row
     and the names of the columns.
     """
-    rows = list(rows)
+    rows = list(islice(rows, 1000))
     modal = column_count_modal(rows)
     for i, row in enumerate(rows):
         length = len([c for c in row if not c.empty])
diff --git a/messytables/types.py b/messytables/types.py
index 92710e0..0575201 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -31,18 +31,18 @@ def type_guess(rows, types=TYPES, strict=False):
         diff = len(row) - len(guesses)
         for _ in range(diff):
             guesses.append(defaultdict(int))
-        for i, cell in enumerate(row):
+        for j, cell in enumerate(row):
             # add string guess so that we have at least one guess
-            guesses[i][String()] = guesses[i].get(String(), 0)
+            guesses[j][String()] = guesses[j].get(String(), 0)
             for type in type_instances:
-                if guesses[i][type] == FAILED:
+                if guesses[j][type] == FAILED:
                     continue
                 result = type.test(cell.value)
                 weight = WEIGHTS[type.__class__]
                 if strict and (result == -1) and not isinstance(type, String):
-                    guesses[i][type] = FAILED
+                    guesses[j][type] = FAILED
                 elif result == 1:
-                    guesses[i][type] += weight
+                    guesses[j][type] += weight
 
     _columns = []
     for guess in guesses:

From de3e84060112a194b4c6bb0734e64af189c7868e Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 15:21:07 +0200
Subject: [PATCH 17/35] dont score null values in type detection

---
 messytables/types.py  | 14 +++++++-------
 test/test_guessing.py | 18 ++++++++++--------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/messytables/types.py b/messytables/types.py
index 0575201..24813ec 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -34,15 +34,15 @@ def type_guess(rows, types=TYPES, strict=False):
         for j, cell in enumerate(row):
             # add string guess so that we have at least one guess
             guesses[j][String()] = guesses[j].get(String(), 0)
-            for type in type_instances:
-                if guesses[j][type] == FAILED:
+            for inst in type_instances:
+                if guesses[j][inst] == FAILED or cell.empty:
                     continue
-                result = type.test(cell.value)
-                weight = WEIGHTS[type.__class__]
-                if strict and (result == -1) and not isinstance(type, String):
-                    guesses[j][type] = FAILED
+                result = inst.test(cell.value)
+                weight = WEIGHTS[inst.__class__]
+                if strict and (result == -1) and not isinstance(inst, String):
+                    guesses[j][inst] = FAILED
                 elif result == 1:
-                    guesses[j][type] += weight
+                    guesses[j][inst] += weight
 
     _columns = []
     for guess in guesses:
diff --git a/test/test_guessing.py b/test/test_guessing.py
index ee8924a..4e8dcac 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -93,24 +93,26 @@ def test_strict_type_guessing_with_large_file(self):
         types = [String, Integer, Decimal, Date]
         guessed_types = type_guess(rows.sample, types, False)
         assert_equal(len(guessed_types), 96)
-        assert_equal(guessed_types, [
-            Integer(), String(), String(), String(),
+        assumed_types = [Integer(), String(), String(), String(),
             String(), String(), Integer(), String(), String(), String(),
             String(), String(), String(), Integer(), String(), String(),
             String(), String(), String(), String(), Integer(), String(),
-            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), String(), String(), Integer(),
             String(), Decimal(), Decimal(), String(), String(), String(),
             String(), String(), String(), String(), String(), String(),
-            String(), String(), String(), String(), String(), String(),
+            String(), String(), String(), Integer(), String(), Integer(),
             String(), String(), String(), String(), String(), String(),
             String(), String(), Integer(), String(), String(), String(),
             String(), String(), String(), String(), String(), String(),
             String(), String(), String(), String(), String(), String(),
+            Integer(), String(), String(), String(), String(), String(),
             String(), String(), String(), String(), String(), String(),
-            String(), String(), String(), String(), String(), String(),
-            String(), String(), String(), String(), String(), String(),
-            String(), String(), String(), Date('%d/%m/%y'), Date('%d/%m/%y'),
-            String(), String(), String()])
+            String(), String(), String(), String(), String(), Integer(),
+            String(), Date('%d/%m/%y'), Date('%d/%m/%y'), Date('%d/%m/%y'),
+            Date('%d/%m/%y'), String(), String(), String()]
+        # for (ta, tb) in zip(guessed_types, assumed_types):
+        #     print (ta, tb)
+        assert_equal(guessed_types, assumed_types)
 
     def test_file_with_few_strings_among_integers(self):
         fh = horror_fobj('mixedGLB.csv')

From 10576f3454d40c42a73213034c7e8f8f35ed9f0b Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 18:35:33 +0200
Subject: [PATCH 18/35] Move test utilities to a specific module.

---
 test/__init__.py        |  6 ------
 test/test_any.py        |  2 +-
 test/test_guessing.py   | 21 +++++++++++++++++++--
 test/test_properties.py |  2 +-
 test/test_read.py       |  2 +-
 test/test_rowset.py     |  2 +-
 test/test_stream.py     |  2 +-
 test/test_tableset.py   |  2 +-
 test/util.py            |  6 ++++++
 9 files changed, 31 insertions(+), 14 deletions(-)
 create mode 100644 test/util.py

diff --git a/test/__init__.py b/test/__init__.py
index 060bb3e..e69de29 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -1,6 +0,0 @@
-import os
-
-
-def horror_fobj(name):
-    fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name)
-    return open(fn, 'rb')
diff --git a/test/test_any.py b/test/test_any.py
index 1fbfe78..ce39b1c 100644
--- a/test/test_any.py
+++ b/test/test_any.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from . import horror_fobj
+from util import horror_fobj
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
 from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 4e8dcac..024558e 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
 import unittest
 import io
+# import cProfile
+# from pstats import Stats
 
-from . import horror_fobj
+from util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal
@@ -12,6 +14,17 @@
 
 
 class TypeGuessTest(unittest.TestCase):
+
+    # def setUp(self):
+    #     self.pr = cProfile.Profile()
+    #     self.pr.enable()
+
+    # def tearDown(self):
+    #     p = Stats(self.pr)
+    #     p.strip_dirs()
+    #     p.sort_stats('cumtime')
+    #     p.print_stats()
+
     @attr("slow")
     def test_type_guess(self):
         csv_file = io.BytesIO(b'''
@@ -122,7 +135,7 @@ def test_file_with_few_strings_among_integers(self):
         types = [String, Integer, Decimal, Date]
         guessed_types = type_guess(rows.sample, types, True)
         assert_equal(len(guessed_types), 19)
-        print(guessed_types)
+        # print(guessed_types)
         assert_equal(guessed_types, [
             Integer(), Integer(),
             Integer(), Integer(), Integer(), Integer(),
@@ -141,3 +154,7 @@ def helper(value):
         assert_equal(helper('123.0'), False)
         assert_equal(helper(123.1), False)
         assert_equal(helper('123.1'), False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_properties.py b/test/test_properties.py
index 5ec3f6d..0a7ca09 100644
--- a/test/test_properties.py
+++ b/test/test_properties.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from . import horror_fobj
+from util import horror_fobj
 from messytables.any import any_tableset
 from messytables.error import NoSuchPropertyError
 from nose.tools import (
diff --git a/test/test_read.py b/test/test_read.py
index 2901c67..092e744 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from . import horror_fobj
+from util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
diff --git a/test/test_rowset.py b/test/test_rowset.py
index 4b47e7c..52e3928 100644
--- a/test/test_rowset.py
+++ b/test/test_rowset.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from . import horror_fobj
+from util import horror_fobj
 from messytables.any import any_tableset
 
 
diff --git a/test/test_stream.py b/test/test_stream.py
index 335022e..2ed6efd 100644
--- a/test/test_stream.py
+++ b/test/test_stream.py
@@ -4,7 +4,7 @@
 import requests
 import six.moves.urllib as urllib
 
-from . import horror_fobj
+from util import horror_fobj
 import httpretty
 from nose.tools import assert_equal
 
diff --git a/test/test_tableset.py b/test/test_tableset.py
index 4c2148c..d03de88 100644
--- a/test/test_tableset.py
+++ b/test/test_tableset.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from . import horror_fobj
+from util import horror_fobj
 from messytables.any import any_tableset
 from messytables.core import RowSet
 from messytables.error import TableError
diff --git a/test/util.py b/test/util.py
new file mode 100644
index 0000000..060bb3e
--- /dev/null
+++ b/test/util.py
@@ -0,0 +1,6 @@
+import os
+
+
+def horror_fobj(name):
+    fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name)
+    return open(fn, 'rb')

From 7da15bfdce4191b3f75325d24728080da0b14d14 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 18:36:03 +0200
Subject: [PATCH 19/35] =?UTF-8?q?Move=20the=20buffered=20reader=20to=20it?=
 =?UTF-8?q?=E2=80=99s=20own=20module.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 messytables/__init__.py |   3 +-
 messytables/any.py      |   2 +-
 messytables/buffered.py |  87 +++++++++++++++++++++++++++++++++
 messytables/commas.py   |   4 +-
 messytables/core.py     | 104 ++++------------------------------------
 5 files changed, 101 insertions(+), 99 deletions(-)
 create mode 100644 messytables/buffered.py

diff --git a/messytables/__init__.py b/messytables/__init__.py
index 53e1dc6..014a095 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -5,7 +5,8 @@
 from messytables.types import type_guess, types_processor
 from messytables.error import ReadError
 
-from messytables.core import Cell, TableSet, RowSet, seekable_stream
+from messytables.buffered import seekable_stream
+from messytables.core import Cell, TableSet, RowSet
 from messytables.commas import CSVTableSet, CSVRowSet
 from messytables.ods import ODSTableSet, ODSRowSet
 from messytables.excel import XLSTableSet, XLSRowSet
diff --git a/messytables/any.py b/messytables/any.py
index 13cac56..9d305ee 100644
--- a/messytables/any.py
+++ b/messytables/any.py
@@ -2,7 +2,7 @@
 
 from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet
 from messytables import HTMLTableSet, ODSTableSet
-from messytables.core import seekable_stream
+from messytables.buffered import seekable_stream
 from messytables.error import ReadError
 
 
diff --git a/messytables/buffered.py b/messytables/buffered.py
new file mode 100644
index 0000000..60335cb
--- /dev/null
+++ b/messytables/buffered.py
@@ -0,0 +1,87 @@
+import io
+
+
+def seekable_stream(fileobj):
+    try:
+        fileobj.seek(0)
+        # if we got here, the stream is seekable
+    except:
+        # otherwise seek failed, so slurp in stream and wrap
+        # it in a BytesIO
+        fileobj = BufferedFile(fileobj)
+    return fileobj
+
+
+class BufferedFile(object):
+    """A buffered file that preserves the beginning of a stream."""
+
+    def __init__(self, fp, buffer_size=2048):
+        self.data = io.BytesIO()
+        self.fp = fp
+        self.offset = 0
+        self.len = 0
+        self.fp_offset = 0
+        self.buffer_size = buffer_size
+
+    def _next_line(self):
+        try:
+            return self.fp.readline()
+        except AttributeError:
+            return next(self.fp)
+
+    def _read(self, n):
+        return self.fp.read(n)
+
+    @property
+    def _buffer_full(self):
+        return self.len >= self.buffer_size
+
+    def readline(self):
+        if self.len < self.offset < self.fp_offset:
+            raise BufferError('Line is not available anymore')
+        if self.offset >= self.len:
+            line = self._next_line()
+            self.fp_offset += len(line)
+
+            self.offset += len(line)
+
+            if not self._buffer_full:
+                self.data.write(line)
+                self.len += len(line)
+        else:
+            line = self.data.readline()
+            self.offset += len(line)
+        return line
+
+    def read(self, n=-1):
+        if n == -1:
+            # if the request is to do a complete read, then do a complete
+            # read.
+            self.data.seek(self.offset)
+            return self.data.read(-1) + self.fp.read(-1)
+
+        if self.len < self.offset < self.fp_offset:
+            raise BufferError('Data is not available anymore')
+        if self.offset >= self.len:
+            byte = self._read(n)
+            self.fp_offset += len(byte)
+
+            self.offset += len(byte)
+
+            if not self._buffer_full:
+                self.data.write(byte)
+                self.len += len(byte)
+        else:
+            byte = self.data.read(n)
+            self.offset += len(byte)
+        return byte
+
+    def tell(self):
+        return self.offset
+
+    def seek(self, offset):
+        if self.len < offset < self.fp_offset:
+            raise BufferError('Cannot seek because data is not buffered here')
+        self.offset = offset
+        if offset < self.len:
+            self.data.seek(offset)
diff --git a/messytables/commas.py b/messytables/commas.py
index 29fa243..4e10b55 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -4,13 +4,13 @@
 
 from six import text_type, binary_type, PY2
 
-from messytables.core import seekable_stream
+from messytables.buffered import seekable_stream
 from messytables.core import RowSet, TableSet, Cell
 from messytables.error import ReadError
 
 
 class UTF8Recoder:
-    """ Iterator that reads an encoded stream and re-encodes it to UTF-8. """
+    """Iterator that reads an encoded stream and re-encodes it to UTF-8."""
 
     # maps between chardet encoding and codecs bom keys
     BOM_MAPPING = {
diff --git a/messytables/core.py b/messytables/core.py
index 2042262..8915229 100644
--- a/messytables/core.py
+++ b/messytables/core.py
@@ -1,4 +1,3 @@
-import io
 from collections import Mapping
 try:
     # python 2.7:
@@ -13,93 +12,6 @@
 from messytables.error import TableError, NoSuchPropertyError
 
 
-def seekable_stream(fileobj):
-    try:
-        fileobj.seek(0)
-        # if we got here, the stream is seekable
-    except:
-        # otherwise seek failed, so slurp in stream and wrap
-        # it in a BytesIO
-        fileobj = BufferedFile(fileobj)
-    return fileobj
-
-
-class BufferedFile(object):
-    ''' A buffered file that preserves the beginning of
-    a stream up to buffer_size
-    '''
-    def __init__(self, fp, buffer_size=2048):
-        self.data = io.BytesIO()
-        self.fp = fp
-        self.offset = 0
-        self.len = 0
-        self.fp_offset = 0
-        self.buffer_size = buffer_size
-
-    def _next_line(self):
-        try:
-            return self.fp.readline()
-        except AttributeError:
-            return next(self.fp)
-
-    def _read(self, n):
-        return self.fp.read(n)
-
-    @property
-    def _buffer_full(self):
-        return self.len >= self.buffer_size
-
-    def readline(self):
-        if self.len < self.offset < self.fp_offset:
-            raise BufferError('Line is not available anymore')
-        if self.offset >= self.len:
-            line = self._next_line()
-            self.fp_offset += len(line)
-
-            self.offset += len(line)
-
-            if not self._buffer_full:
-                self.data.write(line)
-                self.len += len(line)
-        else:
-            line = self.data.readline()
-            self.offset += len(line)
-        return line
-
-    def read(self, n=-1):
-        if n == -1:
-            # if the request is to do a complete read, then do a complete
-            # read.
-            self.data.seek(self.offset)
-            return self.data.read(-1) + self.fp.read(-1)
-
-        if self.len < self.offset < self.fp_offset:
-            raise BufferError('Data is not available anymore')
-        if self.offset >= self.len:
-            byte = self._read(n)
-            self.fp_offset += len(byte)
-
-            self.offset += len(byte)
-
-            if not self._buffer_full:
-                self.data.write(byte)
-                self.len += len(byte)
-        else:
-            byte = self.data.read(n)
-            self.offset += len(byte)
-        return byte
-
-    def tell(self):
-        return self.offset
-
-    def seek(self, offset):
-        if self.len < offset < self.fp_offset:
-            raise BufferError('Cannot seek because data is not buffered here')
-        self.offset = offset
-        if offset < self.len:
-            self.data.seek(offset)
-
-
 class CoreProperties(Mapping):
     KEYS = []
 
@@ -117,10 +29,12 @@ def __len__(self):
 
 
 class Cell(object):
-    """ A cell is the basic value type. It always has a ``value`` (that
-    may be ``None`` and may optionally also have a type and column name
-    associated with it. If no ``type`` is set, the String type is set
-    but no type conversion is set. """
+    """A cell is the basic value type.
+
+    It always has a ``value`` (that may be ``None`` and may optionally
+    also have a type and column name associated with it. If no ``type``
+    is set, the String type is set but no type conversion is set.
+    """
 
     def __init__(self, value, column=None, type=None):
         if type is None:
@@ -138,7 +52,7 @@ def __repr__(self):
 
     @property
     def empty(self):
-        """ Stringify the value and check that it has a length. """
+        """Stringify the value and check that it has a length."""
         if self.value is None:
             return True
         value = self.value
@@ -150,7 +64,7 @@ def empty(self):
 
     @property
     def properties(self):
-        """ Source-specific information. Only a placeholder here. """
+        """Source-specific information. Only a placeholder here."""
         return CoreProperties()
 
     @property
@@ -240,7 +154,7 @@ def register_processor(self, processor):
         self._processors.append(processor)
 
     def __iter__(self, sample=False):
-        """ Apply processors to the row data. """
+        """Apply processors to the row data."""
         for row in self.raw(sample=sample):
             for processor in self._processors:
                 row = processor(self, row)

From ccb094c1f28dce805468ba8c544bc41e37b39a05 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 18:36:15 +0200
Subject: [PATCH 20/35] Move guesser class to typecast.

---
 .gitignore           |  3 +-
 messytables/types.py | 71 +++++++++++---------------------------------
 setup.py             |  2 +-
 3 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0b3fb13..2df0131 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,11 @@
 *.swp
 *.egg-info
 *.pyc
+*.eggs
 *.DS_Store
 */_build/*
 *.py~
 *.~lock.*#
 .coverage
-
+dist/*
 pyenv3
diff --git a/messytables/types.py b/messytables/types.py
index 24813ec..815d846 100644
--- a/messytables/types.py
+++ b/messytables/types.py
@@ -1,68 +1,33 @@
-from collections import defaultdict
-
 import six
+from typecast import guesser, GUESS_TYPES
 
-from typecast import String, Integer, Decimal, Boolean, Date, DateTime
-
-WEIGHTS = {
-    String: 1,
-    Integer: 6,
-    Decimal: 3,
-    Boolean: 7,
-    Date: 4,
-    DateTime: 5
-}
-TYPES = [String, Decimal, Integer, Boolean, Date, DateTime]
-FAILED = 'failed'
 
+def type_guess(rows, types=GUESS_TYPES, strict=False):
+    """Guess the best type for a given row set.
 
-def type_guess(rows, types=TYPES, strict=False):
-    """ The type guesser aggregates the number of successful
-    conversions of each column to each type, weights them by a
-    fixed type priority and select the most probable type for
-    each column based on that figure. It returns a list of
-    ``CellType``. Empty cells are ignored.
+    The type guesser aggregates the number of successful conversions of each
+    column to each type, weights them by a fixed type priority and select the
+    most probable type for each column based on that figure. It returns a list
+    of ``CellType``. Empty cells are ignored.
 
-    Strict means that a type will not be guessed
-    if parsing fails for a single cell in the column."""
-    guesses = []
-    type_instances = [i for t in types for i in t.instances()]
+    Strict means that a type will not be guessed if parsing fails for a single
+    cell in the column.
+    """
+    guessers = []
     for i, row in enumerate(rows):
-        diff = len(row) - len(guesses)
-        for _ in range(diff):
-            guesses.append(defaultdict(int))
+        for _ in range(len(row) - len(guessers)):
+            guessers.append(guesser(types=types, strict=strict))
         for j, cell in enumerate(row):
             # add string guess so that we have at least one guess
-            guesses[j][String()] = guesses[j].get(String(), 0)
-            for inst in type_instances:
-                if guesses[j][inst] == FAILED or cell.empty:
-                    continue
-                result = inst.test(cell.value)
-                weight = WEIGHTS[inst.__class__]
-                if strict and (result == -1) and not isinstance(inst, String):
-                    guesses[j][inst] = FAILED
-                elif result == 1:
-                    guesses[j][inst] += weight
-
-    _columns = []
-    for guess in guesses:
-        # this first creates an array of tuples because we want the types to be
-        # sorted. Even though it is not specified, python chooses the first
-        # element in case of a tie
-        # See: http://stackoverflow.com/a/6783101/214950
-        guesses_tuples = [(t, guess[t]) for t in type_instances
-                          if t in guess and guess[t] != FAILED]
-        # print 'GUESSES', zip(row, guesses_tuples)
-        _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
-    return _columns
+            guessers[j].add(cell.value)
+    return [g.best for g in guessers]
 
 
 def types_processor(types, strict=False):
-    """ Apply the column types set on the instance to the
-    current row, attempting to cast each cell to the specified
-    type.
+    """Apply the column types to the each row.
 
-    Strict means that casting errors are not ignored"""
+    Strict means that casting errors are not ignored.
+    """
     def apply_types(row_set, row):
         if types is None:
             return row
diff --git a/setup.py b/setup.py
index 218ad4f..5b729de 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
         'requests>=2.0',
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
-        'typecast',
+        'typecast>=0.3.0',
         'six',
         'ordereddict',
     ],

From 2565632990a64cb7b1235cfdb3c14665367aac10 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:06:54 +0200
Subject: [PATCH 21/35] Factor out CSV re-coder

---
 messytables/buffered.py |  8 +++--
 messytables/commas.py   | 79 +++++------------------------------------
 messytables/text.py     | 69 +++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 74 deletions(-)
 create mode 100644 messytables/text.py

diff --git a/messytables/buffered.py b/messytables/buffered.py
index 60335cb..dea877f 100644
--- a/messytables/buffered.py
+++ b/messytables/buffered.py
@@ -1,21 +1,23 @@
 import io
 
+BUFFER_SIZE = 4096
+
 
 def seekable_stream(fileobj):
     try:
         fileobj.seek(0)
         # if we got here, the stream is seekable
+        return fileobj
     except:
         # otherwise seek failed, so slurp in stream and wrap
         # it in a BytesIO
-        fileobj = BufferedFile(fileobj)
-    return fileobj
+        return BufferedFile(fileobj)
 
 
 class BufferedFile(object):
     """A buffered file that preserves the beginning of a stream."""
 
-    def __init__(self, fp, buffer_size=2048):
+    def __init__(self, fp, buffer_size=BUFFER_SIZE):
         self.data = io.BytesIO()
         self.fp = fp
         self.offset = 0
diff --git a/messytables/commas.py b/messytables/commas.py
index 4e10b55..c89e44d 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -1,76 +1,13 @@
 import csv
-import codecs
-import chardet
 
-from six import text_type, binary_type, PY2
+from six import text_type, PY2
 
-from messytables.buffered import seekable_stream
+from messytables.buffered import seekable_stream, BUFFER_SIZE
+from messytables.text import UTF8Recoder, to_unicode_or_bust
 from messytables.core import RowSet, TableSet, Cell
 from messytables.error import ReadError
 
-
-class UTF8Recoder:
-    """Iterator that reads an encoded stream and re-encodes it to UTF-8."""
-
-    # maps between chardet encoding and codecs bom keys
-    BOM_MAPPING = {
-        'utf-16le': 'BOM_UTF16_LE',
-        'utf-16be': 'BOM_UTF16_BE',
-        'utf-32le': 'BOM_UTF32_LE',
-        'utf-32be': 'BOM_UTF32_BE',
-        'utf-8': 'BOM_UTF8',
-        'utf-8-sig': 'BOM_UTF8',
-
-    }
-
-    def __init__(self, f, encoding):
-        sample = f.read(2000)
-        if not encoding:
-            results = chardet.detect(sample)
-            encoding = results['encoding']
-            if not encoding:
-                # Don't break, just try and load the data with
-                # a semi-sane encoding
-                encoding = 'utf-8'
-        f.seek(0)
-        self.reader = codecs.getreader(encoding)(f, 'ignore')
-
-        # The reader only skips a BOM if the encoding isn't explicit about its
-        # endianness (i.e. if encoding is UTF-16 a BOM is handled properly
-        # and taken out, but if encoding is UTF-16LE a BOM is ignored).
-        # However, if chardet sees a BOM it returns an encoding with the
-        # endianness explicit, which results in the codecs stream leaving the
-        # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE}
-        # encodings, check for a BOM and remove it if it's there.
-        if encoding.lower() in self.BOM_MAPPING:
-            bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None)
-            if bom:
-                # Try to read the BOM, which is a byte sequence, from
-                # the underlying stream. If all characters match, then
-                # go on. Otherwise when a character doesn't match, seek
-                # the stream back to the beginning and go on.
-                for c in bom:
-                    if f.read(1) != c:
-                        f.seek(0)
-                        break
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        line = self.reader.readline()
-        if not line or line == '\0':
-            raise StopIteration
-        result = line.encode("utf-8")
-        return result
-
-    next = __next__
-
-
-def to_unicode_or_bust(obj, encoding='utf-8'):
-    if isinstance(obj, binary_type):
-        obj = text_type(obj, encoding)
-    return obj
+DELIMITERS = ['\t', ',', ';', '|']
 
 
 class CSVTableSet(TableSet):
@@ -91,7 +28,7 @@ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
         self.skipinitialspace = skipinitialspace
 
     def make_tables(self):
-        """ Return the actual CSV table. """
+        """Return the actual CSV table."""
         return [CSVRowSet(self.name, self.fileobj,
                           delimiter=self.delimiter,
                           quotechar=self.quotechar,
@@ -112,12 +49,12 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None,
                  encoding='utf-8', window=None, doublequote=None,
                  lineterminator=None, skipinitialspace=None):
         self.name = name
-        seekable_fileobj = seekable_stream(fileobj)
-        self.fileobj = UTF8Recoder(seekable_fileobj, encoding)
+        self.fh = seekable_stream(fileobj)
+        self.fileobj = UTF8Recoder(self.fh, encoding)
 
         def fake_ilines(fobj):
             for row in fobj:
-                    yield row.decode('utf-8')
+                yield row.decode('utf-8')
         self.lines = fake_ilines(self.fileobj)
         self._sample = []
         self.delimiter = delimiter
diff --git a/messytables/text.py b/messytables/text.py
new file mode 100644
index 0000000..fe8121d
--- /dev/null
+++ b/messytables/text.py
@@ -0,0 +1,69 @@
+import codecs
+import chardet
+from six import text_type, binary_type
+
+from messytables.buffered import BUFFER_SIZE
+
+
+class UTF8Recoder:
+    """Iterator that reads an encoded stream and re-encodes it to UTF-8."""
+
+    # maps between chardet encoding and codecs bom keys
+    BOM_MAPPING = {
+        'utf-16le': 'BOM_UTF16_LE',
+        'utf-16be': 'BOM_UTF16_BE',
+        'utf-32le': 'BOM_UTF32_LE',
+        'utf-32be': 'BOM_UTF32_BE',
+        'utf-8': 'BOM_UTF8',
+        'utf-8-sig': 'BOM_UTF8',
+
+    }
+
+    def __init__(self, f, encoding):
+        sample = f.read(BUFFER_SIZE)
+        if not encoding:
+            results = chardet.detect(sample)
+            encoding = results['encoding']
+            if not encoding:
+                # Don't break, just try and load the data with
+                # a semi-sane encoding
+                encoding = 'utf-8'
+        f.seek(0)
+        self.reader = codecs.getreader(encoding)(f, 'ignore')
+
+        # The reader only skips a BOM if the encoding isn't explicit about its
+        # endianness (i.e. if encoding is UTF-16 a BOM is handled properly
+        # and taken out, but if encoding is UTF-16LE a BOM is ignored).
+        # However, if chardet sees a BOM it returns an encoding with the
+        # endianness explicit, which results in the codecs stream leaving the
+        # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE}
+        # encodings, check for a BOM and remove it if it's there.
+        if encoding.lower() in self.BOM_MAPPING:
+            bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None)
+            if bom:
+                # Try to read the BOM, which is a byte sequence, from
+                # the underlying stream. If all characters match, then
+                # go on. Otherwise when a character doesn't match, seek
+                # the stream back to the beginning and go on.
+                for c in bom:
+                    if f.read(1) != c:
+                        f.seek(0)
+                        break
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        line = self.reader.readline()
+        if not line or line == '\0':
+            raise StopIteration
+        result = line.encode("utf-8")
+        return result
+
+    next = __next__
+
+
+def to_unicode_or_bust(obj, encoding='utf-8'):
+    if isinstance(obj, binary_type):
+        obj = text_type(obj, encoding)
+    return obj

From b63baeb647e61bf4d593f0ac79522dadee1f8a18 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:08:52 +0200
Subject: [PATCH 22/35] use cchardet

---
 messytables/text.py | 5 ++++-
 setup.py            | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/messytables/text.py b/messytables/text.py
index fe8121d..4ebec79 100644
--- a/messytables/text.py
+++ b/messytables/text.py
@@ -1,5 +1,8 @@
 import codecs
-import chardet
+try:
+    import cchardet as chardet
+except ImportError:
+    import chardet
 from six import text_type, binary_type
 
 from messytables.buffered import BUFFER_SIZE
diff --git a/setup.py b/setup.py
index 5b729de..2da635c 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
         'xlrd>=0.8.0',
         'python-magic>=0.4.12',  # used for type guessing
         'chardet>=2.3.0',
+        'cchardet',
         'lxml>=3.2',
         'requests>=2.0',
         'html5lib',

From 2e4b96c0ffa87401e83e83629b7ad1db72d3287d Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:23:37 +0200
Subject: [PATCH 23/35] simplify the handling of CSV dialects

---
 messytables/commas.py | 42 ++++++++++++++----------------------------
 messytables/text.py   |  7 +------
 setup.py              |  2 +-
 3 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/messytables/commas.py b/messytables/commas.py
index c89e44d..f0b79e3 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -9,13 +9,16 @@
 
 DELIMITERS = ['\t', ',', ';', '|']
 
+# Fix the maximum field size to something a little larger
+csv.field_size_limit(256000)
+
 
 class CSVTableSet(TableSet):
     """ A CSV table set. Since CSV is always just a single table,
     this is just a pass-through for the row set. """
 
     def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
-                 encoding=None, window=None, doublequote=None,
+                 encoding=None, window=None, doublequote=True,
                  lineterminator=None, skipinitialspace=None, **kw):
         self.fileobj = seekable_stream(fileobj)
         self.name = name or 'table'
@@ -46,7 +49,7 @@ class CSVRowSet(RowSet):
     fragment. """
 
     def __init__(self, name, fileobj, delimiter=None, quotechar=None,
-                 encoding='utf-8', window=None, doublequote=None,
+                 encoding='utf-8', window=None, doublequote=True,
                  lineterminator=None, skipinitialspace=None):
         self.name = name
         self.fh = seekable_stream(fileobj)
@@ -75,32 +78,19 @@ def _dialect(self):
         delim = '\n'  # NATIVE
         sample = delim.join(self._sample)
         try:
-            dialect = csv.Sniffer().sniff(sample,
-                                          delimiters=['\t', ',', ';', '|'])
-            dialect.delimiter = str(dialect.delimiter)
-            dialect.quotechar = str(dialect.quotechar)
-            dialect.lineterminator = delim
+            dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS)
+            dialect.delimiter = self.delimiter or str(dialect.delimiter)
+            dialect.quotechar = self.quotechar or str(dialect.quotechar)
+            dialect.lineterminator = self.lineterminator or delim
+            if self.skipinitialspace is not None:
+                dialect.skipinitialspace = self.skipinitialspace
+            if self.lineterminator is not None:
+                dialect.lineterminator = self.lineterminator
             dialect.doublequote = True
             return dialect
         except csv.Error:
             return csv.excel
 
-    @property
-    def _overrides(self):
-        # some variables in the dialect can be overridden
-        d = {}
-        if self.delimiter:
-            d['delimiter'] = self.delimiter
-        if self.quotechar:
-            d['quotechar'] = self.quotechar
-        if self.doublequote:
-            d['doublequote'] = self.doublequote
-        if self.lineterminator:
-            d['lineterminator'] = self.lineterminator
-        if self.skipinitialspace is not None:
-            d['skipinitialspace'] = self.skipinitialspace
-        return d
-
     def raw(self, sample=False):
         def rows():
             for line in self._sample:
@@ -115,12 +105,8 @@ def rows():
                     else:
                         yield line
 
-        # Fix the maximum field size to something a little larger
-        csv.field_size_limit(256000)
-
         try:
-            for row in csv.reader(rows(),
-                                  dialect=self._dialect, **self._overrides):
+            for row in csv.reader(rows(), dialect=self._dialect):
                 yield [Cell(to_unicode_or_bust(c)) for c in row]
         except csv.Error as err:
             if u'newline inside string' in text_type(err) and sample:
diff --git a/messytables/text.py b/messytables/text.py
index 4ebec79..17c8097 100644
--- a/messytables/text.py
+++ b/messytables/text.py
@@ -25,12 +25,7 @@ class UTF8Recoder:
     def __init__(self, f, encoding):
         sample = f.read(BUFFER_SIZE)
         if not encoding:
-            results = chardet.detect(sample)
-            encoding = results['encoding']
-            if not encoding:
-                # Don't break, just try and load the data with
-                # a semi-sane encoding
-                encoding = 'utf-8'
+            encoding = chardet.detect(sample).get('encoding') or 'utf-8'
         f.seek(0)
         self.reader = codecs.getreader(encoding)(f, 'ignore')
 
diff --git a/setup.py b/setup.py
index 2da635c..33a9edb 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
         'requests>=2.0',
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
-        'typecast>=0.3.0',
+        'typecast>=0.3.1',
         'six',
         'ordereddict',
     ],

From f3733258a354d438b57ba81b85d40dbfb9267718 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:34:03 +0200
Subject: [PATCH 24/35] try relative imports with py3

---
 test/test_any.py        | 2 +-
 test/test_guessing.py   | 2 +-
 test/test_properties.py | 2 +-
 test/test_read.py       | 2 +-
 test/test_rowset.py     | 2 +-
 test/test_stream.py     | 2 +-
 test/test_tableset.py   | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_any.py b/test/test_any.py
index ce39b1c..bfb37a1 100644
--- a/test/test_any.py
+++ b/test/test_any.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from util import horror_fobj
+from .util import horror_fobj
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
 from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 024558e..141a3ff 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -4,7 +4,7 @@
 # import cProfile
 # from pstats import Stats
 
-from util import horror_fobj
+from .util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal
diff --git a/test/test_properties.py b/test/test_properties.py
index 0a7ca09..b4e4a0c 100644
--- a/test/test_properties.py
+++ b/test/test_properties.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from util import horror_fobj
+from .util import horror_fobj
 from messytables.any import any_tableset
 from messytables.error import NoSuchPropertyError
 from nose.tools import (
diff --git a/test/test_read.py b/test/test_read.py
index 092e744..ac9b384 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from util import horror_fobj
+from .util import horror_fobj
 from nose.plugins.attrib import attr
 from nose.tools import assert_equal
 from nose.plugins.skip import SkipTest
diff --git a/test/test_rowset.py b/test/test_rowset.py
index 52e3928..39077d9 100644
--- a/test/test_rowset.py
+++ b/test/test_rowset.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from util import horror_fobj
+from .util import horror_fobj
 from messytables.any import any_tableset
 
 
diff --git a/test/test_stream.py b/test/test_stream.py
index 2ed6efd..f2e5723 100644
--- a/test/test_stream.py
+++ b/test/test_stream.py
@@ -4,7 +4,7 @@
 import requests
 import six.moves.urllib as urllib
 
-from util import horror_fobj
+from .util import horror_fobj
 import httpretty
 from nose.tools import assert_equal
 
diff --git a/test/test_tableset.py b/test/test_tableset.py
index d03de88..9d0c127 100644
--- a/test/test_tableset.py
+++ b/test/test_tableset.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import unittest
-from util import horror_fobj
+from .util import horror_fobj
 from messytables.any import any_tableset
 from messytables.core import RowSet
 from messytables.error import TableError

From 96549a9e9a21bfbba9ac54663486316dd36ab3c7 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:59:48 +0200
Subject: [PATCH 25/35] PEP8.

---
 messytables/core.py    | 57 +++++++++++++++++++++++-------------------
 messytables/error.py   | 11 +++-----
 messytables/headers.py | 24 ++++++++++--------
 messytables/html.py    | 40 +++++++++--------------------
 messytables/ods.py     | 31 ++++++++++++-----------
 messytables/util.py    |  7 +++---
 messytables/zip.py     |  4 +--
 7 files changed, 83 insertions(+), 91 deletions(-)

diff --git a/messytables/core.py b/messytables/core.py
index 8915229..7adc9df 100644
--- a/messytables/core.py
+++ b/messytables/core.py
@@ -69,21 +69,21 @@ def properties(self):
 
     @property
     def topleft(self):
-        """
-        Is the cell the top-left of a span? Non-spanning cells are the top left.
-
-        This is used for example in HTML generation where the top left cell
-        is the only one which is written into the output representation.
+        """Non-spanning cells are the top left.
 
+        This is used for example in HTML generation where the top left
+        cell is the only one which is written into the output representation.
         In absense of other knowledge, we assume that all cells are top left.
         """
+        # This seems oddly over-specific, can we solve it otherwise?
         return True
 
 
 class TableSet(object):
-    """ A table set is used for data formats in which multiple tabular
-    objects are bundled. This might include relational databases and
-    workbooks used in spreadsheet software (Excel, LibreOffice).
+    """A table set bundles multiple tabular objects.
+
+    This might include relational databases and workbooks used in spreadsheet
+    software (Excel, LibreOffice).
 
     For each format, we derive from this abstract base class, providing a
     constructor that takes a file object and tables() that returns each table.
@@ -92,14 +92,14 @@ class TableSet(object):
 
     On any fatal errors, it should raise messytables.ReadError
     """
+
     def __init__(self, fileobj):
-        """ Store the fileobj, and perhaps all or part of the file. """
+        """Store the fileobj, and perhaps all or part of the file."""
         pass
 
     @property
     def tables(self):
-        """ Return a listing of tables (i.e. RowSets) in the ``TableSet``.
-        Each table has a name. """
+        """Get a listing of ``RowSets``."""
         if getattr(self, "_tables", None) is None:
             self._tables = self.make_tables()
         return self._tables
@@ -107,8 +107,9 @@ def tables(self):
     def make_tables(self):
         raise NotImplementedError("make_tables() not implemented on {0}"
                                   .format(type(self)))
+
     def __getitem__(self, name):
-        """ Return a RowSet based on the name given """
+        """Return a RowSet based on the name given."""
         matching = [table for table in self.tables if table.name == name]
         if not matching:
             raise TableError("No table called %r" % name)
@@ -118,16 +119,18 @@ def __getitem__(self, name):
 
     @classmethod
     def from_fileobj(cls, fileobj, *args, **kwargs):
-        """ Deprecated, only for compatibility reasons """
+        """Deprecated, only for compatibility reasons."""
         return cls(fileobj, *args, **kwargs)
 
 
 class RowSet(object):
-    """ A row set (aka: table) is a simple wrapper for an iterator of
-    rows (which in turn is a list of ``Cell`` objects). The main table
-    iterable can only be traversed once, so on order to allow analytics
-    like type and header guessing on the data, a sample of ``window``
-    rows is read, cached, and made available.
+    """A single table, which allows iterating over individual rows.
+
+    A row set (aka: table) is a simple wrapper for an iterator of rows
+    (which in turn is a list of ``Cell`` objects). The main table iterable
+    can only be traversed once, so on order to allow analytics like type and
+    header guessing on the data, a sample of ``window`` rows is read, cached,
+    and made available.
 
     On any fatal errors, it should raise messytables.ReadError
     """
@@ -147,10 +150,11 @@ def get_types(self):
     types = property(get_types, set_types)
 
     def register_processor(self, processor):
-        """ Register a stream processor to be used on each row. A
-        processor is a function called with the ``RowSet`` as its
-        first argument and the row to be processed as the second
-        argument. """
+        """Register a stream processor to be used on each row.
+
+        A processor is a function called with the ``RowSet`` as its first
+        argument and the row to be processed as the second argument.
+        """
         self._processors.append(processor)
 
     def __iter__(self, sample=False):
@@ -171,10 +175,11 @@ def sample(self):
         return self.__iter__(sample=True)
 
     def dicts(self, sample=False):
-        """ Return a representation of the data as an iterator of
-        ordered dictionaries. This is less specific than the cell
-        format returned by the generic iterator but only gives a
-        subset of the information. """
+        """Return the table data as an iterator of ordered dictionaries.
+
+        This is less specific than the cell format returned by the generic
+        iterator but only gives a subset of the information.
+        """
         generator = self.sample if sample else self
         for row in generator:
             yield OrderedDict([(c.column, c.value) for c in row])
diff --git a/messytables/error.py b/messytables/error.py
index 3df3f63..255f4ab 100644
--- a/messytables/error.py
+++ b/messytables/error.py
@@ -1,18 +1,15 @@
 
 class MessytablesError(Exception):
-    """ A generic error to inherit from. """
+    """A generic error to inherit from."""
 
 
 class ReadError(MessytablesError):
-    """ Error reading the file/stream in terms of the expected format. """
-    pass
+    """Error reading the file/stream in terms of the expected format."""
 
 
 class TableError(MessytablesError, LookupError):
-    """ Couldn't identify correct table. """
-    pass
+    """Couldn't identify correct table."""
 
 
 class NoSuchPropertyError(MessytablesError, KeyError):
-    """ The requested property doesn't exist. """
-    pass
+    """The requested property doesn't exist."""
diff --git a/messytables/headers.py b/messytables/headers.py
index 04c05d2..0b20453 100644
--- a/messytables/headers.py
+++ b/messytables/headers.py
@@ -6,9 +6,10 @@
 
 
 def column_count_modal(rows):
-    """ Return the modal value of columns in the row_set's
-    sample. This can be assumed to be the number of columns
-    of the table. """
+    """Return the modal value of columns in the row_set's sample.
+
+    This can be assumed to be the number of columns of the table.
+    """
     counts = defaultdict(int)
     for row in rows:
         length = len([c for c in row if not c.empty])
@@ -20,7 +21,8 @@ def column_count_modal(rows):
 
 
 def headers_guess(rows, tolerance=1):
-    """ Guess the offset and names of the headers of the row set.
+    """Guess the offset and names of the headers of the row set.
+
     This will attempt to locate the first row within ``tolerance``
     of the mode of the number of rows in the row set sample.
 
@@ -40,9 +42,10 @@ def headers_guess(rows, tolerance=1):
 
 
 def headers_processor(headers):
-    """ Add column names to the cells in a row_set. If no header is
-    defined, use an autogenerated name. """
+    """Add column names to the cells in a row_set.
 
+    If no header is defined, use an autogenerated name.
+    """
     def apply_headers(row_set, row):
         _row = []
         pairs = six.moves.zip_longest(row, headers)
@@ -59,11 +62,12 @@ def apply_headers(row_set, row):
 
 
 def headers_make_unique(headers, max_length=None):
-    """Make sure the header names are unique. For non-unique
-    columns, append 1, 2, 3, ... after the name. If max_length
-    is set, truncate the original string so that the headers are
-    unique up to that length."""
+    """Make sure the header names are unique.
 
+    For non-unique columns, append 1, 2, 3, ... after the name. If max_length
+    is set, truncate the original string so that the headers are unique up to
+    that length.
+    """
     headers = [h.strip() for h in headers]
 
     new_digits_length = 0
diff --git a/messytables/html.py b/messytables/html.py
index 4f02f26..20c0f35 100644
--- a/messytables/html.py
+++ b/messytables/html.py
@@ -15,9 +15,8 @@ def fromstring(s):
 
 
 class HTMLTableSet(TableSet):
-    """
-    A TableSet from a HTML document.
-    """
+    """A TableSet from a HTML document."""
+
     def __init__(self, fileobj=None, filename=None, window=None, **kw):
 
         if filename is not None:
@@ -45,9 +44,7 @@ def __init__(self, fileobj=None, filename=None, window=None, **kw):
                 "other tables. This is a bug."  # avoid infinite loops
 
     def make_tables(self):
-        """
-        Return a listing of tables (as HTMLRowSets) in the table set.
-        """
+        """Return a listing of tables (as HTMLRowSets) in the table set."""
         def rowset_name(rowset, table_index):
             return "Table {0} of {1}".format(table_index + 1,
                                              len(self.htmltables))
@@ -71,9 +68,8 @@ def insert_blank_cells(row, blanks):
 
 
 class HTMLRowSet(RowSet):
-    """
-    A RowSet representing a HTML table.
-    """
+    """A RowSet representing a HTML table."""
+
     def __init__(self, name, sheet, window=None):
         self.name = name
         self.sheet = sheet
@@ -81,11 +77,8 @@ def __init__(self, name, sheet, window=None):
         super(HTMLRowSet, self).__init__()
 
     def in_table(self, els):
-        """
-        takes a list of xpath elements and returns only those
-        whose parent table is this one
-        """
-
+        # Accept a list of xpath elements and returns only those
+        # whose parent table is this one
         return [e for e in els
                 if self.sheet in e.xpath("./ancestor::table[1]")]
 
@@ -137,17 +130,14 @@ def identify_anatomy(tag):
 
 
 class FakeHTMLCell(Cell):
+    """FakeHTMLCells are not present because of column or row spannning."""
+
     def __init__(self):
         super(FakeHTMLCell, self).__init__("")
 
     @property
     def topleft(self):
-        """
-        FakeHTMLCells are those which are not physically present in the HTML
-        because of column or row spannning.
-
-        See also: HTMLCell.topleft
-        """
+        """See also: HTMLCell.topleft."""
         return False
 
 
@@ -169,12 +159,7 @@ def __init__(self, value=None, column=None, type=None, source=None):
 
     @property
     def topleft(self):
-        """
-        HTMLCells are those which are physically present in the HTML. They are
-        always the top-left in their span.
-
-        See also: FakeHTMLCell.topleft
-        """
+        """See also: FakeHTMLCell.topleft."""
         return True
 
     @property
@@ -198,7 +183,7 @@ def text_from_element(elem):
     """
     builder = []
     for x in elem.iter():
-        #print x.tag, x.attrib, x.text, x.tail
+        # print x.tag, x.attrib, x.text, x.tail
         if is_invisible_text(x):
             cell_str = x.tail or ''  # handle None values.
         else:
@@ -216,7 +201,6 @@ def is_invisible_text(elem):
         if 'style' in elem.attrib:
             if 'display:none' in elem.attrib['style']:
                 flag = True
-
     return flag
 
 
diff --git a/messytables/ods.py b/messytables/ods.py
index 4351c85..da35d57 100644
--- a/messytables/ods.py
+++ b/messytables/ods.py
@@ -20,15 +20,15 @@
 
 
 class ODSTableSet(TableSet):
-    """
-    A wrapper around ODS files. Because they are zipped and the info we want
-    is in the zipped file as content.xml we must ensure that we either have
-    a seekable object (local file) or that we retrieve all of the content from
-    the remote URL.
+    """A wrapper around ODS files.
+
+    Because they are zipped and the info we want is in the zipped file as
+    content.xml we must ensure that we either have a seekable object (local
+    file) or that we retrieve all of the content from the remote URL.
     """
 
     def __init__(self, fileobj, window=None, **kw):
-        '''Initialize the object.
+        """Initialize the object.
 
         :param fileobj: may be a file path or a file-like object. Note the
         file-like object *must* be in binary mode and must be seekable (it will
@@ -40,7 +40,7 @@ def __init__(self, fileobj, window=None, **kw):
         To get a seekable file you *cannot* use
         messytables.core.seekable_stream as it does not support the full seek
         functionality.
-        '''
+        """
         if hasattr(fileobj, 'read'):
             # wrap in a StringIO so we do not have hassle with seeks and
             # binary etc (see notes to __init__ above)
@@ -54,13 +54,12 @@ def __init__(self, fileobj, window=None, **kw):
         zf.close()
 
     def make_tables(self):
-        """
-            Return the sheets in the workbook.
+        """Return the sheets in the workbook.
 
-            A regex is used for this to avoid having to:
+        A regex is used for this to avoid having to:
 
-            1. load large the entire file into memory, or
-            2. SAX parse the file more than once
+        1. load large the entire file into memory, or
+        2. SAX parse the file more than once
         """
         namespace_tags = self._get_namespace_tags()
         sheets = [m.groups(0)[0]
@@ -77,8 +76,10 @@ def _get_namespace_tags(self):
 
 
 class ODSRowSet(RowSet):
-    """ ODS support for a single sheet in the ODS workbook. Unlike
-    the CSV row set this is not a streaming operation. """
+    """ODS support for a single sheet in the ODS workbook.
+
+    Unlike the CSV row set this is not a streaming operation.
+    """
 
     def __init__(self, sheet, window=None, namespace_tags=None):
         self.sheet = sheet
@@ -119,7 +120,7 @@ def __init__(self, sheet, window=None, namespace_tags=None):
         super(ODSRowSet, self).__init__(typed=True)
 
     def raw(self, sample=False):
-        """ Iterate over all rows in this sheet. """
+        """Iterate over all rows in this sheet."""
         rows = ODS_ROW_MATCH.findall(self.sheet)
 
         for row in rows:
diff --git a/messytables/util.py b/messytables/util.py
index df5f2fa..a83d456 100644
--- a/messytables/util.py
+++ b/messytables/util.py
@@ -1,7 +1,8 @@
 
 def offset_processor(offset):
-    """ Skip ``offset`` from the given iterator. This can
-    be used in combination with the ``headers_processor`` to
+    """Skip ``offset`` from the given iterator.
+
+    This can be used in combination with the ``headers_processor`` to
     apply the result of a header scan to the table.
 
     :param offset: Offset to be skipped
@@ -17,7 +18,7 @@ def apply_offset(row_set, row):
 
 
 def null_processor(nulls):
-    """ Replaces every occurrence of items from `nulls` with None.
+    """Replace every occurrence of items from `nulls` with None.
 
     :param nulls: List of items to be replaced
     :type nulls: list
diff --git a/messytables/zip.py b/messytables/zip.py
index 680b44a..a15c90f 100644
--- a/messytables/zip.py
+++ b/messytables/zip.py
@@ -5,10 +5,10 @@
 
 
 class ZIPTableSet(TableSet):
-    """ Reads TableSets from inside a ZIP file """
+    """Reads TableSets from inside a ZIP file."""
 
     def __init__(self, fileobj, **kw):
-        """ On error it will raise ReadError. """
+        """On error it will raise ReadError."""
         from messytables.any import any_tableset
         tables = []
         found = []

From 910b6c2f52b02070f2a6548d43ef2727d759aab9 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 19:59:56 +0200
Subject: [PATCH 26/35] Simplify JTS code.

---
 messytables/jts.py | 48 +++++++++++++++-------------------------------
 setup.py           |  2 +-
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/messytables/jts.py b/messytables/jts.py
index e2aeb61..1bafb68 100644
--- a/messytables/jts.py
+++ b/messytables/jts.py
@@ -1,48 +1,30 @@
-'''
-Convert a rowset to the json table schema
+"""Convert a rowset to the json table schema.
+
 (http://www.dataprotocols.org/en/latest/json-table-schema.html)
-'''
+"""
 import jsontableschema
-from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean
 
 from messytables.headers import headers_guess
 from messytables.types import type_guess
 
 
-MESSYTABLES_TO_JTS_MAPPING = {
-    String: 'string',
-    Integer: 'integer',
-    Float: 'number',
-    Decimal: 'number',
-    Date: 'date',
-    DateTime: 'datetime',
-    Boolean: 'boolean'
-}
-
-
-def celltype_as_string(celltype):
-    return MESSYTABLES_TO_JTS_MAPPING[celltype.__class__]
-
-
 def rowset_as_jts(rowset, headers=None, types=None):
-    ''' Create a json table schema from a rowset
-    '''
+    """Create a json table schema from a rowset."""
     _, headers = headers_guess(rowset.sample)
-    types = list(map(celltype_as_string,
-                     type_guess(rowset.sample)))
-
+    types = type_guess(rowset.sample)
+    types = [t.jts_name for t in types]
     return headers_and_typed_as_jts(headers, types)
 
 
 def headers_and_typed_as_jts(headers, types):
-    ''' Create a json table schema from headers and types as
-    returned from :meth:`~messytables.headers.headers_guess`
-    and :meth:`~messytables.types.type_guess`.
-    '''
-    j = jsontableschema.JSONTableSchema()
+    """Create a json table schema from headers and types.
 
+    Those specs are returned from :meth:`~messytables.headers.headers_guess`
+    and :meth:`~messytables.types.type_guess`.
+    """
+    jts = jsontableschema.JSONTableSchema()
     for field_id, field_type in zip(headers, types):
-        j.add_field(field_id=field_id,
-                    label=field_id,
-                    field_type=field_type)
-    return j
+        jts.add_field(field_id=field_id,
+                      label=field_id,
+                      field_type=field_type)
+    return jts
diff --git a/setup.py b/setup.py
index 33a9edb..6c1e70e 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
         'requests>=2.0',
         'html5lib',
         'json-table-schema>=0.2, <=0.2.1'
-        'typecast>=0.3.1',
+        'typecast>=0.3.3',
         'six',
         'ordereddict',
     ],

From a4c22f3415ac703494a89a5955ed0b09326973b7 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 21:08:24 +0200
Subject: [PATCH 27/35] pep8

---
 messytables/pdf.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/messytables/pdf.py b/messytables/pdf.py
index 11aa907..1998ac8 100644
--- a/messytables/pdf.py
+++ b/messytables/pdf.py
@@ -42,9 +42,8 @@ def properties(self):
 
 
 class PDFTableSet(TableSet):
-    """
-    A TableSet from a PDF document.
-    """
+    """A TableSet from a PDF document."""
+
     def __init__(self, fileobj=None, filename=None, **kw):
         if get_tables is None:
             raise ImportError("pdftables is not installed")
@@ -57,9 +56,7 @@ def __init__(self, fileobj=None, filename=None, **kw):
         self.raw_tables = get_tables(self.fh)
 
     def make_tables(self):
-        """
-        Return a listing of tables (as PDFRowSets) in the table set.
-        """
+        """Return a listing of tables in the table set."""
         def table_name(table):
             return "Table {0} of {1} on page {2} of {3}".format(
                 table.table_number_on_page,
@@ -71,9 +68,8 @@ def table_name(table):
 
 
 class PDFRowSet(RowSet):
-    """
-    A RowSet representing a PDF table.
-    """
+    """A RowSet representing a PDF table."""
+
     def __init__(self, name, table):
         if get_tables is None:
             raise ImportError("pdftables is not installed")
@@ -85,9 +81,7 @@ def __init__(self, name, table):
         )
 
     def raw(self, sample=False):
-        """
-        Yield one row of cells at a time
-        """
+        """Yield one row of cells at a time."""
         if hasattr(self.table, "cell_data"):
             # New style of cell data.
             for row in self.table.cell_data:

From b7b485146eadad8b480e5917ef12d70756452ab4 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 21:09:51 +0200
Subject: [PATCH 28/35] Move stuff around.

---
 messytables/commas.py | 23 ++++++++++++++---------
 messytables/error.py  |  4 ++++
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/messytables/commas.py b/messytables/commas.py
index f0b79e3..6693cef 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -2,7 +2,7 @@
 
 from six import text_type, PY2
 
-from messytables.buffered import seekable_stream, BUFFER_SIZE
+from messytables.buffered import seekable_stream
 from messytables.text import UTF8Recoder, to_unicode_or_bust
 from messytables.core import RowSet, TableSet, Cell
 from messytables.error import ReadError
@@ -14,8 +14,11 @@
 
 
 class CSVTableSet(TableSet):
-    """ A CSV table set. Since CSV is always just a single table,
-    this is just a pass-through for the row set. """
+    """A CSV table set.
+
+    Since CSV is always just a single table, this is just a pass-through for
+    the row set.
+    """
 
     def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
                  encoding=None, window=None, doublequote=True,
@@ -43,10 +46,12 @@ def make_tables(self):
 
 
 class CSVRowSet(RowSet):
-    """ A CSV row set is an iterator on a CSV file-like object
+    """A CSV row set is an iterator on a CSV file-like object.
+
     (which can potentially be infinetly large). When loading,
     a sample is read and cached so you can run analysis on the
-    fragment. """
+    fragment.
+    """
 
     def __init__(self, name, fileobj, delimiter=None, quotechar=None,
                  encoding='utf-8', window=None, doublequote=True,
@@ -58,6 +63,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None,
         def fake_ilines(fobj):
             for row in fobj:
                 yield row.decode('utf-8')
+
         self.lines = fake_ilines(self.fileobj)
         self._sample = []
         self.delimiter = delimiter
@@ -73,8 +79,7 @@ def fake_ilines(fobj):
             pass
         super(CSVRowSet, self).__init__()
 
-    @property
-    def _dialect(self):
+    def dialect(self):
         delim = '\n'  # NATIVE
         sample = delim.join(self._sample)
         try:
@@ -86,7 +91,7 @@ def _dialect(self):
                 dialect.skipinitialspace = self.skipinitialspace
             if self.lineterminator is not None:
                 dialect.lineterminator = self.lineterminator
-            dialect.doublequote = True
+            dialect.doublequote = self.doublequote
             return dialect
         except csv.Error:
             return csv.excel
@@ -106,7 +111,7 @@ def rows():
                         yield line
 
         try:
-            for row in csv.reader(rows(), dialect=self._dialect):
+            for row in csv.reader(rows(), dialect=self.dialect()):
                 yield [Cell(to_unicode_or_bust(c)) for c in row]
         except csv.Error as err:
             if u'newline inside string' in text_type(err) and sample:
diff --git a/messytables/error.py b/messytables/error.py
index 255f4ab..4996bbd 100644
--- a/messytables/error.py
+++ b/messytables/error.py
@@ -13,3 +13,7 @@ class TableError(MessytablesError, LookupError):
 
 class NoSuchPropertyError(MessytablesError, KeyError):
     """The requested property doesn't exist."""
+
+
+class InvalidDateError(Exception):
+    """Invalid date in structured data sources."""

From b8f15ed7621b3c36d1246a805eb46ca0ed7cdafb Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sat, 23 Jul 2016 21:13:25 +0200
Subject: [PATCH 29/35] Formatting.

---
 messytables/excel.py | 146 ++++++++++++++++++++++---------------------
 messytables/text.py  |   1 -
 2 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/messytables/excel.py b/messytables/excel.py
index 92bc4ed..28bb235 100644
--- a/messytables/excel.py
+++ b/messytables/excel.py
@@ -1,70 +1,54 @@
-import sys
 from datetime import datetime, time
-
-import xlrd
 from xlrd.biffh import XLRDError
-from six import PY2
+from xlrd import open_workbook, xldate_as_tuple
 from typecast import String, Integer, Date, Float
 
 from messytables.core import RowSet, TableSet, Cell, CoreProperties
-from messytables.error import ReadError
-
-
-class InvalidDateError(Exception):
-    pass
+from messytables.error import ReadError, InvalidDateError
 
 XLS_TYPES = {
-    1: String(),
+    1: String,
     # NB: Excel does not distinguish floats from integers so we use floats
     # We could try actual type detection between floats and ints later
     # or use the excel format string info - see
     # https://groups.google.com/forum/?fromgroups=#!topic/
     #  python-excel/cAQ1ndsCVxk
-    2: Float(),
-    3: Date(),
+    2: Float,
+    3: Date,
     # this is actually boolean but we do not have a boolean type yet
-    4: Integer()
+    4: Integer
 }
 
 
 class XLSTableSet(TableSet):
-    """An excel workbook wrapper object.
-    """
+    """An excel workbook wrapper object."""
 
     def __init__(self, fileobj=None, filename=None, window=None,
                  encoding=None, with_formatting_info=True, **kw):
-        '''Initialize the tableset.
+        """Initilize the tableset.
 
         :param encoding: passed on to xlrd.open_workbook function
             as encoding_override
-        :param with_formatting_info: passed to xlrd to get font details of cells
-        '''
+        :param with_formatting_info: whether xlrd should provide details
+            of the cells contents (e.g. colour, borders, etc.
+            Not sure what the behaviour of properties is with this turned off.
+            Turning this on apparently may have memory implications in xlrd.
+
+        The convoluted "try it with with_formatting_info, then try it without"
+        is necessary because xlrd doesn't currently support getting this
+        information from XLSX files. Workarounds include converting the XLSX
+        document in LibreOffice.
+        """
         def get_workbook():
             try:
-                return xlrd.open_workbook(
+                return open_workbook(
                     filename=filename,
                     file_contents=read_obj,
                     encoding_override=encoding,
                     formatting_info=with_formatting_info)
-            except XLRDError:
-                _, value, traceback = sys.exc_info()
-                if PY2:
-                   raise ReadError("Can't read Excel file: %r" % value, traceback)
-                else:
-                   raise ReadError("Can't read Excel file: %r" % value).with_traceback(traceback)
-        '''Initilize the tableset.
+            except XLRDError as xlrdexc:
+                raise ReadError("Can't read Excel file: %r" % xlrdexc)
 
-        :param encoding: passed on to xlrd.open_workbook function
-            as encoding_override
-        :param with_formatting_info: whether xlrd should provide details
-            of the cells contents (e.g. colour, borders, etc.
-            Not sure what the behaviour of properties is with this turned off.
-            Turning this on apparently may have memory implications in xlrd.
-
-        The convoluted "try it with with_formatting_info, then try it without" is
-        necessary because xlrd doesn't currently support getting this information
-        from XLSX files. Workarounds include converting the XLSX document in LibreOffice.
-        '''
         self.window = window
 
         if not filename and not fileobj:
@@ -81,19 +65,20 @@ def get_workbook():
             if not with_formatting_info:
                 raise
             else:
-                with_formatting_info=False
+                with_formatting_info = False
                 self.workbook = get_workbook()
 
-
     def make_tables(self):
-        """ Return the sheets in the workbook. """
+        """Return the sheets in the workbook."""
         return [XLSRowSet(name, self.workbook.sheet_by_name(name), self.window)
                 for name in self.workbook.sheet_names()]
 
 
 class XLSRowSet(RowSet):
-    """ Excel support for a single sheet in the excel workbook. Unlike
-    the CSV row set this is not a streaming operation. """
+    """Excel support for a single sheet in the excel workbook.
+
+    Unlike the CSV row set this is not a streaming operation.
+    """
 
     def __init__(self, name, sheet, window=None):
         self.name = name
@@ -102,38 +87,47 @@ def __init__(self, name, sheet, window=None):
         super(XLSRowSet, self).__init__(typed=True)
 
     def raw(self, sample=False):
-        """ Iterate over all rows in this sheet. Types are automatically
-        converted according to the excel data types specified, including
-        conversion of excel dates, which are notoriously buggy. """
+        """Iterate over all rows in this sheet.
+
+        Types are automatically converted according to the excel data types
+        specified, including conversion of excel dates, which are notoriously
+        buggy.
+        """
         num_rows = self.sheet.nrows
-        for rownum in range(min(self.window, num_rows) if sample else num_rows):
+        num_rows = min(self.window, num_rows) if sample else num_rows
+        for rownum in xrange(num_rows):
             row = []
             for colnum, cell in enumerate(self.sheet.row(rownum)):
                 try:
-                    row.append(XLSCell.from_xlrdcell(cell, self.sheet, colnum, rownum))
+                    row.append(XLSCell.from_xlrdcell(cell, self.sheet,
+                                                     colnum, rownum))
                 except InvalidDateError:
-                    raise ValueError("Invalid date at '%s':%d,%d" % (
-                       self.sheet.name, colnum+1, rownum+1))
+                    raise ValueError("Invalid date at '%s':%d,%d" %
+                                     (self.sheet.name, colnum + 1, rownum + 1))
             yield row
 
 
 class XLSCell(Cell):
-    @staticmethod
-    def from_xlrdcell(xlrd_cell, sheet, col, row):
+
+    @classmethod
+    def get_xl_date(cls, sheet, value):
+        if value == 0:
+            return None
+        date = xldate_as_tuple(value, sheet.book.datemode)
+        year, month, day, hour, minute, second = date
+        return datetime(year, month, day, hour, minute, second)
+
+    @classmethod
+    def from_xlrdcell(cls, xlrd_cell, sheet, col, row):
         value = xlrd_cell.value
-        cell_type = XLS_TYPES.get(xlrd_cell.ctype, String())
-        if cell_type == Date():
-            if value != 0:
-                year, month, day, hour, minute, second = \
-                    xlrd.xldate_as_tuple(value, sheet.book.datemode)
-                if (year, month, day) == (0, 0, 0):
-                    value = time(hour, minute, second)
-                else:
-                    value = datetime(year, month, day, hour, minute, second)
-        messy_cell = XLSCell(value, type=cell_type)
+        cell_type = XLS_TYPES.get(xlrd_cell.ctype, String)
+        if cell_type == Date:
+            value = cls.get_xl_date(sheet, value)
+        messy_cell = XLSCell(value, type=cell_type())
         messy_cell.sheet = sheet
         messy_cell.xlrd_cell = xlrd_cell
-        messy_cell.xlrd_pos = (row, col)  # necessary for properties, note not (x,y)
+        # necessary for properties, note not (x,y)
+        messy_cell.xlrd_pos = (row, col)
         return messy_cell
 
     @property
@@ -146,6 +140,7 @@ def properties(self):
 
 
 class XLSProperties(CoreProperties):
+
     KEYS = ['bold', 'size', 'italic', 'font_name', 'strikeout', 'underline',
             'font_colour', 'background_colour', 'any_border', 'all_border',
             'richtext', 'blank', 'a_date', 'formatting_string']
@@ -168,13 +163,19 @@ def formatting(self):
 
     @property
     def rich(self):
-        """returns a tuple of character position, font number which starts at that position
-        https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute"""
-        return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos, None)
+        """Return a tuple of character position, font number.
+
+        Starts at that position:
+        https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute
+        """
+        return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos,
+                                                         None)
 
     def raw_span(self, always=False):
-        """return the bounding box of the cells it's part of.
-         https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute"""
+        """Return the bounding box of the cells it's part of.
+
+        https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute
+        """
         row, col = self.cell.xlrd_pos
         for box in self.cell.sheet.merged_cells:
             rlo, rhi, clo, chi = box
@@ -210,7 +211,7 @@ def get_bold(self):
         return self.font.weight > 500
 
     def get_size(self):
-        """in pixels"""
+        """In pixels."""
         return self.font.height / 20.0
 
     def get_italic(self):
@@ -227,15 +228,18 @@ def get_underline(self):
 
     def get_font_colour(self):
         # TODO
-        return self.font.color_index ## more lookup required
+        return self.font.color_index  # more lookup required
 
     def get_blank(self):
         """Note that cells might not exist at all.
-           Behaviour for spanned cells might be complicated: hence this function"""
+
+        Behaviour for spanned cells might be complicated: hence this function
+        """
         return self.cell.value == ''
 
     def get_background_colour(self):
-        return self.xf.background.background_color_index ## more lookup required
+        # more lookup required:
+        return self.xf.background.background_color_index
 
     def get_any_border(self):
         b = self.xf.border
diff --git a/messytables/text.py b/messytables/text.py
index 17c8097..d7938e0 100644
--- a/messytables/text.py
+++ b/messytables/text.py
@@ -19,7 +19,6 @@ class UTF8Recoder:
         'utf-32be': 'BOM_UTF32_BE',
         'utf-8': 'BOM_UTF8',
         'utf-8-sig': 'BOM_UTF8',
-
     }
 
     def __init__(self, f, encoding):

From 3c96240275f49697927b89743e5e228104f795fa Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sun, 24 Jul 2016 18:26:41 +0200
Subject: [PATCH 30/35] Replace CSV reader with a fully streaming
 implementation.

---
 .gitignore              |   1 +
 messytables/buffered.py |   2 +-
 messytables/commas.py   | 144 +++++++++++++++++++---------------------
 messytables/text.py     |  86 ++++++++----------------
 test/test_guessing.py   |   2 +-
 test/test_read.py       |   2 +-
 6 files changed, 99 insertions(+), 138 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2df0131..33f6a3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@
 *.~lock.*#
 .coverage
 dist/*
+.tox/*
 pyenv3
diff --git a/messytables/buffered.py b/messytables/buffered.py
index dea877f..dd4daf8 100644
--- a/messytables/buffered.py
+++ b/messytables/buffered.py
@@ -17,7 +17,7 @@ def seekable_stream(fileobj):
 class BufferedFile(object):
     """A buffered file that preserves the beginning of a stream."""
 
-    def __init__(self, fp, buffer_size=BUFFER_SIZE):
+    def __init__(self, fp, buffer_size=BUFFER_SIZE + 2):
         self.data = io.BytesIO()
         self.fp = fp
         self.offset = 0
diff --git a/messytables/commas.py b/messytables/commas.py
index 6693cef..add6049 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -1,16 +1,18 @@
+import re
 import csv
+import logging
 
-from six import text_type, PY2
-
-from messytables.buffered import seekable_stream
-from messytables.text import UTF8Recoder, to_unicode_or_bust
+from messytables.buffered import BUFFER_SIZE
+from messytables.text import analyze_stream
 from messytables.core import RowSet, TableSet, Cell
 from messytables.error import ReadError
 
 DELIMITERS = ['\t', ',', ';', '|']
+TERMINATORS = ['\r\n', '\r', '\n', '\0']
 
 # Fix the maximum field size to something a little larger
 csv.field_size_limit(256000)
+log = logging.getLogger(__name__)
 
 
 class CSVTableSet(TableSet):
@@ -21,28 +23,19 @@ class CSVTableSet(TableSet):
     """
 
     def __init__(self, fileobj, delimiter=None, quotechar=None, name=None,
-                 encoding=None, window=None, doublequote=True,
-                 lineterminator=None, skipinitialspace=None, **kw):
-        self.fileobj = seekable_stream(fileobj)
-        self.name = name or 'table'
-        self.delimiter = delimiter
-        self.quotechar = quotechar
-        self.encoding = encoding
-        self.window = window
-        self.doublequote = doublequote
-        self.lineterminator = lineterminator
-        self.skipinitialspace = skipinitialspace
+                 encoding=None, window=1000, doublequote=True,
+                 skipinitialspace=None, **kw):
+        self._tables = [CSVRowSet(name or 'table', fileobj,
+                                  delimiter=delimiter,
+                                  quotechar=quotechar,
+                                  encoding=encoding,
+                                  window=window,
+                                  doublequote=doublequote,
+                                  skipinitialspace=skipinitialspace)]
 
     def make_tables(self):
         """Return the actual CSV table."""
-        return [CSVRowSet(self.name, self.fileobj,
-                          delimiter=self.delimiter,
-                          quotechar=self.quotechar,
-                          encoding=self.encoding,
-                          window=self.window,
-                          doublequote=self.doublequote,
-                          lineterminator=self.lineterminator,
-                          skipinitialspace=self.skipinitialspace)]
+        return self._tables
 
 
 class CSVRowSet(RowSet):
@@ -54,69 +47,66 @@ class CSVRowSet(RowSet):
     """
 
     def __init__(self, name, fileobj, delimiter=None, quotechar=None,
-                 encoding='utf-8', window=None, doublequote=True,
-                 lineterminator=None, skipinitialspace=None):
+                 encoding=None, window=1000, doublequote=None,
+                 skipinitialspace=None):
         self.name = name
-        self.fh = seekable_stream(fileobj)
-        self.fileobj = UTF8Recoder(self.fh, encoding)
+        self.encoding, self.buf = analyze_stream(fileobj, encoding=encoding)
+        self.fileobj = fileobj
 
-        def fake_ilines(fobj):
-            for row in fobj:
-                yield row.decode('utf-8')
+        # For line breaking, use the (detected) encoding of the file:
+        terminators = [t.encode(self.encoding) for t in TERMINATORS]
+        self.terminators_re = re.compile('(%s)' % '|'.join(terminators))
 
-        self.lines = fake_ilines(self.fileobj)
         self._sample = []
-        self.delimiter = delimiter
-        self.quotechar = quotechar
-        self.window = window or 1000
-        self.doublequote = doublequote
-        self.lineterminator = lineterminator
-        self.skipinitialspace = skipinitialspace
-        try:
-            for i in range(self.window):
-                self._sample.append(next(self.lines))
-        except StopIteration:
-            pass
-        super(CSVRowSet, self).__init__()
+        self.window = window
 
-    def dialect(self):
-        delim = '\n'  # NATIVE
-        sample = delim.join(self._sample)
         try:
-            dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS)
-            dialect.delimiter = self.delimiter or str(dialect.delimiter)
-            dialect.quotechar = self.quotechar or str(dialect.quotechar)
-            dialect.lineterminator = self.lineterminator or delim
-            if self.skipinitialspace is not None:
-                dialect.skipinitialspace = self.skipinitialspace
-            if self.lineterminator is not None:
-                dialect.lineterminator = self.lineterminator
-            dialect.doublequote = self.doublequote
-            return dialect
+            sample = self.buf.decode(self.encoding).encode('utf-8')
+            self.dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS)
         except csv.Error:
-            return csv.excel
+            self.dialect = csv.excel
+        # override detected dialect with constructor values.
+        self.dialect.delimiter = delimiter or str(self.dialect.delimiter)
+        self.dialect.quotechar = quotechar or str(self.dialect.quotechar)
+        if skipinitialspace is not None:
+            self.dialect.skipinitialspace = skipinitialspace
+        if doublequote is not None:
+            self.dialect.doublequote = doublequote
+        super(CSVRowSet, self).__init__()
 
-    def raw(self, sample=False):
-        def rows():
-            for line in self._sample:
-                if PY2:
-                    yield line.encode('utf-8')
+    def get_lines(self, sample=False):
+        for line in self._sample:
+            yield line
+
+        while True:
+            if self.buf is None:
+                break
+            if sample and len(self._sample) >= self.window:
+                break
+            match = self.terminators_re.search(self.buf)
+            if match is not None:
+                line = self.buf[:match.end(0)]
+                self.buf = self.buf[match.end(0):]
+            else:
+                buf = self.fileobj.read(BUFFER_SIZE)
+                if len(buf):
+                    self.buf += buf
+                    continue
                 else:
-                    yield line
-            if not sample:
-                for line in self.lines:
-                    if PY2:
-                        yield line.encode('utf-8')
-                    else:
-                        yield line
+                    line, self.buf = self.buf, None
+
+            line = line.decode(self.encoding).encode('utf-8')
+            if line in TERMINATORS or not len(line):
+                continue
 
+            if self.window >= len(self._sample):
+                self._sample.append(line)
+            yield line
+
+    def raw(self, sample=False):
         try:
-            for row in csv.reader(rows(), dialect=self.dialect()):
-                yield [Cell(to_unicode_or_bust(c)) for c in row]
+            for row in csv.reader(self.get_lines(sample=sample),
+                                  dialect=self.dialect):
+                yield [Cell(c.decode('utf-8')) for c in row]
         except csv.Error as err:
-            if u'newline inside string' in text_type(err) and sample:
-                pass
-            elif u'line contains NULL byte' in text_type(err):
-                pass
-            else:
-                raise ReadError('Error reading CSV: %r', err)
+            raise ReadError('Error reading CSV: %r', err)
diff --git a/messytables/text.py b/messytables/text.py
index d7938e0..ee71179 100644
--- a/messytables/text.py
+++ b/messytables/text.py
@@ -3,64 +3,34 @@
     import cchardet as chardet
 except ImportError:
     import chardet
-from six import text_type, binary_type
 
 from messytables.buffered import BUFFER_SIZE
 
-
-class UTF8Recoder:
-    """Iterator that reads an encoded stream and re-encodes it to UTF-8."""
-
-    # maps between chardet encoding and codecs bom keys
-    BOM_MAPPING = {
-        'utf-16le': 'BOM_UTF16_LE',
-        'utf-16be': 'BOM_UTF16_BE',
-        'utf-32le': 'BOM_UTF32_LE',
-        'utf-32be': 'BOM_UTF32_BE',
-        'utf-8': 'BOM_UTF8',
-        'utf-8-sig': 'BOM_UTF8',
-    }
-
-    def __init__(self, f, encoding):
-        sample = f.read(BUFFER_SIZE)
-        if not encoding:
-            encoding = chardet.detect(sample).get('encoding') or 'utf-8'
-        f.seek(0)
-        self.reader = codecs.getreader(encoding)(f, 'ignore')
-
-        # The reader only skips a BOM if the encoding isn't explicit about its
-        # endianness (i.e. if encoding is UTF-16 a BOM is handled properly
-        # and taken out, but if encoding is UTF-16LE a BOM is ignored).
-        # However, if chardet sees a BOM it returns an encoding with the
-        # endianness explicit, which results in the codecs stream leaving the
-        # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE}
-        # encodings, check for a BOM and remove it if it's there.
-        if encoding.lower() in self.BOM_MAPPING:
-            bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None)
-            if bom:
-                # Try to read the BOM, which is a byte sequence, from
-                # the underlying stream. If all characters match, then
-                # go on. Otherwise when a character doesn't match, seek
-                # the stream back to the beginning and go on.
-                for c in bom:
-                    if f.read(1) != c:
-                        f.seek(0)
-                        break
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        line = self.reader.readline()
-        if not line or line == '\0':
-            raise StopIteration
-        result = line.encode("utf-8")
-        return result
-
-    next = __next__
-
-
-def to_unicode_or_bust(obj, encoding='utf-8'):
-    if isinstance(obj, binary_type):
-        obj = text_type(obj, encoding)
-    return obj
+# maps between chardet encoding and codecs bom keys
+BOM_MAPPING = {
+    'utf-16le': 'BOM_UTF16_LE',
+    'utf-16be': 'BOM_UTF16_BE',
+    'utf-32le': 'BOM_UTF32_LE',
+    'utf-32be': 'BOM_UTF32_BE',
+    'utf-8': 'BOM_UTF8',
+    'utf-8-sig': 'BOM_UTF8',
+}
+
+
+def analyze_stream(stream, encoding=None):
+    sample = stream.read(BUFFER_SIZE)
+    if encoding is None:
+        encoding = chardet.detect(sample).get('encoding') or 'utf-8'
+    encoding = encoding.lower()
+    # The reader only skips a BOM if the encoding isn't explicit about its
+    # endianness (i.e. if encoding is UTF-16 a BOM is handled properly
+    # and taken out, but if encoding is UTF-16LE a BOM is ignored).
+    # However, if chardet sees a BOM it returns an encoding with the
+    # endianness explicit, which results in the codecs stream leaving the
+    # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE}
+    # encodings, check for a BOM and remove it if it's there.
+    if encoding in BOM_MAPPING:
+        bom = getattr(codecs, BOM_MAPPING[encoding], None)
+        if sample[:len(bom)] == bom:
+            return encoding, sample[len(bom):]
+    return encoding, sample
diff --git a/test/test_guessing.py b/test/test_guessing.py
index 141a3ff..2d3e7a4 100644
--- a/test/test_guessing.py
+++ b/test/test_guessing.py
@@ -100,7 +100,7 @@ def test_guessing_uses_first_in_case_of_tie(self):
     @attr("slow")
     def test_strict_type_guessing_with_large_file(self):
         fh = horror_fobj('211.csv')
-        rows = CSVTableSet(fh).tables[0]
+        rows = CSVTableSet(fh, encoding='iso-8859-2').tables[0]
         offset, headers = headers_guess(rows.sample)
         rows.register_processor(offset_processor(offset + 1))
         types = [String, Integer, Decimal, Date]
diff --git a/test/test_read.py b/test/test_read.py
index ac9b384..786c901 100644
--- a/test/test_read.py
+++ b/test/test_read.py
@@ -211,7 +211,7 @@ def test_guess_headers(self):
         row_set.register_processor(headers_processor(['foo', 'bar']))
         data = list(row_set)
         assert 'foo' in data[12][0].column, data[12][0]
-        assert 'Chirurgie' in data[12][0].value, data[12][0].value
+        assert 'Chirurgie' in data[10][0].value, data[10][0].value
 
     def test_read_encoded_characters_csv(self):
         fh = horror_fobj('characters.csv')

From ce3627c0a9eecb74af79a180c07c706c0f5353dc Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sun, 24 Jul 2016 19:17:36 +0200
Subject: [PATCH 31/35] Fix up Python 3 support

---
 messytables/__init__.py |  2 +-
 messytables/any.py      |  7 ++-----
 messytables/commas.py   | 46 +++++++++++++++++++++++++++++++++--------
 messytables/excel.py    |  2 +-
 4 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/messytables/__init__.py b/messytables/__init__.py
index 014a095..c1ca1ba 100644
--- a/messytables/__init__.py
+++ b/messytables/__init__.py
@@ -7,7 +7,7 @@
 
 from messytables.buffered import seekable_stream
 from messytables.core import Cell, TableSet, RowSet
-from messytables.commas import CSVTableSet, CSVRowSet
+from messytables.commas import CSVTableSet, CSVRowSet, TSVTableSet
 from messytables.ods import ODSTableSet, ODSRowSet
 from messytables.excel import XLSTableSet, XLSRowSet
 from messytables.zip import ZIPTableSet
diff --git a/messytables/any.py b/messytables/any.py
index 9d305ee..802aeb6 100644
--- a/messytables/any.py
+++ b/messytables/any.py
@@ -1,7 +1,7 @@
 import re
 
 from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet
-from messytables import HTMLTableSet, ODSTableSet
+from messytables import HTMLTableSet, ODSTableSet, TSVTableSet
 from messytables.buffered import seekable_stream
 from messytables.error import ReadError
 
@@ -32,10 +32,7 @@
               }
 
 
-def TABTableSet(fileobj):
-    return CSVTableSet(fileobj, delimiter='\t')
-
-parsers = {'TAB': TABTableSet,
+parsers = {'TAB': TSVTableSet,
            'ZIP': ZIPTableSet,
            'XLS': XLSTableSet,
            'HTML': HTMLTableSet,
diff --git a/messytables/commas.py b/messytables/commas.py
index add6049..27261a5 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -1,5 +1,6 @@
 import re
 import csv
+import six
 import logging
 
 from messytables.buffered import BUFFER_SIZE
@@ -8,7 +9,7 @@
 from messytables.error import ReadError
 
 DELIMITERS = ['\t', ',', ';', '|']
-TERMINATORS = ['\r\n', '\r', '\n', '\0']
+LINE_SEPARATOR = ['\r\n', '\r', '\n', '\0']
 
 # Fix the maximum field size to something a little larger
 csv.field_size_limit(256000)
@@ -38,6 +39,24 @@ def make_tables(self):
         return self._tables
 
 
+class TSVTableSet(CSVTableSet):
+    """A TSV table set.
+
+    This is a slightly specialised version of the CSVTableSet that will always
+    generate a tab-based table parser.
+    """
+
+    def __init__(self, fileobj, quotechar=None, name=None,
+                 encoding=None, window=1000, doublequote=True,
+                 skipinitialspace=None, **kw):
+        super(TSVTableSet, self).__init__(fileobj, delimiter='\t',
+                                          quotechar=quotechar, name=name,
+                                          encoding=encoding, window=window,
+                                          doublequote=doublequote,
+                                          skipinitialspace=skipinitialspace,
+                                          **kw)
+
+
 class CSVRowSet(RowSet):
     """A CSV row set is an iterator on a CSV file-like object.
 
@@ -54,14 +73,17 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None,
         self.fileobj = fileobj
 
         # For line breaking, use the (detected) encoding of the file:
-        terminators = [t.encode(self.encoding) for t in TERMINATORS]
-        self.terminators_re = re.compile('(%s)' % '|'.join(terminators))
+        linesep = [t.encode(self.encoding) for t in LINE_SEPARATOR]
+        linesep = b'(%s)' % b'|'.join(linesep)
+        self.linesep = re.compile(linesep)
 
         self._sample = []
         self.window = window
 
         try:
-            sample = self.buf.decode(self.encoding).encode('utf-8')
+            sample = self.buf.decode(self.encoding)
+            if six.PY2:
+                sample = sample.encode('utf-8')
             self.dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS)
         except csv.Error:
             self.dialect = csv.excel
@@ -83,7 +105,7 @@ def get_lines(self, sample=False):
                 break
             if sample and len(self._sample) >= self.window:
                 break
-            match = self.terminators_re.search(self.buf)
+            match = self.linesep.search(self.buf)
             if match is not None:
                 line = self.buf[:match.end(0)]
                 self.buf = self.buf[match.end(0):]
@@ -95,8 +117,11 @@ def get_lines(self, sample=False):
                 else:
                     line, self.buf = self.buf, None
 
-            line = line.decode(self.encoding).encode('utf-8')
-            if line in TERMINATORS or not len(line):
+            line = line.decode(self.encoding)
+            if six.PY2:
+                line = line.encode('utf-8')
+
+            if line in LINE_SEPARATOR or not len(line):
                 continue
 
             if self.window >= len(self._sample):
@@ -107,6 +132,9 @@ def raw(self, sample=False):
         try:
             for row in csv.reader(self.get_lines(sample=sample),
                                   dialect=self.dialect):
-                yield [Cell(c.decode('utf-8')) for c in row]
+                if six.PY2:
+                    row = [c.decode('utf-8') for c in row]
+                yield [Cell(c) for c in row]
         except csv.Error as err:
-            raise ReadError('Error reading CSV: %r', err)
+            if 'new-line character' not in repr(err):
+                raise ReadError('Error reading CSV: %r', err)
diff --git a/messytables/excel.py b/messytables/excel.py
index 28bb235..93c8004 100644
--- a/messytables/excel.py
+++ b/messytables/excel.py
@@ -95,7 +95,7 @@ def raw(self, sample=False):
         """
         num_rows = self.sheet.nrows
         num_rows = min(self.window, num_rows) if sample else num_rows
-        for rownum in xrange(num_rows):
+        for rownum in range(num_rows):
             row = []
             for colnum, cell in enumerate(self.sheet.row(rownum)):
                 try:

From 7dd9e5b8dc784c8cc3fa79f78ff9ab8244831e84 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@pudo.org>
Date: Sun, 24 Jul 2016 19:23:48 +0200
Subject: [PATCH 32/35] confirm at least python 3.5 is working

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 25aaf2b..9399f16 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 language: python
 python:
   - "2.7"
-  - "3.4"
+  - "3.5"
 install:
   # Fix for html5lib, probably can be removed after the version after
   # 0.999999999/1.0b10 is released.

From 6cd1222754bf1292e5785e37a9ebd33a2174b3ed Mon Sep 17 00:00:00 2001
From: Steven Maude <git@stevenmaude.co.uk>
Date: Tue, 4 Oct 2016 21:07:10 +0100
Subject: [PATCH 33/35] Readd Python 3.4 to Travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 9399f16..e6af8c8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,7 @@
 language: python
 python:
   - "2.7"
+  - "3.4"
   - "3.5"
 install:
   # Fix for html5lib, probably can be removed after the version after

From 506269e403b915165973dc38fb47f91d279f8795 Mon Sep 17 00:00:00 2001
From: Steven Maude <git@stevenmaude.co.uk>
Date: Tue, 4 Oct 2016 21:16:48 +0100
Subject: [PATCH 34/35] Fix missing comma in setup.py

Probably occurred during rebasing.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6c1e70e..8418bba 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
         'lxml>=3.2',
         'requests>=2.0',
         'html5lib',
-        'json-table-schema>=0.2, <=0.2.1'
+        'json-table-schema>=0.2, <=0.2.1',
         'typecast>=0.3.3',
         'six',
         'ordereddict',

From 6638e58bf88f04d0d83c8e373b85e245b4f0e5f4 Mon Sep 17 00:00:00 2001
From: Steven Maude <git@stevenmaude.co.uk>
Date: Tue, 4 Oct 2016 21:32:54 +0100
Subject: [PATCH 35/35] Fix byte concatenation in Python 3.4

This line worked in Python 3.5 because of PEP 461:

https://www.python.org/dev/peps/pep-0461/

but not in Python 3.4.
---
 messytables/commas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/messytables/commas.py b/messytables/commas.py
index 27261a5..1a75613 100644
--- a/messytables/commas.py
+++ b/messytables/commas.py
@@ -74,7 +74,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None,
 
         # For line breaking, use the (detected) encoding of the file:
         linesep = [t.encode(self.encoding) for t in LINE_SEPARATOR]
-        linesep = b'(%s)' % b'|'.join(linesep)
+        linesep = b'(' + b'|'.join(linesep) + b')'
         self.linesep = re.compile(linesep)
 
         self._sample = []