From 1390f094f64f343cee900a7cad1dd0a0fb425313 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Thu, 6 Aug 2015 15:17:26 +0200 Subject: [PATCH 01/35] Use `typecast` for type conversion. --- Makefile | 5 +- messytables/dateparser.py | 66 --------- messytables/types.py | 282 ++++++-------------------------------- setup.py | 9 +- test/test_guessing.py | 4 +- test/test_read.py | 1 + 6 files changed, 58 insertions(+), 309 deletions(-) delete mode 100644 messytables/dateparser.py diff --git a/Makefile b/Makefile index c5cf657..8214231 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,7 @@ run: build build: @docker build -t messytables . -.PHONY: run build +test: + nosetests --with-coverage --cover-package=messytables --cover-erase + +.PHONY: run build test diff --git a/messytables/dateparser.py b/messytables/dateparser.py deleted file mode 100644 index 05d7c93..0000000 --- a/messytables/dateparser.py +++ /dev/null @@ -1,66 +0,0 @@ -import re - -date_regex = re.compile(r'''^\d{1,4}[-\/\.\s]\S+[-\/\.\s]\S+''') - - -def is_date(value): - return len(value) != 1 and date_regex.match(value) - - -def create_date_formats(day_first=True): - """generate combinations of time and date - formats with different delimeters - """ - - if day_first: - date_formats = ['dd/mm/yyyy', 'dd/mm/yy', 'yyyy/mm/dd'] - python_date_formats = ['%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d'] - else: - date_formats = ['mm/dd/yyyy', 'mm/dd/yy', 'yyyy/mm/dd'] - python_date_formats = ['%m/%d/%Y', '%m/%d/%y', '%Y/%m/%d'] - - date_formats += [ - # Things with words in - 'dd/bb/yyyy', 'dd/bbb/yyyy' - ] - python_date_formats += [ - # Things with words in - '%d/%b/%Y', '%d/%B/%Y' - ] - - both_date_formats = list(zip(date_formats, python_date_formats)) - - #time_formats = "hh:mmz hh:mm:ssz hh:mmtzd hh:mm:sstzd".split() - time_formats = "hh:mm:ssz hh:mm:ss hh:mm:sstzd".split() - python_time_formats = "%H:%M%Z %H:%M:%S %H:%M:%S%Z %H:%M%z %H:%M:%S%z".split() - both_time_formats = list(zip(time_formats, python_time_formats)) - - #date_separators = ["-","."," ","","/","\\"] - date_separators = ["-", ".", "/", " "] - - all_date_formats = [] - - for separator in date_separators: - for date_format, python_date_format in both_date_formats: - all_date_formats.append( - (date_format.replace("/", separator), - python_date_format.replace("/", separator)) - ) - - all_formats = {} - - for date_format, python_date_format in all_date_formats: - all_formats[date_format] = python_date_format - for time_format, python_time_format in both_time_formats: - - all_formats[date_format + time_format] = \ - python_date_format + python_time_format - - all_formats[date_format + "T" + time_format] =\ - python_date_format + "T" + python_time_format - - all_formats[date_format + " " + time_format] =\ - python_date_format + " " + python_time_format - return list(all_formats.values()) - -DATE_FORMATS = create_date_formats() diff --git a/messytables/types.py b/messytables/types.py index ba017f3..29a4356 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -1,201 +1,30 @@ -import decimal -import datetime from collections import defaultdict -from messytables.compat23 import izip_longest, unicode_string, string_types -import locale -import sys - -import dateutil.parser as parser - -from messytables.dateparser import DATE_FORMATS, is_date - - -class CellType(object): - """ A cell type maintains information about the format - of the cell, providing methods to check if a type is - applicable to a given value and to convert a value to the - type. """ - - guessing_weight = 1 - # the type that the result will have - result_type = None - - def test(self, value): - """ Test if the value is of the given type. The - default implementation calls ``cast`` and checks if - that throws an exception. True or False""" - if isinstance(value, self.result_type): - return True - try: - self.cast(value) - return True - except: - return False - - @classmethod - def instances(cls): - return [cls()] - - def cast(self, value): - """ Convert the value to the type. This may throw - a quasi-random exception if conversion fails. """ - return value - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __hash__(self): - return hash(self.__class__) - - def __repr__(self): - return self.__class__.__name__.rsplit('Type', 1)[0] - - -class StringType(CellType): - """ A string or other unconverted type. """ - result_type = unicode_string - - def cast(self, value): - if value is None: - return None - if isinstance(value, self.result_type): - return value - try: - return unicode_string(value) - except UnicodeEncodeError: - return str(value) - - -class IntegerType(CellType): - """ An integer field. """ - guessing_weight = 6 - result_type = int - - def cast(self, value): - if value in ('', None): - return None - - try: - value = float(value) - except: - return locale.atoi(value) - - if value.is_integer(): - return int(value) - else: - raise ValueError('Invalid integer: %s' % value) - - -class DecimalType(CellType): - """ Decimal number, ``decimal.Decimal`` or float numbers. """ - guessing_weight = 4 - result_type = decimal.Decimal - - def cast(self, value): - if value in ('', None): - return None - try: - return decimal.Decimal(value) - except: - value = locale.atof(value) - if sys.version_info < (2, 7): - value = str(value) - return decimal.Decimal(value) - - -class FloatType(DecimalType): - """ FloatType is deprecated """ - pass - - -class BoolType(CellType): - """ A boolean field. Matches true/false, yes/no and 0/1 by default, - but a custom set of values can be optionally provided. - """ - guessing_weight = 7 - result_type = bool - true_values = ('yes', 'true', '0') - false_values = ('no', 'false', '1') - - def __init__(self, true_values=None, false_values=None): - if true_values is not None: - self.true_values = true_values - if false_values is not None: - self.false_values = false_values - - def cast(self, value): - s = value.strip().lower() - if value in ('', None): - return None - if s in self.true_values: - return True - if s in self.false_values: - return False - raise ValueError - - -class DateType(CellType): - """ The date type is special in that it also includes a specific - date format that is used to parse the date, additionally to the - basic type information. """ - guessing_weight = 3 - formats = DATE_FORMATS - result_type = datetime.datetime - - def __init__(self, format): - self.format = format - - @classmethod - def instances(cls): - return [cls(v) for v in cls.formats] - - def test(self, value): - if isinstance(value, string_types) and not is_date(value): - return False - return CellType.test(self, value) - - def cast(self, value): - if isinstance(value, self.result_type): - return value - if value in ('', None): - return None - if self.format is None: - return value - return datetime.datetime.strptime(value, self.format) - - def __eq__(self, other): - return (isinstance(other, DateType) and - self.format == other.format) - - def __repr__(self): - return "Date(%s)" % self.format - - def __hash__(self): - return hash(self.__class__) + hash(self.format) - - -class DateUtilType(CellType): - """ The date util type uses the dateutil library to - parse the dates. The advantage of this type over - DateType is the speed and better date detection. However, - it does not offer format detection. - - Do not use this together with the DateType""" - guessing_weight = 3 - result_type = datetime.datetime - - def test(self, value): - if len(value) == 1: - return False - return CellType.test(self, value) - - def cast(self, value): - if value in ('', None): - return None - return parser.parse(value) - - -TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType] +from messytables.compat23 import izip_longest + +import typecast + +# For legacy support: +StringType = typecast.String +IntegerType = typecast.Integer +DecimalType = typecast.Decimal +FloatType = typecast.Decimal +BoolType = typecast.Boolean +DateType = typecast.Date +DateTimeType = typecast.DateTime +DateUtilType = typecast.Date + + +WEIGHTS = { + typecast.String: 1, + typecast.Integer: 6, + typecast.Decimal: 4, + typecast.Boolean: 7, + typecast.Date: 3, + typecast.DateTime: 3 +} +TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType, + DateTimeType] +FAILED = 'failed' def type_guess(rows, types=TYPES, strict=False): @@ -209,54 +38,31 @@ def type_guess(rows, types=TYPES, strict=False): if parsing fails for a single cell in the column.""" guesses = [] type_instances = [i for t in types for i in t.instances()] - if strict: - at_least_one_value = [] - for ri, row in enumerate(rows): - diff = len(row) - len(guesses) - for _ in range(diff): - typesdict = {} - for type in type_instances: - typesdict[type] = 0 - guesses.append(typesdict) - at_least_one_value.append(False) - for ci, cell in enumerate(row): - if not cell.value: + for i, row in enumerate(rows): + diff = len(row) - len(guesses) + for _ in range(diff): + guesses.append(defaultdict(int)) + for i, cell in enumerate(row): + # add string guess so that we have at least one guess + guesses[i][StringType()] = guesses[i].get(StringType(), 0) + for type in type_instances: + if guesses[i][type] == FAILED: continue - at_least_one_value[ci] = True - for type in list(guesses[ci].keys()): - if not type.test(cell.value): - guesses[ci].pop(type) - # no need to set guessing weights before this - # because we only accept a type if it never fails - for i, guess in enumerate(guesses): - for type in guess: - guesses[i][type] = type.guessing_weight - # in case there were no values at all in the column, - # we just set the guessed type to string - for i, v in enumerate(at_least_one_value): - if not v: - guesses[i] = {StringType(): 0} - else: - for i, row in enumerate(rows): - diff = len(row) - len(guesses) - for _ in range(diff): - guesses.append(defaultdict(int)) - for i, cell in enumerate(row): - # add string guess so that we have at least one guess - guesses[i][StringType()] = guesses[i].get(StringType(), 0) - if not cell.value: - continue - for type in type_instances: - if type.test(cell.value): - guesses[i][type] += type.guessing_weight - _columns = [] + result = type.test(cell.value) == 1 + weight = WEIGHTS[type.__class__] + if strict and not result and not isinstance(type, StringType): + guesses[i][type] = FAILED + elif result: + guesses[i][type] += weight + _columns = [] for guess in guesses: # this first creates an array of tuples because we want the types to be # sorted. Even though it is not specified, python chooses the first # element in case of a tie # See: http://stackoverflow.com/a/6783101/214950 - guesses_tuples = [(t, guess[t]) for t in type_instances if t in guess] + guesses_tuples = [(t, guess[t]) for t in type_instances + if t in guess and guess[t] != FAILED] _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) return _columns diff --git a/setup.py b/setup.py index 16fdb73..bff0284 100644 --- a/setup.py +++ b/setup.py @@ -44,12 +44,17 @@ 'chardet>=2.3.0', 'python-dateutil>=1.5.0', 'lxml>=3.2', - 'requests', + 'requests>=2.0', 'html5lib', 'json-table-schema>=0.2, <=0.2.1' + 'typecast', ], extras_require={'pdf': ['pdftables>=0.0.4']}, - tests_require=[], + tests_require=[ + 'nose', + 'httpretty', + 'coverage' + ], entry_points=\ """ """, diff --git a/test/test_guessing.py b/test/test_guessing.py index b843c4e..7cd1d53 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -135,11 +135,11 @@ def test_file_with_few_strings_among_integers(self): def test_integer_and_float_detection(self): def helper(value): - return any(i.test(value) for i in IntegerType.instances()) + return any(i.test(value) == 1 for i in IntegerType.instances()) assert_equal(helper(123), True) assert_equal(helper('123'), True) assert_equal(helper(123.0), True) - assert_equal(helper('123.0'), True) + assert_equal(helper('123.0'), False) assert_equal(helper(123.1), False) assert_equal(helper('123.1'), False) diff --git a/test/test_read.py b/test/test_read.py index f4b73d1..a79a358 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -23,6 +23,7 @@ stringy = type(u'') class ReadCsvTest(unittest.TestCase): + def test_utf8bom_lost(self): fh = horror_fobj('utf8bom.csv') table_set = CSVTableSet(fh) From 879dc695440974f76c2374a559775b108cc16340 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 24 Aug 2015 23:52:16 +0200 Subject: [PATCH 02/35] Fix up type guessing tests. --- messytables/__init__.py | 3 ++ messytables/types.py | 14 ++++--- test/test_guessing.py | 91 +++++++++++++++++++---------------------- test/test_read.py | 6 +-- 4 files changed, 56 insertions(+), 58 deletions(-) diff --git a/messytables/__init__.py b/messytables/__init__.py index e2c03b9..30a3690 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -23,3 +23,6 @@ from messytables.any import any_tableset, AnyTableSet from messytables.jts import rowset_as_jts, headers_and_typed_as_jts + +import warnings +warnings.filterwarnings('ignore', "Coercing non-XML name") diff --git a/messytables/types.py b/messytables/types.py index 29a4356..96aeaf0 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -17,10 +17,10 @@ WEIGHTS = { typecast.String: 1, typecast.Integer: 6, - typecast.Decimal: 4, + typecast.Decimal: 3, typecast.Boolean: 7, - typecast.Date: 3, - typecast.DateTime: 3 + typecast.Date: 4, + typecast.DateTime: 5 } TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType, DateTimeType] @@ -48,11 +48,12 @@ def type_guess(rows, types=TYPES, strict=False): for type in type_instances: if guesses[i][type] == FAILED: continue - result = type.test(cell.value) == 1 + result = type.test(cell.value) weight = WEIGHTS[type.__class__] - if strict and not result and not isinstance(type, StringType): + if strict and (result == -1) and \ + (not isinstance(type, StringType)): guesses[i][type] = FAILED - elif result: + elif result == 1: guesses[i][type] += weight _columns = [] @@ -63,6 +64,7 @@ def type_guess(rows, types=TYPES, strict=False): # See: http://stackoverflow.com/a/6783101/214950 guesses_tuples = [(t, guess[t]) for t in type_instances if t in guess and guess[t] != FAILED] + # print 'GUESSES', zip(row, guesses_tuples) _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) return _columns diff --git a/test/test_guessing.py b/test/test_guessing.py index 7cd1d53..351ab19 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -5,10 +5,9 @@ from . import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal +from typecast import Date, String, Decimal, Integer, Boolean from messytables import (CSVTableSet, type_guess, headers_guess, - offset_processor, DateType, StringType, - DecimalType, IntegerType, - DateUtilType, BoolType) + offset_processor) class TypeGuessTest(unittest.TestCase): @@ -25,8 +24,8 @@ def test_type_guess(self): guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ - DecimalType(), DateType('%Y/%m/%d'), IntegerType(), - DateType('%d %B %Y'), BoolType(), BoolType()]) + Decimal(), Date('%Y/%m/%d'), Integer(), + Date('%d %b %Y'), Boolean(), Integer()]) def test_type_guess_strict(self): import locale @@ -40,9 +39,9 @@ def test_type_guess_strict(self): rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ - StringType(), StringType(), - DecimalType(), IntegerType(), DateType('%d %B %Y'), - DecimalType()]) + String(), String(), + Decimal(), Integer(), Date('%d %b %Y'), + Decimal()]) def test_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' @@ -53,7 +52,7 @@ def test_strict_guessing_handles_padding(self): guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, - [StringType(), StringType(), DecimalType()]) + [String(), String(), Decimal()]) def test_non_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' @@ -64,21 +63,22 @@ def test_non_strict_guessing_handles_padding(self): guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, - [IntegerType(), StringType(), DecimalType()]) + [Integer(), String(), Decimal()]) def test_guessing_uses_first_in_case_of_tie(self): csv_file = io.BytesIO(b''' 2 1.1 + 2.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( - rows.sample, types=[DecimalType, IntegerType], strict=False) - assert_equal(guessed_types, [DecimalType()]) + rows.sample, types=[Decimal, Integer], strict=False) + assert_equal(guessed_types, [Decimal()]) guessed_types = type_guess( - rows.sample, types=[IntegerType, DecimalType], strict=False) - assert_equal(guessed_types, [IntegerType()]) + rows.sample, types=[Integer, Decimal], strict=False) + assert_equal(guessed_types, [Integer()]) @attr("slow") def test_strict_type_guessing_with_large_file(self): @@ -86,56 +86,49 @@ def test_strict_type_guessing_with_large_file(self): rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) - types = [StringType, IntegerType, DecimalType, DateUtilType] - guessed_types = type_guess(rows.sample, types, True) + types = [String, Integer, Decimal, Date] + guessed_types = type_guess(rows.sample, types, False) assert_equal(len(guessed_types), 96) + print guessed_types assert_equal(guessed_types, [ - IntegerType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), IntegerType(), StringType(), DecimalType(), - DecimalType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), DateUtilType(), - DateUtilType(), DateUtilType(), DateUtilType(), StringType(), - StringType(), StringType()]) + Integer(), String(), String(), String(), + String(), String(), Integer(), String(), String(), String(), + String(), String(), String(), Integer(), String(), String(), + String(), String(), String(), String(), Integer(), String(), + String(), String(), String(), String(), String(), String(), + String(), Decimal(), Decimal(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), Integer(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), Date('%d/%m/%y'), Date('%d/%m/%y'), + String(), String(), String()]) def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) - types = [StringType, IntegerType, DecimalType, DateUtilType] + types = [String, Integer, Decimal, Date] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print(guessed_types) assert_equal(guessed_types, [ - IntegerType(), IntegerType(), - IntegerType(), IntegerType(), IntegerType(), IntegerType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), IntegerType(), StringType(), - StringType()]) + Integer(), Integer(), + Integer(), Integer(), Integer(), Integer(), + String(), String(), String(), String(), + String(), String(), String(), String(), + String(), String(), Integer(), String(), + String()]) def test_integer_and_float_detection(self): def helper(value): - return any(i.test(value) == 1 for i in IntegerType.instances()) + return any(i.test(value) == 1 for i in Integer.instances()) assert_equal(helper(123), True) assert_equal(helper('123'), True) diff --git a/test/test_read.py b/test/test_read.py index a79a358..88b9214 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -116,7 +116,7 @@ def test_apply_null_values(self): table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), StringType(), BoolType(), + expected_types = [IntegerType(), StringType(), IntegerType(), StringType()] assert_equal(types, expected_types) @@ -146,8 +146,8 @@ def test_null_process(self): assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), BoolType(), BoolType(), - BoolType()] + expected_types = [IntegerType(), IntegerType(), IntegerType(), + IntegerType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) From 2fdaf25a739d1e1969dc48e84247a8f4838881fd Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 24 Aug 2015 23:52:42 +0200 Subject: [PATCH 03/35] Hide coverage results. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e6fac63..ebba6d9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ */_build/* *.py~ *.~lock.*# +.coverage From d1d097257ed33c0ae304dca18dc94646f765c934 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 24 Aug 2015 23:55:52 +0200 Subject: [PATCH 04/35] Clean up imports. --- messytables/types.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/messytables/types.py b/messytables/types.py index 96aeaf0..65842c4 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -1,29 +1,28 @@ from collections import defaultdict from messytables.compat23 import izip_longest -import typecast +from typecast import String, Integer, Decimal, Boolean, Date, DateTime # For legacy support: -StringType = typecast.String -IntegerType = typecast.Integer -DecimalType = typecast.Decimal -FloatType = typecast.Decimal -BoolType = typecast.Boolean -DateType = typecast.Date -DateTimeType = typecast.DateTime -DateUtilType = typecast.Date +StringType = String +IntegerType = Integer +DecimalType = Decimal +FloatType = Decimal +BoolType = Boolean +DateType = Date +DateTimeType = DateTime +DateUtilType = Date WEIGHTS = { - typecast.String: 1, - typecast.Integer: 6, - typecast.Decimal: 3, - typecast.Boolean: 7, - typecast.Date: 4, - typecast.DateTime: 5 + String: 1, + Integer: 6, + Decimal: 3, + Boolean: 7, + Date: 4, + DateTime: 5 } -TYPES = [StringType, DecimalType, IntegerType, BoolType, DateType, - DateTimeType] +TYPES = [String, Decimal, Integer, Boolean, Date, DateTime] FAILED = 'failed' From 2f71c24f21eec92ded4aac4623c94c39f0ad04e0 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 00:09:32 +0200 Subject: [PATCH 05/35] Get rid of old type names. --- messytables/__init__.py | 2 -- messytables/core.py | 11 +++++++---- messytables/excel.py | 25 ++++++++++++++----------- messytables/html.py | 12 +++++++----- messytables/jts.py | 21 ++++++++++++--------- messytables/ods.py | 9 ++++----- messytables/pdf.py | 6 +++--- messytables/types.py | 16 ++-------------- test/test_guessing.py | 4 ++-- test/test_read.py | 33 +++++++++++++++++---------------- test/test_unit.py | 14 +------------- 11 files changed, 69 insertions(+), 84 deletions(-) diff --git a/messytables/__init__.py b/messytables/__init__.py index 30a3690..baecc4a 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -2,8 +2,6 @@ from messytables.util import offset_processor, null_processor from messytables.headers import headers_guess, headers_processor, headers_make_unique from messytables.types import type_guess, types_processor -from messytables.types import StringType, IntegerType, FloatType, \ - DecimalType, DateType, DateUtilType, BoolType from messytables.error import ReadError from messytables.core import Cell, TableSet, RowSet, seekable_stream diff --git a/messytables/core.py b/messytables/core.py index 28ad7eb..3094c34 100644 --- a/messytables/core.py +++ b/messytables/core.py @@ -1,9 +1,13 @@ -from messytables.util import OrderedDict +import io from collections import Mapping + +from typecast import String + +from messytables.util import OrderedDict from messytables.error import TableError, NoSuchPropertyError -import io from messytables.compat23 import * + def seekable_stream(fileobj): try: fileobj.seek(0) @@ -115,8 +119,7 @@ class Cell(object): def __init__(self, value, column=None, type=None): if type is None: - from messytables.types import StringType - type = StringType() + type = String() self.value = value self.column = column self.column_autogenerated = False diff --git a/messytables/excel.py b/messytables/excel.py index 9d30131..744e70c 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -1,28 +1,29 @@ import sys from datetime import datetime, time + import xlrd from xlrd.biffh import XLRDError +from typecast import String, Integer, Date, Float from messytables.core import RowSet, TableSet, Cell, CoreProperties -from messytables.types import (StringType, IntegerType, - DateType, FloatType) from messytables.error import ReadError from messytables.compat23 import PY2 + class InvalidDateError(Exception): pass XLS_TYPES = { - 1: StringType(), + 1: String(), # NB: Excel does not distinguish floats from integers so we use floats # We could try actual type detection between floats and ints later # or use the excel format string info - see # https://groups.google.com/forum/?fromgroups=#!topic/ # python-excel/cAQ1ndsCVxk - 2: FloatType(), - 3: DateType(None), + 2: Float(), + 3: Date(), # this is actually boolean but we do not have a boolean type yet - 4: IntegerType() + 4: Integer() } @@ -45,7 +46,7 @@ def get_workbook(): file_contents=read_obj, encoding_override=encoding, formatting_info=with_formatting_info) - except XLRDError as e: + except XLRDError: _, value, traceback = sys.exc_info() if PY2: raise ReadError("Can't read Excel file: %r" % value, traceback) @@ -76,7 +77,7 @@ def get_workbook(): try: self.workbook = get_workbook() - except NotImplementedError as e: + except NotImplementedError: if not with_formatting_info: raise else: @@ -115,12 +116,13 @@ def raw(self, sample=False): self.sheet.name, colnum+1, rownum+1)) yield row + class XLSCell(Cell): @staticmethod def from_xlrdcell(xlrd_cell, sheet, col, row): value = xlrd_cell.value - cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType()) - if cell_type == DateType(None): + cell_type = XLS_TYPES.get(xlrd_cell.ctype, String()) + if cell_type == Date(): if value == 0: raise InvalidDateError year, month, day, hour, minute, second = \ @@ -143,10 +145,12 @@ def topleft(self): def properties(self): return XLSProperties(self) + class XLSProperties(CoreProperties): KEYS = ['bold', 'size', 'italic', 'font_name', 'strikeout', 'underline', 'font_colour', 'background_colour', 'any_border', 'all_border', 'richtext', 'blank', 'a_date', 'formatting_string'] + def __init__(self, cell): self.cell = cell self.merged = {} @@ -243,4 +247,3 @@ def get_all_border(self): b = self.xf.border return b.top_line_style > 0 and b.bottom_line_style > 0 and \ b.left_line_style > 0 and b.right_line_style > 0 - diff --git a/messytables/html.py b/messytables/html.py index 2214363..4f02f26 100644 --- a/messytables/html.py +++ b/messytables/html.py @@ -1,9 +1,12 @@ -from messytables.core import RowSet, TableSet, Cell, CoreProperties -import lxml.html from collections import defaultdict -import html5lib import xml.etree.ElementTree as etree +import html5lib +import lxml.html +from typecast import String + +from messytables.core import RowSet, TableSet, Cell, CoreProperties + def fromstring(s): tb = html5lib.getTreeBuilder("lxml", implementation=etree) @@ -159,8 +162,7 @@ def __init__(self, value=None, column=None, type=None, source=None): assert isinstance(source, lxml.etree._Element) self._lxml = source if type is None: - from messytables.types import StringType - type = StringType() + type = String() self.type = type self.column = column self.column_autogenerated = False diff --git a/messytables/jts.py b/messytables/jts.py index 031528f..056d9c3 100644 --- a/messytables/jts.py +++ b/messytables/jts.py @@ -2,18 +2,20 @@ Convert a rowset to the json table schema (http://www.dataprotocols.org/en/latest/json-table-schema.html) ''' +import jsontableschema +from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean import messytables -import jsontableschema + MESSYTABLES_TO_JTS_MAPPING = { - messytables.StringType: 'string', - messytables.IntegerType: 'integer', - messytables.FloatType: 'number', - messytables.DecimalType: 'number', - messytables.DateType: 'date', - messytables.DateUtilType: 'date', - messytables.BoolType: 'boolean' + String: 'string', + Integer: 'integer', + Float: 'number', + Decimal: 'number', + Date: 'date', + DateTime: 'datetime', + Boolean: 'boolean' } @@ -25,7 +27,8 @@ def rowset_as_jts(rowset, headers=None, types=None): ''' Create a json table schema from a rowset ''' _, headers = messytables.headers_guess(rowset.sample) - types = list(map(celltype_as_string, messytables.type_guess(rowset.sample))) + types = list(map(celltype_as_string, + messytables.type_guess(rowset.sample))) return headers_and_typed_as_jts(headers, types) diff --git a/messytables/ods.py b/messytables/ods.py index 7b03d74..4351c85 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -3,10 +3,9 @@ import zipfile from lxml import etree +from typecast import String, Decimal, Date from messytables.core import RowSet, TableSet, Cell -from messytables.types import (StringType, DecimalType, - DateType) ODS_NAMESPACES_TAG_MATCH = re.compile(b"(]*>)", re.MULTILINE) @@ -15,8 +14,8 @@ ODS_ROW_MATCH = re.compile(b".*?().*?", re.MULTILINE) ODS_TYPES = { - 'float': DecimalType(), - 'date': DateType(None), + 'float': Decimal(), + 'date': Date(), } @@ -135,7 +134,7 @@ def raw(self, sample=False): children = elem.getchildren() if children: c = Cell(children[0].text, - type=ODS_TYPES.get(cell_type, StringType())) + type=ODS_TYPES.get(cell_type, String())) row_data.append(c) if not row_data: diff --git a/messytables/pdf.py b/messytables/pdf.py index 4f9052e..11aa907 100644 --- a/messytables/pdf.py +++ b/messytables/pdf.py @@ -1,6 +1,6 @@ -from messytables.core import RowSet, TableSet, Cell +from typecast import String -from messytables.types import StringType +from messytables.core import RowSet, TableSet, Cell try: from pdftables import get_tables @@ -30,7 +30,7 @@ def __init__(self, pdftables_cell): self.column = None self.column_autogenerated = False - self.type = StringType() + self.type = String() @property def topleft(self): diff --git a/messytables/types.py b/messytables/types.py index 65842c4..5709332 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -3,17 +3,6 @@ from typecast import String, Integer, Decimal, Boolean, Date, DateTime -# For legacy support: -StringType = String -IntegerType = Integer -DecimalType = Decimal -FloatType = Decimal -BoolType = Boolean -DateType = Date -DateTimeType = DateTime -DateUtilType = Date - - WEIGHTS = { String: 1, Integer: 6, @@ -43,14 +32,13 @@ def type_guess(rows, types=TYPES, strict=False): guesses.append(defaultdict(int)) for i, cell in enumerate(row): # add string guess so that we have at least one guess - guesses[i][StringType()] = guesses[i].get(StringType(), 0) + guesses[i][String()] = guesses[i].get(String(), 0) for type in type_instances: if guesses[i][type] == FAILED: continue result = type.test(cell.value) weight = WEIGHTS[type.__class__] - if strict and (result == -1) and \ - (not isinstance(type, StringType)): + if strict and (result == -1) and not isinstance(type, String): guesses[i][type] = FAILED elif result == 1: guesses[i][type] += weight diff --git a/test/test_guessing.py b/test/test_guessing.py index 351ab19..48e9e27 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -6,8 +6,8 @@ from nose.plugins.attrib import attr from nose.tools import assert_equal from typecast import Date, String, Decimal, Integer, Boolean -from messytables import (CSVTableSet, type_guess, headers_guess, - offset_processor) +from messytables import CSVTableSet, type_guess, headers_guess +from messytables import offset_processor class TypeGuessTest(unittest.TestCase): diff --git a/test/test_read.py b/test/test_read.py index 88b9214..38014d8 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -12,16 +12,17 @@ except ImportError: from .shim26 import assert_is_instance, assert_greater_equal -from messytables import (CSVTableSet, StringType, HTMLTableSet, +from typecast import Date, Float, Integer, String +from messytables import (CSVTableSet, HTMLTableSet, ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet, ODSTableSet, headers_guess, headers_processor, - offset_processor, DateType, FloatType, - IntegerType, BoolType, rowset_as_jts, + offset_processor, rowset_as_jts, types_processor, type_guess, ReadError, null_processor) import datetime stringy = type(u'') + class ReadCsvTest(unittest.TestCase): def test_utf8bom_lost(self): @@ -42,7 +43,7 @@ def test_read_simple_csv(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) def test_read_complex_csv(self): fh = horror_fobj('complex.csv') @@ -57,7 +58,7 @@ def test_read_complex_csv(self): for row in list(row_set): assert_equal(4, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) def test_overriding_sniffed(self): # semicolon separated values @@ -101,13 +102,13 @@ def test_read_type_guess_simple(self): table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) - expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] + expected_types = [Date("%Y-%m-%d"), Integer(), String()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] - assert_equal(header_types, [StringType()] * 3) + assert_equal(header_types, [String()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types) @@ -116,8 +117,8 @@ def test_apply_null_values(self): table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), StringType(), IntegerType(), - StringType()] + expected_types = [Integer(), String(), Integer(), + String()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) @@ -146,8 +147,8 @@ def test_null_process(self): assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), IntegerType(), IntegerType(), - IntegerType()] + expected_types = [Integer(), Integer(), Integer(), + Integer()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) @@ -237,7 +238,7 @@ def test_read_simple_zip(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadTsvTest(unittest.TestCase): @@ -251,7 +252,7 @@ def test_read_simple_tsv(self): assert_equal(row[1].value, 'expr1_0_imp') for row in list(row_set): assert_equal(17, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadSsvTest(unittest.TestCase): @@ -267,7 +268,7 @@ def test_read_simple_ssv(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadPsvTest(unittest.TestCase): @@ -283,7 +284,7 @@ def test_read_simple_psv(self): for row in list(row_set): assert_equal(6, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadODSTest(unittest.TestCase): @@ -471,7 +472,7 @@ def test_read_type_know_simple(self): row_set = table_set.tables[0] row = list(row_set.sample)[1] types = [c.type for c in row] - assert_equal(types, [DateType(None), FloatType(), StringType()]) + assert_equal(types, [Date(None), Float(), String()]) def test_bad_first_sheet(self): # First sheet appears to have no cells diff --git a/test/test_unit.py b/test/test_unit.py index 27c63aa..696604d 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -1,19 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from messytables import dateparser, Cell - - -class DateParserTest(unittest.TestCase): - def test_date_regex(self): - assert dateparser.is_date('2012 12 22') - assert dateparser.is_date('2012/12/22') - assert dateparser.is_date('2012-12-22') - assert dateparser.is_date('22.12.2012') - assert dateparser.is_date('12 12 22') - assert dateparser.is_date('22 Dec 2012') - assert dateparser.is_date('2012 12 22 13:17') - assert dateparser.is_date('2012 12 22 T 13:17') +from messytables import Cell class CellReprTest(unittest.TestCase): From f06a3c1594890ce7bfd77df143179d19bee2c581 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 00:11:46 +0200 Subject: [PATCH 06/35] Clean out old aliases for XLSXTableSet --- messytables/__init__.py | 7 ------- test/test_read.py | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/messytables/__init__.py b/messytables/__init__.py index baecc4a..4abdaa9 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -8,13 +8,6 @@ from messytables.commas import CSVTableSet, CSVRowSet from messytables.ods import ODSTableSet, ODSRowSet from messytables.excel import XLSTableSet, XLSRowSet - -# XLSXTableSet has been deprecated and its functionality is now provided by -# XLSTableSet. This is to retain backwards compatibility with anyone -# constructing XLSXTableSet directly (rather than using any_tableset) -XLSXTableSet = XLSTableSet -XLSXRowSet = XLSRowSet - from messytables.zip import ZIPTableSet from messytables.html import HTMLTableSet, HTMLRowSet from messytables.pdf import PDFTableSet, PDFRowSet diff --git a/test/test_read.py b/test/test_read.py index 38014d8..2901c67 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -14,7 +14,7 @@ from typecast import Date, Float, Integer, String from messytables import (CSVTableSet, HTMLTableSet, - ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet, + ZIPTableSet, XLSTableSet, PDFTableSet, ODSTableSet, headers_guess, headers_processor, offset_processor, rowset_as_jts, types_processor, type_guess, ReadError, @@ -349,7 +349,7 @@ def test_that_xlsx_is_handled_by_xls_table_set(self): Should emit a DeprecationWarning. """ fh = horror_fobj('simple.xlsx') - assert_is_instance(XLSXTableSet(fh), XLSTableSet) + assert_is_instance(XLSTableSet(fh), XLSTableSet) class ReadXlsTest(unittest.TestCase): From 92fb2159551e2af5b0c32b59d630f7ab53556c83 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 00:28:02 +0200 Subject: [PATCH 07/35] Further pieces of clean up. --- .travis.yml | 1 - Dockerfile | 30 ------------------------------ Makefile | 9 --------- messytables/__init__.py | 5 +++-- messytables/any.py | 7 ------- messytables/commas.py | 4 +--- messytables/error.py | 10 ++++++---- messytables/jts.py | 1 - setup.py | 8 +++----- 9 files changed, 13 insertions(+), 62 deletions(-) delete mode 100644 Dockerfile diff --git a/.travis.yml b/.travis.yml index bd19ad7..25aaf2b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.6" - "2.7" - "3.4" install: diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index b682622..0000000 --- a/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM ubuntu:14.04 - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && \ - apt-get install -y \ - python-pip \ - python-dev - -RUN apt-get install -y python-numpy python-lxml -RUN apt-get install -y python3 python3-pip python3-lxml python3-nose -# chardet version is out of date; old version doesn't detect UTF8 w/ BOM -RUN pip3 install --upgrade chardet -RUN apt-get install -y python-nose -RUN locale-gen en_GB.UTF-8 - -RUN mkdir /home/messytables && \ - chown nobody /home/messytables -USER nobody -ENV HOME=/home/messytables \ - PATH=/home/messytables/.local/bin:$PATH \ - LANG=en_GB.UTF-8 -# LANG needed for httpretty install on Py3 -WORKDIR /home/messytables - -COPY ./requirements-test.txt /home/messytables/ -RUN pip install --user -r /home/messytables/requirements-test.txt -RUN pip3 install --user -r /home/messytables/requirements-test.txt -RUN pip install --user pdftables -COPY . /home/messytables/ diff --git a/Makefile b/Makefile index 8214231..d22fbb6 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,3 @@ -run: build - @docker run \ - --rm \ - -ti \ - messytables - -build: - @docker build -t messytables . - test: nosetests --with-coverage --cover-package=messytables --cover-erase diff --git a/messytables/__init__.py b/messytables/__init__.py index 4abdaa9..53e1dc6 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -1,6 +1,7 @@ from messytables.util import offset_processor, null_processor -from messytables.headers import headers_guess, headers_processor, headers_make_unique +from messytables.headers import headers_guess, headers_processor +from messytables.headers import headers_make_unique from messytables.types import type_guess, types_processor from messytables.error import ReadError @@ -11,7 +12,7 @@ from messytables.zip import ZIPTableSet from messytables.html import HTMLTableSet, HTMLRowSet from messytables.pdf import PDFTableSet, PDFRowSet -from messytables.any import any_tableset, AnyTableSet +from messytables.any import any_tableset from messytables.jts import rowset_as_jts, headers_and_typed_as_jts diff --git a/messytables/any.py b/messytables/any.py index c497391..cdd24b7 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -163,10 +163,3 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw): raise messytables.ReadError('any: \n'.join(error)) else: raise messytables.ReadError("any: Did not attempt any detection.") - - -class AnyTableSet: - '''Deprecated - use any_tableset instead.''' - @staticmethod - def from_fileobj(fileobj, mimetype=None, extension=None): - return any_tableset(fileobj, mimetype=mimetype, extension=extension) diff --git a/messytables/commas.py b/messytables/commas.py index 65dd999..7263a75 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -8,9 +8,7 @@ class UTF8Recoder: - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ + """ Iterator that reads an encoded stream and re-encodes it to UTF-8. """ # maps between chardet encoding and codecs bom keys BOM_MAPPING = { diff --git a/messytables/error.py b/messytables/error.py index a65429c..3df3f63 100644 --- a/messytables/error.py +++ b/messytables/error.py @@ -1,16 +1,18 @@ + class MessytablesError(Exception): - """A generic error to inherit from""" + """ A generic error to inherit from. """ class ReadError(MessytablesError): - '''Error reading the file/stream in terms of the expected format.''' + """ Error reading the file/stream in terms of the expected format. """ pass class TableError(MessytablesError, LookupError): - """Couldn't identify correct table.""" + """ Couldn't identify correct table. """ pass + class NoSuchPropertyError(MessytablesError, KeyError): - """The requested property doesn't exist""" + """ The requested property doesn't exist. """ pass diff --git a/messytables/jts.py b/messytables/jts.py index 056d9c3..0254259 100644 --- a/messytables/jts.py +++ b/messytables/jts.py @@ -44,5 +44,4 @@ def headers_and_typed_as_jts(headers, types): j.add_field(field_id=field_id, label=field_id, field_type=field_type) - return j diff --git a/setup.py b/setup.py index bff0284..08e7d52 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='messytables', - version='0.15.1', + version='1.99.0', description="Parse messy tabular data in various formats", long_description=long_desc, classifiers=[ @@ -42,12 +42,12 @@ 'xlrd>=0.8.0', 'python-magic>=0.4.12', # used for type guessing 'chardet>=2.3.0', - 'python-dateutil>=1.5.0', 'lxml>=3.2', 'requests>=2.0', 'html5lib', 'json-table-schema>=0.2, <=0.2.1' 'typecast', + 'json-table-schema>=0.2' ], extras_require={'pdf': ['pdftables>=0.0.4']}, tests_require=[ @@ -55,7 +55,5 @@ 'httpretty', 'coverage' ], - entry_points=\ - """ - """, + entry_points={} ) From 1108885d6983d14cbe147f00d52bff53aace6b7d Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 10:05:58 +0200 Subject: [PATCH 08/35] Start getting rid of the compatibility layer --- messytables/any.py | 4 +++- messytables/commas.py | 26 ++++++++++++++------------ messytables/compat23.py | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/messytables/any.py b/messytables/any.py index cdd24b7..8aa0b2f 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,7 +1,8 @@ +import re + from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, HTMLTableSet, ODSTableSet) import messytables -import re MIMELOOKUP = {'application/x-zip-compressed': 'ZIP', @@ -29,6 +30,7 @@ 'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS', } + def TABTableSet(fileobj): return CSVTableSet(fileobj, delimiter='\t') diff --git a/messytables/commas.py b/messytables/commas.py index 7263a75..29fa243 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -2,9 +2,11 @@ import codecs import chardet +from six import text_type, binary_type, PY2 + +from messytables.core import seekable_stream from messytables.core import RowSet, TableSet, Cell -import messytables -from messytables.compat23 import unicode_string, byte_string, native_string, PY2 +from messytables.error import ReadError class UTF8Recoder: @@ -66,8 +68,8 @@ def __next__(self): def to_unicode_or_bust(obj, encoding='utf-8'): - if isinstance(obj, byte_string): - obj = unicode_string(obj, encoding) + if isinstance(obj, binary_type): + obj = text_type(obj, encoding) return obj @@ -78,7 +80,7 @@ class CSVTableSet(TableSet): def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None, doublequote=None, lineterminator=None, skipinitialspace=None, **kw): - self.fileobj = messytables.seekable_stream(fileobj) + self.fileobj = seekable_stream(fileobj) self.name = name or 'table' self.delimiter = delimiter self.quotechar = quotechar @@ -110,7 +112,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None, encoding='utf-8', window=None, doublequote=None, lineterminator=None, skipinitialspace=None): self.name = name - seekable_fileobj = messytables.seekable_stream(fileobj) + seekable_fileobj = seekable_stream(fileobj) self.fileobj = UTF8Recoder(seekable_fileobj, encoding) def fake_ilines(fobj): @@ -137,9 +139,9 @@ def _dialect(self): sample = delim.join(self._sample) try: dialect = csv.Sniffer().sniff(sample, - delimiters=['\t', ',', ';', '|']) # NATIVE - dialect.delimiter = native_string(dialect.delimiter) - dialect.quotechar = native_string(dialect.quotechar) + delimiters=['\t', ',', ';', '|']) + dialect.delimiter = str(dialect.delimiter) + dialect.quotechar = str(dialect.quotechar) dialect.lineterminator = delim dialect.doublequote = True return dialect @@ -184,9 +186,9 @@ def rows(): dialect=self._dialect, **self._overrides): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error as err: - if u'newline inside string' in unicode_string(err) and sample: + if u'newline inside string' in text_type(err) and sample: pass - elif u'line contains NULL byte' in unicode_string(err): + elif u'line contains NULL byte' in text_type(err): pass else: - raise messytables.ReadError('Error reading CSV: %r', err) + raise ReadError('Error reading CSV: %r', err) diff --git a/messytables/compat23.py b/messytables/compat23.py index 7970666..993d946 100644 --- a/messytables/compat23.py +++ b/messytables/compat23.py @@ -14,6 +14,6 @@ unicode_string = str native_string = str byte_string = bytes - + string_types = (str,) urlopen = urllib.request.urlopen From ed8cda1d68eb3914603d773b285a7b8b13451858 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 10:28:31 +0200 Subject: [PATCH 09/35] Remove remaining awkward compatibility work-arounds. --- messytables/any.py | 13 +++---- messytables/compat23.py | 19 ----------- messytables/core.py | 11 ++++-- messytables/excel.py | 2 +- messytables/headers.py | 6 ++-- messytables/jts.py | 7 ++-- messytables/types.py | 5 +-- messytables/util.py | 75 ----------------------------------------- messytables/zip.py | 17 +++++----- setup.py | 3 +- test/test_stream.py | 12 ++++--- 11 files changed, 44 insertions(+), 126 deletions(-) delete mode 100644 messytables/compat23.py diff --git a/messytables/any.py b/messytables/any.py index 8aa0b2f..13cac56 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,8 +1,9 @@ import re -from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, - HTMLTableSet, ODSTableSet) -import messytables +from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet +from messytables import HTMLTableSet, ODSTableSet +from messytables.core import seekable_stream +from messytables.error import ReadError MIMELOOKUP = {'application/x-zip-compressed': 'ZIP', @@ -64,7 +65,7 @@ def get_mime(fileobj): import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. - fileobj = messytables.seekable_stream(fileobj) + fileobj = seekable_stream(fileobj) header = fileobj.read(4096) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) @@ -162,6 +163,6 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw): mimetype=magic_mime)) if error: - raise messytables.ReadError('any: \n'.join(error)) + raise ReadError('any: \n'.join(error)) else: - raise messytables.ReadError("any: Did not attempt any detection.") + raise ReadError("any: Did not attempt any detection.") diff --git a/messytables/compat23.py b/messytables/compat23.py deleted file mode 100644 index 993d946..0000000 --- a/messytables/compat23.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys -PY2 = sys.version_info[0] == 2 -if PY2: - import urllib2 - from itertools import izip_longest - unicode_string = unicode - native_string = str - byte_string = str - string_types = (str, unicode) - urlopen = urllib2.urlopen -else: # i.e. PY3 - import urllib.request - from itertools import zip_longest as izip_longest - unicode_string = str - native_string = str - byte_string = bytes - - string_types = (str,) - urlopen = urllib.request.urlopen diff --git a/messytables/core.py b/messytables/core.py index 3094c34..2042262 100644 --- a/messytables/core.py +++ b/messytables/core.py @@ -1,11 +1,16 @@ import io from collections import Mapping +try: + # python 2.7: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict # noqa + +from six import text_type, string_types from typecast import String -from messytables.util import OrderedDict from messytables.error import TableError, NoSuchPropertyError -from messytables.compat23 import * def seekable_stream(fileobj): @@ -138,7 +143,7 @@ def empty(self): return True value = self.value if not isinstance(value, string_types): - value = unicode_string(value) + value = text_type(value) if len(value.strip()): return False return True diff --git a/messytables/excel.py b/messytables/excel.py index 744e70c..abd658b 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -3,11 +3,11 @@ import xlrd from xlrd.biffh import XLRDError +from six import PY2 from typecast import String, Integer, Date, Float from messytables.core import RowSet, TableSet, Cell, CoreProperties from messytables.error import ReadError -from messytables.compat23 import PY2 class InvalidDateError(Exception): diff --git a/messytables/headers.py b/messytables/headers.py index 4434618..664352c 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -1,5 +1,7 @@ from collections import defaultdict -from messytables.compat23 import izip_longest + +import six + from messytables.core import Cell @@ -43,7 +45,7 @@ def headers_processor(headers): def apply_headers(row_set, row): _row = [] - pairs = izip_longest(row, headers) + pairs = six.itertools.izip_longest(row, headers) for i, (cell, header) in enumerate(pairs): if cell is None: cell = Cell(None) diff --git a/messytables/jts.py b/messytables/jts.py index 0254259..e2aeb61 100644 --- a/messytables/jts.py +++ b/messytables/jts.py @@ -5,7 +5,8 @@ import jsontableschema from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean -import messytables +from messytables.headers import headers_guess +from messytables.types import type_guess MESSYTABLES_TO_JTS_MAPPING = { @@ -26,9 +27,9 @@ def celltype_as_string(celltype): def rowset_as_jts(rowset, headers=None, types=None): ''' Create a json table schema from a rowset ''' - _, headers = messytables.headers_guess(rowset.sample) + _, headers = headers_guess(rowset.sample) types = list(map(celltype_as_string, - messytables.type_guess(rowset.sample))) + type_guess(rowset.sample))) return headers_and_typed_as_jts(headers, types) diff --git a/messytables/types.py b/messytables/types.py index 5709332..0b793b7 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -1,5 +1,6 @@ from collections import defaultdict -from messytables.compat23 import izip_longest + +import six from typecast import String, Integer, Decimal, Boolean, Date, DateTime @@ -65,7 +66,7 @@ def types_processor(types, strict=False): def apply_types(row_set, row): if types is None: return row - for cell, type in izip_longest(row, types): + for cell, type in six.itertools.izip_longest(row, types): try: cell.value = type.cast(cell.value) cell.type = type diff --git a/messytables/util.py b/messytables/util.py index 04dd160..df5f2fa 100644 --- a/messytables/util.py +++ b/messytables/util.py @@ -1,78 +1,3 @@ -try: - # python 2.7: - from collections import OrderedDict -except ImportError: - ## {{{ http://code.activestate.com/recipes/576669/ (r18) - ## Raymond Hettingers proporsal to go in 2.7 - from collections import MutableMapping - - class OrderedDict(dict, MutableMapping): - - # Methods with direct access to underlying attributes - - def __init__(self, *args, **kwds): - if len(args) > 1: - raise TypeError('expected at 1 argument, got %d', len(args)) - if not hasattr(self, '_keys'): - self._keys = [] - self.update(*args, **kwds) - - def clear(self): - del self._keys[:] - dict.clear(self) - - def __setitem__(self, key, value): - if key not in self: - self._keys.append(key) - dict.__setitem__(self, key, value) - - def __delitem__(self, key): - dict.__delitem__(self, key) - self._keys.remove(key) - - def __iter__(self): - return iter(self._keys) - - def __reversed__(self): - return reversed(self._keys) - - def popitem(self): - if not self: - raise KeyError - key = self._keys.pop() - value = dict.pop(self, key) - return key, value - - def __reduce__(self): - items = [[k, self[k]] for k in self] - inst_dict = vars(self).copy() - inst_dict.pop('_keys', None) - return (self.__class__, (items,), inst_dict) - - # Methods with indirect access via the above methods - - setdefault = MutableMapping.setdefault - update = MutableMapping.update - pop = MutableMapping.pop - keys = MutableMapping.keys - values = MutableMapping.values - items = MutableMapping.items - - def __repr__(self): - pairs = ', '.join(map('%r: %r'.__mod__, self.items())) - return '%s({%s})' % (self.__class__.__name__, pairs) - - def copy(self): - return self.__class__(self) - - @classmethod - def fromkeys(cls, iterable, value=None): - d = cls() - for key in iterable: - d[key] = value - return d - ## end of http://code.activestate.com/recipes/576669/ }}} - def offset_processor(offset): """ Skip ``offset`` from the given iterator. This can diff --git a/messytables/zip.py b/messytables/zip.py index 4707d47..59f1a1b 100644 --- a/messytables/zip.py +++ b/messytables/zip.py @@ -1,15 +1,15 @@ import zipfile -import messytables +from messytables.core import TableSet +from messytables.any import any_tableset +from messytables.error import ReadError -class ZIPTableSet(messytables.TableSet): +class ZIPTableSet(TableSet): """ Reads TableSets from inside a ZIP file """ def __init__(self, fileobj, **kw): - """ - On error it will raise messytables.ReadError. - """ + """ On error it will raise ReadError. """ tables = [] found = [] z = zipfile.ZipFile(fileobj, 'r') @@ -25,8 +25,7 @@ def __init__(self, fileobj, **kw): ext = f.filename[f.filename.rindex(".") + 1:] try: - filetables = messytables.any.any_tableset( - z.open(f), extension=ext, **kw) + filetables = any_tableset(z.open(f), extension=ext, **kw) except ValueError as e: found.append(f.filename + ": " + e.message) continue @@ -34,8 +33,8 @@ def __init__(self, fileobj, **kw): tables.extend(filetables.tables) if len(tables) == 0: - raise messytables.ReadError('''ZIP file has no recognized - tables (%s).''' % ', '.join(found)) + raise ReadError('''ZIP file has no recognized tables (%s).''' + % ', '.join(found)) finally: z.close() diff --git a/setup.py b/setup.py index 08e7d52..218ad4f 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,8 @@ 'html5lib', 'json-table-schema>=0.2, <=0.2.1' 'typecast', - 'json-table-schema>=0.2' + 'six', + 'ordereddict', ], extras_require={'pdf': ['pdftables>=0.0.4']}, tests_require=[ diff --git a/test/test_stream.py b/test/test_stream.py index 1d677d5..9151e5f 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- import unittest -from messytables.compat23 import urlopen import requests import io +import six.moves.urllib as urllib + from . import horror_fobj from nose.tools import assert_equal import httpretty from messytables import CSVTableSet, XLSTableSet + class StreamInputTest(unittest.TestCase): @httpretty.activate def test_http_csv(self): @@ -18,7 +20,7 @@ def test_http_csv(self): httpretty.GET, url, body=horror_fobj('long.csv').read(), content_type="application/csv") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -46,7 +48,7 @@ def test_http_csv_encoding(self): httpretty.GET, url, body=horror_fobj('utf-16le_encoded.csv').read(), content_type="application/csv") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -59,7 +61,7 @@ def test_http_xls(self): httpretty.GET, url, body=horror_fobj('simple.xls').read(), content_type="application/ms-excel") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -72,7 +74,7 @@ def test_http_xlsx(self): httpretty.GET, url, body=horror_fobj('simple.xlsx').read(), content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) From e87c77472f1b6ab26df005e4b98ae00978b30c58 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 10:39:31 +0200 Subject: [PATCH 10/35] avoid circular import --- messytables/zip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/messytables/zip.py b/messytables/zip.py index 59f1a1b..680b44a 100644 --- a/messytables/zip.py +++ b/messytables/zip.py @@ -1,7 +1,6 @@ import zipfile from messytables.core import TableSet -from messytables.any import any_tableset from messytables.error import ReadError @@ -10,6 +9,7 @@ class ZIPTableSet(TableSet): def __init__(self, fileobj, **kw): """ On error it will raise ReadError. """ + from messytables.any import any_tableset tables = [] found = [] z = zipfile.ZipFile(fileobj, 'r') From 3dd9baddff29ce1fa3b28334d40a00dfe0c875e4 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 10:41:21 +0200 Subject: [PATCH 11/35] Clean up README. --- README.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 75667cd..787a362 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,4 @@ -# Parsing for messy tables - -[![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) -[![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) -[![Latest Version](https://pypip.in/version/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![Downloads](https://pypip.in/download/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![Supported Python versions](https://pypip.in/py_versions/messytables/badge.svg)](https://pypi.python.org/pypi/ckanserviceprovider/) -[![Development Status](https://pypip.in/status/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) -[![License](https://pypip.in/license/messytables/badge.svg)](https://pypi.python.org/pypi/messytables/) +# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) A library for dealing with messy tabular data in several formats, guessing types and detecting headers. @@ -14,6 +6,6 @@ See the documentation at: https://messytables.readthedocs.io Find the package at: https://pypi.python.org/pypi/messytables -See CONTRIBUTING.md for how to send patches, run tests. +See ``CONTRIBUTING.md`` for how to send patches, run tests. **Contact**: Open Knowledge Labs - http://okfnlabs.org/contact/. We especially recommend the forum: http://discuss.okfn.org/category/open-knowledge-labs/ From 8a56e5dcd98ce8405219965f858088819b7c76e0 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 25 Aug 2015 10:59:17 +0200 Subject: [PATCH 12/35] fix py3 compat --- .gitignore | 2 ++ messytables/headers.py | 2 +- messytables/types.py | 2 +- test/test_guessing.py | 1 - 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ebba6d9..0b3fb13 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ *.py~ *.~lock.*# .coverage + +pyenv3 diff --git a/messytables/headers.py b/messytables/headers.py index 664352c..6c28625 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -45,7 +45,7 @@ def headers_processor(headers): def apply_headers(row_set, row): _row = [] - pairs = six.itertools.izip_longest(row, headers) + pairs = six.moves.zip_longest(row, headers) for i, (cell, header) in enumerate(pairs): if cell is None: cell = Cell(None) diff --git a/messytables/types.py b/messytables/types.py index 0b793b7..92710e0 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -66,7 +66,7 @@ def types_processor(types, strict=False): def apply_types(row_set, row): if types is None: return row - for cell, type in six.itertools.izip_longest(row, types): + for cell, type in six.moves.zip_longest(row, types): try: cell.value = type.cast(cell.value) cell.type = type diff --git a/test/test_guessing.py b/test/test_guessing.py index 48e9e27..80883b5 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -89,7 +89,6 @@ def test_strict_type_guessing_with_large_file(self): types = [String, Integer, Decimal, Date] guessed_types = type_guess(rows.sample, types, False) assert_equal(len(guessed_types), 96) - print guessed_types assert_equal(guessed_types, [ Integer(), String(), String(), String(), String(), String(), Integer(), String(), String(), String(), From afca9173469b544148a41d86022f2d7dfce2b02a Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 14:03:38 +0200 Subject: [PATCH 13/35] =?UTF-8?q?Don=E2=80=99t=20raise=20for=200=20as=20a?= =?UTF-8?q?=20date.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- messytables/excel.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/messytables/excel.py b/messytables/excel.py index abd658b..92bc4ed 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -113,7 +113,7 @@ def raw(self, sample=False): row.append(XLSCell.from_xlrdcell(cell, self.sheet, colnum, rownum)) except InvalidDateError: raise ValueError("Invalid date at '%s':%d,%d" % ( - self.sheet.name, colnum+1, rownum+1)) + self.sheet.name, colnum+1, rownum+1)) yield row @@ -123,14 +123,13 @@ def from_xlrdcell(xlrd_cell, sheet, col, row): value = xlrd_cell.value cell_type = XLS_TYPES.get(xlrd_cell.ctype, String()) if cell_type == Date(): - if value == 0: - raise InvalidDateError - year, month, day, hour, minute, second = \ - xlrd.xldate_as_tuple(value, sheet.book.datemode) - if (year, month, day) == (0, 0, 0): - value = time(hour, minute, second) - else: - value = datetime(year, month, day, hour, minute, second) + if value != 0: + year, month, day, hour, minute, second = \ + xlrd.xldate_as_tuple(value, sheet.book.datemode) + if (year, month, day) == (0, 0, 0): + value = time(hour, minute, second) + else: + value = datetime(year, month, day, hour, minute, second) messy_cell = XLSCell(value, type=cell_type) messy_cell.sheet = sheet messy_cell.xlrd_cell = xlrd_cell From 5f4d97898b590c53a3c6c869e0bc43f4a3c3af0d Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 14:24:17 +0200 Subject: [PATCH 14/35] fix up test errors, attempt to make travis pass --- test/test_stream.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_stream.py b/test/test_stream.py index 9151e5f..335022e 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- +import io import unittest import requests -import io - import six.moves.urllib as urllib from . import horror_fobj -from nose.tools import assert_equal import httpretty +from nose.tools import assert_equal from messytables import CSVTableSet, XLSTableSet From 145e2eed866240ba6913bacd70bd7ca3e2dc3905 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 14:37:52 +0200 Subject: [PATCH 15/35] skip tests if en_GB is not supported --- test/test_guessing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_guessing.py b/test/test_guessing.py index 80883b5..ee8924a 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -4,6 +4,7 @@ from . import horror_fobj from nose.plugins.attrib import attr +from nose.plugins.skip import SkipTest from nose.tools import assert_equal from typecast import Date, String, Decimal, Integer, Boolean from messytables import CSVTableSet, type_guess, headers_guess @@ -28,8 +29,11 @@ def test_type_guess(self): Date('%d %b %Y'), Boolean(), Integer()]) def test_type_guess_strict(self): - import locale - locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') + try: + import locale + locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') + except: + raise SkipTest("Locale en_GB.UTF-8 not available.") csv_file = io.BytesIO(b''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" From dcdf21d14c1d0e7e4132a9b299dfed73603454aa Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 14:59:09 +0200 Subject: [PATCH 16/35] remove ambiguous var --- messytables/headers.py | 6 +++--- messytables/types.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/messytables/headers.py b/messytables/headers.py index 6c28625..04c05d2 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -1,6 +1,6 @@ -from collections import defaultdict - import six +from collections import defaultdict +from itertools import islice from messytables.core import Cell @@ -27,7 +27,7 @@ def headers_guess(rows, tolerance=1): The return value is a tuple of the offset of the header row and the names of the columns. """ - rows = list(rows) + rows = list(islice(rows, 1000)) modal = column_count_modal(rows) for i, row in enumerate(rows): length = len([c for c in row if not c.empty]) diff --git a/messytables/types.py b/messytables/types.py index 92710e0..0575201 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -31,18 +31,18 @@ def type_guess(rows, types=TYPES, strict=False): diff = len(row) - len(guesses) for _ in range(diff): guesses.append(defaultdict(int)) - for i, cell in enumerate(row): + for j, cell in enumerate(row): # add string guess so that we have at least one guess - guesses[i][String()] = guesses[i].get(String(), 0) + guesses[j][String()] = guesses[j].get(String(), 0) for type in type_instances: - if guesses[i][type] == FAILED: + if guesses[j][type] == FAILED: continue result = type.test(cell.value) weight = WEIGHTS[type.__class__] if strict and (result == -1) and not isinstance(type, String): - guesses[i][type] = FAILED + guesses[j][type] = FAILED elif result == 1: - guesses[i][type] += weight + guesses[j][type] += weight _columns = [] for guess in guesses: From de3e84060112a194b4c6bb0734e64af189c7868e Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 15:21:07 +0200 Subject: [PATCH 17/35] dont score null values in type detection --- messytables/types.py | 14 +++++++------- test/test_guessing.py | 18 ++++++++++-------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/messytables/types.py b/messytables/types.py index 0575201..24813ec 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -34,15 +34,15 @@ def type_guess(rows, types=TYPES, strict=False): for j, cell in enumerate(row): # add string guess so that we have at least one guess guesses[j][String()] = guesses[j].get(String(), 0) - for type in type_instances: - if guesses[j][type] == FAILED: + for inst in type_instances: + if guesses[j][inst] == FAILED or cell.empty: continue - result = type.test(cell.value) - weight = WEIGHTS[type.__class__] - if strict and (result == -1) and not isinstance(type, String): - guesses[j][type] = FAILED + result = inst.test(cell.value) + weight = WEIGHTS[inst.__class__] + if strict and (result == -1) and not isinstance(inst, String): + guesses[j][inst] = FAILED elif result == 1: - guesses[j][type] += weight + guesses[j][inst] += weight _columns = [] for guess in guesses: diff --git a/test/test_guessing.py b/test/test_guessing.py index ee8924a..4e8dcac 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -93,24 +93,26 @@ def test_strict_type_guessing_with_large_file(self): types = [String, Integer, Decimal, Date] guessed_types = type_guess(rows.sample, types, False) assert_equal(len(guessed_types), 96) - assert_equal(guessed_types, [ - Integer(), String(), String(), String(), + assumed_types = [Integer(), String(), String(), String(), String(), String(), Integer(), String(), String(), String(), String(), String(), String(), Integer(), String(), String(), String(), String(), String(), String(), Integer(), String(), - String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), Integer(), String(), Decimal(), Decimal(), String(), String(), String(), String(), String(), String(), String(), String(), String(), - String(), String(), String(), String(), String(), String(), + String(), String(), String(), Integer(), String(), Integer(), String(), String(), String(), String(), String(), String(), String(), String(), Integer(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), + Integer(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), String(), - String(), String(), String(), String(), String(), String(), - String(), String(), String(), String(), String(), String(), - String(), String(), String(), Date('%d/%m/%y'), Date('%d/%m/%y'), - String(), String(), String()]) + String(), String(), String(), String(), String(), Integer(), + String(), Date('%d/%m/%y'), Date('%d/%m/%y'), Date('%d/%m/%y'), + Date('%d/%m/%y'), String(), String(), String()] + # for (ta, tb) in zip(guessed_types, assumed_types): + # print (ta, tb) + assert_equal(guessed_types, assumed_types) def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') From 10576f3454d40c42a73213034c7e8f8f35ed9f0b Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 18:35:33 +0200 Subject: [PATCH 18/35] Move test utilities to a specific module. --- test/__init__.py | 6 ------ test/test_any.py | 2 +- test/test_guessing.py | 21 +++++++++++++++++++-- test/test_properties.py | 2 +- test/test_read.py | 2 +- test/test_rowset.py | 2 +- test/test_stream.py | 2 +- test/test_tableset.py | 2 +- test/util.py | 6 ++++++ 9 files changed, 31 insertions(+), 14 deletions(-) create mode 100644 test/util.py diff --git a/test/__init__.py b/test/__init__.py index 060bb3e..e69de29 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,6 +0,0 @@ -import os - - -def horror_fobj(name): - fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name) - return open(fn, 'rb') diff --git a/test/test_any.py b/test/test_any.py index 1fbfe78..ce39b1c 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, diff --git a/test/test_guessing.py b/test/test_guessing.py index 4e8dcac..024558e 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -1,8 +1,10 @@ # -*- coding: utf-8 -*- import unittest import io +# import cProfile +# from pstats import Stats -from . import horror_fobj +from util import horror_fobj from nose.plugins.attrib import attr from nose.plugins.skip import SkipTest from nose.tools import assert_equal @@ -12,6 +14,17 @@ class TypeGuessTest(unittest.TestCase): + + # def setUp(self): + # self.pr = cProfile.Profile() + # self.pr.enable() + + # def tearDown(self): + # p = Stats(self.pr) + # p.strip_dirs() + # p.sort_stats('cumtime') + # p.print_stats() + @attr("slow") def test_type_guess(self): csv_file = io.BytesIO(b''' @@ -122,7 +135,7 @@ def test_file_with_few_strings_among_integers(self): types = [String, Integer, Decimal, Date] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) - print(guessed_types) + # print(guessed_types) assert_equal(guessed_types, [ Integer(), Integer(), Integer(), Integer(), Integer(), Integer(), @@ -141,3 +154,7 @@ def helper(value): assert_equal(helper('123.0'), False) assert_equal(helper(123.1), False) assert_equal(helper('123.1'), False) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_properties.py b/test/test_properties.py index 5ec3f6d..0a7ca09 100644 --- a/test/test_properties.py +++ b/test/test_properties.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.error import NoSuchPropertyError from nose.tools import ( diff --git a/test/test_read.py b/test/test_read.py index 2901c67..092e744 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal from nose.plugins.skip import SkipTest diff --git a/test/test_rowset.py b/test/test_rowset.py index 4b47e7c..52e3928 100644 --- a/test/test_rowset.py +++ b/test/test_rowset.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset diff --git a/test/test_stream.py b/test/test_stream.py index 335022e..2ed6efd 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -4,7 +4,7 @@ import requests import six.moves.urllib as urllib -from . import horror_fobj +from util import horror_fobj import httpretty from nose.tools import assert_equal diff --git a/test/test_tableset.py b/test/test_tableset.py index 4c2148c..d03de88 100644 --- a/test/test_tableset.py +++ b/test/test_tableset.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.core import RowSet from messytables.error import TableError diff --git a/test/util.py b/test/util.py new file mode 100644 index 0000000..060bb3e --- /dev/null +++ b/test/util.py @@ -0,0 +1,6 @@ +import os + + +def horror_fobj(name): + fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name) + return open(fn, 'rb') From 7da15bfdce4191b3f75325d24728080da0b14d14 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 18:36:03 +0200 Subject: [PATCH 19/35] =?UTF-8?q?Move=20the=20buffered=20reader=20to=20it?= =?UTF-8?q?=E2=80=99s=20own=20module.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- messytables/__init__.py | 3 +- messytables/any.py | 2 +- messytables/buffered.py | 87 +++++++++++++++++++++++++++++++++ messytables/commas.py | 4 +- messytables/core.py | 104 ++++------------------------------------ 5 files changed, 101 insertions(+), 99 deletions(-) create mode 100644 messytables/buffered.py diff --git a/messytables/__init__.py b/messytables/__init__.py index 53e1dc6..014a095 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -5,7 +5,8 @@ from messytables.types import type_guess, types_processor from messytables.error import ReadError -from messytables.core import Cell, TableSet, RowSet, seekable_stream +from messytables.buffered import seekable_stream +from messytables.core import Cell, TableSet, RowSet from messytables.commas import CSVTableSet, CSVRowSet from messytables.ods import ODSTableSet, ODSRowSet from messytables.excel import XLSTableSet, XLSRowSet diff --git a/messytables/any.py b/messytables/any.py index 13cac56..9d305ee 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -2,7 +2,7 @@ from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet from messytables import HTMLTableSet, ODSTableSet -from messytables.core import seekable_stream +from messytables.buffered import seekable_stream from messytables.error import ReadError diff --git a/messytables/buffered.py b/messytables/buffered.py new file mode 100644 index 0000000..60335cb --- /dev/null +++ b/messytables/buffered.py @@ -0,0 +1,87 @@ +import io + + +def seekable_stream(fileobj): + try: + fileobj.seek(0) + # if we got here, the stream is seekable + except: + # otherwise seek failed, so slurp in stream and wrap + # it in a BytesIO + fileobj = BufferedFile(fileobj) + return fileobj + + +class BufferedFile(object): + """A buffered file that preserves the beginning of a stream.""" + + def __init__(self, fp, buffer_size=2048): + self.data = io.BytesIO() + self.fp = fp + self.offset = 0 + self.len = 0 + self.fp_offset = 0 + self.buffer_size = buffer_size + + def _next_line(self): + try: + return self.fp.readline() + except AttributeError: + return next(self.fp) + + def _read(self, n): + return self.fp.read(n) + + @property + def _buffer_full(self): + return self.len >= self.buffer_size + + def readline(self): + if self.len < self.offset < self.fp_offset: + raise BufferError('Line is not available anymore') + if self.offset >= self.len: + line = self._next_line() + self.fp_offset += len(line) + + self.offset += len(line) + + if not self._buffer_full: + self.data.write(line) + self.len += len(line) + else: + line = self.data.readline() + self.offset += len(line) + return line + + def read(self, n=-1): + if n == -1: + # if the request is to do a complete read, then do a complete + # read. + self.data.seek(self.offset) + return self.data.read(-1) + self.fp.read(-1) + + if self.len < self.offset < self.fp_offset: + raise BufferError('Data is not available anymore') + if self.offset >= self.len: + byte = self._read(n) + self.fp_offset += len(byte) + + self.offset += len(byte) + + if not self._buffer_full: + self.data.write(byte) + self.len += len(byte) + else: + byte = self.data.read(n) + self.offset += len(byte) + return byte + + def tell(self): + return self.offset + + def seek(self, offset): + if self.len < offset < self.fp_offset: + raise BufferError('Cannot seek because data is not buffered here') + self.offset = offset + if offset < self.len: + self.data.seek(offset) diff --git a/messytables/commas.py b/messytables/commas.py index 29fa243..4e10b55 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -4,13 +4,13 @@ from six import text_type, binary_type, PY2 -from messytables.core import seekable_stream +from messytables.buffered import seekable_stream from messytables.core import RowSet, TableSet, Cell from messytables.error import ReadError class UTF8Recoder: - """ Iterator that reads an encoded stream and re-encodes it to UTF-8. """ + """Iterator that reads an encoded stream and re-encodes it to UTF-8.""" # maps between chardet encoding and codecs bom keys BOM_MAPPING = { diff --git a/messytables/core.py b/messytables/core.py index 2042262..8915229 100644 --- a/messytables/core.py +++ b/messytables/core.py @@ -1,4 +1,3 @@ -import io from collections import Mapping try: # python 2.7: @@ -13,93 +12,6 @@ from messytables.error import TableError, NoSuchPropertyError -def seekable_stream(fileobj): - try: - fileobj.seek(0) - # if we got here, the stream is seekable - except: - # otherwise seek failed, so slurp in stream and wrap - # it in a BytesIO - fileobj = BufferedFile(fileobj) - return fileobj - - -class BufferedFile(object): - ''' A buffered file that preserves the beginning of - a stream up to buffer_size - ''' - def __init__(self, fp, buffer_size=2048): - self.data = io.BytesIO() - self.fp = fp - self.offset = 0 - self.len = 0 - self.fp_offset = 0 - self.buffer_size = buffer_size - - def _next_line(self): - try: - return self.fp.readline() - except AttributeError: - return next(self.fp) - - def _read(self, n): - return self.fp.read(n) - - @property - def _buffer_full(self): - return self.len >= self.buffer_size - - def readline(self): - if self.len < self.offset < self.fp_offset: - raise BufferError('Line is not available anymore') - if self.offset >= self.len: - line = self._next_line() - self.fp_offset += len(line) - - self.offset += len(line) - - if not self._buffer_full: - self.data.write(line) - self.len += len(line) - else: - line = self.data.readline() - self.offset += len(line) - return line - - def read(self, n=-1): - if n == -1: - # if the request is to do a complete read, then do a complete - # read. - self.data.seek(self.offset) - return self.data.read(-1) + self.fp.read(-1) - - if self.len < self.offset < self.fp_offset: - raise BufferError('Data is not available anymore') - if self.offset >= self.len: - byte = self._read(n) - self.fp_offset += len(byte) - - self.offset += len(byte) - - if not self._buffer_full: - self.data.write(byte) - self.len += len(byte) - else: - byte = self.data.read(n) - self.offset += len(byte) - return byte - - def tell(self): - return self.offset - - def seek(self, offset): - if self.len < offset < self.fp_offset: - raise BufferError('Cannot seek because data is not buffered here') - self.offset = offset - if offset < self.len: - self.data.seek(offset) - - class CoreProperties(Mapping): KEYS = [] @@ -117,10 +29,12 @@ def __len__(self): class Cell(object): - """ A cell is the basic value type. It always has a ``value`` (that - may be ``None`` and may optionally also have a type and column name - associated with it. If no ``type`` is set, the String type is set - but no type conversion is set. """ + """A cell is the basic value type. + + It always has a ``value`` (that may be ``None`` and may optionally + also have a type and column name associated with it. If no ``type`` + is set, the String type is set but no type conversion is set. + """ def __init__(self, value, column=None, type=None): if type is None: @@ -138,7 +52,7 @@ def __repr__(self): @property def empty(self): - """ Stringify the value and check that it has a length. """ + """Stringify the value and check that it has a length.""" if self.value is None: return True value = self.value @@ -150,7 +64,7 @@ def empty(self): @property def properties(self): - """ Source-specific information. Only a placeholder here. """ + """Source-specific information. Only a placeholder here.""" return CoreProperties() @property @@ -240,7 +154,7 @@ def register_processor(self, processor): self._processors.append(processor) def __iter__(self, sample=False): - """ Apply processors to the row data. """ + """Apply processors to the row data.""" for row in self.raw(sample=sample): for processor in self._processors: row = processor(self, row) From ccb094c1f28dce805468ba8c544bc41e37b39a05 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 18:36:15 +0200 Subject: [PATCH 20/35] Move guesser class to typecast. --- .gitignore | 3 +- messytables/types.py | 71 +++++++++++--------------------------------- setup.py | 2 +- 3 files changed, 21 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 0b3fb13..2df0131 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ *.swp *.egg-info *.pyc +*.eggs *.DS_Store */_build/* *.py~ *.~lock.*# .coverage - +dist/* pyenv3 diff --git a/messytables/types.py b/messytables/types.py index 24813ec..815d846 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -1,68 +1,33 @@ -from collections import defaultdict - import six +from typecast import guesser, GUESS_TYPES -from typecast import String, Integer, Decimal, Boolean, Date, DateTime - -WEIGHTS = { - String: 1, - Integer: 6, - Decimal: 3, - Boolean: 7, - Date: 4, - DateTime: 5 -} -TYPES = [String, Decimal, Integer, Boolean, Date, DateTime] -FAILED = 'failed' +def type_guess(rows, types=GUESS_TYPES, strict=False): + """Guess the best type for a given row set. -def type_guess(rows, types=TYPES, strict=False): - """ The type guesser aggregates the number of successful - conversions of each column to each type, weights them by a - fixed type priority and select the most probable type for - each column based on that figure. It returns a list of - ``CellType``. Empty cells are ignored. + The type guesser aggregates the number of successful conversions of each + column to each type, weights them by a fixed type priority and select the + most probable type for each column based on that figure. It returns a list + of ``CellType``. Empty cells are ignored. - Strict means that a type will not be guessed - if parsing fails for a single cell in the column.""" - guesses = [] - type_instances = [i for t in types for i in t.instances()] + Strict means that a type will not be guessed if parsing fails for a single + cell in the column. + """ + guessers = [] for i, row in enumerate(rows): - diff = len(row) - len(guesses) - for _ in range(diff): - guesses.append(defaultdict(int)) + for _ in range(len(row) - len(guessers)): + guessers.append(guesser(types=types, strict=strict)) for j, cell in enumerate(row): # add string guess so that we have at least one guess - guesses[j][String()] = guesses[j].get(String(), 0) - for inst in type_instances: - if guesses[j][inst] == FAILED or cell.empty: - continue - result = inst.test(cell.value) - weight = WEIGHTS[inst.__class__] - if strict and (result == -1) and not isinstance(inst, String): - guesses[j][inst] = FAILED - elif result == 1: - guesses[j][inst] += weight - - _columns = [] - for guess in guesses: - # this first creates an array of tuples because we want the types to be - # sorted. Even though it is not specified, python chooses the first - # element in case of a tie - # See: http://stackoverflow.com/a/6783101/214950 - guesses_tuples = [(t, guess[t]) for t in type_instances - if t in guess and guess[t] != FAILED] - # print 'GUESSES', zip(row, guesses_tuples) - _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) - return _columns + guessers[j].add(cell.value) + return [g.best for g in guessers] def types_processor(types, strict=False): - """ Apply the column types set on the instance to the - current row, attempting to cast each cell to the specified - type. + """Apply the column types to the each row. - Strict means that casting errors are not ignored""" + Strict means that casting errors are not ignored. + """ def apply_types(row_set, row): if types is None: return row diff --git a/setup.py b/setup.py index 218ad4f..5b729de 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ 'requests>=2.0', 'html5lib', 'json-table-schema>=0.2, <=0.2.1' - 'typecast', + 'typecast>=0.3.0', 'six', 'ordereddict', ], From 2565632990a64cb7b1235cfdb3c14665367aac10 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:06:54 +0200 Subject: [PATCH 21/35] Factor out CSV re-coder --- messytables/buffered.py | 8 +++-- messytables/commas.py | 79 +++++------------------------------------ messytables/text.py | 69 +++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 74 deletions(-) create mode 100644 messytables/text.py diff --git a/messytables/buffered.py b/messytables/buffered.py index 60335cb..dea877f 100644 --- a/messytables/buffered.py +++ b/messytables/buffered.py @@ -1,21 +1,23 @@ import io +BUFFER_SIZE = 4096 + def seekable_stream(fileobj): try: fileobj.seek(0) # if we got here, the stream is seekable + return fileobj except: # otherwise seek failed, so slurp in stream and wrap # it in a BytesIO - fileobj = BufferedFile(fileobj) - return fileobj + return BufferedFile(fileobj) class BufferedFile(object): """A buffered file that preserves the beginning of a stream.""" - def __init__(self, fp, buffer_size=2048): + def __init__(self, fp, buffer_size=BUFFER_SIZE): self.data = io.BytesIO() self.fp = fp self.offset = 0 diff --git a/messytables/commas.py b/messytables/commas.py index 4e10b55..c89e44d 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -1,76 +1,13 @@ import csv -import codecs -import chardet -from six import text_type, binary_type, PY2 +from six import text_type, PY2 -from messytables.buffered import seekable_stream +from messytables.buffered import seekable_stream, BUFFER_SIZE +from messytables.text import UTF8Recoder, to_unicode_or_bust from messytables.core import RowSet, TableSet, Cell from messytables.error import ReadError - -class UTF8Recoder: - """Iterator that reads an encoded stream and re-encodes it to UTF-8.""" - - # maps between chardet encoding and codecs bom keys - BOM_MAPPING = { - 'utf-16le': 'BOM_UTF16_LE', - 'utf-16be': 'BOM_UTF16_BE', - 'utf-32le': 'BOM_UTF32_LE', - 'utf-32be': 'BOM_UTF32_BE', - 'utf-8': 'BOM_UTF8', - 'utf-8-sig': 'BOM_UTF8', - - } - - def __init__(self, f, encoding): - sample = f.read(2000) - if not encoding: - results = chardet.detect(sample) - encoding = results['encoding'] - if not encoding: - # Don't break, just try and load the data with - # a semi-sane encoding - encoding = 'utf-8' - f.seek(0) - self.reader = codecs.getreader(encoding)(f, 'ignore') - - # The reader only skips a BOM if the encoding isn't explicit about its - # endianness (i.e. if encoding is UTF-16 a BOM is handled properly - # and taken out, but if encoding is UTF-16LE a BOM is ignored). - # However, if chardet sees a BOM it returns an encoding with the - # endianness explicit, which results in the codecs stream leaving the - # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} - # encodings, check for a BOM and remove it if it's there. - if encoding.lower() in self.BOM_MAPPING: - bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None) - if bom: - # Try to read the BOM, which is a byte sequence, from - # the underlying stream. If all characters match, then - # go on. Otherwise when a character doesn't match, seek - # the stream back to the beginning and go on. - for c in bom: - if f.read(1) != c: - f.seek(0) - break - - def __iter__(self): - return self - - def __next__(self): - line = self.reader.readline() - if not line or line == '\0': - raise StopIteration - result = line.encode("utf-8") - return result - - next = __next__ - - -def to_unicode_or_bust(obj, encoding='utf-8'): - if isinstance(obj, binary_type): - obj = text_type(obj, encoding) - return obj +DELIMITERS = ['\t', ',', ';', '|'] class CSVTableSet(TableSet): @@ -91,7 +28,7 @@ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, self.skipinitialspace = skipinitialspace def make_tables(self): - """ Return the actual CSV table. """ + """Return the actual CSV table.""" return [CSVRowSet(self.name, self.fileobj, delimiter=self.delimiter, quotechar=self.quotechar, @@ -112,12 +49,12 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None, encoding='utf-8', window=None, doublequote=None, lineterminator=None, skipinitialspace=None): self.name = name - seekable_fileobj = seekable_stream(fileobj) - self.fileobj = UTF8Recoder(seekable_fileobj, encoding) + self.fh = seekable_stream(fileobj) + self.fileobj = UTF8Recoder(self.fh, encoding) def fake_ilines(fobj): for row in fobj: - yield row.decode('utf-8') + yield row.decode('utf-8') self.lines = fake_ilines(self.fileobj) self._sample = [] self.delimiter = delimiter diff --git a/messytables/text.py b/messytables/text.py new file mode 100644 index 0000000..fe8121d --- /dev/null +++ b/messytables/text.py @@ -0,0 +1,69 @@ +import codecs +import chardet +from six import text_type, binary_type + +from messytables.buffered import BUFFER_SIZE + + +class UTF8Recoder: + """Iterator that reads an encoded stream and re-encodes it to UTF-8.""" + + # maps between chardet encoding and codecs bom keys + BOM_MAPPING = { + 'utf-16le': 'BOM_UTF16_LE', + 'utf-16be': 'BOM_UTF16_BE', + 'utf-32le': 'BOM_UTF32_LE', + 'utf-32be': 'BOM_UTF32_BE', + 'utf-8': 'BOM_UTF8', + 'utf-8-sig': 'BOM_UTF8', + + } + + def __init__(self, f, encoding): + sample = f.read(BUFFER_SIZE) + if not encoding: + results = chardet.detect(sample) + encoding = results['encoding'] + if not encoding: + # Don't break, just try and load the data with + # a semi-sane encoding + encoding = 'utf-8' + f.seek(0) + self.reader = codecs.getreader(encoding)(f, 'ignore') + + # The reader only skips a BOM if the encoding isn't explicit about its + # endianness (i.e. if encoding is UTF-16 a BOM is handled properly + # and taken out, but if encoding is UTF-16LE a BOM is ignored). + # However, if chardet sees a BOM it returns an encoding with the + # endianness explicit, which results in the codecs stream leaving the + # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} + # encodings, check for a BOM and remove it if it's there. + if encoding.lower() in self.BOM_MAPPING: + bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None) + if bom: + # Try to read the BOM, which is a byte sequence, from + # the underlying stream. If all characters match, then + # go on. Otherwise when a character doesn't match, seek + # the stream back to the beginning and go on. + for c in bom: + if f.read(1) != c: + f.seek(0) + break + + def __iter__(self): + return self + + def __next__(self): + line = self.reader.readline() + if not line or line == '\0': + raise StopIteration + result = line.encode("utf-8") + return result + + next = __next__ + + +def to_unicode_or_bust(obj, encoding='utf-8'): + if isinstance(obj, binary_type): + obj = text_type(obj, encoding) + return obj From b63baeb647e61bf4d593f0ac79522dadee1f8a18 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:08:52 +0200 Subject: [PATCH 22/35] use cchardet --- messytables/text.py | 5 ++++- setup.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/messytables/text.py b/messytables/text.py index fe8121d..4ebec79 100644 --- a/messytables/text.py +++ b/messytables/text.py @@ -1,5 +1,8 @@ import codecs -import chardet +try: + import cchardet as chardet +except ImportError: + import chardet from six import text_type, binary_type from messytables.buffered import BUFFER_SIZE diff --git a/setup.py b/setup.py index 5b729de..2da635c 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ 'xlrd>=0.8.0', 'python-magic>=0.4.12', # used for type guessing 'chardet>=2.3.0', + 'cchardet', 'lxml>=3.2', 'requests>=2.0', 'html5lib', From 2e4b96c0ffa87401e83e83629b7ad1db72d3287d Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:23:37 +0200 Subject: [PATCH 23/35] simplify the handling of CSV dialects --- messytables/commas.py | 42 ++++++++++++++---------------------------- messytables/text.py | 7 +------ setup.py | 2 +- 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/messytables/commas.py b/messytables/commas.py index c89e44d..f0b79e3 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -9,13 +9,16 @@ DELIMITERS = ['\t', ',', ';', '|'] +# Fix the maximum field size to something a little larger +csv.field_size_limit(256000) + class CSVTableSet(TableSet): """ A CSV table set. Since CSV is always just a single table, this is just a pass-through for the row set. """ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, - encoding=None, window=None, doublequote=None, + encoding=None, window=None, doublequote=True, lineterminator=None, skipinitialspace=None, **kw): self.fileobj = seekable_stream(fileobj) self.name = name or 'table' @@ -46,7 +49,7 @@ class CSVRowSet(RowSet): fragment. """ def __init__(self, name, fileobj, delimiter=None, quotechar=None, - encoding='utf-8', window=None, doublequote=None, + encoding='utf-8', window=None, doublequote=True, lineterminator=None, skipinitialspace=None): self.name = name self.fh = seekable_stream(fileobj) @@ -75,32 +78,19 @@ def _dialect(self): delim = '\n' # NATIVE sample = delim.join(self._sample) try: - dialect = csv.Sniffer().sniff(sample, - delimiters=['\t', ',', ';', '|']) - dialect.delimiter = str(dialect.delimiter) - dialect.quotechar = str(dialect.quotechar) - dialect.lineterminator = delim + dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS) + dialect.delimiter = self.delimiter or str(dialect.delimiter) + dialect.quotechar = self.quotechar or str(dialect.quotechar) + dialect.lineterminator = self.lineterminator or delim + if self.skipinitialspace is not None: + dialect.skipinitialspace = self.skipinitialspace + if self.lineterminator is not None: + dialect.lineterminator = self.lineterminator dialect.doublequote = True return dialect except csv.Error: return csv.excel - @property - def _overrides(self): - # some variables in the dialect can be overridden - d = {} - if self.delimiter: - d['delimiter'] = self.delimiter - if self.quotechar: - d['quotechar'] = self.quotechar - if self.doublequote: - d['doublequote'] = self.doublequote - if self.lineterminator: - d['lineterminator'] = self.lineterminator - if self.skipinitialspace is not None: - d['skipinitialspace'] = self.skipinitialspace - return d - def raw(self, sample=False): def rows(): for line in self._sample: @@ -115,12 +105,8 @@ def rows(): else: yield line - # Fix the maximum field size to something a little larger - csv.field_size_limit(256000) - try: - for row in csv.reader(rows(), - dialect=self._dialect, **self._overrides): + for row in csv.reader(rows(), dialect=self._dialect): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error as err: if u'newline inside string' in text_type(err) and sample: diff --git a/messytables/text.py b/messytables/text.py index 4ebec79..17c8097 100644 --- a/messytables/text.py +++ b/messytables/text.py @@ -25,12 +25,7 @@ class UTF8Recoder: def __init__(self, f, encoding): sample = f.read(BUFFER_SIZE) if not encoding: - results = chardet.detect(sample) - encoding = results['encoding'] - if not encoding: - # Don't break, just try and load the data with - # a semi-sane encoding - encoding = 'utf-8' + encoding = chardet.detect(sample).get('encoding') or 'utf-8' f.seek(0) self.reader = codecs.getreader(encoding)(f, 'ignore') diff --git a/setup.py b/setup.py index 2da635c..33a9edb 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ 'requests>=2.0', 'html5lib', 'json-table-schema>=0.2, <=0.2.1' - 'typecast>=0.3.0', + 'typecast>=0.3.1', 'six', 'ordereddict', ], From f3733258a354d438b57ba81b85d40dbfb9267718 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:34:03 +0200 Subject: [PATCH 24/35] try relative imports with py3 --- test/test_any.py | 2 +- test/test_guessing.py | 2 +- test/test_properties.py | 2 +- test/test_read.py | 2 +- test/test_rowset.py | 2 +- test/test_stream.py | 2 +- test/test_tableset.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_any.py b/test/test_any.py index ce39b1c..bfb37a1 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from util import horror_fobj +from .util import horror_fobj from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, diff --git a/test/test_guessing.py b/test/test_guessing.py index 024558e..141a3ff 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -4,7 +4,7 @@ # import cProfile # from pstats import Stats -from util import horror_fobj +from .util import horror_fobj from nose.plugins.attrib import attr from nose.plugins.skip import SkipTest from nose.tools import assert_equal diff --git a/test/test_properties.py b/test/test_properties.py index 0a7ca09..b4e4a0c 100644 --- a/test/test_properties.py +++ b/test/test_properties.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from util import horror_fobj +from .util import horror_fobj from messytables.any import any_tableset from messytables.error import NoSuchPropertyError from nose.tools import ( diff --git a/test/test_read.py b/test/test_read.py index 092e744..ac9b384 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from util import horror_fobj +from .util import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal from nose.plugins.skip import SkipTest diff --git a/test/test_rowset.py b/test/test_rowset.py index 52e3928..39077d9 100644 --- a/test/test_rowset.py +++ b/test/test_rowset.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from util import horror_fobj +from .util import horror_fobj from messytables.any import any_tableset diff --git a/test/test_stream.py b/test/test_stream.py index 2ed6efd..f2e5723 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -4,7 +4,7 @@ import requests import six.moves.urllib as urllib -from util import horror_fobj +from .util import horror_fobj import httpretty from nose.tools import assert_equal diff --git a/test/test_tableset.py b/test/test_tableset.py index d03de88..9d0c127 100644 --- a/test/test_tableset.py +++ b/test/test_tableset.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import unittest -from util import horror_fobj +from .util import horror_fobj from messytables.any import any_tableset from messytables.core import RowSet from messytables.error import TableError From 96549a9e9a21bfbba9ac54663486316dd36ab3c7 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:59:48 +0200 Subject: [PATCH 25/35] PEP8. --- messytables/core.py | 57 +++++++++++++++++++++++------------------- messytables/error.py | 11 +++----- messytables/headers.py | 24 ++++++++++-------- messytables/html.py | 40 +++++++++-------------------- messytables/ods.py | 31 ++++++++++++----------- messytables/util.py | 7 +++--- messytables/zip.py | 4 +-- 7 files changed, 83 insertions(+), 91 deletions(-) diff --git a/messytables/core.py b/messytables/core.py index 8915229..7adc9df 100644 --- a/messytables/core.py +++ b/messytables/core.py @@ -69,21 +69,21 @@ def properties(self): @property def topleft(self): - """ - Is the cell the top-left of a span? Non-spanning cells are the top left. - - This is used for example in HTML generation where the top left cell - is the only one which is written into the output representation. + """Non-spanning cells are the top left. + This is used for example in HTML generation where the top left + cell is the only one which is written into the output representation. In absense of other knowledge, we assume that all cells are top left. """ + # This seems oddly over-specific, can we solve it otherwise? return True class TableSet(object): - """ A table set is used for data formats in which multiple tabular - objects are bundled. This might include relational databases and - workbooks used in spreadsheet software (Excel, LibreOffice). + """A table set bundles multiple tabular objects. + + This might include relational databases and workbooks used in spreadsheet + software (Excel, LibreOffice). For each format, we derive from this abstract base class, providing a constructor that takes a file object and tables() that returns each table. @@ -92,14 +92,14 @@ class TableSet(object): On any fatal errors, it should raise messytables.ReadError """ + def __init__(self, fileobj): - """ Store the fileobj, and perhaps all or part of the file. """ + """Store the fileobj, and perhaps all or part of the file.""" pass @property def tables(self): - """ Return a listing of tables (i.e. RowSets) in the ``TableSet``. - Each table has a name. """ + """Get a listing of ``RowSets``.""" if getattr(self, "_tables", None) is None: self._tables = self.make_tables() return self._tables @@ -107,8 +107,9 @@ def tables(self): def make_tables(self): raise NotImplementedError("make_tables() not implemented on {0}" .format(type(self))) + def __getitem__(self, name): - """ Return a RowSet based on the name given """ + """Return a RowSet based on the name given.""" matching = [table for table in self.tables if table.name == name] if not matching: raise TableError("No table called %r" % name) @@ -118,16 +119,18 @@ def __getitem__(self, name): @classmethod def from_fileobj(cls, fileobj, *args, **kwargs): - """ Deprecated, only for compatibility reasons """ + """Deprecated, only for compatibility reasons.""" return cls(fileobj, *args, **kwargs) class RowSet(object): - """ A row set (aka: table) is a simple wrapper for an iterator of - rows (which in turn is a list of ``Cell`` objects). The main table - iterable can only be traversed once, so on order to allow analytics - like type and header guessing on the data, a sample of ``window`` - rows is read, cached, and made available. + """A single table, which allows iterating over individual rows. + + A row set (aka: table) is a simple wrapper for an iterator of rows + (which in turn is a list of ``Cell`` objects). The main table iterable + can only be traversed once, so on order to allow analytics like type and + header guessing on the data, a sample of ``window`` rows is read, cached, + and made available. On any fatal errors, it should raise messytables.ReadError """ @@ -147,10 +150,11 @@ def get_types(self): types = property(get_types, set_types) def register_processor(self, processor): - """ Register a stream processor to be used on each row. A - processor is a function called with the ``RowSet`` as its - first argument and the row to be processed as the second - argument. """ + """Register a stream processor to be used on each row. + + A processor is a function called with the ``RowSet`` as its first + argument and the row to be processed as the second argument. + """ self._processors.append(processor) def __iter__(self, sample=False): @@ -171,10 +175,11 @@ def sample(self): return self.__iter__(sample=True) def dicts(self, sample=False): - """ Return a representation of the data as an iterator of - ordered dictionaries. This is less specific than the cell - format returned by the generic iterator but only gives a - subset of the information. """ + """Return the table data as an iterator of ordered dictionaries. + + This is less specific than the cell format returned by the generic + iterator but only gives a subset of the information. + """ generator = self.sample if sample else self for row in generator: yield OrderedDict([(c.column, c.value) for c in row]) diff --git a/messytables/error.py b/messytables/error.py index 3df3f63..255f4ab 100644 --- a/messytables/error.py +++ b/messytables/error.py @@ -1,18 +1,15 @@ class MessytablesError(Exception): - """ A generic error to inherit from. """ + """A generic error to inherit from.""" class ReadError(MessytablesError): - """ Error reading the file/stream in terms of the expected format. """ - pass + """Error reading the file/stream in terms of the expected format.""" class TableError(MessytablesError, LookupError): - """ Couldn't identify correct table. """ - pass + """Couldn't identify correct table.""" class NoSuchPropertyError(MessytablesError, KeyError): - """ The requested property doesn't exist. """ - pass + """The requested property doesn't exist.""" diff --git a/messytables/headers.py b/messytables/headers.py index 04c05d2..0b20453 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -6,9 +6,10 @@ def column_count_modal(rows): - """ Return the modal value of columns in the row_set's - sample. This can be assumed to be the number of columns - of the table. """ + """Return the modal value of columns in the row_set's sample. + + This can be assumed to be the number of columns of the table. + """ counts = defaultdict(int) for row in rows: length = len([c for c in row if not c.empty]) @@ -20,7 +21,8 @@ def column_count_modal(rows): def headers_guess(rows, tolerance=1): - """ Guess the offset and names of the headers of the row set. + """Guess the offset and names of the headers of the row set. + This will attempt to locate the first row within ``tolerance`` of the mode of the number of rows in the row set sample. @@ -40,9 +42,10 @@ def headers_guess(rows, tolerance=1): def headers_processor(headers): - """ Add column names to the cells in a row_set. If no header is - defined, use an autogenerated name. """ + """Add column names to the cells in a row_set. + If no header is defined, use an autogenerated name. + """ def apply_headers(row_set, row): _row = [] pairs = six.moves.zip_longest(row, headers) @@ -59,11 +62,12 @@ def apply_headers(row_set, row): def headers_make_unique(headers, max_length=None): - """Make sure the header names are unique. For non-unique - columns, append 1, 2, 3, ... after the name. If max_length - is set, truncate the original string so that the headers are - unique up to that length.""" + """Make sure the header names are unique. + For non-unique columns, append 1, 2, 3, ... after the name. If max_length + is set, truncate the original string so that the headers are unique up to + that length. + """ headers = [h.strip() for h in headers] new_digits_length = 0 diff --git a/messytables/html.py b/messytables/html.py index 4f02f26..20c0f35 100644 --- a/messytables/html.py +++ b/messytables/html.py @@ -15,9 +15,8 @@ def fromstring(s): class HTMLTableSet(TableSet): - """ - A TableSet from a HTML document. - """ + """A TableSet from a HTML document.""" + def __init__(self, fileobj=None, filename=None, window=None, **kw): if filename is not None: @@ -45,9 +44,7 @@ def __init__(self, fileobj=None, filename=None, window=None, **kw): "other tables. This is a bug." # avoid infinite loops def make_tables(self): - """ - Return a listing of tables (as HTMLRowSets) in the table set. - """ + """Return a listing of tables (as HTMLRowSets) in the table set.""" def rowset_name(rowset, table_index): return "Table {0} of {1}".format(table_index + 1, len(self.htmltables)) @@ -71,9 +68,8 @@ def insert_blank_cells(row, blanks): class HTMLRowSet(RowSet): - """ - A RowSet representing a HTML table. - """ + """A RowSet representing a HTML table.""" + def __init__(self, name, sheet, window=None): self.name = name self.sheet = sheet @@ -81,11 +77,8 @@ def __init__(self, name, sheet, window=None): super(HTMLRowSet, self).__init__() def in_table(self, els): - """ - takes a list of xpath elements and returns only those - whose parent table is this one - """ - + # Accept a list of xpath elements and returns only those + # whose parent table is this one return [e for e in els if self.sheet in e.xpath("./ancestor::table[1]")] @@ -137,17 +130,14 @@ def identify_anatomy(tag): class FakeHTMLCell(Cell): + """FakeHTMLCells are not present because of column or row spannning.""" + def __init__(self): super(FakeHTMLCell, self).__init__("") @property def topleft(self): - """ - FakeHTMLCells are those which are not physically present in the HTML - because of column or row spannning. - - See also: HTMLCell.topleft - """ + """See also: HTMLCell.topleft.""" return False @@ -169,12 +159,7 @@ def __init__(self, value=None, column=None, type=None, source=None): @property def topleft(self): - """ - HTMLCells are those which are physically present in the HTML. They are - always the top-left in their span. - - See also: FakeHTMLCell.topleft - """ + """See also: FakeHTMLCell.topleft.""" return True @property @@ -198,7 +183,7 @@ def text_from_element(elem): """ builder = [] for x in elem.iter(): - #print x.tag, x.attrib, x.text, x.tail + # print x.tag, x.attrib, x.text, x.tail if is_invisible_text(x): cell_str = x.tail or '' # handle None values. else: @@ -216,7 +201,6 @@ def is_invisible_text(elem): if 'style' in elem.attrib: if 'display:none' in elem.attrib['style']: flag = True - return flag diff --git a/messytables/ods.py b/messytables/ods.py index 4351c85..da35d57 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -20,15 +20,15 @@ class ODSTableSet(TableSet): - """ - A wrapper around ODS files. Because they are zipped and the info we want - is in the zipped file as content.xml we must ensure that we either have - a seekable object (local file) or that we retrieve all of the content from - the remote URL. + """A wrapper around ODS files. + + Because they are zipped and the info we want is in the zipped file as + content.xml we must ensure that we either have a seekable object (local + file) or that we retrieve all of the content from the remote URL. """ def __init__(self, fileobj, window=None, **kw): - '''Initialize the object. + """Initialize the object. :param fileobj: may be a file path or a file-like object. Note the file-like object *must* be in binary mode and must be seekable (it will @@ -40,7 +40,7 @@ def __init__(self, fileobj, window=None, **kw): To get a seekable file you *cannot* use messytables.core.seekable_stream as it does not support the full seek functionality. - ''' + """ if hasattr(fileobj, 'read'): # wrap in a StringIO so we do not have hassle with seeks and # binary etc (see notes to __init__ above) @@ -54,13 +54,12 @@ def __init__(self, fileobj, window=None, **kw): zf.close() def make_tables(self): - """ - Return the sheets in the workbook. + """Return the sheets in the workbook. - A regex is used for this to avoid having to: + A regex is used for this to avoid having to: - 1. load large the entire file into memory, or - 2. SAX parse the file more than once + 1. load large the entire file into memory, or + 2. SAX parse the file more than once """ namespace_tags = self._get_namespace_tags() sheets = [m.groups(0)[0] @@ -77,8 +76,10 @@ def _get_namespace_tags(self): class ODSRowSet(RowSet): - """ ODS support for a single sheet in the ODS workbook. Unlike - the CSV row set this is not a streaming operation. """ + """ODS support for a single sheet in the ODS workbook. + + Unlike the CSV row set this is not a streaming operation. + """ def __init__(self, sheet, window=None, namespace_tags=None): self.sheet = sheet @@ -119,7 +120,7 @@ def __init__(self, sheet, window=None, namespace_tags=None): super(ODSRowSet, self).__init__(typed=True) def raw(self, sample=False): - """ Iterate over all rows in this sheet. """ + """Iterate over all rows in this sheet.""" rows = ODS_ROW_MATCH.findall(self.sheet) for row in rows: diff --git a/messytables/util.py b/messytables/util.py index df5f2fa..a83d456 100644 --- a/messytables/util.py +++ b/messytables/util.py @@ -1,7 +1,8 @@ def offset_processor(offset): - """ Skip ``offset`` from the given iterator. This can - be used in combination with the ``headers_processor`` to + """Skip ``offset`` from the given iterator. + + This can be used in combination with the ``headers_processor`` to apply the result of a header scan to the table. :param offset: Offset to be skipped @@ -17,7 +18,7 @@ def apply_offset(row_set, row): def null_processor(nulls): - """ Replaces every occurrence of items from `nulls` with None. + """Replace every occurrence of items from `nulls` with None. :param nulls: List of items to be replaced :type nulls: list diff --git a/messytables/zip.py b/messytables/zip.py index 680b44a..a15c90f 100644 --- a/messytables/zip.py +++ b/messytables/zip.py @@ -5,10 +5,10 @@ class ZIPTableSet(TableSet): - """ Reads TableSets from inside a ZIP file """ + """Reads TableSets from inside a ZIP file.""" def __init__(self, fileobj, **kw): - """ On error it will raise ReadError. """ + """On error it will raise ReadError.""" from messytables.any import any_tableset tables = [] found = [] From 910b6c2f52b02070f2a6548d43ef2727d759aab9 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 19:59:56 +0200 Subject: [PATCH 26/35] Simplify JTS code. --- messytables/jts.py | 48 +++++++++++++++------------------------------- setup.py | 2 +- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/messytables/jts.py b/messytables/jts.py index e2aeb61..1bafb68 100644 --- a/messytables/jts.py +++ b/messytables/jts.py @@ -1,48 +1,30 @@ -''' -Convert a rowset to the json table schema +"""Convert a rowset to the json table schema. + (http://www.dataprotocols.org/en/latest/json-table-schema.html) -''' +""" import jsontableschema -from typecast import String, Integer, Float, Decimal, Date, DateTime, Boolean from messytables.headers import headers_guess from messytables.types import type_guess -MESSYTABLES_TO_JTS_MAPPING = { - String: 'string', - Integer: 'integer', - Float: 'number', - Decimal: 'number', - Date: 'date', - DateTime: 'datetime', - Boolean: 'boolean' -} - - -def celltype_as_string(celltype): - return MESSYTABLES_TO_JTS_MAPPING[celltype.__class__] - - def rowset_as_jts(rowset, headers=None, types=None): - ''' Create a json table schema from a rowset - ''' + """Create a json table schema from a rowset.""" _, headers = headers_guess(rowset.sample) - types = list(map(celltype_as_string, - type_guess(rowset.sample))) - + types = type_guess(rowset.sample) + types = [t.jts_name for t in types] return headers_and_typed_as_jts(headers, types) def headers_and_typed_as_jts(headers, types): - ''' Create a json table schema from headers and types as - returned from :meth:`~messytables.headers.headers_guess` - and :meth:`~messytables.types.type_guess`. - ''' - j = jsontableschema.JSONTableSchema() + """Create a json table schema from headers and types. + Those specs are returned from :meth:`~messytables.headers.headers_guess` + and :meth:`~messytables.types.type_guess`. + """ + jts = jsontableschema.JSONTableSchema() for field_id, field_type in zip(headers, types): - j.add_field(field_id=field_id, - label=field_id, - field_type=field_type) - return j + jts.add_field(field_id=field_id, + label=field_id, + field_type=field_type) + return jts diff --git a/setup.py b/setup.py index 33a9edb..6c1e70e 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ 'requests>=2.0', 'html5lib', 'json-table-schema>=0.2, <=0.2.1' - 'typecast>=0.3.1', + 'typecast>=0.3.3', 'six', 'ordereddict', ], From a4c22f3415ac703494a89a5955ed0b09326973b7 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 21:08:24 +0200 Subject: [PATCH 27/35] pep8 --- messytables/pdf.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/messytables/pdf.py b/messytables/pdf.py index 11aa907..1998ac8 100644 --- a/messytables/pdf.py +++ b/messytables/pdf.py @@ -42,9 +42,8 @@ def properties(self): class PDFTableSet(TableSet): - """ - A TableSet from a PDF document. - """ + """A TableSet from a PDF document.""" + def __init__(self, fileobj=None, filename=None, **kw): if get_tables is None: raise ImportError("pdftables is not installed") @@ -57,9 +56,7 @@ def __init__(self, fileobj=None, filename=None, **kw): self.raw_tables = get_tables(self.fh) def make_tables(self): - """ - Return a listing of tables (as PDFRowSets) in the table set. - """ + """Return a listing of tables in the table set.""" def table_name(table): return "Table {0} of {1} on page {2} of {3}".format( table.table_number_on_page, @@ -71,9 +68,8 @@ def table_name(table): class PDFRowSet(RowSet): - """ - A RowSet representing a PDF table. - """ + """A RowSet representing a PDF table.""" + def __init__(self, name, table): if get_tables is None: raise ImportError("pdftables is not installed") @@ -85,9 +81,7 @@ def __init__(self, name, table): ) def raw(self, sample=False): - """ - Yield one row of cells at a time - """ + """Yield one row of cells at a time.""" if hasattr(self.table, "cell_data"): # New style of cell data. for row in self.table.cell_data: From b7b485146eadad8b480e5917ef12d70756452ab4 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 21:09:51 +0200 Subject: [PATCH 28/35] Move stuff around. --- messytables/commas.py | 23 ++++++++++++++--------- messytables/error.py | 4 ++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/messytables/commas.py b/messytables/commas.py index f0b79e3..6693cef 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -2,7 +2,7 @@ from six import text_type, PY2 -from messytables.buffered import seekable_stream, BUFFER_SIZE +from messytables.buffered import seekable_stream from messytables.text import UTF8Recoder, to_unicode_or_bust from messytables.core import RowSet, TableSet, Cell from messytables.error import ReadError @@ -14,8 +14,11 @@ class CSVTableSet(TableSet): - """ A CSV table set. Since CSV is always just a single table, - this is just a pass-through for the row set. """ + """A CSV table set. + + Since CSV is always just a single table, this is just a pass-through for + the row set. + """ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None, doublequote=True, @@ -43,10 +46,12 @@ def make_tables(self): class CSVRowSet(RowSet): - """ A CSV row set is an iterator on a CSV file-like object + """A CSV row set is an iterator on a CSV file-like object. + (which can potentially be infinetly large). When loading, a sample is read and cached so you can run analysis on the - fragment. """ + fragment. + """ def __init__(self, name, fileobj, delimiter=None, quotechar=None, encoding='utf-8', window=None, doublequote=True, @@ -58,6 +63,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None, def fake_ilines(fobj): for row in fobj: yield row.decode('utf-8') + self.lines = fake_ilines(self.fileobj) self._sample = [] self.delimiter = delimiter @@ -73,8 +79,7 @@ def fake_ilines(fobj): pass super(CSVRowSet, self).__init__() - @property - def _dialect(self): + def dialect(self): delim = '\n' # NATIVE sample = delim.join(self._sample) try: @@ -86,7 +91,7 @@ def _dialect(self): dialect.skipinitialspace = self.skipinitialspace if self.lineterminator is not None: dialect.lineterminator = self.lineterminator - dialect.doublequote = True + dialect.doublequote = self.doublequote return dialect except csv.Error: return csv.excel @@ -106,7 +111,7 @@ def rows(): yield line try: - for row in csv.reader(rows(), dialect=self._dialect): + for row in csv.reader(rows(), dialect=self.dialect()): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error as err: if u'newline inside string' in text_type(err) and sample: diff --git a/messytables/error.py b/messytables/error.py index 255f4ab..4996bbd 100644 --- a/messytables/error.py +++ b/messytables/error.py @@ -13,3 +13,7 @@ class TableError(MessytablesError, LookupError): class NoSuchPropertyError(MessytablesError, KeyError): """The requested property doesn't exist.""" + + +class InvalidDateError(Exception): + """Invalid date in structured data sources.""" From b8f15ed7621b3c36d1246a805eb46ca0ed7cdafb Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 23 Jul 2016 21:13:25 +0200 Subject: [PATCH 29/35] Formatting. --- messytables/excel.py | 146 ++++++++++++++++++++++--------------------- messytables/text.py | 1 - 2 files changed, 75 insertions(+), 72 deletions(-) diff --git a/messytables/excel.py b/messytables/excel.py index 92bc4ed..28bb235 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -1,70 +1,54 @@ -import sys from datetime import datetime, time - -import xlrd from xlrd.biffh import XLRDError -from six import PY2 +from xlrd import open_workbook, xldate_as_tuple from typecast import String, Integer, Date, Float from messytables.core import RowSet, TableSet, Cell, CoreProperties -from messytables.error import ReadError - - -class InvalidDateError(Exception): - pass +from messytables.error import ReadError, InvalidDateError XLS_TYPES = { - 1: String(), + 1: String, # NB: Excel does not distinguish floats from integers so we use floats # We could try actual type detection between floats and ints later # or use the excel format string info - see # https://groups.google.com/forum/?fromgroups=#!topic/ # python-excel/cAQ1ndsCVxk - 2: Float(), - 3: Date(), + 2: Float, + 3: Date, # this is actually boolean but we do not have a boolean type yet - 4: Integer() + 4: Integer } class XLSTableSet(TableSet): - """An excel workbook wrapper object. - """ + """An excel workbook wrapper object.""" def __init__(self, fileobj=None, filename=None, window=None, encoding=None, with_formatting_info=True, **kw): - '''Initialize the tableset. + """Initilize the tableset. :param encoding: passed on to xlrd.open_workbook function as encoding_override - :param with_formatting_info: passed to xlrd to get font details of cells - ''' + :param with_formatting_info: whether xlrd should provide details + of the cells contents (e.g. colour, borders, etc. + Not sure what the behaviour of properties is with this turned off. + Turning this on apparently may have memory implications in xlrd. + + The convoluted "try it with with_formatting_info, then try it without" + is necessary because xlrd doesn't currently support getting this + information from XLSX files. Workarounds include converting the XLSX + document in LibreOffice. + """ def get_workbook(): try: - return xlrd.open_workbook( + return open_workbook( filename=filename, file_contents=read_obj, encoding_override=encoding, formatting_info=with_formatting_info) - except XLRDError: - _, value, traceback = sys.exc_info() - if PY2: - raise ReadError("Can't read Excel file: %r" % value, traceback) - else: - raise ReadError("Can't read Excel file: %r" % value).with_traceback(traceback) - '''Initilize the tableset. + except XLRDError as xlrdexc: + raise ReadError("Can't read Excel file: %r" % xlrdexc) - :param encoding: passed on to xlrd.open_workbook function - as encoding_override - :param with_formatting_info: whether xlrd should provide details - of the cells contents (e.g. colour, borders, etc. - Not sure what the behaviour of properties is with this turned off. - Turning this on apparently may have memory implications in xlrd. - - The convoluted "try it with with_formatting_info, then try it without" is - necessary because xlrd doesn't currently support getting this information - from XLSX files. Workarounds include converting the XLSX document in LibreOffice. - ''' self.window = window if not filename and not fileobj: @@ -81,19 +65,20 @@ def get_workbook(): if not with_formatting_info: raise else: - with_formatting_info=False + with_formatting_info = False self.workbook = get_workbook() - def make_tables(self): - """ Return the sheets in the workbook. """ + """Return the sheets in the workbook.""" return [XLSRowSet(name, self.workbook.sheet_by_name(name), self.window) for name in self.workbook.sheet_names()] class XLSRowSet(RowSet): - """ Excel support for a single sheet in the excel workbook. Unlike - the CSV row set this is not a streaming operation. """ + """Excel support for a single sheet in the excel workbook. + + Unlike the CSV row set this is not a streaming operation. + """ def __init__(self, name, sheet, window=None): self.name = name @@ -102,38 +87,47 @@ def __init__(self, name, sheet, window=None): super(XLSRowSet, self).__init__(typed=True) def raw(self, sample=False): - """ Iterate over all rows in this sheet. Types are automatically - converted according to the excel data types specified, including - conversion of excel dates, which are notoriously buggy. """ + """Iterate over all rows in this sheet. + + Types are automatically converted according to the excel data types + specified, including conversion of excel dates, which are notoriously + buggy. + """ num_rows = self.sheet.nrows - for rownum in range(min(self.window, num_rows) if sample else num_rows): + num_rows = min(self.window, num_rows) if sample else num_rows + for rownum in xrange(num_rows): row = [] for colnum, cell in enumerate(self.sheet.row(rownum)): try: - row.append(XLSCell.from_xlrdcell(cell, self.sheet, colnum, rownum)) + row.append(XLSCell.from_xlrdcell(cell, self.sheet, + colnum, rownum)) except InvalidDateError: - raise ValueError("Invalid date at '%s':%d,%d" % ( - self.sheet.name, colnum+1, rownum+1)) + raise ValueError("Invalid date at '%s':%d,%d" % + (self.sheet.name, colnum + 1, rownum + 1)) yield row class XLSCell(Cell): - @staticmethod - def from_xlrdcell(xlrd_cell, sheet, col, row): + + @classmethod + def get_xl_date(cls, sheet, value): + if value == 0: + return None + date = xldate_as_tuple(value, sheet.book.datemode) + year, month, day, hour, minute, second = date + return datetime(year, month, day, hour, minute, second) + + @classmethod + def from_xlrdcell(cls, xlrd_cell, sheet, col, row): value = xlrd_cell.value - cell_type = XLS_TYPES.get(xlrd_cell.ctype, String()) - if cell_type == Date(): - if value != 0: - year, month, day, hour, minute, second = \ - xlrd.xldate_as_tuple(value, sheet.book.datemode) - if (year, month, day) == (0, 0, 0): - value = time(hour, minute, second) - else: - value = datetime(year, month, day, hour, minute, second) - messy_cell = XLSCell(value, type=cell_type) + cell_type = XLS_TYPES.get(xlrd_cell.ctype, String) + if cell_type == Date: + value = cls.get_xl_date(sheet, value) + messy_cell = XLSCell(value, type=cell_type()) messy_cell.sheet = sheet messy_cell.xlrd_cell = xlrd_cell - messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y) + # necessary for properties, note not (x,y) + messy_cell.xlrd_pos = (row, col) return messy_cell @property @@ -146,6 +140,7 @@ def properties(self): class XLSProperties(CoreProperties): + KEYS = ['bold', 'size', 'italic', 'font_name', 'strikeout', 'underline', 'font_colour', 'background_colour', 'any_border', 'all_border', 'richtext', 'blank', 'a_date', 'formatting_string'] @@ -168,13 +163,19 @@ def formatting(self): @property def rich(self): - """returns a tuple of character position, font number which starts at that position - https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute""" - return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos, None) + """Return a tuple of character position, font number. + + Starts at that position: + https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute + """ + return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos, + None) def raw_span(self, always=False): - """return the bounding box of the cells it's part of. - https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute""" + """Return the bounding box of the cells it's part of. + + https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute + """ row, col = self.cell.xlrd_pos for box in self.cell.sheet.merged_cells: rlo, rhi, clo, chi = box @@ -210,7 +211,7 @@ def get_bold(self): return self.font.weight > 500 def get_size(self): - """in pixels""" + """In pixels.""" return self.font.height / 20.0 def get_italic(self): @@ -227,15 +228,18 @@ def get_underline(self): def get_font_colour(self): # TODO - return self.font.color_index ## more lookup required + return self.font.color_index # more lookup required def get_blank(self): """Note that cells might not exist at all. - Behaviour for spanned cells might be complicated: hence this function""" + + Behaviour for spanned cells might be complicated: hence this function + """ return self.cell.value == '' def get_background_colour(self): - return self.xf.background.background_color_index ## more lookup required + # more lookup required: + return self.xf.background.background_color_index def get_any_border(self): b = self.xf.border diff --git a/messytables/text.py b/messytables/text.py index 17c8097..d7938e0 100644 --- a/messytables/text.py +++ b/messytables/text.py @@ -19,7 +19,6 @@ class UTF8Recoder: 'utf-32be': 'BOM_UTF32_BE', 'utf-8': 'BOM_UTF8', 'utf-8-sig': 'BOM_UTF8', - } def __init__(self, f, encoding): From 3c96240275f49697927b89743e5e228104f795fa Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sun, 24 Jul 2016 18:26:41 +0200 Subject: [PATCH 30/35] Replace CSV reader with a fully streaming implementation. --- .gitignore | 1 + messytables/buffered.py | 2 +- messytables/commas.py | 144 +++++++++++++++++++--------------------- messytables/text.py | 86 ++++++++---------------- test/test_guessing.py | 2 +- test/test_read.py | 2 +- 6 files changed, 99 insertions(+), 138 deletions(-) diff --git a/.gitignore b/.gitignore index 2df0131..33f6a3f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ *.~lock.*# .coverage dist/* +.tox/* pyenv3 diff --git a/messytables/buffered.py b/messytables/buffered.py index dea877f..dd4daf8 100644 --- a/messytables/buffered.py +++ b/messytables/buffered.py @@ -17,7 +17,7 @@ def seekable_stream(fileobj): class BufferedFile(object): """A buffered file that preserves the beginning of a stream.""" - def __init__(self, fp, buffer_size=BUFFER_SIZE): + def __init__(self, fp, buffer_size=BUFFER_SIZE + 2): self.data = io.BytesIO() self.fp = fp self.offset = 0 diff --git a/messytables/commas.py b/messytables/commas.py index 6693cef..add6049 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -1,16 +1,18 @@ +import re import csv +import logging -from six import text_type, PY2 - -from messytables.buffered import seekable_stream -from messytables.text import UTF8Recoder, to_unicode_or_bust +from messytables.buffered import BUFFER_SIZE +from messytables.text import analyze_stream from messytables.core import RowSet, TableSet, Cell from messytables.error import ReadError DELIMITERS = ['\t', ',', ';', '|'] +TERMINATORS = ['\r\n', '\r', '\n', '\0'] # Fix the maximum field size to something a little larger csv.field_size_limit(256000) +log = logging.getLogger(__name__) class CSVTableSet(TableSet): @@ -21,28 +23,19 @@ class CSVTableSet(TableSet): """ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, - encoding=None, window=None, doublequote=True, - lineterminator=None, skipinitialspace=None, **kw): - self.fileobj = seekable_stream(fileobj) - self.name = name or 'table' - self.delimiter = delimiter - self.quotechar = quotechar - self.encoding = encoding - self.window = window - self.doublequote = doublequote - self.lineterminator = lineterminator - self.skipinitialspace = skipinitialspace + encoding=None, window=1000, doublequote=True, + skipinitialspace=None, **kw): + self._tables = [CSVRowSet(name or 'table', fileobj, + delimiter=delimiter, + quotechar=quotechar, + encoding=encoding, + window=window, + doublequote=doublequote, + skipinitialspace=skipinitialspace)] def make_tables(self): """Return the actual CSV table.""" - return [CSVRowSet(self.name, self.fileobj, - delimiter=self.delimiter, - quotechar=self.quotechar, - encoding=self.encoding, - window=self.window, - doublequote=self.doublequote, - lineterminator=self.lineterminator, - skipinitialspace=self.skipinitialspace)] + return self._tables class CSVRowSet(RowSet): @@ -54,69 +47,66 @@ class CSVRowSet(RowSet): """ def __init__(self, name, fileobj, delimiter=None, quotechar=None, - encoding='utf-8', window=None, doublequote=True, - lineterminator=None, skipinitialspace=None): + encoding=None, window=1000, doublequote=None, + skipinitialspace=None): self.name = name - self.fh = seekable_stream(fileobj) - self.fileobj = UTF8Recoder(self.fh, encoding) + self.encoding, self.buf = analyze_stream(fileobj, encoding=encoding) + self.fileobj = fileobj - def fake_ilines(fobj): - for row in fobj: - yield row.decode('utf-8') + # For line breaking, use the (detected) encoding of the file: + terminators = [t.encode(self.encoding) for t in TERMINATORS] + self.terminators_re = re.compile('(%s)' % '|'.join(terminators)) - self.lines = fake_ilines(self.fileobj) self._sample = [] - self.delimiter = delimiter - self.quotechar = quotechar - self.window = window or 1000 - self.doublequote = doublequote - self.lineterminator = lineterminator - self.skipinitialspace = skipinitialspace - try: - for i in range(self.window): - self._sample.append(next(self.lines)) - except StopIteration: - pass - super(CSVRowSet, self).__init__() + self.window = window - def dialect(self): - delim = '\n' # NATIVE - sample = delim.join(self._sample) try: - dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS) - dialect.delimiter = self.delimiter or str(dialect.delimiter) - dialect.quotechar = self.quotechar or str(dialect.quotechar) - dialect.lineterminator = self.lineterminator or delim - if self.skipinitialspace is not None: - dialect.skipinitialspace = self.skipinitialspace - if self.lineterminator is not None: - dialect.lineterminator = self.lineterminator - dialect.doublequote = self.doublequote - return dialect + sample = self.buf.decode(self.encoding).encode('utf-8') + self.dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS) except csv.Error: - return csv.excel + self.dialect = csv.excel + # override detected dialect with constructor values. + self.dialect.delimiter = delimiter or str(self.dialect.delimiter) + self.dialect.quotechar = quotechar or str(self.dialect.quotechar) + if skipinitialspace is not None: + self.dialect.skipinitialspace = skipinitialspace + if doublequote is not None: + self.dialect.doublequote = doublequote + super(CSVRowSet, self).__init__() - def raw(self, sample=False): - def rows(): - for line in self._sample: - if PY2: - yield line.encode('utf-8') + def get_lines(self, sample=False): + for line in self._sample: + yield line + + while True: + if self.buf is None: + break + if sample and len(self._sample) >= self.window: + break + match = self.terminators_re.search(self.buf) + if match is not None: + line = self.buf[:match.end(0)] + self.buf = self.buf[match.end(0):] + else: + buf = self.fileobj.read(BUFFER_SIZE) + if len(buf): + self.buf += buf + continue else: - yield line - if not sample: - for line in self.lines: - if PY2: - yield line.encode('utf-8') - else: - yield line + line, self.buf = self.buf, None + + line = line.decode(self.encoding).encode('utf-8') + if line in TERMINATORS or not len(line): + continue + if self.window >= len(self._sample): + self._sample.append(line) + yield line + + def raw(self, sample=False): try: - for row in csv.reader(rows(), dialect=self.dialect()): - yield [Cell(to_unicode_or_bust(c)) for c in row] + for row in csv.reader(self.get_lines(sample=sample), + dialect=self.dialect): + yield [Cell(c.decode('utf-8')) for c in row] except csv.Error as err: - if u'newline inside string' in text_type(err) and sample: - pass - elif u'line contains NULL byte' in text_type(err): - pass - else: - raise ReadError('Error reading CSV: %r', err) + raise ReadError('Error reading CSV: %r', err) diff --git a/messytables/text.py b/messytables/text.py index d7938e0..ee71179 100644 --- a/messytables/text.py +++ b/messytables/text.py @@ -3,64 +3,34 @@ import cchardet as chardet except ImportError: import chardet -from six import text_type, binary_type from messytables.buffered import BUFFER_SIZE - -class UTF8Recoder: - """Iterator that reads an encoded stream and re-encodes it to UTF-8.""" - - # maps between chardet encoding and codecs bom keys - BOM_MAPPING = { - 'utf-16le': 'BOM_UTF16_LE', - 'utf-16be': 'BOM_UTF16_BE', - 'utf-32le': 'BOM_UTF32_LE', - 'utf-32be': 'BOM_UTF32_BE', - 'utf-8': 'BOM_UTF8', - 'utf-8-sig': 'BOM_UTF8', - } - - def __init__(self, f, encoding): - sample = f.read(BUFFER_SIZE) - if not encoding: - encoding = chardet.detect(sample).get('encoding') or 'utf-8' - f.seek(0) - self.reader = codecs.getreader(encoding)(f, 'ignore') - - # The reader only skips a BOM if the encoding isn't explicit about its - # endianness (i.e. if encoding is UTF-16 a BOM is handled properly - # and taken out, but if encoding is UTF-16LE a BOM is ignored). - # However, if chardet sees a BOM it returns an encoding with the - # endianness explicit, which results in the codecs stream leaving the - # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} - # encodings, check for a BOM and remove it if it's there. - if encoding.lower() in self.BOM_MAPPING: - bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None) - if bom: - # Try to read the BOM, which is a byte sequence, from - # the underlying stream. If all characters match, then - # go on. Otherwise when a character doesn't match, seek - # the stream back to the beginning and go on. - for c in bom: - if f.read(1) != c: - f.seek(0) - break - - def __iter__(self): - return self - - def __next__(self): - line = self.reader.readline() - if not line or line == '\0': - raise StopIteration - result = line.encode("utf-8") - return result - - next = __next__ - - -def to_unicode_or_bust(obj, encoding='utf-8'): - if isinstance(obj, binary_type): - obj = text_type(obj, encoding) - return obj +# maps between chardet encoding and codecs bom keys +BOM_MAPPING = { + 'utf-16le': 'BOM_UTF16_LE', + 'utf-16be': 'BOM_UTF16_BE', + 'utf-32le': 'BOM_UTF32_LE', + 'utf-32be': 'BOM_UTF32_BE', + 'utf-8': 'BOM_UTF8', + 'utf-8-sig': 'BOM_UTF8', +} + + +def analyze_stream(stream, encoding=None): + sample = stream.read(BUFFER_SIZE) + if encoding is None: + encoding = chardet.detect(sample).get('encoding') or 'utf-8' + encoding = encoding.lower() + # The reader only skips a BOM if the encoding isn't explicit about its + # endianness (i.e. if encoding is UTF-16 a BOM is handled properly + # and taken out, but if encoding is UTF-16LE a BOM is ignored). + # However, if chardet sees a BOM it returns an encoding with the + # endianness explicit, which results in the codecs stream leaving the + # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} + # encodings, check for a BOM and remove it if it's there. + if encoding in BOM_MAPPING: + bom = getattr(codecs, BOM_MAPPING[encoding], None) + if sample[:len(bom)] == bom: + return encoding, sample[len(bom):] + return encoding, sample diff --git a/test/test_guessing.py b/test/test_guessing.py index 141a3ff..2d3e7a4 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -100,7 +100,7 @@ def test_guessing_uses_first_in_case_of_tie(self): @attr("slow") def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') - rows = CSVTableSet(fh).tables[0] + rows = CSVTableSet(fh, encoding='iso-8859-2').tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [String, Integer, Decimal, Date] diff --git a/test/test_read.py b/test/test_read.py index ac9b384..786c901 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -211,7 +211,7 @@ def test_guess_headers(self): row_set.register_processor(headers_processor(['foo', 'bar'])) data = list(row_set) assert 'foo' in data[12][0].column, data[12][0] - assert 'Chirurgie' in data[12][0].value, data[12][0].value + assert 'Chirurgie' in data[10][0].value, data[10][0].value def test_read_encoded_characters_csv(self): fh = horror_fobj('characters.csv') From ce3627c0a9eecb74af79a180c07c706c0f5353dc Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sun, 24 Jul 2016 19:17:36 +0200 Subject: [PATCH 31/35] Fix up Python 3 support --- messytables/__init__.py | 2 +- messytables/any.py | 7 ++----- messytables/commas.py | 46 +++++++++++++++++++++++++++++++++-------- messytables/excel.py | 2 +- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/messytables/__init__.py b/messytables/__init__.py index 014a095..c1ca1ba 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -7,7 +7,7 @@ from messytables.buffered import seekable_stream from messytables.core import Cell, TableSet, RowSet -from messytables.commas import CSVTableSet, CSVRowSet +from messytables.commas import CSVTableSet, CSVRowSet, TSVTableSet from messytables.ods import ODSTableSet, ODSRowSet from messytables.excel import XLSTableSet, XLSRowSet from messytables.zip import ZIPTableSet diff --git a/messytables/any.py b/messytables/any.py index 9d305ee..802aeb6 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,7 +1,7 @@ import re from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet -from messytables import HTMLTableSet, ODSTableSet +from messytables import HTMLTableSet, ODSTableSet, TSVTableSet from messytables.buffered import seekable_stream from messytables.error import ReadError @@ -32,10 +32,7 @@ } -def TABTableSet(fileobj): - return CSVTableSet(fileobj, delimiter='\t') - -parsers = {'TAB': TABTableSet, +parsers = {'TAB': TSVTableSet, 'ZIP': ZIPTableSet, 'XLS': XLSTableSet, 'HTML': HTMLTableSet, diff --git a/messytables/commas.py b/messytables/commas.py index add6049..27261a5 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -1,5 +1,6 @@ import re import csv +import six import logging from messytables.buffered import BUFFER_SIZE @@ -8,7 +9,7 @@ from messytables.error import ReadError DELIMITERS = ['\t', ',', ';', '|'] -TERMINATORS = ['\r\n', '\r', '\n', '\0'] +LINE_SEPARATOR = ['\r\n', '\r', '\n', '\0'] # Fix the maximum field size to something a little larger csv.field_size_limit(256000) @@ -38,6 +39,24 @@ def make_tables(self): return self._tables +class TSVTableSet(CSVTableSet): + """A TSV table set. + + This is a slightly specialised version of the CSVTableSet that will always + generate a tab-based table parser. + """ + + def __init__(self, fileobj, quotechar=None, name=None, + encoding=None, window=1000, doublequote=True, + skipinitialspace=None, **kw): + super(TSVTableSet, self).__init__(fileobj, delimiter='\t', + quotechar=quotechar, name=name, + encoding=encoding, window=window, + doublequote=doublequote, + skipinitialspace=skipinitialspace, + **kw) + + class CSVRowSet(RowSet): """A CSV row set is an iterator on a CSV file-like object. @@ -54,14 +73,17 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None, self.fileobj = fileobj # For line breaking, use the (detected) encoding of the file: - terminators = [t.encode(self.encoding) for t in TERMINATORS] - self.terminators_re = re.compile('(%s)' % '|'.join(terminators)) + linesep = [t.encode(self.encoding) for t in LINE_SEPARATOR] + linesep = b'(%s)' % b'|'.join(linesep) + self.linesep = re.compile(linesep) self._sample = [] self.window = window try: - sample = self.buf.decode(self.encoding).encode('utf-8') + sample = self.buf.decode(self.encoding) + if six.PY2: + sample = sample.encode('utf-8') self.dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS) except csv.Error: self.dialect = csv.excel @@ -83,7 +105,7 @@ def get_lines(self, sample=False): break if sample and len(self._sample) >= self.window: break - match = self.terminators_re.search(self.buf) + match = self.linesep.search(self.buf) if match is not None: line = self.buf[:match.end(0)] self.buf = self.buf[match.end(0):] @@ -95,8 +117,11 @@ def get_lines(self, sample=False): else: line, self.buf = self.buf, None - line = line.decode(self.encoding).encode('utf-8') - if line in TERMINATORS or not len(line): + line = line.decode(self.encoding) + if six.PY2: + line = line.encode('utf-8') + + if line in LINE_SEPARATOR or not len(line): continue if self.window >= len(self._sample): @@ -107,6 +132,9 @@ def raw(self, sample=False): try: for row in csv.reader(self.get_lines(sample=sample), dialect=self.dialect): - yield [Cell(c.decode('utf-8')) for c in row] + if six.PY2: + row = [c.decode('utf-8') for c in row] + yield [Cell(c) for c in row] except csv.Error as err: - raise ReadError('Error reading CSV: %r', err) + if 'new-line character' not in repr(err): + raise ReadError('Error reading CSV: %r', err) diff --git a/messytables/excel.py b/messytables/excel.py index 28bb235..93c8004 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -95,7 +95,7 @@ def raw(self, sample=False): """ num_rows = self.sheet.nrows num_rows = min(self.window, num_rows) if sample else num_rows - for rownum in xrange(num_rows): + for rownum in range(num_rows): row = [] for colnum, cell in enumerate(self.sheet.row(rownum)): try: From 7dd9e5b8dc784c8cc3fa79f78ff9ab8244831e84 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sun, 24 Jul 2016 19:23:48 +0200 Subject: [PATCH 32/35] confirm at least python 3.5 is working --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 25aaf2b..9399f16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: python python: - "2.7" - - "3.4" + - "3.5" install: # Fix for html5lib, probably can be removed after the version after # 0.999999999/1.0b10 is released. From 6cd1222754bf1292e5785e37a9ebd33a2174b3ed Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 4 Oct 2016 21:07:10 +0100 Subject: [PATCH 33/35] Readd Python 3.4 to Travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 9399f16..e6af8c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ language: python python: - "2.7" + - "3.4" - "3.5" install: # Fix for html5lib, probably can be removed after the version after From 506269e403b915165973dc38fb47f91d279f8795 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 4 Oct 2016 21:16:48 +0100 Subject: [PATCH 34/35] Fix missing comma in setup.py Probably occurred during rebasing. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6c1e70e..8418bba 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ 'lxml>=3.2', 'requests>=2.0', 'html5lib', - 'json-table-schema>=0.2, <=0.2.1' + 'json-table-schema>=0.2, <=0.2.1', 'typecast>=0.3.3', 'six', 'ordereddict', From 6638e58bf88f04d0d83c8e373b85e245b4f0e5f4 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 4 Oct 2016 21:32:54 +0100 Subject: [PATCH 35/35] Fix byte concatenation in Python 3.4 This line worked in Python 3.5 because of PEP 461: https://www.python.org/dev/peps/pep-0461/ but not in Python 3.4. --- messytables/commas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/messytables/commas.py b/messytables/commas.py index 27261a5..1a75613 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -74,7 +74,7 @@ def __init__(self, name, fileobj, delimiter=None, quotechar=None, # For line breaking, use the (detected) encoding of the file: linesep = [t.encode(self.encoding) for t in LINE_SEPARATOR] - linesep = b'(%s)' % b'|'.join(linesep) + linesep = b'(' + b'|'.join(linesep) + b')' self.linesep = re.compile(linesep) self._sample = []