diff --git a/README.md b/README.md index 7b7ed66..6fdfd57 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet - **invalid_format_action**: (optional) By default, the tap will raise an exception if a source file can not be read . Set this key to "ignore" to skip such source files and continue the run. - **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values. +- **encoding**: (optional) The file encoding to use when reading text files (i.e., "utf-8" (default), "latin1", "windows-1252") - **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier. - **skip_initial**: (optional) How many lines should be skipped. The default is 0. - **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line. A sampling rate of 10 (the default) will sample every 10th line. diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py index 652c25f..507b000 100644 --- a/tap_spreadsheets_anywhere/configuration.py +++ b/tap_spreadsheets_anywhere/configuration.py @@ -13,6 +13,7 @@ Required('start_date'): str, Required('key_properties'): [str], Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'), + Optional('encoding'): str, Optional('invalid_format_action'): Any('ignore','fail'), Optional('universal_newlines'): bool, Optional('skip_initial'): int, diff --git a/tap_spreadsheets_anywhere/file_utils.py b/tap_spreadsheets_anywhere/file_utils.py index aff4cca..e76a50a 100644 --- a/tap_spreadsheets_anywhere/file_utils.py +++ b/tap_spreadsheets_anywhere/file_utils.py @@ -352,6 +352,7 @@ def config_by_crawl(crawl_config): "pattern": abs_pattern, "key_properties": [], "format": "detect", + "encoding": source.get('encoding', 'utf-8'), "invalid_format_action": "ignore", "delimiter": "detect", "max_records_per_run": source.get('max_records_per_run',-1), @@ -373,6 +374,7 @@ def config_by_crawl(crawl_config): "pattern": abs_pattern, "key_properties": [], "format": "detect", + "encoding": source.get('encoding', 'utf-8'), "invalid_format_action": "ignore", "delimiter": "detect", "max_records_per_run": source.get('max_records_per_run', -1), diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py index 7f843cf..3bf911f 100644 --- a/tap_spreadsheets_anywhere/format_handler.py +++ b/tap_spreadsheets_anywhere/format_handler.py @@ -18,7 +18,7 @@ def __str__(self): return f'{self.name} could not be parsed: {self.message}' -def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'): +def get_streamreader(uri, universal_newlines=True, newline='', open_mode='r', encoding='utf-8'): kwarg_dispatch = { "azure": lambda: { "transport_params": { @@ -32,7 +32,7 @@ def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'): SCHEME_SEP = "://" kwargs = kwarg_dispatch.get(uri.split(SCHEME_SEP, 1)[0], lambda: {})() - streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', **kwargs) + streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', encoding=encoding, **kwargs) if not universal_newlines and isinstance(streamreader, StreamReader): return monkey_patch_streamreader(streamreader) @@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False): def get_row_iterator(table_spec, uri): universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True + encoding = table_spec['encoding'] if 'encoding' in table_spec else 'utf-8' skip_initial = table_spec.get("skip_initial", 0) if 'format' not in table_spec or table_spec['format'] == 'detect': @@ -135,7 +136,7 @@ def get_row_iterator(table_spec, uri): format = 'csv' else: # TODO: some protocols provide the ability to pull format (content-type) info & we could make use of that here - reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') + reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding) buf = reader.read(10) reader.seek(0) if len(buf) > 0: @@ -153,7 +154,7 @@ def get_row_iterator(table_spec, uri): try: if format == 'csv': - reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') + reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding) iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader) elif format == 'excel': reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb') @@ -162,10 +163,10 @@ def get_row_iterator(table_spec, uri): else: iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader) elif format == 'json': - reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') + reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding) iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader) elif format == 'jsonl': - reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') + reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding) iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader) except (ValueError,TypeError) as err: raise InvalidFormatError(uri,message=err)