Skip to content

Commit

Permalink
Merge pull request #43 from TyShkan/feature_encoding
Browse files Browse the repository at this point in the history
Add encoding config option
  • Loading branch information
menzenski authored Apr 12, 2023
2 parents 12f4d96 + 6ebb8e5 commit 646c069
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 6 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
- **invalid_format_action**: (optional) By default, the tap will raise an exception if a source file can not be read
. Set this key to "ignore" to skip such source files and continue the run.
- **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values.
- **encoding**: (optional) The file encoding to use when reading text files (i.e., "utf-8" (default), "latin1", "windows-1252")
- **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier.
- **skip_initial**: (optional) How many lines should be skipped. The default is 0.
- **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line. A sampling rate of 10 (the default) will sample every 10th line.
Expand Down
1 change: 1 addition & 0 deletions tap_spreadsheets_anywhere/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Required('start_date'): str,
Required('key_properties'): [str],
Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'),
Optional('encoding'): str,
Optional('invalid_format_action'): Any('ignore','fail'),
Optional('universal_newlines'): bool,
Optional('skip_initial'): int,
Expand Down
2 changes: 2 additions & 0 deletions tap_spreadsheets_anywhere/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def config_by_crawl(crawl_config):
"pattern": abs_pattern,
"key_properties": [],
"format": "detect",
"encoding": source.get('encoding', 'utf-8'),
"invalid_format_action": "ignore",
"delimiter": "detect",
"max_records_per_run": source.get('max_records_per_run',-1),
Expand All @@ -373,6 +374,7 @@ def config_by_crawl(crawl_config):
"pattern": abs_pattern,
"key_properties": [],
"format": "detect",
"encoding": source.get('encoding', 'utf-8'),
"invalid_format_action": "ignore",
"delimiter": "detect",
"max_records_per_run": source.get('max_records_per_run', -1),
Expand Down
13 changes: 7 additions & 6 deletions tap_spreadsheets_anywhere/format_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __str__(self):
return f'{self.name} could not be parsed: {self.message}'


def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'):
def get_streamreader(uri, universal_newlines=True, newline='', open_mode='r', encoding='utf-8'):
kwarg_dispatch = {
"azure": lambda: {
"transport_params": {
Expand All @@ -32,7 +32,7 @@ def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'):
SCHEME_SEP = "://"
kwargs = kwarg_dispatch.get(uri.split(SCHEME_SEP, 1)[0], lambda: {})()

streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', **kwargs)
streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', encoding=encoding, **kwargs)

if not universal_newlines and isinstance(streamreader, StreamReader):
return monkey_patch_streamreader(streamreader)
Expand Down Expand Up @@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False):

def get_row_iterator(table_spec, uri):
universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True
encoding = table_spec['encoding'] if 'encoding' in table_spec else 'utf-8'
skip_initial = table_spec.get("skip_initial", 0)

if 'format' not in table_spec or table_spec['format'] == 'detect':
Expand All @@ -135,7 +136,7 @@ def get_row_iterator(table_spec, uri):
format = 'csv'
else:
# TODO: some protocols provide the ability to pull format (content-type) info & we could make use of that here
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
buf = reader.read(10)
reader.seek(0)
if len(buf) > 0:
Expand All @@ -153,7 +154,7 @@ def get_row_iterator(table_spec, uri):

try:
if format == 'csv':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
elif format == 'excel':
reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb')
Expand All @@ -162,10 +163,10 @@ def get_row_iterator(table_spec, uri):
else:
iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
elif format == 'json':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
elif format == 'jsonl':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
except (ValueError,TypeError) as err:
raise InvalidFormatError(uri,message=err)
Expand Down

0 comments on commit 646c069

Please sign in to comment.