Merge pull request #43 from TyShkan/feature_encoding

Add encoding config option
ets · Apr 12, 2023 · 646c069 · 646c069
2 parents 12f4d96 + 6ebb8e5
commit 646c069
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -102,6 +102,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
 - **invalid_format_action**: (optional) By default, the tap will raise an exception if a source file can not be read
 . Set this key to "ignore" to skip such source files and continue the run.  
 - **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values. 
+- **encoding**: (optional) The file encoding to use when reading text files (i.e., "utf-8" (default), "latin1", "windows-1252")
 - **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier.
 - **skip_initial**: (optional) How many lines should be skipped. The default is 0.
 - **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line.  A sampling rate of 10 (the default) will sample every 10th line.

diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py
@@ -13,6 +13,7 @@
         Required('start_date'): str,
         Required('key_properties'): [str],
         Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'),
+        Optional('encoding'): str,
         Optional('invalid_format_action'): Any('ignore','fail'),
         Optional('universal_newlines'): bool,
         Optional('skip_initial'): int,

diff --git a/tap_spreadsheets_anywhere/file_utils.py b/tap_spreadsheets_anywhere/file_utils.py
@@ -352,6 +352,7 @@ def config_by_crawl(crawl_config):
                         "pattern": abs_pattern,
                         "key_properties": [],
                         "format": "detect",
+                        "encoding": source.get('encoding', 'utf-8'),
                         "invalid_format_action": "ignore",
                         "delimiter": "detect",
                         "max_records_per_run": source.get('max_records_per_run',-1),
@@ -373,6 +374,7 @@ def config_by_crawl(crawl_config):
                             "pattern": abs_pattern,
                             "key_properties": [],
                             "format": "detect",
+                            "encoding": source.get('encoding', 'utf-8'),
                             "invalid_format_action": "ignore",
                             "delimiter": "detect",
                             "max_records_per_run": source.get('max_records_per_run', -1),

diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py
@@ -18,7 +18,7 @@ def __str__(self):
         return f'{self.name} could not be parsed: {self.message}'
 
 
-def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'):
+def get_streamreader(uri, universal_newlines=True, newline='', open_mode='r', encoding='utf-8'):
     kwarg_dispatch = {
         "azure": lambda: {
             "transport_params": {
@@ -32,7 +32,7 @@ def get_streamreader(uri, universal_newlines=True,newline='',open_mode='r'):
     SCHEME_SEP = "://"
     kwargs = kwarg_dispatch.get(uri.split(SCHEME_SEP, 1)[0], lambda: {})()
 
-    streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', **kwargs)
+    streamreader = smart_open.open(uri, open_mode, newline=newline, errors='surrogateescape', encoding=encoding, **kwargs)
 
     if not universal_newlines and isinstance(streamreader, StreamReader):
         return monkey_patch_streamreader(streamreader)
@@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False):
 
 def get_row_iterator(table_spec, uri):
     universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True
+    encoding = table_spec['encoding'] if 'encoding' in table_spec else 'utf-8'
     skip_initial = table_spec.get("skip_initial", 0)
 
     if 'format' not in table_spec or table_spec['format'] == 'detect':
@@ -135,7 +136,7 @@ def get_row_iterator(table_spec, uri):
             format = 'csv'
         else:
             # TODO: some protocols provide the ability to pull format (content-type) info & we could make use of that here
-            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
+            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
             buf = reader.read(10)
             reader.seek(0)
             if len(buf) > 0:
@@ -153,7 +154,7 @@ def get_row_iterator(table_spec, uri):
 
     try:
         if format == 'csv':
-            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
+            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
             iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
         elif format == 'excel':
             reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb')
@@ -162,10 +163,10 @@ def get_row_iterator(table_spec, uri):
             else:
                 iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
         elif format == 'json':
-            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
+            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
             iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
         elif format == 'jsonl':
-            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
+            reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r', encoding=encoding)
             iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
     except (ValueError,TypeError) as err:
         raise InvalidFormatError(uri,message=err)