Merge pull request #37 from NINAnor/skip-initial

Add skip_initial
ets · Mar 3, 2023 · 12f4d96 · 12f4d96
2 parents 4fb97f2 + 3791733
commit 12f4d96
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,7 @@ The configuration is also captured in [tables_config_util.py](tap_spreadsheets_a
             "delimiter": "|",
             "quotechar": '"',
             "universal_newlines": false,
+            "skip_initial": 0,
             "sample_rate": 10,
             "max_sampling_read": 2000,
             "max_sampled_files": 3,
@@ -102,6 +103,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
 . Set this key to "ignore" to skip such source files and continue the run.  
 - **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values. 
 - **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier.
+- **skip_initial**: (optional) How many lines should be skipped. The default is 0.
 - **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line.  A sampling rate of 10 (the default) will sample every 10th line.
 - **max_sampling_read**: (optional) How many lines of the source file should be sampled when in discovery mode attempting to infer a schema. The default is 1000 samples.
 - **max_sampled_files**: (optional) The maximum number of files in the targeted set that will be sampled. The default is 5.

diff --git a/sample_config.json b/sample_config.json
@@ -8,6 +8,7 @@
             "key_properties": [],
             "format": "csv",
             "universal_newlines": false,
+            "skip_initial": 0,
             "sample_rate": 10,
             "max_sampling_read": 2000,
             "max_sampled_files": 3,

diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py
@@ -15,6 +15,7 @@
         Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'),
         Optional('invalid_format_action'): Any('ignore','fail'),
         Optional('universal_newlines'): bool,
+        Optional('skip_initial'): int,
         Optional('selected'): bool,
         Optional('field_names'): [str],
         Optional('search_prefix'): str,

diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py
@@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False):
 
 def get_row_iterator(table_spec, uri):
     universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True
+    skip_initial = table_spec.get("skip_initial", 0)
 
     if 'format' not in table_spec or table_spec['format'] == 'detect':
         lowered_uri = uri.lower()
@@ -153,19 +154,23 @@ def get_row_iterator(table_spec, uri):
     try:
         if format == 'csv':
             reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
-            return tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
+            iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
         elif format == 'excel':
             reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb')
             if uri.lower().endswith(".xls"):
-                return tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader)
+                iterator = tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader)
             else:
-                return tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
+                iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
         elif format == 'json':
             reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
-            return tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
+            iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
         elif format == 'jsonl':
             reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
-            return tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
+            iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
     except (ValueError,TypeError) as err:
         raise InvalidFormatError(uri,message=err)
 
+    for _ in range(skip_initial):
+        next(iterator)
+
+    return iterator