Skip to content

Commit

Permalink
Merge pull request #37 from NINAnor/skip-initial
Browse files Browse the repository at this point in the history
Add skip_initial
  • Loading branch information
menzenski authored Mar 3, 2023
2 parents 4fb97f2 + 3791733 commit 12f4d96
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 5 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ The configuration is also captured in [tables_config_util.py](tap_spreadsheets_a
"delimiter": "|",
"quotechar": '"',
"universal_newlines": false,
"skip_initial": 0,
"sample_rate": 10,
"max_sampling_read": 2000,
"max_sampled_files": 3,
Expand Down Expand Up @@ -102,6 +103,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
. Set this key to "ignore" to skip such source files and continue the run.
- **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values.
- **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier.
- **skip_initial**: (optional) How many lines should be skipped. The default is 0.
- **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line. A sampling rate of 10 (the default) will sample every 10th line.
- **max_sampling_read**: (optional) How many lines of the source file should be sampled when in discovery mode attempting to infer a schema. The default is 1000 samples.
- **max_sampled_files**: (optional) The maximum number of files in the targeted set that will be sampled. The default is 5.
Expand Down
1 change: 1 addition & 0 deletions sample_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"key_properties": [],
"format": "csv",
"universal_newlines": false,
"skip_initial": 0,
"sample_rate": 10,
"max_sampling_read": 2000,
"max_sampled_files": 3,
Expand Down
1 change: 1 addition & 0 deletions tap_spreadsheets_anywhere/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'),
Optional('invalid_format_action'): Any('ignore','fail'),
Optional('universal_newlines'): bool,
Optional('skip_initial'): int,
Optional('selected'): bool,
Optional('field_names'): [str],
Optional('search_prefix'): str,
Expand Down
15 changes: 10 additions & 5 deletions tap_spreadsheets_anywhere/format_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False):

def get_row_iterator(table_spec, uri):
universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True
skip_initial = table_spec.get("skip_initial", 0)

if 'format' not in table_spec or table_spec['format'] == 'detect':
lowered_uri = uri.lower()
Expand Down Expand Up @@ -153,19 +154,23 @@ def get_row_iterator(table_spec, uri):
try:
if format == 'csv':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
return tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader)
elif format == 'excel':
reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb')
if uri.lower().endswith(".xls"):
return tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader)
iterator = tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader)
else:
return tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader)
elif format == 'json':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
return tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader)
elif format == 'jsonl':
reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r')
return tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader)
except (ValueError,TypeError) as err:
raise InvalidFormatError(uri,message=err)

for _ in range(skip_initial):
next(iterator)

return iterator

0 comments on commit 12f4d96

Please sign in to comment.