From e5aa32bd04ac1b029b0b9518c8599feb846ffcbb Mon Sep 17 00:00:00 2001 From: Richard Kooijman Date: Sat, 8 Oct 2022 14:04:34 +0200 Subject: [PATCH 1/4] added set of sensible options to configure CSV reading in more detail (delimiter, quoting) --- .gitignore | 6 ++++++ README.md | 14 ++++++++++++-- meltano.yml | 2 +- tap_csv/client.py | 10 +++++++++- tap_csv/tap.py | 6 ++++++ tap_csv/tests/test_core.py | 24 ++++++++++++++++++++++++ 6 files changed, 58 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 1627579..c1e9bdb 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,9 @@ dmypy.json # Pyre type checker .pyre/ + +/.idea/ + +/data/ + +config_csv.json diff --git a/README.md b/README.md index 585af3c..a0c920f 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,14 @@ The `config.json` contains an array called `files` that consists of dictionary o * `keys`: The names of the columns that constitute the unique keys for that entity * `encoding`: [Optional] The file encoding to use when reading the file (i.e. "latin1", "UTF-8"). Use this setting when you get a `UnicodeDecodeError` error. +The following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader: +* `delimiter`: A one-character string used to separate fields. It defaults to ','. +* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True. +* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping. +* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '"'. +* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False. +* `strict`: When True, raise exception Error on bad CSV input. The default is False. + Example: ```json @@ -35,12 +43,14 @@ Example: "files": [ { "entity" : "leads", "path" : "/path/to/leads.csv", - "keys" : ["Id"] + "keys" : ["Id"], + "delimiter": ";" }, { "entity" : "opportunities", "path" : "/path/to/opportunities.csv", "keys" : ["Id"], - "encoding" : "latin1" + "encoding" : "latin1", + "skipinitialspace": true } ] } diff --git a/meltano.yml b/meltano.yml index f35686f..447df5f 100644 --- a/meltano.yml +++ b/meltano.yml @@ -19,7 +19,7 @@ plugins: keys: - col1 settings: - - description: Array of objects containing keys: `entity`, `file`, `keys`, and `encoding` (Optional) + - description: Array of objects containing keys: `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional) kind: array name: files - description: Project-relative path to JSON file holding array of objects with diff --git a/tap_csv/client.py b/tap_csv/client.py index 7633407..926638b 100644 --- a/tap_csv/client.py +++ b/tap_csv/client.py @@ -81,8 +81,16 @@ def is_valid_filename(self, file_path: str) -> bool: def get_rows(self, file_path: str) -> Iterable[list]: """Return a generator of the rows in a particular CSV file.""" encoding = self.file_config.get("encoding", None) + csv.register_dialect("tap_dialect", + delimiter = self.file_config.get("delimiter", ","), + doublequote = self.file_config.get("doublequote", True), + escapechar = self.file_config.get("escapechar", None), + quotechar = self.file_config.get("quotechar", '"'), + skipinitialspace = self.file_config.get("skipinitialspace", False), + strict = self.file_config.get("strict", False) + ) with open(file_path, "r", encoding=encoding) as f: - reader = csv.reader(f) + reader = csv.reader(f, dialect="tap_dialect") for row in reader: yield row diff --git a/tap_csv/tap.py b/tap_csv/tap.py index ab32316..2b0b785 100644 --- a/tap_csv/tap.py +++ b/tap_csv/tap.py @@ -25,6 +25,12 @@ class TapCSV(Tap): th.Property("path", th.StringType, required=True), th.Property("keys", th.ArrayType(th.StringType), required=True), th.Property("encoding", th.StringType, required=False), + th.Property("delimiter", th.StringType, required=False), + th.Property("doublequote", th.BooleanType, required=False), + th.Property("escapechar", th.StringType, required=False), + th.Property("quotechar", th.StringType, required=False), + th.Property("skipinitialspace", th.BooleanType, required=False), + th.Property("strict", th.BooleanType, required=False), ) ), description="An array of csv file stream settings.", diff --git a/tap_csv/tests/test_core.py b/tap_csv/tests/test_core.py index 46ab362..5d41aca 100644 --- a/tap_csv/tests/test_core.py +++ b/tap_csv/tests/test_core.py @@ -42,3 +42,27 @@ def test_standard_tap_tests_encoding(): tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) for test in tests: test() + + +# Run standard built-in tap tests from the SDK, with different CSV dialect settings: +def test_standard_tap_tests_csv_dialect(): + """Run standard built-in tap tests from the SDK, with different CSV dialect settings.""" + test_data_dir = os.path.dirname(os.path.abspath(__file__)) + SAMPLE_CONFIG = { + "files": [ + { + "entity": "test", + "path": f"{test_data_dir}/data/alphabet_encoding.csv", + "keys": [], + "delimiter": ",", + "doublequote": True, + "escapechar": "^", + "quotechar": "\"", + "skipinitialspace": True, + "strict": True + } + ] + } + tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) + for test in tests: + test() From a75e7337018ed950a1492f2224698b83f81813aa Mon Sep 17 00:00:00 2001 From: Richard Kooijman Date: Sat, 8 Oct 2022 14:16:33 +0200 Subject: [PATCH 2/4] fixed formatting issue --- tap_csv/client.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tap_csv/client.py b/tap_csv/client.py index 926638b..15b22cf 100644 --- a/tap_csv/client.py +++ b/tap_csv/client.py @@ -82,12 +82,12 @@ def get_rows(self, file_path: str) -> Iterable[list]: """Return a generator of the rows in a particular CSV file.""" encoding = self.file_config.get("encoding", None) csv.register_dialect("tap_dialect", - delimiter = self.file_config.get("delimiter", ","), - doublequote = self.file_config.get("doublequote", True), - escapechar = self.file_config.get("escapechar", None), - quotechar = self.file_config.get("quotechar", '"'), - skipinitialspace = self.file_config.get("skipinitialspace", False), - strict = self.file_config.get("strict", False) + delimiter=self.file_config.get("delimiter", ","), + doublequote=self.file_config.get("doublequote", True), + escapechar=self.file_config.get("escapechar", None), + quotechar=self.file_config.get("quotechar", '"'), + skipinitialspace=self.file_config.get("skipinitialspace", False), + strict=self.file_config.get("strict", False) ) with open(file_path, "r", encoding=encoding) as f: reader = csv.reader(f, dialect="tap_dialect") From ae4a878755af60fa004ed0df569b2231aee62271 Mon Sep 17 00:00:00 2001 From: Richard Kooijman Date: Mon, 10 Oct 2022 17:51:05 +0200 Subject: [PATCH 3/4] formatting fix Co-authored-by: Pat Nadolny --- meltano.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meltano.yml b/meltano.yml index 447df5f..c424233 100644 --- a/meltano.yml +++ b/meltano.yml @@ -19,7 +19,7 @@ plugins: keys: - col1 settings: - - description: Array of objects containing keys: `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional) + - description: Array of objects containing keys - `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional) kind: array name: files - description: Project-relative path to JSON file holding array of objects with From 3fd5a5342c74ec35e2ba75b5c8df5b343c050e47 Mon Sep 17 00:00:00 2001 From: Richard Kooijman Date: Mon, 10 Oct 2022 17:54:42 +0200 Subject: [PATCH 4/4] cleaned up .gitignore --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index c1e9bdb..e5535dd 100644 --- a/.gitignore +++ b/.gitignore @@ -131,8 +131,3 @@ dmypy.json # Pyre type checker .pyre/ -/.idea/ - -/data/ - -config_csv.json