diff --git a/.gitignore b/.gitignore index 1627579..e5535dd 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ dmypy.json # Pyre type checker .pyre/ + diff --git a/README.md b/README.md index 585af3c..a0c920f 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,14 @@ The `config.json` contains an array called `files` that consists of dictionary o * `keys`: The names of the columns that constitute the unique keys for that entity * `encoding`: [Optional] The file encoding to use when reading the file (i.e. "latin1", "UTF-8"). Use this setting when you get a `UnicodeDecodeError` error. +The following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader: +* `delimiter`: A one-character string used to separate fields. It defaults to ','. +* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True. +* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping. +* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '"'. +* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False. +* `strict`: When True, raise exception Error on bad CSV input. The default is False. + Example: ```json @@ -35,12 +43,14 @@ Example: "files": [ { "entity" : "leads", "path" : "/path/to/leads.csv", - "keys" : ["Id"] + "keys" : ["Id"], + "delimiter": ";" }, { "entity" : "opportunities", "path" : "/path/to/opportunities.csv", "keys" : ["Id"], - "encoding" : "latin1" + "encoding" : "latin1", + "skipinitialspace": true } ] } diff --git a/meltano.yml b/meltano.yml index f35686f..c424233 100644 --- a/meltano.yml +++ b/meltano.yml @@ -19,7 +19,7 @@ plugins: keys: - col1 settings: - - description: Array of objects containing keys: `entity`, `file`, `keys`, and `encoding` (Optional) + - description: Array of objects containing keys - `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional) kind: array name: files - description: Project-relative path to JSON file holding array of objects with diff --git a/tap_csv/client.py b/tap_csv/client.py index 7633407..15b22cf 100644 --- a/tap_csv/client.py +++ b/tap_csv/client.py @@ -81,8 +81,16 @@ def is_valid_filename(self, file_path: str) -> bool: def get_rows(self, file_path: str) -> Iterable[list]: """Return a generator of the rows in a particular CSV file.""" encoding = self.file_config.get("encoding", None) + csv.register_dialect("tap_dialect", + delimiter=self.file_config.get("delimiter", ","), + doublequote=self.file_config.get("doublequote", True), + escapechar=self.file_config.get("escapechar", None), + quotechar=self.file_config.get("quotechar", '"'), + skipinitialspace=self.file_config.get("skipinitialspace", False), + strict=self.file_config.get("strict", False) + ) with open(file_path, "r", encoding=encoding) as f: - reader = csv.reader(f) + reader = csv.reader(f, dialect="tap_dialect") for row in reader: yield row diff --git a/tap_csv/tap.py b/tap_csv/tap.py index ab32316..2b0b785 100644 --- a/tap_csv/tap.py +++ b/tap_csv/tap.py @@ -25,6 +25,12 @@ class TapCSV(Tap): th.Property("path", th.StringType, required=True), th.Property("keys", th.ArrayType(th.StringType), required=True), th.Property("encoding", th.StringType, required=False), + th.Property("delimiter", th.StringType, required=False), + th.Property("doublequote", th.BooleanType, required=False), + th.Property("escapechar", th.StringType, required=False), + th.Property("quotechar", th.StringType, required=False), + th.Property("skipinitialspace", th.BooleanType, required=False), + th.Property("strict", th.BooleanType, required=False), ) ), description="An array of csv file stream settings.", diff --git a/tap_csv/tests/test_core.py b/tap_csv/tests/test_core.py index 46ab362..5d41aca 100644 --- a/tap_csv/tests/test_core.py +++ b/tap_csv/tests/test_core.py @@ -42,3 +42,27 @@ def test_standard_tap_tests_encoding(): tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) for test in tests: test() + + +# Run standard built-in tap tests from the SDK, with different CSV dialect settings: +def test_standard_tap_tests_csv_dialect(): + """Run standard built-in tap tests from the SDK, with different CSV dialect settings.""" + test_data_dir = os.path.dirname(os.path.abspath(__file__)) + SAMPLE_CONFIG = { + "files": [ + { + "entity": "test", + "path": f"{test_data_dir}/data/alphabet_encoding.csv", + "keys": [], + "delimiter": ",", + "doublequote": True, + "escapechar": "^", + "quotechar": "\"", + "skipinitialspace": True, + "strict": True + } + ] + } + tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG) + for test in tests: + test()